diff --git a/prebuilt-sdk/x86_64_linux/VERSION b/prebuilt-sdk/x86_64_linux/VERSION index 79d5c1795..91123ad1e 100644 --- a/prebuilt-sdk/x86_64_linux/VERSION +++ b/prebuilt-sdk/x86_64_linux/VERSION @@ -1 +1 @@ -6.4.14_CL650117A_D650117_A648302_R647402_T648811_O646970 \ No newline at end of file +6.4.15_CL690884A_D690855_A690484_R690194_T690259_O688896 \ No newline at end of file diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h index 48f824f65..c49800a9f 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h @@ -1340,6 +1340,21 @@ VX_API_ENTRY vx_status VX_API_CALL vxAssignNodeCallback(vx_node node, vx_nodecom */ VX_API_ENTRY vx_nodecomplete_f VX_API_CALL vxRetrieveNodeCallback(vx_node node); +/*! \brief Assigns a callback to a node. + * If a callback already exists in this node, this function must return an error + * and the user may clear the callback by passing a NULL pointer as the callback. + * \param [in] node The reference to the node. + * \param [in] callback The callback to associate with completion of this + * specific node. + * \warning This must be used with extreme caution as it can \e ruin + * optimizations in the power/performance efficiency of a graph. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Callback assigned; any other value indicates failure. + * \retval VX_ERROR_INVALID_REFERENCE node is not a valid \ref vx_node reference. + * \ingroup group_node_callback + */ +VX_API_ENTRY vx_status VX_API_CALL vxAssignNodeQueryCallback(vx_node node, vx_nodequery_f callback); + /*! \brief Sets the node target to the provided value. A success invalidates the graph * that the node belongs to (\ref vxVerifyGraph must be called before the next execution) * \param [in] node The reference to the \ref vx_node object. diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h index d35396074..8a2ac76b1 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h @@ -503,6 +503,40 @@ enum vx_kernel_e { VX_KERNEL_NN_BATCH_GEMM_RELU_POOLING_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x33, + VX_KERNEL_NN_FUSED_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x34, + + VX_KERNEL_NN_CONVOLUTION_RELU_POOLING_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x35, + + VX_KERNEL_NN_LAYER_NORMALIZATION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x36, + + VX_KERNEL_NN_INSTANCE_NORMALIZATION_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x37, + + VX_KERNEL_NN_GROUP_NORMALIZATION_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x38, + + VX_KERNEL_NN_LOGICAL_OPS_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x39, + + VX_KERNEL_NN_LOGICAL_NOT_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x40, + + VX_KERNEL_NN_RELATIONAL_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x41, + + VX_KERNEL_NN_TENSOR_REDUCE_MAX = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x42, + + VX_KERNEL_NN_MAXIMUM_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x43, + + VX_KERNEL_NN_MINIMUM_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x44, + + VX_KERNEL_NN_TENSOR_SELECT_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x45, + + VX_KERNEL_NN_REDUCE_SUM_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x46, + + VX_KERNEL_NN_GRU_CELL_ACTIVATION_Z_H_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x47, + + VX_KERNEL_NN_GRU_CELL_H_TIMES_ACTIVATION_R_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x48, + + VX_KERNEL_NN_GRU_CELL_RESET_AFTER_ACTIVATION_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x49, + + VX_KERNEL_NN_LSTM_ACTIVATION_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x50, + VX_KERNEL_MAX_1_2, /*!< \internal Used for VX1.2 bounds checking in the conformance test. */ }; diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h index f3f019113..ec5d069ed 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h @@ -214,7 +214,7 @@ VX_STREAM_PROCESSOR_SUPPORT is used to declare that vsi openvx driver can suppor 1: support */ #ifndef VX_STREAM_PROCESSOR_SUPPORT -#define VX_STREAM_PROCESSOR_SUPPORT 0 +#define VX_STREAM_PROCESSOR_SUPPORT 1 #endif /* @@ -258,5 +258,144 @@ VX_STREAM_PROCESSOR_SUPPORT is used to declare that vsi openvx driver can suppor #define VX_ACTIVATION_EXT2_SUPPORT 1 #endif +/* + VX_TENSORVIEW_ON_ANY_DIM is used to declare that ovxlib can do optimization for all concat node(all dimision) to tensor view if possiable, not only channel. + [value] + 0: disable + 1: enable +*/ +#ifndef VX_TENSORVIEW_ON_ANY_DIM +#define VX_TENSORVIEW_ON_ANY_DIM 0 +#endif + +/* +VX_DEPTH2SPACE_CRD_MODE_SUPPORT is used to declare that SPACE2DEPTH can support CRD mode + [value] + 0: not support + 1: support +*/ +#ifndef VX_DEPTH2SPACE_CRD_MODE_SUPPORT +#define VX_DEPTH2SPACE_CRD_MODE_SUPPORT 1 +#endif + +/* + VX_LAYER_NORMALIZATION_VX_SUPPORT is used to declare driver support layer normalization layer. + [value] + 0: not support + 1: support +*/ +#ifndef VX_LAYER_NORMALIZATION_VX_SUPPORT +#define VX_LAYER_NORMALIZATION_VX_SUPPORT 1 +#endif + +/* + VX_LAYER_NORMALIZATION_VX_SUPPORT is used to declare driver support layer normalization layer. + [value] + 0: not support + 1: support +*/ +#ifndef VX_INSTANCE_NORMALIZATION_VX_SUPPORT +#define VX_INSTANCE_NORMALIZATION_VX_SUPPORT 1 +#endif + +/* + VX_GROUP_NORMALIZATION_VX_SUPPORT is used to declare driver support layer normalization layer. + [value] + 0: not support + 1: support +*/ +#ifndef VX_GROUP_NORMALIZATION_VX_SUPPORT +#define VX_GROUP_NORMALIZATION_VX_SUPPORT 1 +#endif + +/* + VX_LOGICAL_VX_SUPPORT is used to declare driver support layer logical related layer. + [value] + 0: not support + 1: support +*/ +#ifndef VX_LOGICAL_VX_SUPPORT +#define VX_LOGICAL_VX_SUPPORT 1 +#endif + +/* + VX_RELATIONAL_OPS_VX_SUPPORT is used to declare driver support layer relational related layer. + [value] + 0: not support + 1: support +*/ +#ifndef VX_RELATIONAL_OPS_VX_SUPPORT +#define VX_RELATIONAL_OPS_VX_SUPPORT 1 +#endif + +/* + VX_REDUCE_MAX_VX_SUPPORT is used to declare driver support layer reduce max layer. + [value] + 0: not support + 1: support +*/ +#ifndef VX_REDUCE_MAX_VX_SUPPORT +#define VX_REDUCE_MAX_VX_SUPPORT 1 +#endif + +/* + VX_REDUCE_MEAN_VX_SUPPORT is used to declare driver support layer reduce mean layer. + [value] + 0: not support + 1: support +*/ +#ifndef VX_REDUCE_MEAN_VX_SUPPORT +#define VX_REDUCE_MEAN_VX_SUPPORT 1 +#endif + +/* + VX_REDUCE_SUM_VX_SUPPORT is used to declare driver support layer reduce sum layer. + [value] + 0: not support + 1: support +*/ +#ifndef VX_REDUCE_SUM_VX_SUPPORT +#define VX_REDUCE_SUM_VX_SUPPORT 1 +#endif + +/* + VX_MAX_MIN_IMUM_VX_SUPPORT is used to declare driver support maximum and minimum layer. + [value] + 0: not support + 1: support +*/ +#ifndef VX_MAX_MIN_IMUM_VX_SUPPORT +#define VX_MAX_MIN_IMUM_VX_SUPPORT 1 +#endif + +/* + VX_TENSOR_SELECR_VX_SUPPORT is used to declare driver support tensor select layer. + [value] + 0: not support + 1: support +*/ +#ifndef VX_TENSOR_SELECT_VX_SUPPORT +#define VX_TENSOR_SELECT_VX_SUPPORT 1 +#endif + +/* + VX_GRU_CELL_VX_SUPPORT is used to declare driver support gru cell layer. + [value] + 0: not support + 1: support +*/ +#ifndef VX_GRU_CELL_VX_SUPPORT +#define VX_GRU_CELL_VX_SUPPORT 1 +#endif + +/* + VX_LSTM_ACTIVATION_SUPPORT is used to declare driver support gru cell layer. + [value] + 0: not support + 1: support +*/ +#ifndef VX_LSTM_ACTIVATION_SUPPORT +#define VX_LSTM_ACTIVATION_SUPPORT 1 +#endif #endif /* __VX_KHR_COMPATIBLE_H__ */ diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h index a43a37ec2..49472870d 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h @@ -395,6 +395,17 @@ enum vx_tensor_lifetime_type_e VX_TENSOR_LIFE_TIME_DYNAMIC, }; +/*! \brief Specifies depthtospace mode + * \ingroup group_cnn + */ +enum vx_nn_depth_to_space_mode_e +{ + /*! \brief DCR(default) for depth-column-row order re-arrangement */ + VX_NN_DEPTH_TO_SPACE_DCR = 0x0, + /*! \brief CRD for column-row-depth order re-arrangement */ + VX_NN_DEPTH_TO_SPACE_CRD, +}; + typedef struct _vx_nn_convolution_3d_params_t { vx_int32 padding_w_left; /*!< \brief Number of elements added at each side in the left of w dimension of the input. */ @@ -972,6 +983,16 @@ typedef struct _vx_nn_mean_params_t vx_int32 keep_dims; /*!< \brief Keep dims, if positive, retains reduced dims with length 1 */ } vx_nn_mean_params_t; +/*! \brief Input parameter for reducesum layer +* \ingroup group_cnn +*\version 0.5 +*/ +typedef struct _vx_nn_sum_params_t +{ + vx_tensor axis; /*!< \brief 1D axis tensor of reduce dims */ + vx_int32 keep_dims; /*!< \brief Keep dims, if positive, retains reduced dims with length 1 */ +} vx_nn_sum_params_t; + /*! \brief Input parameter for tensor squeeze layer * \ingroup group_cnn *\version 0.5 @@ -1254,6 +1275,12 @@ typedef struct _vx_nn_reorg_params_ext2_t vx_int32 *axis; } vx_nn_reorg_params_ext2_t; +typedef struct _vx_nn_reorg_params_ext3_t +{ + vx_nn_reorg_params_ext2_t base; /*!< \brief vx_nn_reorg_params \ref vx_nn_reorg_params_t */ + vx_enum mode; /*!< \brief [Optional] Only for DEPH2SPACE */ +} vx_nn_reorg_params_ext3_t; + /*! \brief [Graph] Creates a Reorgnization Layer Node, Enhancement of vxReorgLayer, Support both DEPTH to SPACE and SPACE to DEPTH. * \param [in] graph The reference to the parent graph. * \param [in] input The input tensor data to reorg. @@ -1911,6 +1938,21 @@ VX_API_ENTRY vx_node VX_API_CALL vxRPNLayer( vx_tensor score_output ); +/*! \brief Input parameters for a lstm activation operation. + * \ingroup group_cnn + * \version 0.3 + */ +typedef struct _vx_nn_lstm_activation_params_t +{ + vx_int32 is_ln; + vx_int32 is_cifg; + vx_int32 is_proj; + vx_int32 is_hybrid; + vx_int32 is_peephole; + vx_int32 recurrent_activation; + vx_float32 forget_bias; +} vx_nn_lstm_activation_params_t; + /*! \brief Input parameters for a lstm operation. * \ingroup group_cnn * \version 0.3 @@ -2115,6 +2157,28 @@ VX_API_ENTRY vx_node VX_API_CALL vxTensorMeanNode( vx_size size_of_mean_param, vx_tensor outputs); +/*! \brief [Graph] Creates sum layer node. +* \details +* Computes the sum of elements across dimensions of a tensor. +* +* \param [in] graph The handle to the graph. +* \param [in] input A n-D tensor, specifying the input. +* \param [in] sum_params paraments \ref vx_nn_sum_params_t . +* \param [in] size_of_sum_param [static] The size of the vx_nn_mean_params_t. +* \param [out] output A n-D tensor of the same type as input. +* \return vx_node. +* \returns A node reference \ref vx_node. Any possible errors preventing a +* successful creation should be checked using \ref vxGetStatus. +* \ingroup group_tensor +* \version 0.5 +*/ +VX_API_ENTRY vx_node VX_API_CALL vxReduceSumNode( + vx_graph graph, + vx_tensor inputs, + const vx_nn_sum_params_t *sum_params, + vx_size size_of_sum_param, + vx_tensor outputs); + /*! \brief [Graph] Creates squeeze layer node. * \details * Remove dimensions of size 1 from the input tensor. @@ -2287,6 +2351,282 @@ VX_API_ENTRY vx_node VX_API_CALL vxConv3dLayer(vx_graph graph, vx_tensor inputs, */ VX_API_ENTRY vx_node VX_API_CALL vxDeconv3dLayer(vx_graph graph, vx_tensor inputs, vx_tensor weights, vx_tensor biases, const vx_nn_deconvolution_3d_params_t *convolution_params, vx_size size_of_deconv_params, vx_tensor outputs); +/*! \brief [Graph] Creates a layer Normalization Node. + * \details Normalize the activations of the previous layer at each batch, i.e. applies a transformation that maintains the mean activation close to 0 and the activation standard deviation close to 1. + * \param [in] graph The handle to the graph. + * \param [in] eps [static] Float 32. Small value to add to the variance estimate so that we don't divide by zero.(default is 1e-5) + * \param [in] axis [static] The axis on which we need do normalize. + * \param [in] input_list [static] The input tensor data. + * \param [in] input_count [static] The input tensor number. + * \param [out] output [static] The output tensor data. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxLayerNormalizationLayer( + vx_graph graph, + vx_float32 eps, + vx_int32 axis, + vx_tensor* input_list, + vx_uint32 input_count, + vx_tensor output + ); + +/*! \brief [Graph] Creates a layer instance normalization Node. + * \details Normalize the activations of the previous layer at each batch, i.e. applies a transformation that maintains the mean activation close to 0 and the activation standard deviation close to 1. + * \param [in] graph The handle to the graph. + * \param [in] eps [static] Float 32. Small value to add to the variance estimate so that we don't divide by zero.(default is 1e-5) + * \param [in] input_list [static] The input tensor data. + * \param [in] input_count [static] The input tensor number. + * \param [out] output [static] The output tensor data. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxInstanceNormalizationLayer( + vx_graph graph, + vx_float32 eps, + vx_tensor* input_list, + vx_uint32 input_count, + vx_tensor output + ); + +/*! \brief [Graph] Creates a layer instance normalization Node. + * \details Normalize the activations of the previous layer at each batch, i.e. applies a transformation that maintains the mean activation close to 0 and the activation standard deviation close to 1. + * \param [in] graph The handle to the graph. + * \param [in] eps [static] Float 32. Small value to add to the variance estimate so that we don't divide by zero.(default is 1e-5) + * \param [in] group_num [static] Int 32. Number of groups for GN + * \param [in] input_list [static] The input tensor data. + * \param [in] input_count [static] The input tensor number. + * \param [out] output [static] The output tensor data. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxGroupNormalizationLayer( + vx_graph graph, + vx_float32 eps, + vx_int32 group_num, + vx_tensor* input_list, + vx_uint32 input_count, + vx_tensor output + ); + +/*! \brief [Graph] Creates a layer logical ops Node. + * \details Return the truth value of x AND, XOR,OR y element-wise. + * \param [in] graph The handle to the graph. + * \param [in] ops_type [static] Int 32. Operation Type + * \param [in] input_list [static] The input tensor data. + * \param [in] input_count [static] The input tensor number. + * \param [out] output [static] The output tensor data. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxLogicalOpsLayer( + vx_graph graph, + vx_int32 ops_type, + vx_tensor* input_list, + vx_uint32 input_count, + vx_tensor output + ); + +/*! \brief [Graph] Creates a layer logical not Node. + * \details Return the truth value of not x element-wise. + * \param [in] graph The handle to the graph. + * \param [in] input [static] The input tensor data. + * \param [out] output [static] The output tensor data. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxLogicalNotLayer( + vx_graph graph, + vx_tensor input, + vx_tensor output + ); + +/*! \brief [Graph] Creates a layer relational Node. + * \param [in] graph The handle to the graph. + * \param [in] ops_type [static] Int 32. Operation Type + * \param [in] input_list [static] The input tensor data. + * \param [in] input_count [static] The input tensor number. + * \param [out] output [static] The output tensor data. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxRelationalLayer( + vx_graph graph, + vx_int32 ops_type, + vx_tensor* input_list, + vx_uint32 input_count, + vx_tensor output + ); + +/*! \brief [Graph] Computes the max of elements across dimensions of input tensor. +* \param [in] graph The handle to the graph. +* \param [in] in input tensor data, +* \param [in] axis [static] used to determine max across which dimension(dimension 0 means width, etc). If not given, compute the sum across all dimensions. +* \param [in] keep_dim [static] means if keep the dimesion count. +* \param [out] out output tensor data. +* \ingroup group_tensor +* \return vx_node. +* \retval 0 Node could not be created. +* \retval * Node handle. +* \version 0.3 +*/ +VX_API_ENTRY vx_node VX_API_CALL vxTensorReduceMaxNode( + vx_graph graph, + vx_tensor inputs, + vx_tensor axis, + vx_bool keep_dims, + vx_tensor outputs); + +/*! \brief [Graph] Creates a layer minumum Node. + * \param [in] graph The handle to the graph. + * \param [in] input_list [static] The input tensor data. + * \param [in] input_count [static] The input tensor number. + * \param [out] output [static] The output tensor data. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxMinimumLayer( + vx_graph graph, + vx_tensor* input_list, + vx_uint32 input_count, + vx_tensor output + ); + +/*! \brief [Graph] Creates a layer maximum Node. + * \param [in] graph The handle to the graph. + * \param [in] input_list [static] The input tensor data. + * \param [in] input_count [static] The input tensor number. + * \param [out] output [static] The output tensor data. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxMaximumLayer( + vx_graph graph, + vx_tensor* input_list, + vx_uint32 input_count, + vx_tensor output + ); + +/*! \brief [Graph] Creates a layer select Node. + * \param [in] graph The handle to the graph. + * \param [in] input_list [static] The input tensor data. + * \param [in] input_count [static] The input tensor number. + * \param [out] output [static] The output tensor data. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxTensorSelectLayer( + vx_graph graph, + vx_tensor* input_list, + vx_uint32 input_count, + vx_tensor output + ); + +/*! \brief [Graph] Creates a layer gru cell activation z h Node. + * \param [in] graph The handle to the graph. + * \param [in] input_list [static] The input tensor data. + * \param [in] input_count [static] The input tensor number. + * \param [in] recurrent_activation [static] recurrent activation type. + * \param [in] activation [static] activation type. + * \param [out] output_list [static] The output tensor data. + * \param [out] output_count [static] The output tensor number. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxGruCellActivationZHLayer( + vx_graph graph, + vx_tensor* input_list, + vx_uint32 input_count, + vx_int32 recurrent_activation, + vx_int32 activation, + vx_tensor* output_list, + vx_uint32 output_count + ); + +/*! \brief [Graph] Creates a layer gru cell h times activation r Node. + * \param [in] graph The handle to the graph. + * \param [in] input_list [static] The input tensor data. + * \param [in] input_count [static] The input tensor number. + * \param [in] recurrent_activation [static] recurrent activation type. + * \param [out] output_list [static] The output tensor data. + * \param [out] output_count [static] The output tensor number. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxGruCellHTimeActivationRLayer( + vx_graph graph, + vx_tensor* input_list, + vx_uint32 input_count, + vx_int32 recurrent_activation, + vx_tensor* output_list, + vx_uint32 output_count + ); + +/*! \brief [Graph] Creates a layer gru cell reset after activationNode. + * \param [in] graph The handle to the graph. + * \param [in] input_list [static] The input tensor data. + * \param [in] input_count [static] The input tensor number. + * \param [in] recurrent_activation [static] recurrent activation type. + * \param [in] activation [static] activation type. + * \param [out] output_list [static] The output tensor data. + * \param [out] output_count [static] The output tensor number. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxGruCellResetAfterActivationLayer( + vx_graph graph, + vx_tensor* input_list, + vx_uint32 input_count, + vx_int32 recurrent_activation, + vx_int32 activation, + vx_tensor* output_list, + vx_uint32 output_count + ); + +/*! \brief [Graph] Creates a layer lstm activation Node. + * \param [in] graph The handle to the graph. + * \param [in] input_list [static] The input tensor data. + * \param [in] input_count [static] The input tensor number. + * \param [in] lstm_activation_param \ref vx_nn_lstm_activation_params_t . + * \param [out] output_list [static] The output tensor data. + * \param [out] output_count [static] The output tensor number. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxLSTMActivationLayer( + vx_graph graph, + vx_tensor* input_list, + vx_uint32 input_count, + const vx_nn_lstm_activation_params_t * lstm_activation_param, + vx_tensor* output_list, + vx_uint32 output_count + ); #ifdef __cplusplus } #endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h index 6570e1d81..e824d55a7 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h @@ -242,6 +242,48 @@ typedef struct _vx_nn_convolution_relu_pooling_params_ext7_t vx_bool isSub; } vx_nn_convolution_relu_pooling_params_ext7_t, * vx_nn_convolution_relu_pooling_params_ext7; +typedef struct _vx_nn_fused_sp_params_t +{ + vx_enum multi_sp_kernel_type; + /*!*/ + vx_scalar mul_scale; + /*!*/ + union + { + struct + { + vx_scalar linear_a, linear_b; + } linear; + struct + { + vx_scalar tanh_a, tanh_b; + float a_v, b_v; + } tanh_linear; + struct + { + vx_scalar hsigmoid_a, hsigmoid_b; + } hsigmoid; + struct + { + vx_scalar clip_a, clip_b; + } clip; + struct + { + vx_scalar scalar_a, scalar_b, scalar_c, scalar_d; + } params; + } scalar_params; + /*!*/ +} vx_nn_fused_sp_params_t, * vx_nn_fused_sp_params; + +typedef struct _vx_nn_convolution_relu_pooling_params_sp_ext_t +{ + vx_nn_convolution_relu_pooling_params_ext4_t ext4; /*!< \brief convolution relu pooling params \ref vx_nn_convolution_relu_pooling_params_ext_t */ + vx_object_array inputs_list; + vx_object_array outputs_list; + vx_nn_fused_sp_params_t sp_param; + +} vx_nn_convolution_relu_pooling_params_sp_ext_t, * vx_nn_convolution_relu_pooling_params_sp_ext; + /*! \brief [Graph] Creates a Convolutional Network Convolution and Activation(Relu) and Pooling Layer Node, this fucntion match kronos NN Extension 1.2 verion. * \details This function implement Convolutional Network Convolution and Activation(Relu) and Pooling layer. * For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined, @@ -1129,6 +1171,48 @@ VX_API_ENTRY vx_node VX_API_CALL vxBatchGemmReluPoolingLayer(vx_graph graph, const vx_nn_gemm_relu_pooling_params merge_param, vx_tensor output); +/*! \brief Create a fuse stream process node. + * \param [in] graph The handle to the graph. + * \param [in] input_list input tensor list. + * \param [in] input_count input tensor number. + * \param [in] output_list output tensor list. + * \param [in] output_count output tensor number. + * \param [in] params the parameters for multi streamprocessor merging. + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation + * should be checked using \ref vxGetStatus + * \ingroup group_vision_function_sp + */ +VX_API_ENTRY vx_node VX_API_CALL vxFusedSpNode( + vx_graph graph, + vx_tensor* input_list, + vx_uint32 input_count, + vx_tensor* output_list, + vx_uint32 output_count, + const vx_nn_fused_sp_params_t * params + ); + +/*! \brief Create a conv fuse stream process node. + * \param [in] graph The handle to the graph. + * \param [in] inputs input tensor. + * \param [in] weights_biases [static] Point to WeightBiasesParameter data, vx_weights_biases_parameter is an opaque reference. + * \param [in] convolution_relu_pooling_params [static] Pointer to parameters of type \ref vx_nn_convolution_relu_pooling_params_t + * \param [in] size_of_convolution_relu_pooling_params [static] Size in bytes of convolution_relu_pooling_params. + * \param [in] outputs output tensor. + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation + * should be checked using \ref vxGetStatus + * \ingroup group_vision_function_sp + */ +VX_API_ENTRY vx_node VX_API_CALL vxConvSpNode( + vx_graph graph, + vx_tensor inputs, + vx_weights_biases_parameter weights_biases, + const vx_nn_convolution_relu_pooling_params_t * convolution_relu_pooling_params, + vx_size size_of_convolution_relu_pooling_params, + vx_tensor outputs +); + #ifdef __cplusplus } #endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h index 36df37487..38d2223a4 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h @@ -345,16 +345,6 @@ VX_API_ENTRY vx_spinst VX_API_CALL vxCreateSPINST( vx_context context ); -/*! \brief Creates an internal reference to a spinst data. - * \param [in] context The reference to the implementation context. - * \return A spinst data reference. - * \Any possible errors preventing a successful creation should be checked using \ref vxGetStatus. - * \ingroup group_object_spinst - */ -VX_API_ENTRY vx_spinst VX_API_CALL vxCreateSPINSTInternal( - vx_context context - ); - /*! \brief Releases a reference to a external spinst object. * The object may not be garbage collected until its total reference count is zero. * \param [in] spinst_obj The pointer to the spinst data to release. @@ -368,19 +358,6 @@ VX_API_ENTRY vx_status VX_API_CALL vxReleaseSPINST( vx_spinst *spinst_obj ); -/*! \brief Releases a reference to a internal spinst object. - * The object may not be garbage collected until its total reference count is zero. - * \param [in] spinst_obj The pointer to the spinst data to release. - * \post After returning from this function the reference is zeroed. - * \return A \ref vx_status_e enumeration. - * \retval VX_SUCCESS No errors; all other values indicate failure - * \retval * An error occurred. See \ref vx_status_e. - * \ingroup group_object_spinst - */ -VX_API_ENTRY vx_status VX_API_CALL vxReleaseSPINSTInternal( - vx_spinst *spinst_obj - ); - /*! \brief Add a instruction to spinst object. * \param [in] spinst_obj The reference to the spinst object. * \param [in] inst_unit_array The units of one instruction. Use a \ref vx_spinst_unit_param. diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h index 6f75ea9db..eefa39ce5 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h @@ -477,6 +477,8 @@ enum vx_type_e { VX_TYPE_SPINST = 0x81B,/*!< \brief A \ref vx_spinst. */ VX_TYPE_INT4 = 0x81C,/*!< \brief A \ref signed 4bits tensor.. */ VX_TYPE_UINT4 = 0x81D,/*!< \brief A \ref unsigned 4bits tensor.. */ + VX_TYPE_FLOAT8_E4M3 = 0x81E,/*!< \brief A \ref vx_float8_e4m3. */ + VX_TYPE_FLOAT8_E5M2 = 0x81F,/*!< \brief A \ref vx_float8_e5m2. */ }; /*! \brief The enumeration of all status codes. @@ -803,6 +805,8 @@ enum vx_convert_policy_e { VX_CONVERT_POLICY_WRAP = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CONVERT_POLICY) + 0x0, /*! \brief Results are saturated to the bit depth of the output operand. */ VX_CONVERT_POLICY_SATURATE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CONVERT_POLICY) + 0x1, + /*! \brief Results preserve infinity and nan value. */ + VX_CONVERT_POLICY_INF = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_CONVERT_POLICY) + 0x0, }; /*! \brief Based on the VX_DF_IMAGE definition. diff --git a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so index 0e2036813..40b91d016 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so and b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libCLC.so b/prebuilt-sdk/x86_64_linux/lib/libCLC.so index 9c8839038..a50839e36 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libCLC.so and b/prebuilt-sdk/x86_64_linux/lib/libCLC.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so index 96a5ab43d..201f51c15 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so and b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libGAL.so b/prebuilt-sdk/x86_64_linux/lib/libGAL.so index 06525dac1..fa303327d 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libGAL.so and b/prebuilt-sdk/x86_64_linux/lib/libGAL.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so index 1566bab34..fee4a57db 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so and b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 index 71f33843a..b8a0d961d 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 and b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so index 9b7e0caf8..cfa02ae3a 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so and b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libVSC.so b/prebuilt-sdk/x86_64_linux/lib/libVSC.so index 1bafe16b3..e482f3097 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libVSC.so and b/prebuilt-sdk/x86_64_linux/lib/libVSC.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libvdtproxy.so b/prebuilt-sdk/x86_64_linux/lib/libvdtproxy.so index 628f663a4..0deaff134 100644 Binary files a/prebuilt-sdk/x86_64_linux/lib/libvdtproxy.so and b/prebuilt-sdk/x86_64_linux/lib/libvdtproxy.so differ diff --git a/src/tim/vx/internal/include/custom/custom_node_type.def b/src/tim/vx/internal/include/custom/custom_node_type.def index 90d772799..c5ef3e04a 100644 --- a/src/tim/vx/internal/include/custom/custom_node_type.def +++ b/src/tim/vx/internal/include/custom/custom_node_type.def @@ -6,3 +6,6 @@ DEF_NODE_TYPE(custom_ainr_denoise_postprocess) DEF_NODE_TYPE(custom_warp_affine) DEF_NODE_TYPE(custom_warp_perspective) DEF_NODE_TYPE(custom_sample) +DEF_NODE_TYPE(custom_tiny_yolov4_postprocess) +DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_confidence) +DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_box) diff --git a/src/tim/vx/internal/include/custom/custom_ops.def b/src/tim/vx/internal/include/custom/custom_ops.def index 00504392c..2074b8f30 100644 --- a/src/tim/vx/internal/include/custom/custom_ops.def +++ b/src/tim/vx/internal/include/custom/custom_ops.def @@ -6,3 +6,6 @@ DEF_OP(CUSTOM_AINR_DENOISE_POSTPROCESS) DEF_OP(CUSTOM_WARP_AFFINE) DEF_OP(CUSTOM_WARP_PERSPECTIVE) DEF_OP(CUSTOM_SAMPLE) +DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS) +DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE) +DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX) diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h new file mode 100644 index 000000000..5234d56d6 --- /dev/null +++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h @@ -0,0 +1,47 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_H +#define _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_custom_tiny_yolov4_postprocess_param +{ + struct _custom_tiny_yolov4_postprocess_local_data_t* local; + // Add parameters here +} vsi_nn_custom_tiny_yolov4_postprocess_param; +_compiler_assert(offsetof(vsi_nn_custom_tiny_yolov4_postprocess_param, local) == 0, \ + vsi_nn_custom_tiny_yolov4_postprocess_h ); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h new file mode 100644 index 000000000..854c3a9e1 --- /dev/null +++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h @@ -0,0 +1,49 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX_H +#define _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_custom_tiny_yolov4_postprocess_box_param +{ + struct _custom_tiny_yolov4_postprocess_box_local_data_t* local; + // Add parameters here + float bias_0; + float bias_1; +} vsi_nn_custom_tiny_yolov4_postprocess_box_param; +_compiler_assert(offsetof(vsi_nn_custom_tiny_yolov4_postprocess_box_param, local) == 0, \ + vsi_nn_custom_tiny_yolov4_postprocess_box_h ); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h new file mode 100644 index 000000000..181595289 --- /dev/null +++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h @@ -0,0 +1,47 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE_H +#define _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_custom_tiny_yolov4_postprocess_confidence_param +{ + struct _custom_tiny_yolov4_postprocess_confidence_local_data_t* local; + // Add parameters here +} vsi_nn_custom_tiny_yolov4_postprocess_confidence_param; +_compiler_assert(offsetof(vsi_nn_custom_tiny_yolov4_postprocess_confidence_param, local) == 0, \ + vsi_nn_custom_tiny_yolov4_postprocess_confidence_h ); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h index 815a064fc..adf769f7f 100644 --- a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h +++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h @@ -38,6 +38,7 @@ typedef struct _vsi_nn_custom_warp_affine_param const float *matrix; vsi_enum type; int32_t size[2]; + vsi_enum rgb_type; } vsi_nn_custom_warp_affine_param; _compiler_assert(offsetof(vsi_nn_custom_warp_affine_param, local) == 0, \ vsi_nn_custom_warp_affine_h ); diff --git a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h index 8976be307..eb23a2055 100644 --- a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h +++ b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h @@ -31,5 +31,8 @@ #include "custom/ops/vsi_nn_op_custom_warp_affine.h" #include "custom/ops/vsi_nn_op_custom_warp_perspective.h" #include "custom/ops/vsi_nn_op_custom_sample.h" +#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h" +#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h" +#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h" #endif diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def index 82d843fc5..0753df06d 100755 --- a/src/tim/vx/internal/include/interface/ops.def +++ b/src/tim/vx/internal/include/interface/ops.def @@ -193,3 +193,4 @@ DEF_OP(REVERSESEQUENCE) DEF_OP(INVERSE_SIGMOID) DEF_OP(GRID_SAMPLE) DEF_OP(LPNORM) +DEF_OP(RESIZE_3D) diff --git a/src/tim/vx/internal/include/internal/internal_ops.def b/src/tim/vx/internal/include/internal/internal_ops.def old mode 100755 new mode 100644 index de3332709..a47559a3a --- a/src/tim/vx/internal/include/internal/internal_ops.def +++ b/src/tim/vx/internal/include/internal/internal_ops.def @@ -20,4 +20,3 @@ DEF_OP(SPACE2DEPTH_INTERNAL) DEF_OP(GRUCELL_H_TIMES_ACTIVATION_R) DEF_OP(GRUCELL_ACTIVATION_Z_H) DEF_OP(REDUCE_MEAN_INTERNAL) -DEF_OP(BILINEAR_GRID_SAMPLE) diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h index c118e137f..5150b0e4a 100644 --- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h @@ -79,6 +79,8 @@ typedef enum BOOL8, I4, U4, + FP8_E4M3, + FP8_E5M2, } VSI_PUBLIC_TYPE vsi_nn_kernel_dtype_e; typedef enum @@ -89,6 +91,8 @@ typedef enum VSI_NN_KERNEL_QUANT_ASYMM_PERCHANNEL, VSI_NN_KERNEL_QUANT_SYMM, VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL, + VSI_NN_KERNEL_QUANT_FLOAT8, + VSI_NN_KERNEL_QUANT_FLOAT8_PERCHANNEL, VSI_NN_KERNEL_QUANT_TYPE_NUM } vsi_nn_kernel_quant_type_e; @@ -522,6 +526,10 @@ static VSI_INLINE_API vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype return BF16; case VSI_NN_TYPE_FLOAT32: return F32; + case VSI_NN_TYPE_FLOAT8_E4M3: + return FP8_E4M3; + case VSI_NN_TYPE_FLOAT8_E5M2: + return FP8_E5M2; default: VSILOGE("error data type %d", dtype); break; @@ -579,6 +587,8 @@ static VSI_INLINE_API size_t vsi_nn_kernel_dtype_get_bytes case I8: case U8: case BOOL8: + case FP8_E4M3: + case FP8_E5M2: return sizeof(int8_t); case I16: case U16: @@ -611,6 +621,8 @@ static VSI_INLINE_API vsi_size_t vsi_nn_kernel_dtype_get_bits case I8: case U8: case BOOL8: + case FP8_E4M3: + case FP8_E5M2: return 8; case I16: case U16: @@ -879,7 +891,7 @@ static VSI_INLINE_API void vsi_nn_kernel_tensor_attr_get_stride shape = attr->shape->data; type_bits = vsi_nn_kernel_dtype_get_bits( attr->dtype ); - if ( type_bits < BITS_PER_BYTE ) + if ( type_bits < BITS_PER_BYTE && type_bits != 0) { vsi_size_t i; diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h index cfecfd1fe..c834d040e 100644 --- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h @@ -91,4 +91,21 @@ vsi_bool vsi_nn_kernel_optimize_scatter_elements_shape vsi_size_t* out_shape_x, uint32_t* out_rank_x, int32_t* out_axis, vsi_size_t max_size ); +vsi_bool vsi_nn_kernel_optimize_matrixmul_broadcast_shape + ( + const vsi_size_t * shape_x, + const vsi_size_t * shape_y, + const vsi_size_t * shape_output, + vsi_size_t rank_x, + vsi_size_t rank_y, + vsi_size_t rank_out, + vsi_size_t* out_shape_x, + vsi_size_t* out_shape_y, + vsi_size_t* out_shape_output, + uint32_t* new_rank, + uint32_t* cross_flg, + uint32_t* size_axis_inner_outer, + uint32_t* strides_axis_inner_outer + ); + #endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h index 3f614139a..749a432e7 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h @@ -82,6 +82,12 @@ typedef struct _vsi_nn_pre_process_param vsi_nn_pre_process_type_e type; + struct + { + float mean[3]; + float scale[3]; + } norm2; + vsi_nn_pre_process_lcl_data *local; } vsi_nn_pre_process_param; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h index d01fba846..d2772b5c1 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h @@ -65,6 +65,10 @@ typedef struct _vsi_nn_pre_process_bgra_param vsi_bool reverse_channel; + float r_scale; + float g_scale; + float b_scale; + /* pre process rgb layer local data structure */ vsi_nn_pre_process_bgra_lcl_data local; } vsi_nn_pre_process_bgra_param; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h index aa8fc820f..34c5a6de6 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h @@ -70,6 +70,10 @@ typedef struct _vsi_nn_pre_process_nv12_param vsi_nn_pre_process_nv12_lcl_data* local; vsi_nn_nv_type nv_type; + + float r_scale; + float g_scale; + float b_scale; } vsi_nn_pre_process_nv12_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h index da52fa0d2..9e05a5966 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h @@ -76,6 +76,9 @@ typedef struct _vsi_nn_pre_process_rgb_param vsi_bool reverse_channel; + float r_scale; + float g_scale; + float b_scale; /* pre process rgb layer local data structure */ vsi_nn_pre_process_rgb_lcl_data local; } vsi_nn_pre_process_rgb_param; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h index f384e4fb3..171df70c3 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h @@ -53,6 +53,15 @@ typedef struct _vsi_nn_pre_process_rgb888_planar_param float g_mean; float b_mean; float scale; + + + vsi_bool reverse_channel; + vsi_bool enable_rgb88_planar_nhwc; + + float r_scale; + float g_scale; + float b_scale; + } vsi_nn_pre_process_rgb888_planar_param; _compiler_assert(offsetof(vsi_nn_pre_process_rgb888_planar_param, local) == 0, \ vsi_nn_pre_process_rgb888_planar_h ); diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h index 998de5ee2..2ceabcb75 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h @@ -66,6 +66,11 @@ typedef struct _vsi_nn_pre_process_yuv420_param float rgb_scale; vsi_bool reverse_channel; + + float r_scale; + float g_scale; + float b_scale; + /* local data must be the first. */ vsi_nn_pre_process_yuv420_lcl_data local; } vsi_nn_pre_process_yuv420_param; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h index b516e6016..1ca45170c 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h @@ -71,6 +71,10 @@ typedef struct _vsi_nn_pre_process_yuv422_param float rgb_scale; vsi_bool reverse_channel; + + float r_scale; + float g_scale; + float b_scale; } vsi_nn_pre_process_yuv422_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h index c4391773e..7b2658968 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h @@ -66,6 +66,10 @@ typedef struct _vsi_nn_pre_process_yuv444_param float rgb_scale; vsi_bool reverse_channel; + + float r_scale; + float g_scale; + float b_scale; /* local data must be the first. */ vsi_nn_pre_process_yuv444_lcl_data* local; } vsi_nn_pre_process_yuv444_param; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_bilinear_grid_sample.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_3d.h similarity index 76% rename from src/tim/vx/internal/include/ops/vsi_nn_op_bilinear_grid_sample.h rename to src/tim/vx/internal/include/ops/vsi_nn_op_resize_3d.h index d04c589a9..0771a71f0 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_bilinear_grid_sample.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_3d.h @@ -22,8 +22,8 @@ * *****************************************************************************/ -#ifndef _VSI_NN_OP_BILINEAR_GRID_SAMPLE_H -#define _VSI_NN_OP_BILINEAR_GRID_SAMPLE_H +#ifndef _VSI_NN_OP_RESIZE_3D_H +#define _VSI_NN_OP_RESIZE_3D_H #include "vsi_nn_types.h" @@ -31,17 +31,19 @@ extern "C" { #endif +typedef struct _vsi_nn_resize_3d_local_data { + vsi_bool use_internal_node; +} vsi_nn_resize_3d_local_data; -typedef struct _vsi_nn_bilinear_grid_sample_param +typedef struct _vsi_nn_resize_3d_param { - struct _bilinear_grid_sample_local_data_t* local; - vsi_bool align_corners; - vsi_nn_pad_mode_e padding_mode; - int32_t const_val; -} vsi_nn_bilinear_grid_sample_param; - -_compiler_assert(offsetof(vsi_nn_bilinear_grid_sample_param, local) == 0, \ - vsi_nn_bilinear_grid_sample_h ); + vsi_nn_resize_3d_local_data* lcl_data; + vsi_enum type; + float factor; + int32_t size[3]; + vsi_bool align_corners; + vsi_bool half_pixel_centers; +} vsi_nn_resize_3d_param; #ifdef __cplusplus } diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h index 7ab6ff2dd..bccc0b5e5 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h @@ -33,6 +33,7 @@ extern "C" { typedef struct _vsi_nn_topk_param { uint32_t k; + int32_t axis; } vsi_nn_topk_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h b/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h index d7e598395..6446cd046 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h @@ -52,7 +52,9 @@ enum { D_BF16 = VSI_NN_TYPE_BFLOAT16, D_BOOL8 = VSI_NN_TYPE_BOOL8, D_I4 = VSI_NN_TYPE_INT4, - D_U4 = VSI_NN_TYPE_UINT4 + D_U4 = VSI_NN_TYPE_UINT4, + D_F8_E4M3 = VSI_NN_TYPE_FLOAT8_E4M3, + D_F8_E5M2 = VSI_NN_TYPE_FLOAT8_E5M2 }; /* short alias for qtype */ @@ -63,6 +65,8 @@ enum { Q_ASYM = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC << Q_SHIFT, Q_SYM_PC = VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC << Q_SHIFT, Q_SYM = VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC << Q_SHIFT, + Q_FP8 = VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 << Q_SHIFT, + Q_FP8_PC = VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 << Q_SHIFT, }; typedef struct { diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h index ab63a3c70..367ff88fb 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h @@ -27,6 +27,7 @@ #include "vsi_nn_types.h" #include "vsi_nn_math.h" #include "vsi_nn_tensor.h" +#include "vsi_nn_log.h" #ifdef __cplusplus extern "C" { @@ -78,6 +79,8 @@ static VSI_INLINE_API vsi_bool type_is_signed case VSI_NN_TYPE_FLOAT32: case VSI_NN_TYPE_FLOAT64: case VSI_NN_TYPE_BFLOAT16: + case VSI_NN_TYPE_FLOAT8_E4M3: + case VSI_NN_TYPE_FLOAT8_E5M2: ret = TRUE; break; default: @@ -93,9 +96,14 @@ static VSI_INLINE_API uint32_t type_get_bytes { switch( type ) { + case VSI_NN_TYPE_INT4: + case VSI_NN_TYPE_UINT4: + return 0; case VSI_NN_TYPE_INT8: case VSI_NN_TYPE_UINT8: case VSI_NN_TYPE_BOOL8: + case VSI_NN_TYPE_FLOAT8_E4M3: + case VSI_NN_TYPE_FLOAT8_E5M2: return 1; case VSI_NN_TYPE_INT16: case VSI_NN_TYPE_UINT16: @@ -111,7 +119,8 @@ static VSI_INLINE_API uint32_t type_get_bytes case VSI_NN_TYPE_FLOAT64: return 8; default: - return 0; + VSILOGE("unsupported type: %d", type); + return 1; } } /* type_get_bytes() */ @@ -128,6 +137,8 @@ static VSI_INLINE_API uint32_t type_get_bits case VSI_NN_TYPE_INT8: case VSI_NN_TYPE_UINT8: case VSI_NN_TYPE_BOOL8: + case VSI_NN_TYPE_FLOAT8_E4M3: + case VSI_NN_TYPE_FLOAT8_E5M2: return 8; case VSI_NN_TYPE_INT16: case VSI_NN_TYPE_UINT16: @@ -143,7 +154,8 @@ static VSI_INLINE_API uint32_t type_get_bits case VSI_NN_TYPE_FLOAT64: return 64; default: - return 0; + VSILOGE("unsupported type: %d", type); + return 1; } } /* type_get_bits() */ @@ -236,6 +248,7 @@ static VSI_INLINE_API float affine_to_fp32 ) { float data; + VSI_UNREFERENCED(type); data = ( (float)val - zero_point ) * scale; return data; } /* affine_to_fp32() */ @@ -279,6 +292,7 @@ static VSI_INLINE_API float dfp_to_fp32 ) { float result; + VSI_UNREFERENCED(type); if( fl > 0 ) { result = (float)val * ( 1.0f / ( (float) ( (int64_t)1 << fl ) ) ); @@ -440,6 +454,139 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne return out; } /* fp32_to_bfp16_rtne */ +#define FLOAT_BIAS_EXPONENT 127 +#define FLOAT_EXPONENT_SIZE 8 +#define FLOAT_MANTISSA_SIZE 23 +#define FLOAT8_E4M3_BIAS_EXPONENT 7 +#define FLOAT8_E4M3_EXPONENT_SIZE 4 +#define FLOAT8_E4M3_MANTISSA_SIZE 3 +#define FLOAT8_E5M2_BIAS_EXPONENT 15 +#define FLOAT8_E5M2_EXPONENT_SIZE 5 +#define FLOAT8_E5M2_MANTISSA_SIZE 2 + +static VSI_INLINE_API uint8_t fp32_to_fp8_e4m3(float in, const float scale) { + float fp8_f32 = in / scale; + int32_t fp8_i32 = *((int32_t*)&fp8_f32); + //int32_t mask = (int32_t)(pow(2, 32) - 1 - (pow(2, 23 - 3) - 1)); + int32_t eps = 1 << (23 - 3 - 1); + fp8_i32 += eps; + //fp8_i32 &= mask; + { + int sign = (fp8_i32 >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; + int exp = (fp8_i32 >> FLOAT_MANTISSA_SIZE) & 0xff; + int expShiftValue = FLOAT8_E4M3_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT; + int mantissa = (fp8_i32 >> (FLOAT_MANTISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7; + + exp = (exp + expShiftValue) & 0xF; + + return (uint8_t)(sign << 7 | exp << 3 | mantissa); + } +} /* fp32_to_fp8_e4m3() */ + +static VSI_INLINE_API uint8_t fp32_to_fp8_e5m2(float in, const float scale) { + float fp8_f32 = in / scale; + int32_t fp8_i32 = *((int32_t*)&fp8_f32); + //int32_t mask = (int32_t)(pow(2, 32) - 1 - (pow(2, 23 - 2) - 1)); + int32_t eps = 1 << (23 - 2 - 1); + fp8_i32 += eps; + //fp8_i32 &= mask; + { + int sign = (fp8_i32 >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; + int exp = (fp8_i32 >> FLOAT_MANTISSA_SIZE) & 0xff; + int expShiftValue = FLOAT8_E5M2_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT; + int mantissa = (fp8_i32 >> (FLOAT_MANTISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x3; + + exp = (exp + expShiftValue) & 0x1F; + + return (uint8_t)(sign << 7 | exp << 2 | mantissa); + } +} /* fp32_to_fp8_e5m2() */ + +static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) { + float val_fp32; + + uint32_t signOut = 0; + uint32_t exponentOut = 0; + uint32_t mantissaOut = 0; + uint32_t out_u = 0; + + uint32_t signIn; + uint32_t exponentIn; + uint32_t mantissaIn; + int expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E4M3_BIAS_EXPONENT; + + signIn = (in >> (FLOAT8_E4M3_EXPONENT_SIZE + FLOAT8_E4M3_MANTISSA_SIZE)) & 0x1; + exponentIn = (in >> FLOAT8_E4M3_MANTISSA_SIZE) & 0xF; + mantissaIn = in & 0x7; + + signOut = signIn; + + if (exponentIn == 0 && mantissaIn == 0) + { + goto final; + } + + if (exponentIn == 0xf && mantissaIn == 0x7) + { + exponentOut = 0xff; + mantissaOut = 0x400000; + goto final; + } + + exponentOut = (exponentIn + expShiftValue) & 0xff; + mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7fffff; + + +final: + out_u = signOut << 31 | exponentOut << 23 | mantissaOut; + val_fp32 = *((float*)&out_u); + + return val_fp32 * scale; +} /* fp8_e4m3_to_fp32() */ + +static VSI_INLINE_API float fp8_e5m2_to_fp32(int8_t in, const float scale) { + float val_fp32; + + uint32_t signOut = 0; + uint32_t exponentOut = 0; + uint32_t mantissaOut = 0; + uint32_t out_u = 0; + + uint32_t signIn; + uint32_t exponentIn; + uint32_t mantissaIn; + int expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E5M2_BIAS_EXPONENT; + + signIn = (in >> 7) & 0x1; + exponentIn = (in >> 2) & 0x1F; + mantissaIn = in & 0x3; + + signOut = signIn; + + if (exponentIn == 0 && mantissaIn == 0) + { + goto final; + } + + if (exponentIn == 0x1f && mantissaIn == 0x3) + { + exponentOut = 0xff; + mantissaOut = 0x400000; + goto final; + } + + + exponentOut = (exponentIn + expShiftValue) & 0xff; + mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x7fffff; + + + final: + out_u = signOut << 31 | exponentOut << 23 | mantissaOut; + val_fp32 = *((float*)&out_u); + + return val_fp32 * scale; +} /* fp8_e5m2_to_fp32() */ + static VSI_INLINE_API vsi_status dtype_to_float32 ( uint8_t *src, @@ -458,6 +605,12 @@ static VSI_INLINE_API vsi_status dtype_to_float32 case VSI_NN_TYPE_BFLOAT16: *dst = bfp16_to_fp32( *(int16_t *)src ); break; + case VSI_NN_TYPE_FLOAT8_E4M3: + *dst = fp8_e4m3_to_fp32(*(int8_t*)src, src_dtype->scale); + break; + case VSI_NN_TYPE_FLOAT8_E5M2: + *dst = fp8_e5m2_to_fp32(*(int8_t *)src, src_dtype->scale); + break; case VSI_NN_TYPE_INT4: case VSI_NN_TYPE_UINT4: case VSI_NN_TYPE_INT8: @@ -511,6 +664,12 @@ static VSI_INLINE_API vsi_status float32_to_dtype case VSI_NN_TYPE_BFLOAT16: *(int16_t *)dst = fp32_to_bfp16_rtne( src ); break; + case VSI_NN_TYPE_FLOAT8_E4M3: + *(int8_t *)dst = fp32_to_fp8_e4m3(src, dst_dtype->scale); + break; + case VSI_NN_TYPE_FLOAT8_E5M2: + *(int8_t *)dst = fp32_to_fp8_e5m2(src, dst_dtype->scale); + break; case VSI_NN_TYPE_INT4: case VSI_NN_TYPE_UINT4: case VSI_NN_TYPE_INT8: diff --git a/src/tim/vx/internal/include/utils/vsi_nn_link_list.h b/src/tim/vx/internal/include/utils/vsi_nn_link_list.h index 7e6afb2ea..2c800a152 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_link_list.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_link_list.h @@ -30,7 +30,7 @@ extern "C"{ #endif -#define vsi_nn_LinkListInitRoot(n) do{n = NULL;} while (0); +#define vsi_nn_LinkListInitRoot(n) {n = NULL;} typedef struct _vsi_nn_link_list { diff --git a/src/tim/vx/internal/include/utils/vsi_nn_math.h b/src/tim/vx/internal/include/utils/vsi_nn_math.h index b8a6d2a9a..924ddf004 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_math.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_math.h @@ -53,12 +53,13 @@ extern "C" { #define DEFINE_ARRAY_TYPE( NAME, TYPE ) \ typedef struct { \ size_t size; \ - TYPE data[0]; \ + TYPE *data; \ } vsi_##NAME##_array_t; \ static VSI_INLINE_API vsi_##NAME##_array_t * vsi_##NAME##_array_create( size_t size ) { \ - vsi_##NAME##_array_t * array = (vsi_##NAME##_array_t *)malloc( \ - sizeof(vsi_##NAME##_array_t) + sizeof(TYPE) * size ); \ + vsi_##NAME##_array_t * array = NULL; \ + array = (vsi_##NAME##_array_t *)malloc( sizeof(vsi_##NAME##_array_t) + sizeof(TYPE) * size ); \ if (array == NULL) return NULL; \ + array->data = (TYPE *)(((TYPE**)(&(array->data))) + 1); \ array->size = size; \ return array; \ } \ diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h index f939592b0..128e7d0c5 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_util.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h @@ -50,14 +50,23 @@ extern "C" { free( _PTR ); _PTR = NULL; } #define vsi_safe_release_tensor(_t) if(_t){vsi_nn_ReleaseTensor(&(_t)); _t = NULL;} - -#define END_OF_VARIADIC_ARGUMENTS ((size_t)0xbadcaffebadcaffe) +#if (defined(_WIN32) || defined(__WIN32__) || defined(WIN32)) + #if defined(_WIN64) + #define END_OF_VARIADIC_ARGUMENTS ((size_t)0xbadcaffebadcaffe) + #else + #define END_OF_VARIADIC_ARGUMENTS ((size_t)0xbadcaffe) + #endif +#else + #define END_OF_VARIADIC_ARGUMENTS ((size_t)0xbadcaffebadcaffe) +#endif #define FOREACH_ARGS(_args, _next, _arg_type) \ while(((_arg_type)((size_t)END_OF_VARIADIC_ARGUMENTS)) != (_next = va_arg(_args, _arg_type))) #define BITS_PER_BYTE 8 +#define VSI_UNREFERENCED( param ) ( ( void ) ( param ) ) + #define VSI_NN_STRINGIZE(X) VSI_NN_DO_STRINGIZE(X) #define VSI_NN_DO_STRINGIZE(X) #X diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h index 75e5ab7e1..777cf5c04 100644 --- a/src/tim/vx/internal/include/vsi_nn_context.h +++ b/src/tim/vx/internal/include/vsi_nn_context.h @@ -78,6 +78,7 @@ typedef struct _vsi_nn_runtime_option_t int32_t enable_asymi8_to_u8; int32_t enable_dataconvert_optimize; int32_t enable_stream_processor; + int32_t enable_rgb88_planar_nhwc; } vsi_nn_runtime_option_t; /** diff --git a/src/tim/vx/internal/include/vsi_nn_error.h b/src/tim/vx/internal/include/vsi_nn_error.h index 7b55aa507..bc9eca8b6 100644 --- a/src/tim/vx/internal/include/vsi_nn_error.h +++ b/src/tim/vx/internal/include/vsi_nn_error.h @@ -31,33 +31,42 @@ #define VSI_ASSERT( cond ) assert(cond) #define VSI_CHECK_PTR( pointer, msg, retval ) \ - do { \ + { \ if( pointer == NULL ) { \ VSILOGD("%s",msg); \ VSI_ASSERT(FALSE); \ } \ - } while(0) + } -#define CHECK_STATUS_FAIL_GOTO( stat, lbl ) do {\ +#define CHECK_STATUS_FAIL_GOTO( stat, lbl ) {\ if( VSI_SUCCESS != stat ) {\ VSILOGE("CHECK STATUS(%d:%s)", (stat), vsi_nn_DescribeStatus(stat));\ goto lbl;\ }\ -} while(0) +} -#define CHECK_STATUS( stat ) do {\ +#define CHECK_STATUS( stat ) {\ if( VSI_SUCCESS != stat ) {\ VSILOGE("CHECK STATUS(%d:%s)", (stat), vsi_nn_DescribeStatus(stat));\ }\ -} while(0) +} #define CHECK_PTR_FAIL_GOTO( pointer, msg, lbl ) \ - do { \ + { \ if( pointer == NULL ) { \ VSILOGD("CHECK POINTER %s", msg); \ goto lbl; \ } \ - } while(0) + } + +#define CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( pointer, node, msg, lbl ) \ + { \ + if( pointer == NULL ) { \ + vsi_nn_internal_release_node(&node);\ + VSILOGD("CHECK POINTER %s", msg); \ + goto lbl; \ + } \ + } #endif diff --git a/src/tim/vx/internal/include/vsi_nn_feature_config.h b/src/tim/vx/internal/include/vsi_nn_feature_config.h index 01ec04c29..e93d1af19 100644 --- a/src/tim/vx/internal/include/vsi_nn_feature_config.h +++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h @@ -1,26 +1,3 @@ -/**************************************************************************** -* -* Copyright (c) 2019 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the Software), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ /*****Auto generated header file, Please DO NOT modify manually!*****/ #ifndef _VSI_NN_FEATURE_CONFIG_H #define _VSI_NN_FEATURE_CONFIG_H @@ -42,5 +19,6 @@ #if defined(VX_TENSORVIEW_ON_ANY_DIM) && VX_TENSORVIEW_ON_ANY_DIM #define VSI_CONCAT_ENHANCE_SUPPORT #endif +#define VSI_CREATE_TENSOR_FROM_VIEW_SUPPORT #endif diff --git a/src/tim/vx/internal/include/vsi_nn_graph.h b/src/tim/vx/internal/include/vsi_nn_graph.h index 175687096..8504791f8 100644 --- a/src/tim/vx/internal/include/vsi_nn_graph.h +++ b/src/tim/vx/internal/include/vsi_nn_graph.h @@ -361,6 +361,27 @@ OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromHandle uint8_t * data ); +/** + * Add a new tensor from view + * Create a new tensor from a view and add it to graph. + * + * @param[in] graph Graph handle. + * @param[in] id Required, the id of the parent tensor on which to create view. + * @param[in] start The start cooridinates for each dim, 0-based none-negative interger. + * NULL means copy from the idx 0 of each dim. + * @param[in] end The end cooridinates for each dim, 0-based none-negative interger. + * NULL means copy to the end of each dim. For the given idx, the end[idx] + * should be greater than start[idx]. + * @return The new tensor id on success, or VSI_NN_TENSOR_ID_NA otheriwse. + */ +OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromView + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_id_t id, + vsi_size_t* start, + vsi_size_t* end + ); + /** * Attach tensor to graph * Attach an exist tensor to graph. diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h index 37032f473..5cadddb3e 100644 --- a/src/tim/vx/internal/include/vsi_nn_node_type.h +++ b/src/tim/vx/internal/include/vsi_nn_node_type.h @@ -206,8 +206,8 @@ #include "ops/vsi_nn_op_maxunpool.h" #include "ops/vsi_nn_op_reversesequence.h" #include "ops/vsi_nn_op_grid_sample.h" -#include "ops/vsi_nn_op_bilinear_grid_sample.h" #include "ops/vsi_nn_op_lpnorm.h" +#include "ops/vsi_nn_op_resize_3d.h" /* custom node head define define */ #include "custom/vsi_nn_custom_node_type.h" #include "ops/vsi_nn_op_inverse_sigmoid.h" @@ -402,8 +402,8 @@ typedef union _vsi_nn_nn_param vsi_nn_reversesequence_param reversesequence; vsi_nn_inverse_sigmoid_param inverse_sigmoid; vsi_nn_grid_sample_param gridsample; - vsi_nn_bilinear_grid_sample_param bilinear_grid_sample; vsi_nn_lpnorm_param lpnorm; + vsi_nn_resize_3d_param resize_3d; void* client_param; /* custom node data struct define */ diff --git a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h index 227b17f3a..59292cd0d 100644 --- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h +++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h @@ -48,6 +48,7 @@ typedef enum VSI_NN_PREPROCESS_IMAGE_RESIZE_BILINEAR, VSI_NN_PREPROCESS_IMAGE_RESIZE_NEAREST, VSI_NN_PREPROCESS_DTYPE_CONVERT, + VSI_NN_PREPROCESS_MEANS_AND_SCALES, } vsi_nn_preprocess_type_e; /** @@ -150,8 +151,25 @@ typedef struct float scale; }vsi_nn_process_mean_and_scale_t; +/** + * Process mean and scale parameter structure + */ +typedef struct +{ + /** Mean value for each channel */ + float* channel_mean; + /*Channel length */ + int32_t channel_len; + /** Scale value */ + float* scale; + /** Scale length */ + int32_t scale_len; +}vsi_nn_process_means_and_scales_t; + typedef vsi_nn_process_mean_and_scale_t vsi_nn_preprocess_mean_and_scale_t; +typedef vsi_nn_process_means_and_scales_t vsi_nn_preprocess_means_and_scales_t; typedef vsi_nn_process_mean_and_scale_t vsi_nn_postprocess_mean_and_scale_t; +typedef vsi_nn_process_means_and_scales_t vsi_nn_postprocess_means_and_scales_t; /** * Process permute parameter structure diff --git a/src/tim/vx/internal/include/vsi_nn_rnn_helper.h b/src/tim/vx/internal/include/vsi_nn_rnn_helper.h index 4bef7b942..14f359338 100644 --- a/src/tim/vx/internal/include/vsi_nn_rnn_helper.h +++ b/src/tim/vx/internal/include/vsi_nn_rnn_helper.h @@ -154,7 +154,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_transpose_time_major vsi_bool use_virtual_tensor ); -void vsi_nn_rnn_split_input_tensor +vsi_status vsi_nn_rnn_split_input_tensor ( vsi_nn_node_t * self, vsi_nn_tensor_t * input, @@ -163,7 +163,7 @@ void vsi_nn_rnn_split_input_tensor vsi_bool use_virtual_tensor ); -void vsi_nn_rnn_data_check_aligned +vsi_status vsi_nn_rnn_data_check_aligned ( vsi_nn_node_t * self, vsi_nn_tensor_t ** input, diff --git a/src/tim/vx/internal/include/vsi_nn_tensor.h b/src/tim/vx/internal/include/vsi_nn_tensor.h index 5b7bdb940..d6ed09045 100644 --- a/src/tim/vx/internal/include/vsi_nn_tensor.h +++ b/src/tim/vx/internal/include/vsi_nn_tensor.h @@ -82,6 +82,10 @@ typedef enum VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC = 0x4, /** affine perchannel asymmetric */ VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC = 0x5, + /** float8 */ + VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6, + /** perchannel float8 */ + VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7, /** undefined type */ VSI_NN_QNT_TYPE_NA = 0xff, } vsi_nn_qnt_type_e; diff --git a/src/tim/vx/internal/include/vsi_nn_tensor_util.h b/src/tim/vx/internal/include/vsi_nn_tensor_util.h index 4b997f319..14bb0d62b 100644 --- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h +++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h @@ -734,13 +734,15 @@ vsi_status vsi_nn_copy_tensor_veiw_patch /** * OVXLIB internal tensor util api * A wrapper api for OpenVX vxCopyTensorPatch - * Allows the application to copy whole tensor patch from/into an tensor object. + * Allows the application to copy partial/whole tensor patch from/into an tensor object. * * @param[in] tensor OpenVX Tensor handle. * @param[in] attr OVXLIB Tensor attr. * @param[in] user_ptr The address of the memory location where to store the requested data. * @param[in] usage This declares the effect of the copy with regard to the tensor object * support VX_READ_ONLY or VX_WRITE_ONLY + * @param[in] start The start cooridinates for each dim. NULL means copy from the idx 0 of each dim. + * @param[in] end The end cooridinates for each dim. NULL means copy to the end of each dim. * @return VSI_SUCCESS on success, or error core otherwise. */ vsi_status vsi_nn_copy_tensor_patch @@ -748,7 +750,9 @@ vsi_status vsi_nn_copy_tensor_patch vx_tensor tensor, vsi_nn_tensor_attr_t *attr, void * user_ptr, - vsi_enum usage + vsi_enum usage, + vsi_size_t* start, + vsi_size_t* end ); /** diff --git a/src/tim/vx/internal/include/vsi_nn_test.h b/src/tim/vx/internal/include/vsi_nn_test.h index 8f5df6e6a..59bafe198 100644 --- a/src/tim/vx/internal/include/vsi_nn_test.h +++ b/src/tim/vx/internal/include/vsi_nn_test.h @@ -31,26 +31,26 @@ extern "C"{ #endif -#define TEST_CHECK_TENSOR_ID( id, lbl ) do {\ +#define TEST_CHECK_TENSOR_ID( id, lbl ) {\ if( VSI_NN_TENSOR_ID_NA == id ) {\ VSILOGE("CHECK TENSOR ID %d", __LINE__);\ goto lbl;\ }\ - } while(0) + } -#define TEST_CHECK_PTR( ptr, lbl ) do {\ +#define TEST_CHECK_PTR( ptr, lbl ) {\ if( NULL == ptr ) {\ VSILOGE("CHECK PTR %d", __LINE__);\ goto lbl;\ }\ -} while(0) +} -#define TEST_CHECK_STATUS( stat, lbl ) do {\ +#define TEST_CHECK_STATUS( stat, lbl ) {\ if( VSI_SUCCESS != stat ) {\ VSILOGE("CHECK STATUS(%d:%s)", (stat), vsi_nn_DescribeStatus(stat));\ goto lbl;\ }\ -} while(0) +} #if defined(__cplusplus) } diff --git a/src/tim/vx/internal/include/vsi_nn_types.h b/src/tim/vx/internal/include/vsi_nn_types.h index 6238e4f2d..380057b94 100644 --- a/src/tim/vx/internal/include/vsi_nn_types.h +++ b/src/tim/vx/internal/include/vsi_nn_types.h @@ -191,6 +191,16 @@ typedef enum VSI_NN_TYPE_BFLOAT16 = VX_TYPE_BFLOAT16, #else VSI_NN_TYPE_BFLOAT16 = 0x81A, +#endif +#ifdef VSI_NN_TYPE_FLOAT8_E4M3_SUPPORT + VSI_NN_TYPE_FLOAT8_E4M3 = VX_TYPE_FLOAT8_E4M3, +#else + VSI_NN_TYPE_FLOAT8_E4M3 = 0X81E, +#endif +#ifdef VSI_NN_TYPE_FLOAT8_E5M2_SUPPORT + VSI_NN_TYPE_FLOAT8_E5M2 = VX_TYPE_FLOAT8_E5M2, +#else + VSI_NN_TYPE_FLOAT8_E5M2 = 0X81F, #endif VSI_NN_TYPE_VDATA = VX_TYPE_USER_STRUCT_START + 0x1, @@ -268,6 +278,11 @@ typedef enum _vsi_nn_roi_align_type_e VSI_NN_ROI_ALIGN } vsi_nn_roi_align_type_e; +typedef enum _vsi_nn_custom_warp_affine_type_e { + VSI_NN_WARP_AFFINE_TYPE_NONE = 0, + VSI_NN_WARP_AFFINE_TYPE_RGB +} vsi_nn_custom_warp_affine_type_e; + /** Deprecated */ typedef uint32_t vsi_nn_size_t; diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h index 280f0cc4c..399e72e01 100644 --- a/src/tim/vx/internal/include/vsi_nn_version.h +++ b/src/tim/vx/internal/include/vsi_nn_version.h @@ -33,7 +33,7 @@ extern "C"{ #define VSI_NN_VERSION_MAJOR 1 #define VSI_NN_VERSION_MINOR 1 -#define VSI_NN_VERSION_PATCH 74 +#define VSI_NN_VERSION_PATCH 84 #define VSI_NN_VERSION \ (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH) diff --git a/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess.c b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess.c new file mode 100644 index 000000000..6d6ceb98c --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess.c @@ -0,0 +1,578 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "vsi_nn_internal_node.h" +#include "utils/vsi_nn_constraint_check.h" + +typedef struct _custom_tiny_yolov4_postprocess_local_data_t { + vx_int32 begin_dims[6][VSI_NN_MAX_DIM_NUM]; + vx_int32 end_dims[6][VSI_NN_MAX_DIM_NUM]; + vx_int32 stride_dims[VSI_NN_MAX_DIM_NUM]; +} custom_tiny_yolov4_postprocess_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (4) +#define _OUTPUT_NUM (2) + +static vsi_nn_internal_tensor_t *_create_internal_tensor + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t * tensor = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + memcpy( &attr.dtype, &input->attr.dtype, sizeof( attr.dtype ) ); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = TRUE; + attr.is_const = FALSE; + tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + return tensor; +} /* _create_internal_tensor() */ + +static vsi_nn_internal_tensor_t *_create_sigmoid_internal_tensor + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t * tensor = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + memcpy( &attr.dtype, &input->attr.dtype, sizeof( attr.dtype ) ); + if (attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC || + attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC) + { + attr.dtype.scale = 0.00390625; + attr.dtype.zero_point = 0; + } + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = TRUE; + attr.is_const = FALSE; + tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + return tensor; +} /* _create_sigmoid_internal_tensor() */ + +static vsi_nn_internal_tensor_t *_create_output_internal_tensor + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * output + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t * tensor = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + memcpy( &attr.dtype, &output->attr.dtype, sizeof( attr.dtype ) ); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = TRUE; + attr.is_const = FALSE; + tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + return tensor; +} /* _create_output_internal_tensor() */ + +static vsi_nn_internal_tensor_t *_create_strided_slice_op + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + int32_t begin_mask, + int32_t end_mask, + int32_t index + ) +{ + vsi_nn_custom_tiny_yolov4_postprocess_param * p = NULL; + vsi_nn_internal_tensor_t * tensor = NULL; + vsi_nn_internal_node_t* curr = NULL; + p = (vsi_nn_custom_tiny_yolov4_postprocess_param *)&(self->nn_param.custom_tiny_yolov4_postprocess); + + tensor = _create_internal_tensor(self, input); + CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); + curr->node->nn_param.strided_slice.begin_dims = p->local->begin_dims[index]; + curr->node->nn_param.strided_slice.begin_dims_num = input->attr.dim_num; + curr->node->nn_param.strided_slice.end_dims = p->local->end_dims[index]; + curr->node->nn_param.strided_slice.end_dims_num = input->attr.dim_num; + curr->node->nn_param.strided_slice.stride_dims = p->local->stride_dims; + curr->node->nn_param.strided_slice.stride_dims_num = input->attr.dim_num; + curr->node->nn_param.strided_slice.begin_mask = begin_mask; + curr->node->nn_param.strided_slice.end_mask = end_mask; + curr->node->nn_param.strided_slice.shrink_axis_mask = 0; + curr->node->nn_param.strided_slice.new_axis_mask = 0; + curr->inputs[0] = input; + curr->outputs[0] = tensor->t; + vsi_nn_internal_setup_node( self, curr ); + +final: + return tensor; +} /* _create_strided_slice() */ + +static vsi_nn_internal_tensor_t *_create_sigmoid_op + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input + ) +{ + vsi_nn_internal_tensor_t * tensor = NULL; + vsi_nn_internal_node_t* curr = NULL; + + tensor = _create_sigmoid_internal_tensor(self, input); + CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SIGMOID, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); + curr->inputs[0] = input; + curr->outputs[0] = tensor->t; + vsi_nn_internal_setup_node( self, curr ); + +final: + return tensor; +} /* _create_sigmoid_op() */ + +static vsi_nn_internal_tensor_t *_create_confidence_op + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * output + ) +{ + vsi_nn_internal_tensor_t * tensor = NULL; + vsi_nn_internal_node_t* curr = NULL; + + tensor = _create_output_internal_tensor(self, output); + CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); + curr->inputs[0] = input; + curr->outputs[0] = tensor->t; + vsi_nn_internal_setup_node( self, curr ); + +final: + return tensor; +} /* _create_confidence_op() */ + +static vsi_nn_internal_tensor_t *_create_box_op + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input0, + vsi_nn_tensor_t * input1, + vsi_nn_tensor_t * output, + float bias0, + float bias1 + ) +{ + vsi_nn_internal_tensor_t * tensor = NULL; + vsi_nn_internal_node_t* curr = NULL; + + tensor = _create_output_internal_tensor(self, output); + CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); + curr->inputs[0] = input0; + curr->inputs[1] = input1; + curr->outputs[0] = tensor->t; + curr->node->nn_param.custom_tiny_yolov4_postprocess_box.bias_0 = bias0; + curr->node->nn_param.custom_tiny_yolov4_postprocess_box.bias_1 = bias1; + vsi_nn_internal_setup_node( self, curr ); + +final: + return tensor; +} /* _create_box_op() */ + +static vsi_nn_internal_tensor_t *_create_reshape_op + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * output, + vsi_size_t width + ) +{ + vsi_nn_internal_tensor_t * tensor = NULL; + vsi_nn_internal_node_t* curr = NULL; + vsi_size_t shape_1[] = { 1, (vsi_size_t)-1, 1 }; + + shape_1[0] = width; + + tensor = _create_output_internal_tensor(self, output); + CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); + curr->inputs[0] = input; + curr->outputs[0] = tensor->t; + curr->node->nn_param.reshape2.size = shape_1; + curr->node->nn_param.reshape2.dim_num = 3; + vsi_nn_internal_setup_node( self, curr ); + +final: + return tensor; +} /* _create_reshape_op() */ + +static vsi_bool _create_concat_op + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input0, + vsi_nn_tensor_t * input1, + vsi_nn_tensor_t * input2, + vsi_nn_tensor_t * input3, + vsi_nn_tensor_t * input4, + vsi_nn_tensor_t * input5, + vsi_nn_tensor_t * output + ) +{ + vsi_nn_internal_node_t* curr = NULL; + vsi_bool ret = FALSE; + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 6, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); + curr->inputs[0] = input0; + curr->inputs[1] = input1; + curr->inputs[2] = input2; + curr->inputs[3] = input3; + curr->inputs[4] = input4; + curr->inputs[5] = input5; + curr->outputs[0] = output; + curr->node->nn_param.concat.axis = 1; + ret = vsi_nn_internal_setup_node( self, curr ); + +final: + return ret; +} /* _create_concat_op() */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(CUSTOM_TINY_YOLOV4_POSTPROCESS, 4, 2) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + END_IO_TYPE_DECL(CUSTOM_TINY_YOLOV4_POSTPROCESS) + if (!VALIDATE_OP_IO_TYPES(CUSTOM_TINY_YOLOV4_POSTPROCESS, self, inputs, + self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + return vsi_nn_internal_optimize_node( self, direction ); +} + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + vsi_nn_internal_tensor_t * tensor0[12] = {NULL}; + vsi_nn_internal_tensor_t * tensor1[12] = {NULL}; + int32_t index_0 = 1; + int32_t index_1 = 0; + int32_t index_2 = 3; + int32_t index_3 = 2; + + vsi_nn_internal_init_node_wksp( self ); + + /**confidence**/ + /**input 0 chunk 0**/ + /* + sub0:26x26x255 --> 26x26x81, begin: [0, 0, 4, 0] end: [0, 0, 85, 0] stride: [1, 1, 1, 1] + sub1[26, 26, 80] = sigmoid(sub0)[26, 26, 0:0] * sigmoid(sub0)[26, 26, 1:81] + sub2[80, 26, 26] = transpose(sub1) + sub3[80, 676] = reshape(sub2) + */ + tensor0[0] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 0); + CHECK_PTR_FAIL_GOTO( tensor0[0], "Create internal tensor fail.", final ); + tensor0[1] = _create_sigmoid_op(self, tensor0[0]->t); + CHECK_PTR_FAIL_GOTO( tensor0[1], "Create internal tensor fail.", final ); + tensor0[2] = _create_confidence_op(self, tensor0[1]->t, outputs[0]); + CHECK_PTR_FAIL_GOTO( tensor0[2], "Create internal tensor fail.", final ); + tensor0[3] = _create_reshape_op(self, tensor0[2]->t, outputs[0], 80); + CHECK_PTR_FAIL_GOTO( tensor0[3], "Create internal tensor fail.", final ); + /**chunk 1**/ + /* + 26x26x255 --> 26x26x81, begin: [0, 0, 89, 0] end: [0, 0, 170, 0] stride: [1, 1, 1, 1] + */ + tensor0[4] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 1); + CHECK_PTR_FAIL_GOTO( tensor0[4], "Create internal tensor fail.", final ); + tensor0[5] = _create_sigmoid_op(self, tensor0[4]->t); + CHECK_PTR_FAIL_GOTO( tensor0[5], "Create internal tensor fail.", final ); + tensor0[6] = _create_confidence_op(self, tensor0[5]->t, outputs[0]); + CHECK_PTR_FAIL_GOTO( tensor0[6], "Create internal tensor fail.", final ); + tensor0[7] = _create_reshape_op(self, tensor0[6]->t, outputs[0], 80); + CHECK_PTR_FAIL_GOTO( tensor0[7], "Create internal tensor fail.", final ); + /**chunk 2**/ + /* + 26x26x255 --> 26x26x81, begin: [0, 0, 174, 0] end: [0, 0, 255, 0] stride: [1, 1, 1, 1] + */ + tensor0[8] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 2); + CHECK_PTR_FAIL_GOTO( tensor0[8], "Create internal tensor fail.", final ); + tensor0[9] = _create_sigmoid_op(self, tensor0[8]->t); + CHECK_PTR_FAIL_GOTO( tensor0[9], "Create internal tensor fail.", final ); + tensor0[10] = _create_confidence_op(self, tensor0[9]->t, outputs[0]); + CHECK_PTR_FAIL_GOTO( tensor0[10], "Create internal tensor fail.", final ); + tensor0[11] = _create_reshape_op(self, tensor0[10]->t, outputs[0], 80); + CHECK_PTR_FAIL_GOTO( tensor0[11], "Create internal tensor fail.", final ); + + /**input 1 chunk 0**/ + /* + sub0:13x13x255 --> 26x26x81, begin: [0, 0, 4, 0] end: [0, 0, 85, 0] stride: [1, 1, 1, 1] + sub1[13, 13, 80] = sigmoid(sub0)[13, 13, 0:0] * sigmoid(sub0)[13, 13, 1:81] + sub2[80, 13, 13] = transpose(sub1) + sub3[80, 169] = reshape(sub2) + */ + tensor1[0] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 0); + CHECK_PTR_FAIL_GOTO( tensor1[0], "Create internal tensor fail.", final ); + tensor1[1] = _create_sigmoid_op(self, tensor1[0]->t); + CHECK_PTR_FAIL_GOTO( tensor1[1], "Create internal tensor fail.", final ); + tensor1[2] = _create_confidence_op(self, tensor1[1]->t, outputs[0]); + CHECK_PTR_FAIL_GOTO( tensor1[2], "Create internal tensor fail.", final ); + tensor1[3] = _create_reshape_op(self, tensor1[2]->t, outputs[0], 80); + CHECK_PTR_FAIL_GOTO( tensor1[3], "Create internal tensor fail.", final ); + /**chunk 1**/ + /* + 13x13x255 --> 13x13x81, begin: [0, 0, 89, 0] end: [0, 0, 170, 0] stride: [1, 1, 1, 1] + */ + tensor1[4] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 1); + CHECK_PTR_FAIL_GOTO( tensor1[4], "Create internal tensor fail.", final ); + tensor1[5] = _create_sigmoid_op(self, tensor1[4]->t); + CHECK_PTR_FAIL_GOTO( tensor1[5], "Create internal tensor fail.", final ); + tensor1[6] = _create_confidence_op(self, tensor1[5]->t, outputs[0]); + CHECK_PTR_FAIL_GOTO( tensor1[6], "Create internal tensor fail.", final ); + tensor1[7] = _create_reshape_op(self, tensor1[6]->t, outputs[0], 80); + CHECK_PTR_FAIL_GOTO( tensor1[7], "Create internal tensor fail.", final ); + /**chunk 2**/ + /* + 13x13x255 --> 13x13x81, begin: [0, 0, 174, 0] end: [0, 0, 255, 0] stride: [1, 1, 1, 1] + */ + tensor1[8] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 2); + CHECK_PTR_FAIL_GOTO( tensor1[8], "Create internal tensor fail.", final ); + tensor1[9] = _create_sigmoid_op(self, tensor1[8]->t); + CHECK_PTR_FAIL_GOTO( tensor1[9], "Create internal tensor fail.", final ); + tensor1[10] = _create_confidence_op(self, tensor1[9]->t, outputs[0]); + CHECK_PTR_FAIL_GOTO( tensor1[10], "Create internal tensor fail.", final ); + tensor1[11] = _create_reshape_op(self, tensor1[10]->t, outputs[0], 80); + CHECK_PTR_FAIL_GOTO( tensor1[11], "Create internal tensor fail.", final ); + + ret = _create_concat_op(self, tensor0[3]->t, tensor0[7]->t, tensor0[11]->t, + tensor1[3]->t, tensor1[7]->t, tensor1[11]->t, outputs[0]); + if (ret == FALSE) + { + VSILOGE("Create concat operation fail"); + goto final; + } + + ret = FALSE; + /**box**/ + /* + 26x26x255 --> 26x26x4, begin: [0, 0, 0, 0] end: [0, 0, 4, 0] stride: [1, 1, 1, 1] + */ + tensor0[0] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 3); + CHECK_PTR_FAIL_GOTO( tensor0[0], "Create internal tensor fail.", final ); + tensor0[1] = _create_box_op(self, tensor0[0]->t, inputs[index_2], outputs[1], 23, 27); + CHECK_PTR_FAIL_GOTO( tensor0[1], "Create internal tensor fail.", final ); + tensor0[2] = _create_reshape_op(self, tensor0[1]->t, outputs[1], 4); + CHECK_PTR_FAIL_GOTO( tensor0[2], "Create internal tensor fail.", final ); + /* + 26x26x255 --> 26x26x4, begin: [0, 0, 85, 0] end: [0, 0, 89, 0] stride: [1, 1, 1, 1] + */ + tensor0[3] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 4); + CHECK_PTR_FAIL_GOTO( tensor0[3], "Create internal tensor fail.", final ); + tensor0[4] = _create_box_op(self, tensor0[3]->t, inputs[index_2], outputs[1], 37, 58); + CHECK_PTR_FAIL_GOTO( tensor0[4], "Create internal tensor fail.", final ); + tensor0[5] = _create_reshape_op(self, tensor0[4]->t, outputs[1], 4); + CHECK_PTR_FAIL_GOTO( tensor0[5], "Create internal tensor fail.", final ); + /* + 26x26x255 --> 26x26x4, begin: [0, 0, 85, 0] end: [0, 0, 89, 0] stride: [1, 1, 1, 1] + */ + tensor0[6] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 5); + CHECK_PTR_FAIL_GOTO( tensor0[6], "Create internal tensor fail.", final ); + tensor0[7] = _create_box_op(self, tensor0[6]->t, inputs[index_2], outputs[1], 81, 82); + CHECK_PTR_FAIL_GOTO( tensor0[7], "Create internal tensor fail.", final ); + tensor0[8] = _create_reshape_op(self, tensor0[7]->t, outputs[1], 4); + CHECK_PTR_FAIL_GOTO( tensor0[8], "Create internal tensor fail.", final ); + + /* + 13x13x255 --> 13x13x4, begin: [0, 0, 0, 0] end: [0, 0, 4, 0] stride: [1, 1, 1, 1] + */ + tensor1[0] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 3); + CHECK_PTR_FAIL_GOTO( tensor1[0], "Create internal tensor fail.", final ); + tensor1[1] = _create_box_op(self, tensor1[0]->t, inputs[index_3], outputs[1], 81, 82); + CHECK_PTR_FAIL_GOTO( tensor1[1], "Create internal tensor fail.", final ); + tensor1[2] = _create_reshape_op(self, tensor1[1]->t, outputs[1], 4); + CHECK_PTR_FAIL_GOTO( tensor1[2], "Create internal tensor fail.", final ); + /* + 13x13x255 --> 13x13x4, begin: [0, 0, 85, 0] end: [0, 0, 89, 0] stride: [1, 1, 1, 1] + */ + tensor1[3] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 4); + CHECK_PTR_FAIL_GOTO( tensor1[3], "Create internal tensor fail.", final ); + tensor1[4] = _create_box_op(self, tensor1[3]->t, inputs[index_3], outputs[1], 135, 169); + CHECK_PTR_FAIL_GOTO( tensor1[4], "Create internal tensor fail.", final ); + tensor1[5] = _create_reshape_op(self, tensor1[4]->t, outputs[1], 4); + CHECK_PTR_FAIL_GOTO( tensor1[5], "Create internal tensor fail.", final ); + /* + 13x13x255 --> 13x13x4, begin: [0, 0, 170, 0] end: [0, 0, 174, 0] stride: [1, 1, 1, 1] + */ + tensor1[6] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 5); + CHECK_PTR_FAIL_GOTO( tensor1[6], "Create internal tensor fail.", final ); + tensor1[7] = _create_box_op(self, tensor1[6]->t, inputs[index_3], outputs[1], 344, 319); + CHECK_PTR_FAIL_GOTO( tensor1[7], "Create internal tensor fail.", final ); + tensor1[8] = _create_reshape_op(self, tensor1[7]->t, outputs[1], 4); + CHECK_PTR_FAIL_GOTO( tensor1[8], "Create internal tensor fail.", final ); + + ret = _create_concat_op(self, tensor0[2]->t, tensor0[5]->t, tensor0[8]->t, + tensor1[2]->t, tensor1[5]->t, tensor1[8]->t, outputs[1]); + if (ret == FALSE) + { + VSILOGE("Create concat operation fail"); + goto final; + } + +final: + return ret; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t* self + ) +{ + int32_t i = 0; + vsi_nn_custom_tiny_yolov4_postprocess_param *p = &self->nn_param.custom_tiny_yolov4_postprocess; + p->local = \ + (custom_tiny_yolov4_postprocess_local_data_t*)malloc(sizeof(custom_tiny_yolov4_postprocess_local_data_t)); + CHECK_PTR_FAIL_GOTO(p->local, "create buffer fail", final); + memset(p->local, 0, sizeof(custom_tiny_yolov4_postprocess_local_data_t)); + for ( i = 0; i < VSI_NN_MAX_DIM_NUM; i++ ) + { + p->local->stride_dims[i] = 1; + } + p->local->begin_dims[0][2] = 4; + p->local->end_dims[0][2] = 85; + + p->local->begin_dims[1][2] = 89; + p->local->end_dims[1][2] = 170; + + p->local->begin_dims[2][2] = 174; + p->local->end_dims[2][2] = 255; + + p->local->begin_dims[3][2] = 0; + p->local->end_dims[3][2] = 4; + + p->local->begin_dims[4][2] = 85; + p->local->end_dims[4][2] = 89; + + p->local->begin_dims[5][2] = 170; + p->local->end_dims[5][2] = 174; +final: + return VSI_SUCCESS; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t* self + ) +{ + vsi_status status = VSI_SUCCESS; + + status = vsi_nn_op_common_deinit(self); + + vsi_nn_safe_free(self->nn_param.custom_tiny_yolov4_postprocess.local); + vsi_nn_internal_deinit_node_wksp( self ); + + return status; +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CUSTOM_TINY_YOLOV4_POSTPROCESS, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bilinear_grid_sample.c b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_box.c similarity index 59% rename from src/tim/vx/internal/src/ops/vsi_nn_op_bilinear_grid_sample.c rename to src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_box.c index c664a3c16..a05ca3f42 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_bilinear_grid_sample.c +++ b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_box.c @@ -35,9 +35,9 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -typedef struct _bilinear_grid_sample_local_data_t { +typedef struct _custom_tiny_yolov4_postprocess_box_local_data_t { int32_t placeholder; -} bilinear_grid_sample_local_data_t; +} custom_tiny_yolov4_postprocess_box_local_data_t; /* Declare number of input and output. @@ -53,27 +53,25 @@ static vsi_status op_compute ) { vsi_status status = VSI_FAILURE; - - vsi_nn_kernel_param_t* param = NULL; - int32_t align_corners = self->nn_param.bilinear_grid_sample.align_corners; - vsi_nn_kernel_node_t n; + vsi_nn_kernel_param_t * param = NULL; + float bias_0 = self->nn_param.custom_tiny_yolov4_postprocess_box.bias_0; + float bias_1 = self->nn_param.custom_tiny_yolov4_postprocess_box.bias_1; param = vsi_nn_kernel_param_create(); - vsi_nn_kernel_param_add_int32(param, "align_corners", align_corners); - n = vsi_nn_kernel_selector( - self->graph, "bilinear_grid_sample", inputs, 2, outputs, 1, param); - if (n == NULL) { - vsi_nn_kernel_param_release(¶m); - status = VSI_FAILURE; - return status; - } - self->n = (vx_node)n; - vsi_nn_kernel_param_release(¶m); - if (self->n) { + vsi_nn_kernel_param_add_float32( param, "bias_0", bias_0 ); + vsi_nn_kernel_param_add_float32( param, "bias_1", bias_1 ); + + self->n = vsi_nn_kernel_selector( self->graph, "tiny_yolov4_postprocess_box", + inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); + + if ( self->n ) + { status = VSI_SUCCESS; } + vsi_nn_kernel_param_release( ¶m ); + return status; } /* op_compute() */ @@ -85,6 +83,9 @@ static vsi_bool op_check ) { /*TODO: Check tensor shapes. */ + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } /* op_check() */ @@ -95,61 +96,36 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - if (NULL == self) { - return FALSE; - } - - if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) { - outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; - outputs[0]->attr.size[0] = inputs[1]->attr.size[1]; - outputs[0]->attr.size[1] = inputs[1]->attr.size[2]; - outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; - if (4 == inputs[0]->attr.dim_num) { - outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + uint32_t rank = inputs[0]->attr.dim_num; + vsi_bool ret = TRUE; + + VSI_UNREFERENCED(self); + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = rank; + outputs[0]->attr.size[0] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[1] = inputs[0]->attr.size[0]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[1]; + if (rank > 3) + { + memcpy( &outputs[0]->attr.size[3], &inputs[0]->attr.size[3], (rank - 3) * sizeof(vsi_size_t) ); } } - return TRUE; + return ret; } /* op_setup() */ -static vsi_status op_init - ( - vsi_nn_node_t* self - ) -{ - /* TODO - //self->nn_param.bilinear_grid_sample.local = \ - // (bilinear_grid_sample_local_data_t*)malloc(sizeof(bilinear_grid_sample_local_data_t)); - */ - - return VSI_SUCCESS; -} /* op_init() */ - -static vsi_status op_deinit - ( - vsi_nn_node_t* self - ) -{ - vsi_status status = VSI_SUCCESS; - - status = vsi_nn_op_common_deinit(self); - - /* TODO - //vsi_nn_safe_free(self->nn_param.bilinear_grid_sample.local); - */ - - return status; -} /* op_deinit() */ __BEGIN_DECLS /* Registrar */ DEF_OP_REG ( - /* op_name */ BILINEAR_GRID_SAMPLE, - /* init */ op_init, + /* op_name */ CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX, + /* init */ NULL, /* compute */ op_compute, - /* deinit */ op_deinit, + /* deinit */ vsi_nn_op_common_deinit, /* check */ op_check, /* setup */ op_setup, /* optimize */ NULL, diff --git a/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_confidence.c b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_confidence.c new file mode 100644 index 000000000..a9cf8b4a6 --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_confidence.c @@ -0,0 +1,127 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +typedef struct _tiny_yolov4_postprocess_confidence_local_data_t { + int32_t placeholder; +} tiny_yolov4_postprocess_confidence_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + + self->n = vsi_nn_kernel_selector( self->graph, "tiny_yolov4_postprocess_confidence", + inputs, 1, outputs, 1, NULL ); + + if ( self->n ) + { + status = VSI_SUCCESS; + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t rank = inputs[0]->attr.dim_num; + vsi_bool ret = TRUE; + + VSI_UNREFERENCED(self); + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = rank; + outputs[0]->attr.size[0] = inputs[0]->attr.size[2] - 1; + outputs[0]->attr.size[1] = inputs[0]->attr.size[0]; + outputs[0]->attr.size[2] = inputs[0]->attr.size[1]; + if (rank > 3) + { + memcpy( &outputs[0]->attr.size[3], &inputs[0]->attr.size[3], (rank - 3) * sizeof(vsi_size_t) ); + } + } + + return ret; +} /* op_setup() */ + + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c index a1e50a481..8fc6d6ce0 100644 --- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c +++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c @@ -54,20 +54,26 @@ DEF_KERNEL_EXECUTOR(_softmax_compute) size_t param_size ) { - vsi_status status = VX_SUCCESS; + vsi_status status = VSI_FAILURE; float *buffer[_CPU_IO_NUM] = {NULL}; vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *attr[_CPU_IO_NUM] = {NULL}; uint32_t i = 0, out_elements = 0; int32_t axis; + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(param_size); + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; // input0 tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; // input1 tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; // output attr[0] = vsi_nn_kernel_tensor_attr_create(tensors[0]); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create(tensors[1]); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); attr[2] = vsi_nn_kernel_tensor_attr_create(tensors[2]); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis); CHECK_STATUS_FAIL_GOTO(status, final ); @@ -133,6 +139,8 @@ static vsi_status _query_kernel vsi_nn_kernel_t* kernel ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); return VSI_SUCCESS; } @@ -153,6 +161,9 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; int32_t axis = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + axis = vsi_nn_kernel_param_get_int32(params, "axis"); status = _query_kernel(inputs, outputs, kernel); if(status != VSI_SUCCESS) diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c index ed1e14932..3fb62eb74 100644 --- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c +++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c @@ -54,7 +54,7 @@ DEF_KERNEL_EXECUTOR(_softmax_exec) size_t param_size ) { - vsi_status status = VX_SUCCESS; + vsi_status status = VSI_FAILURE; float* buffer[_CPU_IO_NUM] = { NULL }; vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL }; @@ -64,11 +64,16 @@ DEF_KERNEL_EXECUTOR(_softmax_exec) float fMax = 0.0; float fProbSum = 0.0f; + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(param_size); + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &sf_axis); CHECK_STATUS_FAIL_GOTO(status, final ); @@ -141,6 +146,8 @@ static vsi_status _query_kernel vsi_nn_kernel_t* kernel ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); return VSI_SUCCESS; } @@ -161,6 +168,9 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; int32_t axis = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + axis = vsi_nn_kernel_param_get_int32(params, "axis"); status = _query_kernel( inputs, outputs, kernel ); diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c index f2cb0315c..b9e77c299 100644 --- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c +++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c @@ -62,6 +62,7 @@ static vx_param_description_t _custom_warp_affine_kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, // Add kererl parameters here }; #define _CUSTOM_WARP_AFFINE_PARAM_NUM _cnt_of_array( _custom_warp_affine_kernel_param_def ) @@ -97,7 +98,7 @@ static vsi_bool _read_pixel if (out_of_bounds) { - *pixel = 205.0f; + *pixel = 0.0f; return TRUE; } @@ -125,6 +126,7 @@ DEF_KERNEL_EXECUTOR(_compute) vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL }; int32_t type = 0; + int32_t rgb_type = 0; float matrix[6] = {0}; vsi_size_t i = 0; vsi_size_t b = 0; @@ -135,11 +137,16 @@ DEF_KERNEL_EXECUTOR(_compute) vsi_size_t height = 0; vsi_size_t outer_size = 1; + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(param_size); + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); @@ -153,6 +160,7 @@ DEF_KERNEL_EXECUTOR(_compute) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_TYPE], &type); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &rgb_type); CHECK_STATUS_FAIL_GOTO(status, final ); for (i = 0; i < 6; i++) { @@ -172,34 +180,95 @@ DEF_KERNEL_EXECUTOR(_compute) { float *src_base = buffer[0] + b * attr[0]->shape->data[0] * attr[0]->shape->data[1]; float *dst_base = buffer[1] + b * width * height; - for (y = 0; y < height; y++) + + if ( rgb_type == VSI_NN_WARP_AFFINE_TYPE_RGB ) { - for (x = 0; x < width; x++) + width = width / 3; + for (y = 0; y < height; y++) { - float xf = 0; - float yf = 0; - float dst = 0; - - _transform_affine(x, y, matrix, &xf, &yf); - if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR) + for (x = 0; x < width; x++) { - _read_pixel(src_base, attr[0], xf, yf, &dst); - dst_base[y * width + x] = dst; + float xf = 0; + float yf = 0; + float dst = 0; + + _transform_affine(x, y, matrix, &xf, &yf); + + if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR) + { + _read_pixel(src_base, attr[0], 3 * floorf(xf), floorf(yf), &dst); + dst_base[y * 3 * width + 3 * x] = dst; + _read_pixel(src_base, attr[0], 3 * floorf(xf) + 1, floorf(yf), &dst); + dst_base[y * 3 * width + 3 * x + 1] = dst; + _read_pixel(src_base, attr[0], 3 * floorf(xf) + 2, floorf(yf), &dst); + dst_base[y * 3 * width + 3 * x + 2] = dst; + } + else + { + float tl = 0, tr = 0, bl = 0, br = 0; + float ar = xf - floorf(xf); + float ab = yf - floorf(yf); + float al = 1.0f - ar; + float at = 1.0f - ab; + + _read_pixel(src_base, attr[0], 3 * floorf(xf), floorf(yf), &tl); + _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1), floorf(yf), &tr); + _read_pixel(src_base, attr[0], 3 * floorf(xf), floorf(yf) + 1, &bl); + _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1), floorf(yf) + 1, &br); + + dst_base[y * 3 * width + 3 * x] = + tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab; + + _read_pixel(src_base, attr[0], 3 * floorf(xf) + 1, floorf(yf), &tl); + _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1) + 1, floorf(yf), &tr); + _read_pixel(src_base, attr[0], 3 * floorf(xf) + 1, floorf(yf) + 1, &bl); + _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1) + 1, floorf(yf) + 1, &br); + + dst_base[y * 3 * width + 3 * x + 1] = + tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab; + + _read_pixel(src_base, attr[0], 3 * floorf(xf) + 2, floorf(yf), &tl); + _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1) + 2, floorf(yf), &tr); + _read_pixel(src_base, attr[0], 3 * floorf(xf) + 2, floorf(yf) + 1, &bl); + _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1) + 2, floorf(yf) + 1, &br); + + dst_base[y * 3 * width + 3 * x + 2] = + tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab; + } } - else + } + } + else + { + for (y = 0; y < height; y++) + { + for (x = 0; x < width; x++) { - float tl = 0, tr = 0, bl = 0, br = 0; - float ar = xf - floorf(xf); - float ab = yf - floorf(yf); - float al = 1.0f - ar; - float at = 1.0f - ab; - - _read_pixel(src_base, attr[0], floorf(xf), floorf(yf), &tl); - _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf), &tr); - _read_pixel(src_base, attr[0], floorf(xf), floorf(yf) + 1, &bl); - _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf) + 1, &br); - - dst_base[y * width + x] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab; + float xf = 0; + float yf = 0; + float dst = 0; + + _transform_affine(x, y, matrix, &xf, &yf); + if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR) + { + _read_pixel(src_base, attr[0], xf, yf, &dst); + dst_base[y * width + x] = dst; + } + else + { + float tl = 0, tr = 0, bl = 0, br = 0; + float ar = xf - floorf(xf); + float ab = yf - floorf(yf); + float al = 1.0f - ar; + float at = 1.0f - ab; + + _read_pixel(src_base, attr[0], floorf(xf), floorf(yf), &tl); + _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf), &tr); + _read_pixel(src_base, attr[0], floorf(xf), floorf(yf) + 1, &bl); + _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf) + 1, &br); + + dst_base[y * width + x] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab; + } } } } @@ -233,6 +302,8 @@ static vsi_status _query_kernel ) { vsi_status status = VSI_FAILURE; + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); kernel->info.function = _compute; kernel->info.parameters = _custom_warp_affine_kernel_param_def; @@ -260,6 +331,7 @@ static vsi_nn_kernel_node_t _setup size_t i = 0; size_t buffer_size = 0; int32_t type = vsi_nn_kernel_param_get_int32( params, "type"); + int32_t rgb_type = vsi_nn_kernel_param_get_int32( params, "rgb_type"); float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size ); status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); @@ -278,6 +350,8 @@ static vsi_nn_kernel_node_t _setup node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create( graph, F32, &buffer[i] ); } + node_params[9] = vsi_nn_kernel_scalar_create( + graph, I32, &rgb_type ); /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM ); @@ -286,6 +360,7 @@ static vsi_nn_kernel_node_t _setup { vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] ); } + vsi_nn_kernel_scalar_release( &node_params[9] ); } } return node; diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c index 397f02291..98ae55858 100644 --- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c +++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c @@ -95,7 +95,7 @@ static vsi_bool _read_pixel ) { vsi_size_t width = attr->shape->data[0]; - vsi_size_t height = attr->shape->data[1]; + vsi_size_t height = attr->shape->size > 1 ? attr->shape->data[1] : 1; vsi_bool out_of_bounds = (x < 0 || y < 0 || x >= width || y >= height); vsi_size_t bx = 0, by = 0; @@ -139,11 +139,16 @@ DEF_KERNEL_EXECUTOR(_compute) vsi_size_t height = 0; vsi_size_t outer_size = 1; + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(param_size); + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); @@ -237,6 +242,8 @@ static vsi_status _query_kernel ) { vsi_status status = VSI_FAILURE; + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); kernel->info.function = _compute; kernel->info.parameters = _custom_warp_perspective_kernel_param_def; diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c index 0ec7145e4..6dc60cea4 100644 --- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c +++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c @@ -73,6 +73,8 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer) {0, 0, 0}, // local_size: local group size in thread {0, 0, 0}}; // global_size: image size in thread + VSI_UNREFERENCED(param_size); + attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); if (!attr) { @@ -144,6 +146,8 @@ static vsi_status _query_kernel vsi_nn_kernel_t* kernel ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) ); vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, @@ -170,6 +174,9 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; int32_t axis = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + axis = vsi_nn_kernel_param_get_int32(params, "axis"); status = _query_kernel( inputs, outputs, kernel ); diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_box_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_box_evis.c new file mode 100644 index 000000000..c56c80937 --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_box_evis.c @@ -0,0 +1,357 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_TINY_YOLOV4_POSTPROCESS_BOX, +} _internal_kernel_e; + +#define _SOURCE "tiny_yolov4_postprocess_box" +#define _KERNEL_NAME CVIVANTE_NAMESPACE("evis.tiny_yolov4_postprocess_box_U8_U8toU8") + +// Add kernel hashtable here +#define TINY_YOLOV4_POSTPROCESS_BOX_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + (( IN0_DTYPE ) | ( IN1_DTYPE << 8 ) | ( OUT_DTYPE << 16 )) +#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + { TINY_YOLOV4_POSTPROCESS_BOX_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), \ + _KERNEL_NAME, _SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _tiny_yolov4_postprocess_box_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( U8, U8, U8 ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _tiny_yolov4_postprocess_box_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _TINY_YOLOV4_POSTPROCESS_BOX_PARAM_NUM _cnt_of_array( _tiny_yolov4_postprocess_box_kernel_param_def ) +#define SCALAR_BIAS_0_VALUE (3) +#define SCALAR_BIAS_1_VALUE (4) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_tiny_yolov4_postprocess_box_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + float CONST2 = 16.0f; + + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + // Add initializer + gpu_param.dim = 2; + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2( + (attr[0]->shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 8); + gpu_param.global_size[1] = 1; + + if (attr[0]->shape->data[0] == 13 * 13) + { + CONST2 = 32.0f; + } + + if (attr[0]->dtype == U8 && attr[1]->dtype == U8 && attr[2]->dtype == U8) + { + float input0_scale = attr[0]->scale; + float input0_tail = 0 - (float)attr[0]->zero_point * input0_scale; + float input1_scale = attr[1]->scale; + float input1_tail = 0 - (float)attr[1]->zero_point * input1_scale; + float output_scale = 1.0f / attr[2]->scale; + float output_zp = (float)attr[2]->zero_point; + gpu_dp_inst_t uniExtract8Data_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniDatatoFloat32_0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniDatatoFloat32_1_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniDataTranspose_0_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x0c080400, 0x0d090501, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniDataTranspose_1_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x0e0a0602, 0x0f0b0703, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniDatatoFloat32_0_4x4", &uniDatatoFloat32_0_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniDatatoFloat32_1_4x4", &uniDatatoFloat32_1_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Data_2x8", &uniExtract8Data_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniDataTranspose_0_2x8", &uniDataTranspose_0_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniDataTranspose_1_2x8", &uniDataTranspose_1_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "input0_scale", &input0_scale); + status |= vsi_nn_kernel_gpu_add_param( node, "input0_tail", &input0_tail); + status |= vsi_nn_kernel_gpu_add_param( node, "input1_scale", &input1_scale); + status |= vsi_nn_kernel_gpu_add_param( node, "input1_tail", &input1_tail); + status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale); + status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp); + status |= vsi_nn_kernel_gpu_add_param( node, "CONST2", &CONST2); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + } + + return status; +} /* _tiny_yolov4_postprocess_box_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _tiny_yolov4_postprocess_box_kernel_map; + size_t kernel_map_size = _cnt_of_array( _tiny_yolov4_postprocess_box_kernel_map ); + vx_param_description_t * param_def = _tiny_yolov4_postprocess_box_kernel_param_def; + vx_kernel_initialize_f initializer = _tiny_yolov4_postprocess_box_initializer; + + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = TINY_YOLOV4_POSTPROCESS_BOX_HASH_KEY( in0_dtype, in1_dtype, out_dtype ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _tiny_yolov4_postprocess_box_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_TINY_YOLOV4_POSTPROCESS_BOX_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + vsi_size_t shape[3][VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + float bias_0 = vsi_nn_kernel_param_get_float32( params, "bias_0" ); + float bias_1 = vsi_nn_kernel_param_get_float32( params, "bias_1" ); + + VSI_UNREFERENCED(params); + + memcpy(shape[0], inputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + shape[0][0] = shape[0][0] * shape[0][1]; + shape[0][1] = shape[0][2]; + shape[0][2] = 1; + + memcpy(shape[1], inputs[1]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + shape[1][0] = shape[1][0] * shape[1][1]; + shape[1][1] = shape[1][2]; + shape[1][2] = 1; + + memcpy(shape[2], outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + shape[2][0] = shape[2][0]; + shape[2][1] = shape[2][2] * shape[2][1]; + shape[2][2] = 1; + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shape[0], inputs[0]->attr.dim_num ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + inputs[1], shape[1], inputs[1]->attr.dim_num ); + reshape_tensors[2] = vsi_nn_reshape_tensor( graph, + outputs[0], shape[2], outputs[0]->attr.dim_num ); + + if ( !vsi_nn_kernel_gpu_check_shape( + reshape_tensors[0]->attr.size, reshape_tensors[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _TINY_YOLOV4_POSTPROCESS_BOX_PARAM_NUM, + reshape_tensors, input_num, &reshape_tensors[2], output_num ); + /* Pass parameters to node. */ + node_params[SCALAR_BIAS_0_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &bias_0 ); + node_params[SCALAR_BIAS_1_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &bias_1 ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _TINY_YOLOV4_POSTPROCESS_BOX_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_BIAS_0_VALUE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_BIAS_1_VALUE] ); + } + } + + vsi_safe_release_tensor( reshape_tensors[0] ); + vsi_safe_release_tensor( reshape_tensors[1] ); + vsi_safe_release_tensor( reshape_tensors[2] ); + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( tiny_yolov4_postprocess_box, _setup ) + diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c new file mode 100644 index 000000000..b36ec6b14 --- /dev/null +++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c @@ -0,0 +1,320 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_TINY_YOLOV4_POSTPROCESS_CONFIDENCE, +} _internal_kernel_e; + +#define _SOURCE "tiny_yolov4_postprocess_confidence" +#define _KERNEL_NAME CVIVANTE_NAMESPACE("evis.tiny_yolov4_postprocess_conf_U8toU8") + +// Add kernel hashtable here +#define _CONFIDENCE_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 8 ) | ( OUT_DTYPE )) +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { _CONFIDENCE_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \ + _KERNEL_NAME, _SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _tiny_yolov4_postprocess_confidence_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( U8, U8 ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _tiny_yolov4_postprocess_confidence_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM \ + _cnt_of_array( _tiny_yolov4_postprocess_confidence_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_tiny_yolov4_postprocess_confidence_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + gpu_param.dim = 2; + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 4; + gpu_param.global_size[0] = gpu_align_p2( + (attr[0]->shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (attr[1]->shape->data[0] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + + if (attr[0]->dtype == U8 && attr[1]->dtype == U8) + { + float output_scale = attr[0]->scale * attr[0]->scale / attr[1]->scale; + int output_zp = attr[1]->zero_point; + uint16_t M0 = 0; + int32_t postShift = 0; + int32_t i = 0; + + gpu_dp_inst_t uniU8TimesU8_0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x01010101, // BSelt + 0x00010000, 0x00030002, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU16TimesMultiplier_PostShift_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8PlusU8_trans_0_2x8 = {{ + 0xffffffff, // TCfg + 0x44444444, // ASelt + 0x0c080400, 0x0d090501, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00007400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8PlusU8_trans_1_2x8 = {{ + 0xffffffff, // TCfg + 0x44444444, // ASelt + 0x0e0a0602, 0x0f0b0703, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00007400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + gpu_quantize_multiplier_16bit((double)output_scale, &M0, &postShift); + + uniU16TimesMultiplier_PostShift_2x8.data[7] |= (postShift & 0x1F); + for ( i = 8; i < 16; i++ ) + { + uniU16TimesMultiplier_PostShift_2x8.data[i] = M0; + } + + status = vsi_nn_kernel_gpu_add_param( node, "uniU8TimesU8_0_4x4", &uniU8TimesU8_0_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniU16TimesMultiplier_PostShift_2x8", + &uniU16TimesMultiplier_PostShift_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniU8PlusU8_trans_0_2x8", &uniU8PlusU8_trans_0_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniU8PlusU8_trans_1_2x8", &uniU8PlusU8_trans_1_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + } + + return status; +} /* _tiny_yolov4_postprocess_confidence_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _tiny_yolov4_postprocess_confidence_kernel_map; + size_t kernel_map_size = _cnt_of_array( _tiny_yolov4_postprocess_confidence_kernel_map ); + vx_param_description_t * param_def = _tiny_yolov4_postprocess_confidence_kernel_param_def; + vx_kernel_initialize_f initializer = _tiny_yolov4_postprocess_confidence_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = _CONFIDENCE_HASH_KEY( in_dtype, out_dtype ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _tiny_yolov4_postprocess_confidence_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + + VSI_UNREFERENCED(params); + + memcpy(shape[0], inputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + shape[0][0] = shape[0][0] * shape[0][1]; + shape[0][1] = shape[0][2]; + shape[0][2] = 1; + + memcpy(shape[1], outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + shape[1][0] = shape[1][0]; + shape[1][1] = shape[1][2] * shape[1][1]; + shape[1][2] = 1; + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shape[0], inputs[0]->attr.dim_num ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], shape[1], outputs[0]->attr.dim_num ); + + if ( !vsi_nn_kernel_gpu_check_shape( + reshape_tensors[0]->attr.size, reshape_tensors[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM, + reshape_tensors, input_num, &reshape_tensors[1], output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, + _TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM ); + } + } + + vsi_safe_release_tensor(reshape_tensors[0]); + vsi_safe_release_tensor(reshape_tensors[1]); + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( tiny_yolov4_postprocess_confidence, _setup ) + diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c index 169825158..3272fd634 100644 --- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c +++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c @@ -50,18 +50,27 @@ typedef enum _custom_warp_affine_type_e }custom_warp_affine_type_e; #define _CUSTOM_WARP_AFFINE_KERNEL_SOURCE "custom_warp_affine" +#define _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE "custom_warp_affine_rgb" // Add kernel hashtable here -#define CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D ) \ - (( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20)) +#define CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D, RGB_TYPE ) \ + (( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20) | (RGB_TYPE << 24)) #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ - { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0 ), \ + { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 0 ), \ CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE), \ _CUSTOM_WARP_AFFINE_KERNEL_SOURCE } #define PACK_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ - { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1 ), \ + { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 0 ), \ CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D"), \ _CUSTOM_WARP_AFFINE_KERNEL_SOURCE } +#define PACK_RGB_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ + { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 1 ), \ + CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb"), \ + _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE } +#define PACK_RGB_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \ + { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 1 ), \ + CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb_2D"), \ + _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE } typedef struct { @@ -78,6 +87,12 @@ static const _kernel_map_type _custom_warp_affine_kernel_map[] = PACK_2D_KERNEL_MAP( U8, U8, nearest_neighbor ), PACK_2D_KERNEL_MAP( U8, U8, bilinear ), + + PACK_RGB_KERNEL_MAP( U8, U8, nearest_neighbor ), + PACK_RGB_KERNEL_MAP( U8, U8, bilinear ), + + PACK_RGB_2D_KERNEL_MAP( U8, U8, nearest_neighbor ), + PACK_RGB_2D_KERNEL_MAP( U8, U8, bilinear ), }; /* @@ -124,6 +139,8 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_initializer) float matrix4[4] = {0}; int32_t i = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -178,7 +195,81 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_initializer) return status; } /* _custom_warp_affine_initializer() */ +DEF_KERNEL_INITIALIZER(_custom_warp_affine_rgb_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; + vsi_size_array_t * out_shape = NULL; + float m[6] = {0}; + float matrix0[4] = {0}; + float matrix1[4] = {0}; + int32_t i = 0; + + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + for (i = 0; i < 6; i++) + { + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i], + &m[i]); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + matrix0[0] = m[0]; matrix0[1] = m[1]; matrix0[2] = m[2]; matrix0[3] = m[3]; + matrix1[0] = m[4]; matrix1[1] = m[5]; + out_shape = attr[1]->shape; + + gpu_param.global_scale[0] = 2; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = ( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / (3 * gpu_param.global_scale[0])); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_add_param( node, + "matrix0", &matrix0 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "matrix1", &matrix1 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + + return status; +} /* _custom_warp_affine_rgb_initializer() */ /* * Query kernel @@ -188,7 +279,8 @@ static vsi_status _query_kernel vsi_nn_kernel_t * kernel, vsi_nn_tensor_t * const * const inputs, vsi_nn_tensor_t * const * const outputs, - int32_t type + int32_t type, + int32_t rgb_type ) { vsi_status status = VSI_FAILURE; @@ -205,8 +297,11 @@ static vsi_status _query_kernel in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = CUSTOM_WARP_AFFINE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img ); - + key = CUSTOM_WARP_AFFINE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img, rgb_type ); + if (rgb_type == 1) + { + initializer = _custom_warp_affine_rgb_initializer; + } for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) { if ( kernel_map[i].key == key ) @@ -251,6 +346,7 @@ static vsi_nn_kernel_node_t _setup size_t i = 0; size_t buffer_size = 0; int32_t type = vsi_nn_kernel_param_get_int32( params, "type"); + int32_t rgb_type = vsi_nn_kernel_param_get_int32( params, "rgb_type"); float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size ); if (vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) @@ -258,7 +354,7 @@ static vsi_nn_kernel_node_t _setup return NULL; } - status = _query_kernel( kernel, inputs, outputs, type ); + status = _query_kernel( kernel, inputs, outputs, type, rgb_type ); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); @@ -282,7 +378,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] ); } // Set default border mode. - border.constant_value.U32 = 0xcdcdcdcd; + border.constant_value.U32 = 0x00000000; status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); CHECK_STATUS(status); } diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c index 69367599b..ab6d8437e 100644 --- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c +++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c @@ -127,6 +127,8 @@ DEF_KERNEL_INITIALIZER(_custom_warp_perspective_initializer) float matrix4[4] = {0}; int32_t i = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); diff --git a/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c b/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c index 2e7415e62..606b7c80f 100644 --- a/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c +++ b/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c @@ -48,6 +48,9 @@ static vsi_status op_compute { vsi_status status = VSI_SUCCESS; + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + #if defined(VX_DENOISE_POSTPROCESS_SUPPORT) && VX_DENOISE_POSTPROCESS_SUPPORT self->n = vxDenoisePostProcesslayer( self->graph->g, @@ -83,6 +86,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } /* op_check() */ @@ -93,6 +99,9 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } /* op_setup() */ @@ -101,6 +110,7 @@ static vsi_status op_init vsi_nn_node_t* self ) { + VSI_UNREFERENCED(self); return VSI_SUCCESS; } /* op_init() */ diff --git a/src/tim/vx/internal/src/custom/ops/op_custom_sample.c b/src/tim/vx/internal/src/custom/ops/op_custom_sample.c index 145953922..ef28a2e64 100644 --- a/src/tim/vx/internal/src/custom/ops/op_custom_sample.c +++ b/src/tim/vx/internal/src/custom/ops/op_custom_sample.c @@ -63,6 +63,9 @@ static vsi_bool op_check ) { /*TODO: Check params. */ + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } /* op_check() */ @@ -73,6 +76,7 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(node); if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; diff --git a/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c b/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c index 3a37247a9..6da5e6136 100644 --- a/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c +++ b/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c @@ -62,6 +62,9 @@ static vsi_bool op_check ) { /*TODO: Check params. */ + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } /* op_check() */ @@ -72,6 +75,7 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(node); if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; diff --git a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c index e076b7c7c..5ee37c58e 100644 --- a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c +++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c @@ -59,6 +59,7 @@ static vsi_status op_compute param = vsi_nn_kernel_param_create(); vsi_nn_kernel_param_add_const_buffer( param, "matrix", p->matrix, 6 ); vsi_nn_kernel_param_add_int32( param, "type", p->type); + vsi_nn_kernel_param_add_int32( param, "rgb_type", p->rgb_type); self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "custom_warp_affine", @@ -78,6 +79,9 @@ static vsi_bool op_check ) { /*TODO: Check tensor shapes. */ + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } /* op_check() */ diff --git a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c index 7afbd8352..91f788c94 100644 --- a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c +++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c @@ -78,6 +78,9 @@ static vsi_bool op_check ) { /*TODO: Check tensor shapes. */ + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } /* op_check() */ diff --git a/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c b/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c index 6a84a5e0b..b9a840ff3 100644 --- a/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c @@ -100,7 +100,7 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer) size_t param_size ) { - vsi_status status = VX_FAILURE; + vsi_status status = VSI_FAILURE; // Alignment with a power of two value. gpu_param_t gpu_param = { 2, @@ -113,6 +113,8 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer) vsi_nn_kernel_tensor_attr_t *input0_attr = NULL; vsi_size_array_t *input_shape = NULL; + VSI_UNREFERENCED(param_size); + input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0); CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); input_shape = input0_attr->shape; diff --git a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c index 5741690d3..bc7d36efc 100644 --- a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c @@ -143,6 +143,8 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer) vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -183,7 +185,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int32_t i; + size_t i; input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -240,6 +242,9 @@ static vsi_nn_kernel_node_t _setup int32_t axis = 0; vsi_size_t axis_size = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + axis = vsi_nn_kernel_param_get_int32(params, "axis"); if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, diff --git a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c index b710fa11e..6fb6cd872 100644 --- a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c @@ -143,6 +143,8 @@ DEF_KERNEL_INITIALIZER(_argmin_initializer) vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -183,7 +185,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int32_t i; + size_t i; input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -240,6 +242,9 @@ static vsi_nn_kernel_node_t _setup int32_t axis = 0; size_t axis_size = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + axis = vsi_nn_kernel_param_get_int32(params, "axis"); if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, diff --git a/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c b/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c index c0ed53eee..24b266439 100644 --- a/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c @@ -129,6 +129,8 @@ DEF_KERNEL_INITIALIZER(_avg_pool3d_initializer) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t *output_shape = NULL; + VSI_UNREFERENCED(param_size); + vxReadScalarValue(depth_out, &depth_out_value); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c index c62f0b4c0..689603021 100644 --- a/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c @@ -135,6 +135,8 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer) vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * in_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -170,7 +172,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + size_t i; input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -238,6 +240,9 @@ static vsi_nn_kernel_node_t _setup float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f; float eps = vsi_nn_kernel_param_get_float32(params, "eps"); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if ( (inputs[1]->attr.is_const && inputs[2]->attr.is_const) || ( inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16 && inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32 ) diff --git a/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c b/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c index bda96ffcb..84811fd82 100644 --- a/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c @@ -128,6 +128,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer) vsi_nn_kernel_tensor_attr_t* output_attr = NULL; vsi_size_array_t* out_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]); CHECK_PTR_FAIL_GOTO(output_attr, "Create tensor attr buffer fail.", final); @@ -140,9 +142,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer) gpu_param.dim = 2; gpu_param.global_size[0] = - gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1) / - gpu_param.global_scale[0], - 4); + (out_shape->data[0] + gpu_param.global_scale[0] - 1) / + gpu_param.global_scale[0]; gpu_param.global_size[1] = ((out_shape->data[1] + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1]); diff --git a/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c b/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c index e20cb1be4..d3c4968a8 100644 --- a/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c @@ -134,6 +134,8 @@ DEF_KERNEL_INITIALIZER(_bucketize_initializer) vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/cast_cl.c b/src/tim/vx/internal/src/kernel/cl/cast_cl.c index 33291a799..e379000ea 100644 --- a/src/tim/vx/internal/src/kernel/cl/cast_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/cast_cl.c @@ -128,6 +128,8 @@ DEF_KERNEL_INITIALIZER(_cast_initializer) vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); @@ -251,6 +253,8 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; + VSI_UNREFERENCED(params); + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) ) { diff --git a/src/tim/vx/internal/src/kernel/cl/clip_cl.c b/src/tim/vx/internal/src/kernel/cl/clip_cl.c index 4b518b2be..ec74f361b 100644 --- a/src/tim/vx/internal/src/kernel/cl/clip_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/clip_cl.c @@ -128,6 +128,8 @@ DEF_KERNEL_INITIALIZER(_clip_initializer) vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c index 8fec39b3c..4b1369f96 100644 --- a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c @@ -229,6 +229,8 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer) vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -285,7 +287,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + size_t i; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); @@ -347,6 +349,9 @@ static vsi_nn_kernel_node_t _setup float input1Scale = vsi_nn_get_tensor_scale(inputs[1]); float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + ret = vsi_nn_kernel_optimize_eltwise_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, inputs[1]->attr.size, inputs[1]->attr.dim_num, @@ -363,11 +368,11 @@ static vsi_nn_kernel_node_t _setup outputs[0], shapes[2], new_rank ); #define _swap_tensor(a, b, tmp) \ - do { \ + { \ tmp = a; \ a = b; \ b = tmp; \ - } while(0) + } if (shapes[1][3] > shapes[0][3] && new_rank == 4) { diff --git a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c index 0aac099e6..8dca93180 100644 --- a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c @@ -135,6 +135,8 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer) int32_t c = 1; uint32_t dim = 1; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -203,7 +205,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e input0_dtype = U8; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; - int i = 0; + size_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -288,11 +290,28 @@ static vsi_nn_kernel_node_t _setup int32_t width = 0; int32_t height = 0; int32_t channel = 1; - int32_t i = 0; + uint32_t i = 0; + + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); - vsi_nn_kernel_optimize_softmax_shape( - inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, - shapes[0], &rs_dim, &axis_new); + if (axis < 0) + { + axis_new = 0; + shapes[0][0] = 1; + shapes[0][1] = 1; + for (i = 0; i < inputs[0]->attr.dim_num; i++) + { + shapes[0][0] *= inputs[0]->attr.size[i]; + } + rs_dim = 2; + } + else + { + vsi_nn_kernel_optimize_softmax_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, + shapes[0], &rs_dim, &axis_new); + } if (rs_dim > 3) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c index e1bb5f9c4..94e79fe56 100644 --- a/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c @@ -103,6 +103,8 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer) int32_t output_height = 0; int32_t output_chn = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -145,7 +147,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e input0_dtype = U8; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; - int i = 0; + size_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -195,6 +197,9 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { diff --git a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c index f34393ecf..596aab56e 100644 --- a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c @@ -126,6 +126,9 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer) vsi_nn_kernel_tensor_attr_t * input_attr = NULL; vsi_size_array_t * in_shape = NULL; + VSI_UNREFERENCED(param_size); + VSI_UNREFERENCED(node); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); in_shape = input_attr->shape; diff --git a/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c b/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c index d54182d11..c278d0603 100644 --- a/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c @@ -181,6 +181,14 @@ static vsi_nn_kernel_node_t _setup { vsi_nn_kernel_node_t node = NULL; + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(outputs); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(kernel); + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c index 5d29c6796..c44010a9c 100644 --- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c @@ -211,6 +211,9 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -253,7 +256,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + size_t i; input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -327,6 +330,9 @@ static vsi_nn_kernel_node_t _setup float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" ); float beta = vsi_nn_kernel_param_get_float32( params, "beta" ); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if (unary_type == UNARY_SELU) { alpha = alpha * beta; diff --git a/src/tim/vx/internal/src/kernel/cl/erf_cl.c b/src/tim/vx/internal/src/kernel/cl/erf_cl.c index d6ef8d85b..e7aa1d3d2 100644 --- a/src/tim/vx/internal/src/kernel/cl/erf_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/erf_cl.c @@ -135,6 +135,9 @@ DEF_KERNEL_INITIALIZER(_erf_initializer) vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -243,6 +246,10 @@ static vsi_nn_kernel_node_t _setup float outputScale = vsi_nn_get_tensor_scale(outputs[0]); float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + ret = vsi_nn_kernel_optimize_element_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank ); diff --git a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c index af31ed15d..7341f3282 100644 --- a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c @@ -122,11 +122,14 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer) {0, 0, 0}, {0, 0, 0} }; - vx_status status = VX_FAILURE; - vx_tensor output = (vx_tensor)param[2]; + vsi_status status = VSI_FAILURE; + vx_tensor output = (vx_tensor)param[2]; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t *output_shape = NULL; + VSI_UNREFERENCED(param_size); + VSI_UNREFERENCED(node); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -258,6 +261,8 @@ static vsi_nn_kernel_node_t _setup float input1Scale = vsi_nn_get_tensor_scale(inputs[1]); float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]); + VSI_UNREFERENCED(params); + outputScale = 1.0f / outputScale; input0Tail = -(input0Tail * input0Scale); input1Tail = -(input1Tail * input1Scale); diff --git a/src/tim/vx/internal/src/kernel/cl/gather_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_cl.c index bafe86c15..a3fa2d61d 100644 --- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c @@ -205,6 +205,9 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) size_t input_dims1 = 0; size_t i = 0; + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -264,7 +267,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e input0_dtype = U8; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; - int i = 0; + size_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -334,6 +337,9 @@ static vsi_nn_kernel_node_t _setup int32_t is_array = block_size >= GPU_TENSOR_MAX_WIDTH ? 1 : 0; int32_t i = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0, &is_array); status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1, &is_array); status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0, &is_array); diff --git a/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c index a8d56a2bc..82838648c 100644 --- a/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c @@ -51,18 +51,30 @@ typedef enum #define STR(a) #a // Add kernel hashtable here -#define GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMG_2D ) \ - (( AXIS ) | ( IN0_DTYPE << 2 ) | ( IN1_DTYPE << 10 ) | ( OUT_DTYPE << 18 ) | ( IMG_2D << 26 )) +#define GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMG_2D, BEYOND_MAXWIDTH ) \ + (( AXIS ) | ( IN0_DTYPE << 2 ) | ( IN1_DTYPE << 10 ) | ( OUT_DTYPE << 18 ) | ( IMG_2D << 26 ) | \ + (BEYOND_MAXWIDTH << 28)) #define PACK_KERNEL_3D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ - { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 ), \ + { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 , 0), \ CVIVANTE_NAMESPACE("cl.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \ _GATHER_ELEMENTS_KERNEL_SOURCE} #define PACK_KERNEL_2D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ - { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 ), \ + { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 , 0), \ CVIVANTE_NAMESPACE("cl.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ _GATHER_ELEMENTS_KERNEL_SOURCE} +#define PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 , 1), \ + CVIVANTE_NAMESPACE("cl.gather_elements_beyond_maxwidth_axis"STR(AXIS)"_"STR(IN0_DTYPE)\ + "_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \ + _GATHER_ELEMENTS_KERNEL_SOURCE} + +#define PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 , 1), \ + CVIVANTE_NAMESPACE("cl.gather_elements_beyond_maxwidth_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)\ + "to"STR(OUT_DTYPE)"_2D"), _GATHER_ELEMENTS_KERNEL_SOURCE} + typedef struct { uint32_t key; @@ -89,6 +101,44 @@ static const _kernel_map_type _gather_elements_kernel_map[] = PACK_KERNEL_2D_MAP( 1, F32, I32, F32 ), PACK_KERNEL_2D_MAP( 1, I32, I32, I32 ), PACK_KERNEL_2D_MAP( 1, U32, I32, U32 ), + + PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, F32, I32, F32), + PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, F16, I32, F16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, I32, I32, I32 ), + PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, I16, I32, I16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, I8, I32, I8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, U8, I32, U8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, F32, I32, F32), + PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, F16, I32, F16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, I32, I32, I32 ), + PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, I16, I32, I16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, I8, I32, I8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, U8, I32, U8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, F32, I32, F32), + PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, F16, I32, F16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, I32, I32, I32 ), + PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, I16, I32, I16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, I8, I32, I8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, U8, I32, U8 ), + + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, F32, I32, F32 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, F16, I32, F16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, I32, I32, I32 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, I16, I32, I16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, I8, I32, I8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, U8, I32, U8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, F32, I32, F32 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, F16, I32, F16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, I32, I32, I32 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, I16, I32, I16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, I8, I32, I8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, U8, I32, U8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, F32, I32, F32 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, F16, I32, F16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, I32, I32, I32 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, I16, I32, I16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, I8, I32, I8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, U8, I32, U8 ), }; @@ -126,12 +176,38 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer) {0, 0, 0}, {0, 0, 0} }; + vsi_nn_kernel_tensor_attr_t * input_attr0 = NULL; + vsi_nn_kernel_tensor_attr_t * input_attr1 = NULL; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_size_array_t * out_shape = NULL; + uint32_t width0 = 0; + uint32_t height0 = 0; + uint32_t width1 = 0; + uint32_t height1 = 0; + uint32_t width_out = 0; + uint32_t height_out = 0; + uint32_t depth0 = 0; + uint32_t depth1 = 0; + + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(param_size); + input_attr0 = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr0, "Create tensor attr buffer fail.", final ); + input_attr1 = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( input_attr1, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + width0 = (uint32_t)input_attr0->shape->data[0]; + height0 = (uint32_t)input_attr0->shape->data[1]; + depth0 = input_attr0->shape->size > 2 ? (uint32_t)input_attr0->shape->data[2] : 1; + width1 = (uint32_t)input_attr1->shape->data[0]; + height1 = (uint32_t)input_attr1->shape->data[1]; + depth1 = input_attr1->shape->size > 2 ? (uint32_t)input_attr1->shape->data[2] : 1; + width_out = (uint32_t)output_attr->shape->data[0]; + height_out = (uint32_t)output_attr->shape->data[1]; + out_shape = output_attr->shape; gpu_param.global_scale[0] = 1; @@ -146,7 +222,25 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer) (out_shape->data[1] + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1]); gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + if (width0 >= GPU_TENSOR_MAX_WIDTH || + width1 >= GPU_TENSOR_MAX_WIDTH || + height0 >= GPU_TENSOR_MAX_WIDTH || + height1 >= GPU_TENSOR_MAX_WIDTH || + depth0 >= GPU_TENSOR_MAX_WIDTH || + depth1 >= GPU_TENSOR_MAX_WIDTH) + { + gpu_param.global_scale[0] = 1; + gpu_param.global_size[0] = out_shape->data[0]; + } + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + status |= vsi_nn_kernel_gpu_add_param( node, "width0", &width0 ); + status |= vsi_nn_kernel_gpu_add_param( node, "height0", &height0 ); + status |= vsi_nn_kernel_gpu_add_param( node, "width1", &width1 ); + status |= vsi_nn_kernel_gpu_add_param( node, "height1", &height1 ); + status |= vsi_nn_kernel_gpu_add_param( node, "width_out", &width_out ); + status |= vsi_nn_kernel_gpu_add_param( node, "height_out", &height_out ); final: #define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } @@ -178,32 +272,52 @@ static vsi_status _query_kernel int32_t img_2d = (outputs[0]->attr.dim_num < 3 || outputs[0]->attr.size[2] == 1) ? 1 : 0; uint32_t key = 0; uint32_t i; + int32_t beyond_maxwidth = 0; + vsi_size_t depth0 = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1; + vsi_size_t depth1 = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1; in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + if (inputs[0]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH || + inputs[0]->attr.size[1] >= GPU_TENSOR_MAX_WIDTH || + inputs[1]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH || + inputs[1]->attr.size[1] >= GPU_TENSOR_MAX_WIDTH || + depth0 >= GPU_TENSOR_MAX_WIDTH || + depth1 >= GPU_TENSOR_MAX_WIDTH) + { + beyond_maxwidth = 1; + } + #define _PACK_SELECT_KEY( in0_type, out_type ) \ ( ( in0_type ) | ( out_type << 8 )) - switch (_PACK_SELECT_KEY(in0_dtype, out_dtype)) + if (beyond_maxwidth == 0) + { + switch (_PACK_SELECT_KEY(in0_dtype, out_dtype)) + { + case _PACK_SELECT_KEY(F32, F32): + case _PACK_SELECT_KEY(F16, F16): + key = GATHER_ELEMENTS_HASH_KEY( axis, F32, in1_dtype, F32, img_2d, 0 ); + break; + case _PACK_SELECT_KEY(U32, U32): + case _PACK_SELECT_KEY(U16, U16): + case _PACK_SELECT_KEY(U8, U8): + key = GATHER_ELEMENTS_HASH_KEY( axis, U32, in1_dtype, U32, img_2d, 0 ); + break; + case _PACK_SELECT_KEY(I32, I32): + case _PACK_SELECT_KEY(I16, I16): + case _PACK_SELECT_KEY(I8, I8): + key = GATHER_ELEMENTS_HASH_KEY( axis, I32, in1_dtype, I32, img_2d, 0 ); + break; + default: + break; + } + } + else { - case _PACK_SELECT_KEY(F32, F32): - case _PACK_SELECT_KEY(F16, F16): - key = GATHER_ELEMENTS_HASH_KEY( axis, F32, in1_dtype, F32, img_2d ); - break; - case _PACK_SELECT_KEY(U32, U32): - case _PACK_SELECT_KEY(U16, U16): - case _PACK_SELECT_KEY(U8, U8): - key = GATHER_ELEMENTS_HASH_KEY( axis, U32, in1_dtype, U32, img_2d ); - break; - case _PACK_SELECT_KEY(I32, I32): - case _PACK_SELECT_KEY(I16, I16): - case _PACK_SELECT_KEY(I8, I8): - key = GATHER_ELEMENTS_HASH_KEY( axis, I32, in1_dtype, I32, img_2d ); - break; - default: - break; + key = GATHER_ELEMENTS_HASH_KEY( axis, in0_dtype, in1_dtype, out_dtype, img_2d, 1 ); } #undef _PACK_SELECT_KEY @@ -221,7 +335,8 @@ static vsi_status _query_kernel kernel->info.numParams = _cnt_of_array( _gather_elements_kernel_param_def ); kernel->info.initialize = initializer; // Register code source - vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", kernel_map[i].source_name ); // Register binary source vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, diff --git a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c index a41e7ace3..bfcb0df06 100644 --- a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c @@ -119,7 +119,7 @@ static vsi_status cal_gather_nd_tensor_reshape_size uint32_t block_size, uint32_t coordDim, int32_t* newDim, - int32_t batch_dims + uint32_t batch_dims ) { vsi_status status = VSI_FAILURE; @@ -146,17 +146,23 @@ static vsi_status cal_gather_nd_tensor_reshape_size if (batch_dims) { + int32_t rank = 1; for (i = 0; i < offset; i++) { sizes[0] *= input_size[i]; } - for (i = 0; i < coordDim; i++) + for (i = 0; i < coordDim - 1; i++) { - sizes[i + 1] = input_size[i + offset]; + sizes[rank++] = input_size[i + offset]; } - newDim[0] = coordDim == 1 ? 2 : 3; + for (i = 0; i < batch_dims; i++) + { + sizes[rank] *= input_size[dims_num - i - 1]; + } + + newDim[0] = rank + 1; } else { @@ -186,13 +192,27 @@ static vsi_status cal_gather_nd_tensor_reshape_size } else // indices&output reshape { - if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH) + if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH && batch_dims == 0) { sizes[0] = block_size; sizes[1] = elementCnt / block_size; status = VSI_SUCCESS; newDim[0] = 2; } + else if (batch_dims > 0) + { + vsi_size_t batch_cnt = 1; + for (i = 0; i < batch_dims; ++i) + { + batch_cnt *= input_size[dims_num - i - 1]; + } + + sizes[0] = block_size; + sizes[1] = (elementCnt / block_size) / batch_cnt; + sizes[2] = batch_cnt; + status = VSI_SUCCESS; + newDim[0] = 3; + } } #undef VSI_NN_MAX_IMAGE_WIDTH @@ -220,7 +240,11 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer) vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; int32_t block_size = 0; - vsi_ssize_t indices_num = 1; + vsi_size_t indices_num = 1; + vsi_size_t batch_num = 1; + + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(param_size); attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -229,6 +253,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer) CHECK_STATUS_FAIL_GOTO(status, final ); indices_num = attr[0]->shape->data[1]; + batch_num = (attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1); gpu_param.global_scale[0] = 1; gpu_param.global_scale[1] = 1; @@ -237,7 +262,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer) gpu_param.global_size[0] = gpu_align_p2((block_size + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); gpu_param.global_size[1] = indices_num; - gpu_param.global_size[2] = 1; + gpu_param.global_size[2] = batch_num; status = vsi_nn_kernel_gpu_config( node, &gpu_param ); CHECK_STATUS_FAIL_GOTO(status, final); @@ -265,7 +290,8 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype = U8; vsi_nn_kernel_coord_type_e coord_type = _error; uint32_t key = 0; - int i = 0; + int32_t batch_flg = batch_dims > 0 ? 1 : 0; + size_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -301,7 +327,7 @@ static vsi_status _query_kernel coord_type = _3D; } - key = HASH_GATHER_ND_KEY( input0_dtype, I32, output_dtype, coord_type, batch_dims ); + key = HASH_GATHER_ND_KEY( input0_dtype, I32, output_dtype, coord_type, batch_flg ); for ( i = 0; i < _cnt_of_array(gather_nd_map); i ++ ) { @@ -348,6 +374,9 @@ static vsi_nn_kernel_node_t _setup int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + status = cal_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim, batch_dims); status |= cal_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim, batch_dims); status |= cal_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim, batch_dims); diff --git a/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c b/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c index 1e51bd7b7..07eb2651f 100644 --- a/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c @@ -108,6 +108,9 @@ DEF_KERNEL_INITIALIZER(_globallppool_initializer) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t *output_shape = NULL; + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); output_shape = output_attr->shape; diff --git a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c index 95a4bff5a..5e727fadb 100644 --- a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c @@ -220,6 +220,9 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer) vsi_ssize_t width = 0; vsi_ssize_t chn = 0; + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -275,6 +278,9 @@ DEF_KERNEL_INITIALIZER(_groupnorm_mean_vari_initializer) vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_ssize_t chn = 0; + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -325,6 +331,9 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer) vsi_ssize_t chn = 0; int32_t is2D = 0; + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); @@ -489,6 +498,9 @@ static vsi_nn_kernel_node_t _setup float rSpaceOrg = 1.0f / (width * height); float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c index 410fe5638..b6e0bf733 100644 --- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c @@ -91,6 +91,9 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer) ) { vsi_status status = VSI_FAILURE; + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(param); + VSI_UNREFERENCED(param_size); // vsi_nn_kernel_tensor_attr * attr[2] = { NULL }; // attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); // attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -172,6 +175,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; + VSI_UNREFERENCED(params); + /* // Check if gpu can support the size if( !vsi_nn_kernel_gpu_check_shape( diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c index 1a849fe60..828a88a22 100644 --- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c @@ -91,6 +91,10 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_sma_initializer) ) { vsi_status status = VSI_FAILURE; + + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(param); + VSI_UNREFERENCED(param_size); // vsi_nn_kernel_tensor_attr * attr[2] = { NULL }; // attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); // attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -172,6 +176,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_SMA_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; + VSI_UNREFERENCED(params); + /* // Check if gpu can support the size if( !vsi_nn_kernel_gpu_check_shape( diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c index e2b6964a8..193f388d3 100644 --- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c @@ -118,6 +118,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer) vsi_nn_kernel_tensor_t input = NULL; vsi_nn_kernel_tensor_attr_t* input_attr = NULL; + VSI_UNREFERENCED(param_size); + input = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_Z_H_HSTATE]; input_attr = vsi_nn_kernel_tensor_attr_create( input ); diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c index 3912b95cb..0896c6a1c 100644 --- a/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c @@ -110,6 +110,8 @@ DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer) vsi_nn_kernel_tensor_t output = NULL; vsi_nn_kernel_tensor_attr_t* output_attr; + VSI_UNREFERENCED(param_size); + output = (vsi_nn_kernel_tensor_t)param[3]; output_attr = vsi_nn_kernel_tensor_attr_create( output ); diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c index a18b1121e..a99f8b908 100644 --- a/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c @@ -120,6 +120,8 @@ DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer) vsi_nn_kernel_tensor_t input = NULL; vsi_nn_kernel_tensor_attr_t* input_attr = NULL; + VSI_UNREFERENCED(param_size); + input = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_H_STATE]; input_attr = vsi_nn_kernel_tensor_attr_create( input ); diff --git a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c index 892377b53..942585037 100644 --- a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c @@ -188,6 +188,8 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer) vsi_ssize_t height = 0; vsi_ssize_t chn = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -255,6 +257,8 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) vsi_ssize_t height = 0; vsi_ssize_t chn = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); @@ -405,6 +409,9 @@ static vsi_nn_kernel_node_t _setup float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); float inv_multiplier = (float)1.0 / (float)(width * height); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { diff --git a/src/tim/vx/internal/src/kernel/cl/l1norm_cl.c b/src/tim/vx/internal/src/kernel/cl/l1norm_cl.c index 2626bfeaa..44186d138 100644 --- a/src/tim/vx/internal/src/kernel/cl/l1norm_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/l1norm_cl.c @@ -164,6 +164,8 @@ DEF_KERNEL_INITIALIZER(_l1norm_initializer) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t *output_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c index 7b2f50aa5..83e598bb0 100644 --- a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c @@ -115,6 +115,8 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t * output_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis); diff --git a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c index 20f3ab01c..a13ec2e19 100644 --- a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c @@ -123,6 +123,8 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer) vsi_ssize_t height = 0; vsi_ssize_t chn = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); @@ -175,7 +177,9 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e input0_dtype = U8; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; - int i = 0; + size_t i = 0; + + VSI_UNREFERENCED(reshape2D); input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -243,6 +247,9 @@ static vsi_nn_kernel_node_t _setup float zp2ScaleE2 = 0.0f; float sumZpScaleE2 = 0.0f; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + scale_inOut = input_scale * output_scale; e2InScale = input_scale * input_scale; sumZpScale = width * input_zp * input_scale; diff --git a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c index 311de9729..3fc716cad 100644 --- a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c @@ -148,6 +148,8 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer) vsi_size_array_t * out_shape = NULL; int32_t axis = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -194,7 +196,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + size_t i; input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -244,6 +246,9 @@ static vsi_nn_kernel_node_t _setup float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f; float scaleValue = (vx_float32)(log10(exp(1.0f)) / log10(2.0f)); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + axis = vsi_nn_kernel_param_get_int32(params, "axis"); beta = vsi_nn_kernel_param_get_float32(params, "beta"); diff --git a/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c b/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c index bcf4d7a7f..27b97ebb6 100644 --- a/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c @@ -106,11 +106,13 @@ DEF_KERNEL_INITIALIZER(_logical_not_initializer) {0, 0, 0}, {0, 0, 0} }; - vx_status status = VX_FAILURE; + vsi_status status = VSI_FAILURE; vx_tensor output = (vx_tensor)param[1]; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t *output_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -218,6 +220,8 @@ static vsi_nn_kernel_node_t _setup vsi_size_t new_rank = 0; vsi_bool ret = FALSE; + VSI_UNREFERENCED(params); + ret = vsi_nn_kernel_optimize_element_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank ); diff --git a/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c b/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c index 7121aa93b..4d0c23ab7 100644 --- a/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c @@ -111,11 +111,13 @@ DEF_KERNEL_INITIALIZER(_logical_ops_initializer) {0, 0, 0}, {0, 0, 0} }; - vx_status status = VX_FAILURE; - vx_tensor output = (vx_tensor)param[2]; + vsi_status status = VSI_FAILURE; + vx_tensor output = (vx_tensor)param[2]; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t *output_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -250,11 +252,11 @@ static vsi_nn_kernel_node_t _setup outputs[0], shapes[2], new_rank ); #define _swap_tensor(a, b, tmp) \ - do { \ + { \ tmp = a; \ a = b; \ b = tmp; \ - } while(0) + } if (shapes[1][3] > shapes[0][3] && new_rank == 4) { diff --git a/src/tim/vx/internal/src/kernel/cl/lppool_cl.c b/src/tim/vx/internal/src/kernel/cl/lppool_cl.c index 514bec0c7..a46c728d7 100644 --- a/src/tim/vx/internal/src/kernel/cl/lppool_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/lppool_cl.c @@ -121,6 +121,8 @@ DEF_KERNEL_INITIALIZER(_lppool_initializer) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t *output_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c index a7bdb2c89..dec27e3f9 100644 --- a/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c @@ -68,7 +68,8 @@ typedef enum _LSTMUNIT_nn_activation_e #define LSTMUNIT_ACTIVATION_HASH_KEY(_is_ln, _is_cifg, _is_proj, _is_hybrid, _is_peephole, \ _input_type, _output_type, _cell_type, _rec_act) \ ((_is_ln << 31) | (_is_cifg << 30) | (_is_proj << 29) | (_is_hybrid << 28) | (_is_peephole << 27) \ -| (_input_type << 23) | (_output_type << 19) | (_cell_type << 15) | (_rec_act << 10)) +| (((uint32_t)_input_type) << 23) | (((uint32_t)_output_type) << 19) | (((uint32_t)_cell_type) << 15) \ +| (_rec_act << 10)) #define LSTMUNIT_ACTIVATION_SOURCE_NAME(_ln_cifg_proj_hybrid_, _input_type) \ "lstmunit_activation_"#_ln_cifg_proj_hybrid_"_"#_input_type @@ -941,6 +942,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_CL_initializer) vsi_nn_kernel_tensor_t output = NULL; vsi_nn_kernel_tensor_attr_t* output_attr; + VSI_UNREFERENCED(param_size); + output = (vsi_nn_kernel_tensor_t)param[CL_OUTPUT]; output_attr = vsi_nn_kernel_tensor_attr_create( output ); @@ -983,6 +986,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_CB_initializer) vsi_nn_kernel_tensor_t output = NULL; vsi_nn_kernel_tensor_attr_t* output_attr; + VSI_UNREFERENCED(param_size); + output = (vsi_nn_kernel_tensor_t)param[CB_OUTPUT]; output_attr = vsi_nn_kernel_tensor_attr_create( output ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); @@ -1027,6 +1032,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_CS_initializer) vsi_nn_kernel_tensor_t output = NULL; vsi_nn_kernel_tensor_attr_t* output_attr; + VSI_UNREFERENCED(param_size); + output = (vsi_nn_kernel_tensor_t)param[CS_OUTPUT]; output_attr = vsi_nn_kernel_tensor_attr_create( output ); @@ -1073,6 +1080,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_L_initializer) vsi_nn_kernel_tensor_t output = NULL; vsi_nn_kernel_tensor_attr_t* output_attr; + VSI_UNREFERENCED(param_size); + output = (vsi_nn_kernel_tensor_t)param[L_OUTPUT]; output_attr = vsi_nn_kernel_tensor_attr_create( output ); @@ -1118,6 +1127,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_B_initializer) vsi_nn_kernel_tensor_t output = NULL; vsi_nn_kernel_tensor_attr_t* output_attr; + VSI_UNREFERENCED(param_size); + output = (vsi_nn_kernel_tensor_t)param[B_OUTPUT]; output_attr = vsi_nn_kernel_tensor_attr_create( output ); @@ -1164,6 +1175,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_S_initializer) vsi_nn_kernel_tensor_t output = NULL; vsi_nn_kernel_tensor_attr_t* output_attr; + VSI_UNREFERENCED(param_size); + output = (vsi_nn_kernel_tensor_t)param[S_OUTPUT]; output_attr = vsi_nn_kernel_tensor_attr_create( output ); diff --git a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c index 5ff2a9308..de336c9ba 100644 --- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c @@ -43,6 +43,7 @@ __BEGIN_DECLS */ #define KERNEL_SOURCE_1 "matrixmul" #define KERNEL_SOURCE_2 "matrixmul_transA" +#define KERNEL_SOURCE_3 "matrixmul_cross" typedef enum { @@ -50,8 +51,8 @@ __BEGIN_DECLS _3D } vsi_nn_kernel_image_dim_type_e; -#define HASH_MATRIXMUL_KEY(_input0_type, _input1_type, _output_type, _image_dim, _trans_a) \ - ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_dim << 4) | (_trans_a)) +#define HASH_MATRIXMUL_KEY(_type0, _type1, _type2, _image_dim, _trans_a, _cross) \ + ((_type0 << 24) | (_type1 << 16) | (_type2 << 8) | (_image_dim << 4) | (_trans_a << 2) | (_cross)) #define HASH_MATRIXMUL_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \ CVIVANTE_NAMESPACE("cl.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM) @@ -62,21 +63,29 @@ __BEGIN_DECLS #define HASH_MATRIXMUL_TRANSB_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \ CVIVANTE_NAMESPACE("cl.gemm_transb_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM) +#define HASH_MATRIXMUL_SH_KERNEL_MERGE_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("cl.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_merge") + #define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ - { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0), \ + { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 0), \ HASH_MATRIXMUL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \ SOURCE }, #define TENSOR_MATRIXMUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ - { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 1), \ + { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 1, 0), \ HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \ SOURCE }, #define TENSOR_MATRIXMUL_TRANSB_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ - { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 2), \ + { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 2, 0), \ HASH_MATRIXMUL_TRANSB_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \ SOURCE }, +#define TENSOR_MATRIXMUL_MERGE_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \ + { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 2), \ + HASH_MATRIXMUL_SH_KERNEL_MERGE_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + SOURCE }, + static const struct { uint32_t key; char* function_name; @@ -109,6 +118,9 @@ static const struct { TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, F32, _3D, KERNEL_SOURCE_2) TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, F32, _2D, KERNEL_SOURCE_1) TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, F32, _3D, KERNEL_SOURCE_1) + TENSOR_MATRIXMUL_MERGE_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_3) + TENSOR_MATRIXMUL_MERGE_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_3) + TENSOR_MATRIXMUL_MERGE_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_3) }; /* @@ -132,7 +144,27 @@ static vx_param_description_t _matrixmul_kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; +static vx_param_description_t _matrixmul_merge_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + #define _MATRIXMUL_PARAM_NUM _cnt_of_array(_matrixmul_kernel_param_def) +#define _MATRIXMUL_MERGE_PARAM_NUM _cnt_of_array(_matrixmul_merge_kernel_param_def) /* * Kernel initializer @@ -153,17 +185,40 @@ DEF_KERNEL_INITIALIZER(_matrixmul_initializer) {0, 0, 0} }; - vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - vsi_ssize_t width = 0; - vsi_ssize_t height = 0; - vsi_ssize_t chn = 0; + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + vsi_size_t width = 0; + vsi_size_t height = 0; + vsi_size_t chn = 0; - attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); - width = attr[0]->shape->data[0]; - height = attr[0]->shape->data[1]; - chn = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1; + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + width = attr[2]->shape->data[0]; + height = attr[2]->shape->data[1]; + chn = attr[2]->shape->size > 2 ? attr[2]->shape->data[2] : 1; + + if (((attr[0]->shape->size == 4 && attr[1]->shape->size == 3) || + (attr[0]->shape->size == 3 && attr[1]->shape->size == 4)) + && attr[0]->shape->data[2] > 1 && attr[1]->shape->data[2] > 1 + && chn == attr[0]->shape->data[2] * attr[1]->shape->data[2]) + { + if (attr[0]->shape->size == 4) + { + chn = attr[1]->shape->data[2]; + } + else + { + chn = attr[0]->shape->data[2]; + } + } gpu_param.global_scale[0] = 1; gpu_param.global_scale[1] = 1; @@ -184,6 +239,16 @@ DEF_KERNEL_INITIALIZER(_matrixmul_initializer) vsi_nn_kernel_tensor_attr_release( &attr[0] ); attr[0] = NULL; } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } return status; } /* _matrixmul_initializer() */ @@ -193,7 +258,8 @@ static vsi_status _query_kernel vsi_nn_tensor_t * const * const inputs, vsi_nn_tensor_t * const * const outputs, vsi_size_t depth, - int32_t transa + int32_t transa, + int32_t cross ) { vsi_status status = VSI_FAILURE; @@ -202,7 +268,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype = U8; vsi_nn_kernel_image_dim_type_e dim_type = _2D; uint32_t key = 0; - int i = 0; + size_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); @@ -251,7 +317,7 @@ static vsi_status _query_kernel output_dtype = U8; } - key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, transa ); + key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, transa, cross ); for( i = 0; i < _cnt_of_array(matrixmul_map); i ++ ) { @@ -264,8 +330,16 @@ static vsi_status _query_kernel if ( i < _cnt_of_array(matrixmul_map) ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", matrixmul_map[i].function_name ); - kernel->info.parameters = _matrixmul_kernel_param_def; - kernel->info.numParams = _cnt_of_array( _matrixmul_kernel_param_def ); + if (cross == 0) + { + kernel->info.parameters = _matrixmul_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _matrixmul_kernel_param_def ); + } + else if (cross == 2) + { + kernel->info.parameters = _matrixmul_merge_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _matrixmul_merge_kernel_param_def ); + } kernel->info.initialize = _matrixmul_initializer; // Register code source vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, @@ -290,14 +364,17 @@ static vsi_nn_kernel_node_t _setup ) { vsi_status status = VSI_FAILURE; - vsi_nn_kernel_node_param_t node_params[_MATRIXMUL_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_param_t node_params[_MATRIXMUL_MERGE_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; int32_t transposeA = vsi_nn_kernel_param_get_int32( params, "transposeA" ); int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" ); + int32_t cross_flg = vsi_nn_kernel_param_get_int32( params, "cross_flg" ); int32_t transFlg = 0; vsi_size_t M = inputs[0]->attr.size[1]; vsi_size_t K = inputs[0]->attr.size[0]; vsi_size_t N = inputs[1]->attr.size[0]; + vsi_size_t a_depth = 0; + vsi_size_t b_depth = 0; vsi_size_t depth = outputs[0]->attr.dim_num > 2 ? outputs[0]->attr.size[2] : 1; uint32_t ac2zero = 0; uint32_t bc2zero = 0; @@ -307,6 +384,10 @@ static vsi_nn_kernel_node_t _setup float zp_b = (float)vsi_nn_get_tensor_zero_point(inputs[1]); float scale_out = vsi_nn_get_tensor_scale(outputs[0]); float zp_out = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + int32_t outer = 0; + + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); scale_out = 1 / scale_out; @@ -329,28 +410,43 @@ static vsi_nn_kernel_node_t _setup transFlg = 1; } - if ((inputs[0]->attr.dim_num > inputs[1]->attr.dim_num) || - (inputs[0]->attr.size[2] > inputs[1]->attr.size[2] - && inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2)) + a_depth = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1; + b_depth = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1; + + if (b_depth == 1) { bc2zero = 1; } - else if ((inputs[1]->attr.dim_num > inputs[0]->attr.dim_num) || - (inputs[1]->attr.size[2] > inputs[0]->attr.size[2] - && inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2)) + if (a_depth == 1) + { + ac2zero = 1; + } + + if (inputs[0]->attr.dim_num == 4 && inputs[1]->attr.dim_num == 3 + && a_depth > 1 && b_depth > 1 && cross_flg == 2) { ac2zero = 1; + bc2zero = 0; + outer = (int32_t)a_depth; + } + else if (inputs[1]->attr.dim_num == 4 && inputs[0]->attr.dim_num == 3 + && a_depth > 1 && b_depth > 1 && cross_flg == 2) + { + ac2zero = 0; + bc2zero = 1; + outer = (int32_t)b_depth; } - status = _query_kernel( kernel, inputs, outputs, depth, transFlg ); + status = _query_kernel( kernel, inputs, outputs, depth, transFlg, cross_flg ); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); if ( node ) { uint32_t index = 3; + size_t param_num = cross_flg == 2 ? _MATRIXMUL_MERGE_PARAM_NUM : _MATRIXMUL_PARAM_NUM; /* Pass parameters to node. */ - vsi_nn_kernel_node_pack_io( node_params, _MATRIXMUL_PARAM_NUM, + vsi_nn_kernel_node_pack_io( node_params, param_num, inputs, 2, outputs, 1 ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &M ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &K ); @@ -363,8 +459,12 @@ static vsi_nn_kernel_node_t _setup node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp_b ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_out ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp_out ); + if (cross_flg == 2) + { + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &outer ); + } /* Pass parameters to node. */ - status = vsi_nn_kernel_node_pass_param( node, node_params, _MATRIXMUL_PARAM_NUM ); + status = vsi_nn_kernel_node_pass_param( node, node_params, param_num ); CHECK_STATUS(status); vsi_nn_kernel_scalar_release( &node_params[3] ); vsi_nn_kernel_scalar_release( &node_params[4] ); @@ -377,6 +477,10 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &node_params[11] ); vsi_nn_kernel_scalar_release( &node_params[12] ); vsi_nn_kernel_scalar_release( &node_params[13] ); + if (cross_flg == 2) + { + vsi_nn_kernel_scalar_release( &node_params[14] ); + } } } return node; diff --git a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c index c81289ed6..3446fef8b 100644 --- a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c @@ -136,6 +136,8 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer) vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -190,7 +192,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + size_t i; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); @@ -252,6 +254,10 @@ static vsi_nn_kernel_node_t _setup float outputScale = vsi_nn_get_tensor_scale(outputs[0]); float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, diff --git a/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c b/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c index 2311810e9..b8ecf2ae9 100644 --- a/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c @@ -115,11 +115,13 @@ DEF_KERNEL_INITIALIZER(_maxpoolwithargmax_initializer) {0, 0, 0} }; - vx_status status = VX_FAILURE; + vsi_status status = VSI_FAILURE; vx_tensor output = (vx_tensor)param[1]; vsi_nn_kernel_tensor_attr_t * attr_out = NULL; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr_out = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); CHECK_PTR_FAIL_GOTO( attr_out, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -159,7 +161,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output0_dtype = U8; vsi_nn_kernel_dtype_e output1_dtype = I32; uint32_t key = 0; - int32_t i = 0; + size_t i = 0; input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output0_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); diff --git a/src/tim/vx/internal/src/kernel/cl/maxunpool_cl.c b/src/tim/vx/internal/src/kernel/cl/maxunpool_cl.c index 408164bfb..f4086a8e1 100644 --- a/src/tim/vx/internal/src/kernel/cl/maxunpool_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/maxunpool_cl.c @@ -120,6 +120,8 @@ DEF_KERNEL_INITIALIZER(_maxunpool_initializer) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t *output_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c index 92a19a3e5..5d85656cb 100644 --- a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c @@ -136,6 +136,8 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer) vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -190,7 +192,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + size_t i; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); @@ -252,6 +254,11 @@ static vsi_nn_kernel_node_t _setup float outputScale = vsi_nn_get_tensor_scale(outputs[0]); float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + + outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, diff --git a/src/tim/vx/internal/src/kernel/cl/mod_cl.c b/src/tim/vx/internal/src/kernel/cl/mod_cl.c index 1398823d9..b6c50164a 100644 --- a/src/tim/vx/internal/src/kernel/cl/mod_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/mod_cl.c @@ -119,6 +119,8 @@ DEF_KERNEL_INITIALIZER(_mod_initializer) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t *output_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/moments_cl.c b/src/tim/vx/internal/src/kernel/cl/moments_cl.c index e5bae713e..4afda3666 100644 --- a/src/tim/vx/internal/src/kernel/cl/moments_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/moments_cl.c @@ -224,6 +224,8 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) int32_t axis = 0; int32_t axis_num = 1; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -306,7 +308,9 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e input0_dtype = U8; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; - int i = 0; + size_t i = 0; + + VSI_UNREFERENCED(params); input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -381,6 +385,9 @@ static vsi_nn_kernel_node_t _setup float input_scale = vsi_nn_get_tensor_scale(inputs[0]); float dim_ratio = (float)1.0 / (float)(width * height); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + axis_num = (int32_t)axis_num_temp; if (axis_num == 1 && axis[0] == 0) diff --git a/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c b/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c new file mode 100644 index 000000000..cc6d53800 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c @@ -0,0 +1,401 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_NEAREST_GRID_SAMPLE, +} _internal_kernel_e; + +#define _NEAREST_GRID_SAMPLE_KERNEL_SOURCE() "nearest_grid_sample" + +#define STR(a) #a + +// Add kernel hashtable here +#define NEAREST_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + ((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE)) + +#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + { \ + NEAREST_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \ + CVIVANTE_NAMESPACE("cl.nearest_grid_sample_" STR( \ + IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \ + _NEAREST_GRID_SAMPLE_KERNEL_SOURCE() \ + } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _nearest_grid_sample_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP(F32, F32, F32), + PACK_KERNEL_MAP(U8, U8, U8), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _nearest_grid_sample_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _NEAREST_GRID_SAMPLE_PARAM_NUM 8 +#define _NEAREST_GRID_SAMPLE_PARAM_QUANT_NUM \ + _cnt_of_array(_nearest_grid_sample_kernel_param_def) + +#define SCALAR_HALF_INPUT0_W (3) +#define SCALAR_HALF_INPUT0_H (4) +#define SCALAR_ADD_VALUE_W (5) +#define SCALAR_ADD_VALUE_H (6) +#define SCALAR_DEPTH (7) +#define SCALAR_INPUT0_SCALE (8) +#define SCALAR_INPUT0_TAIL (9) +#define SCALAR_INPUT1_SCALE (10) +#define SCALAR_INPUT1_TAIL (11) +#define SCALAR_OUTPUT_SCALE (12) +#define SCALAR_OUTPUT_TAIL (13) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_nearest_grid_sample_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = {3, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}}; + vsi_nn_kernel_tensor_attr_t* output_attr = NULL; + vsi_size_array_t* out_shape = NULL; + + VSI_UNREFERENCED(param_size); + + output_attr = + vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]); + CHECK_PTR_FAIL_GOTO(output_attr, "Create tensor attr buffer fail.", final); + + out_shape = output_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = 2; + gpu_param.global_size[0] = + (out_shape->data[0] + gpu_param.global_scale[0] - 1) / + gpu_param.global_scale[0]; + gpu_param.global_size[1] = + ((out_shape->data[1] + gpu_param.global_scale[1] - 1) / + gpu_param.global_scale[1]); + gpu_param.global_size[2] = 1; + status = vsi_nn_kernel_gpu_config(node, &gpu_param); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) \ + if (_PTR) { \ + vsi_nn_kernel_tensor_attr_release(&_PTR); \ + _PTR = NULL; \ + } + SAFE_FREE_TENSOR_ATTR(output_attr); + return status; +} /* _nearest_grid_sample_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool* is_use_u8_kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype, in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _nearest_grid_sample_kernel_map; + size_t kernel_map_size = _cnt_of_array( _nearest_grid_sample_kernel_map ); + vx_param_description_t * param_def = _nearest_grid_sample_kernel_param_def; + size_t param_def_size = + _cnt_of_array(_nearest_grid_sample_kernel_param_def); + vx_kernel_initialize_f initializer = _nearest_grid_sample_initializer; + + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type); + in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in0_dtype) { + in0_dtype = F32; + } + if (F16 == in1_dtype) { + in1_dtype = F32; + } + if (F16 == out_dtype) { + out_dtype = F32; + } + if ((U8 == in0_dtype) || (U8 == out_dtype)) { + param_def_size = _NEAREST_GRID_SAMPLE_PARAM_QUANT_NUM; + *is_use_u8_kernel = TRUE; + } else { + param_def_size = _NEAREST_GRID_SAMPLE_PARAM_NUM; + *is_use_u8_kernel = FALSE; + } + + key = NEAREST_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_NEAREST_GRID_SAMPLE_PARAM_QUANT_NUM]; + vsi_nn_kernel_node_t node = NULL; + vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; + uint32_t final_in1_rank = 0; + vsi_nn_tensor_t* rs_tensors = NULL; + vsi_nn_tensor_t* final_tensors[3] = {NULL}; + vsi_size_t in0_width = inputs[0]->attr.size[0]; + vsi_size_t in0_height = inputs[0]->attr.size[1]; + float input0_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input0_scale = vsi_nn_get_tensor_scale(inputs[0]); + float input0_tail = -(input0_zp * input0_scale); + float input1_zp = (float)vsi_nn_get_tensor_zero_point(inputs[1]); + float input1_scale = vsi_nn_get_tensor_scale(inputs[1]); + float input1_tail = -(input1_zp * input1_scale); + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); + vsi_bool is_use_u8_kernel = FALSE; + int32_t align_corners = + vsi_nn_kernel_param_get_int32(params, "align_corners"); + uint32_t pad_val = 0; + int32_t depth = 0; + vsi_nn_kernel_dtype_e in0_dtype; + float half_input0_w, half_input0_h, add_float_value_w, add_float_value_h; + + // Check if gpu can support the size + if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size, + inputs[0]->attr.dim_num)) { + return NULL; + } + + if (!vsi_nn_kernel_gpu_check_shape(inputs[1]->attr.size, + inputs[1]->attr.dim_num)) { + return NULL; + } + + final_tensors[0] = inputs[0]; + if (inputs[1]->attr.dim_num >= 3) { + final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0]; + final_shape[1] = inputs[1]->attr.size[2]; + final_shape[2] = 1; + final_shape[3] = + inputs[1]->attr.dim_num > 3 ? inputs[1]->attr.size[3] : 1; + final_in1_rank = + inputs[1]->attr.dim_num == 3 ? 2 : inputs[1]->attr.dim_num; + if (!vsi_nn_kernel_gpu_check_shape(final_shape, final_in1_rank)) { + return NULL; + } + + rs_tensors = vsi_nn_reshape_tensor( + graph, inputs[1], final_shape, final_in1_rank); + final_tensors[1] = rs_tensors; + } else { + final_tensors[1] = inputs[1]; + } + final_tensors[2] = outputs[0]; + + if (align_corners) { + half_input0_w = ((float)in0_width - 1.0f) * 0.5f; + half_input0_h = ((float)in0_height - 1.0f) * 0.5f; + add_float_value_w = half_input0_w; + add_float_value_h = half_input0_h; + } else { + half_input0_w = (float)in0_width * 0.5f; + half_input0_h = (float)in0_height * 0.5f; + add_float_value_w = half_input0_w - 0.5f; + add_float_value_h = half_input0_h - 0.5f; + } + + add_float_value_w = add_float_value_w + 0.5f; + add_float_value_h = add_float_value_h + 0.5f; + + depth = (int32_t)inputs[0]->attr.size[2]; + in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type); + if (U8 == in0_dtype) { + pad_val = inputs[0]->attr.dtype.zero_point; + } + + status = _query_kernel(kernel, inputs, outputs, &is_use_u8_kernel); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node(graph, kernel); + if (node) { + size_t node_params_num = _NEAREST_GRID_SAMPLE_PARAM_NUM; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io(node_params, + _NEAREST_GRID_SAMPLE_PARAM_QUANT_NUM, + final_tensors, + input_num, + &final_tensors[2], + output_num); + node_params[SCALAR_HALF_INPUT0_W] = + vsi_nn_kernel_scalar_create(graph, F32, &half_input0_w); + node_params[SCALAR_HALF_INPUT0_H] = + vsi_nn_kernel_scalar_create(graph, F32, &half_input0_h); + node_params[SCALAR_ADD_VALUE_W] = + vsi_nn_kernel_scalar_create(graph, F32, &add_float_value_w); + node_params[SCALAR_ADD_VALUE_H] = + vsi_nn_kernel_scalar_create(graph, F32, &add_float_value_h); + node_params[SCALAR_DEPTH] = + vsi_nn_kernel_scalar_create(graph, I32, &depth); + if (is_use_u8_kernel) { + node_params[SCALAR_INPUT0_SCALE] = + vsi_nn_kernel_scalar_create(graph, F32, &input0_scale); + node_params[SCALAR_INPUT0_TAIL] = + vsi_nn_kernel_scalar_create(graph, F32, &input0_tail); + node_params[SCALAR_INPUT1_SCALE] = + vsi_nn_kernel_scalar_create(graph, F32, &input1_scale); + node_params[SCALAR_INPUT1_TAIL] = + vsi_nn_kernel_scalar_create(graph, F32, &input1_tail); + node_params[SCALAR_OUTPUT_SCALE] = + vsi_nn_kernel_scalar_create(graph, F32, &output_scale); + node_params[SCALAR_OUTPUT_TAIL] = + vsi_nn_kernel_scalar_create(graph, F32, &output_zp); + node_params_num = _NEAREST_GRID_SAMPLE_PARAM_QUANT_NUM; + } + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( + node, node_params, node_params_num); + VSI_ASSERT(status == VSI_SUCCESS); + vsi_nn_kernel_scalar_release(&node_params[SCALAR_HALF_INPUT0_W]); + vsi_nn_kernel_scalar_release(&node_params[SCALAR_HALF_INPUT0_H]); + vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_W]); + vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_H]); + vsi_nn_kernel_scalar_release(&node_params[SCALAR_DEPTH]); + if (is_use_u8_kernel) { + vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_SCALE]); + vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_TAIL]); + vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_SCALE]); + vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_TAIL]); + vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_SCALE]); + vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_TAIL]); + } + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U32 = pad_val; + status = vxSetNodeAttribute( + (vx_node)node, VX_NODE_BORDER, &border, sizeof(border)); + CHECK_STATUS(status); + } + } + } + + vsi_safe_release_tensor(rs_tensors); + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( nearest_grid_sample, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c index 4369beaf6..a66b89b3e 100644 --- a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c @@ -121,6 +121,8 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer) vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; vsi_size_array_t * in_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -234,6 +236,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_tensor_t* rs_tensors[2] = { NULL }; vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }}; int32_t i = 0; + size_t j = 0; vsi_size_t num_elements = vsi_nn_vxGetTensorElementNum(&inputs[0]->attr); vsi_size_t prefix_dim_size = 1; vsi_size_t suffix_dim_size = 0; @@ -320,11 +323,11 @@ static vsi_nn_kernel_node_t _setup vsi_nn_ReleaseTensor( &rs_tensors[1] ); } - for (i = SCALAR_INPUT_DEPTH; i < _ONE_HOT_PARAM_NUM; i++) + for (j = SCALAR_INPUT_DEPTH; j < _ONE_HOT_PARAM_NUM; j++) { - if (node_params[i]) + if (node_params[j]) { - vsi_nn_kernel_scalar_release( &node_params[i] ); + vsi_nn_kernel_scalar_release( &node_params[j] ); } } diff --git a/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c b/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c index 558a1e0d1..18468ae5c 100644 --- a/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c @@ -111,12 +111,14 @@ DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer) {0, 0, 0} }; - vx_status status = VX_FAILURE; + vsi_status status = VSI_FAILURE; vx_tensor output = (vx_tensor)param[1]; vsi_nn_kernel_tensor_attr_t * attr_out = NULL; vsi_size_array_t * out_shape = NULL; vsi_bool image_2d = FALSE; + VSI_UNREFERENCED(param_size); + attr_out = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); CHECK_PTR_FAIL_GOTO( attr_out, "vsi_nn_kernel_tensor_attr_create fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/pow_cl.c b/src/tim/vx/internal/src/kernel/cl/pow_cl.c index 1d1020d7a..6a38b4e85 100644 --- a/src/tim/vx/internal/src/kernel/cl/pow_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/pow_cl.c @@ -126,6 +126,8 @@ DEF_KERNEL_INITIALIZER(_pow_initializer) vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -180,7 +182,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key = 0; - int i = 0; + size_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); @@ -248,6 +250,10 @@ static vsi_nn_kernel_node_t _setup float inputScale = vsi_nn_get_tensor_scale(inputs[0]); float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + outputScale = 1.0f / outputScale; inputTail = -(inputTail * inputScale); diff --git a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c index 609c90e18..87c8593a3 100644 --- a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c @@ -136,6 +136,8 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer) vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -190,7 +192,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + size_t i; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); @@ -248,6 +250,9 @@ static vsi_nn_kernel_node_t _setup float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]); int32_t is_per_channel_alpha = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + is_per_channel_alpha = vsi_nn_kernel_param_get_int32(params, "is_per_channel_alpha"); if (is_per_channel_alpha) diff --git a/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c b/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c index 696303b21..7e4504008 100644 --- a/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c @@ -35,7 +35,6 @@ #include "vsi_nn_error.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" __BEGIN_DECLS @@ -153,6 +152,8 @@ DEF_KERNEL_INITIALIZER(_multinomial_initializer) vsi_nn_kernel_tensor_attr_t * attr = NULL; vsi_size_array_t * in_shape = NULL; + VSI_UNREFERENCED(param_size); + attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); @@ -196,6 +197,8 @@ DEF_KERNEL_INITIALIZER(_cdf_initializer) vsi_size_array_t * in_shape = NULL; vsi_size_t batch = 0; + VSI_UNREFERENCED(param_size); + attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); @@ -235,6 +238,9 @@ DEF_KERNEL_INITIALIZER(_seed_initializer) {0, 0, 0} }; + VSI_UNREFERENCED(param); + VSI_UNREFERENCED(param_size); + gpu_param.global_scale[0] = 1; gpu_param.global_scale[1] = 1; gpu_param.global_size[0] = 1; @@ -351,6 +357,10 @@ static vsi_nn_kernel_node_t _setup float rand_max = (float)(pow(2.0,32)); float re_rand_max = 1 / rand_max; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + // Check if gpu can support the size if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) @@ -370,17 +380,20 @@ static vsi_nn_kernel_node_t _setup attr.is_const = FALSE; attr.vtl = TRUE; tensors[SEED_INDEX] = vsi_nn_CreateTensor( graph, &attr ); + CHECK_PTR_FAIL_GOTO(tensors[SEED_INDEX], "Create tensor failed", final); attr.size[0] = inputs[0]->attr.size[0]; attr.size[1] = inputs[0]->attr.size[1]; attr.dim_num = 2; tensors[CDF_INDEX] = vsi_nn_CreateTensor( graph, &attr ); + CHECK_PTR_FAIL_GOTO(tensors[CDF_INDEX], "Create tensor failed", final); memcpy( &attr, &(inputs[1]->attr), sizeof(vsi_nn_tensor_attr_t) ); attr.size[1] = 1; attr.dim_num = 2; tensors[SEEDS_INDEX] = vsi_nn_reshape_tensor( graph, inputs[1], attr.size, attr.dim_num ); + CHECK_PTR_FAIL_GOTO(tensors[SEEDS_INDEX], "Create tensor failed", final); in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); diff --git a/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c index 9b92246fd..aa2a45c89 100644 --- a/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c @@ -105,6 +105,8 @@ DEF_KERNEL_INITIALIZER(_reduceall_internal_initializer) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t * output_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c index b347758c1..b5ff4e262 100644 --- a/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c @@ -105,6 +105,8 @@ DEF_KERNEL_INITIALIZER(_reduceany_internal_initializer) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t * output_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c index 05a867406..5ee818064 100644 --- a/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c @@ -120,6 +120,8 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t * output_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c index 50a502565..ba31ed9fe 100644 --- a/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c @@ -119,6 +119,8 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t * output_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c index 8d1b7c0dd..b04a246a5 100644 --- a/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c @@ -129,6 +129,8 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t * output_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c b/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c index 8cfd331fa..1ea137bdc 100644 --- a/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c @@ -126,6 +126,8 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer) vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/repeat_cl.c b/src/tim/vx/internal/src/kernel/cl/repeat_cl.c index c2f28dda7..d40ae1f26 100644 --- a/src/tim/vx/internal/src/kernel/cl/repeat_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/repeat_cl.c @@ -129,6 +129,8 @@ DEF_KERNEL_INITIALIZER(_repeat_initializer) int32_t is1d = 0; int32_t axis = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &axis); @@ -190,7 +192,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype = U8; int32_t is1d = inputs[0]->attr.dim_num == 1 ? 1 : 0; uint32_t key = 0; - int i = 0; + size_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -308,6 +310,9 @@ static vsi_nn_kernel_node_t _setup vsi_size_t height = inputs[0]->attr.dim_num > 1 ? inputs[0]->attr.size[1] : 1; vsi_size_t channel = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { diff --git a/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c index fda7acdc9..d9b18e718 100644 --- a/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c @@ -116,6 +116,8 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer) vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c index eef5bec37..8868565f9 100644 --- a/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c @@ -117,6 +117,8 @@ DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer) vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/resize_3d_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_3d_bilinear_cl.c new file mode 100644 index 000000000..77afbc1ca --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/resize_3d_bilinear_cl.c @@ -0,0 +1,329 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +#define _RESIZE_3D_BILINEAR_KERNEL_SOURCE() "resize_3d_bilinear" + +#define STR(a) #a +// Add kernel hashtable here +#define RESIZE_3D_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) ) + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_3D_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("cl.resize_3d_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _RESIZE_3D_BILINEAR_KERNEL_SOURCE() } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _resize_3d_bilinear_kernel_map[] = +{ + PACK_KERNEL_MAP( F32, F32), + PACK_KERNEL_MAP( F32, U8), + PACK_KERNEL_MAP( U8, F32), + PACK_KERNEL_MAP( U8, U8), + PACK_KERNEL_MAP( I8, I8), + PACK_KERNEL_MAP( BF16,BF16), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _resize_3d_bilinear_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + + +#define RESIZE_3D_BILINEAR_NUM _cnt_of_array( _resize_3d_bilinear_kernel_param_def ) + + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_resize_3d_bilinear_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_size_array_t * out_shape = NULL; + + VSI_UNREFERENCED(param_size); + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + out_shape = output_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = 3; + gpu_param.global_size[0] = (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0]; + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->data[2]; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(output_attr); + return status; +} /* _resize_3d_bilinear_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _resize_3d_bilinear_kernel_map; + size_t kernel_map_size = _cnt_of_array( _resize_3d_bilinear_kernel_map ); + vx_param_description_t * param_def = _resize_3d_bilinear_kernel_param_def; + size_t param_def_size = _cnt_of_array( _resize_3d_bilinear_kernel_param_def ); + vx_kernel_initialize_f initializer = _resize_3d_bilinear_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in_dtype) + { + in_dtype = F32; + } + if (F16 == out_dtype) + { + out_dtype = F32; + } + + if (I16 == in_dtype) + { + in_dtype = I8; + } + if (I16 == out_dtype) + { + out_dtype = I8; + } + + key = RESIZE_3D_BILINEAR_HASH_KEY( in_dtype, out_dtype ); + + for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[RESIZE_3D_BILINEAR_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + vsi_size_t in_width = inputs[0]->attr.size[0]; + vsi_size_t in_height = inputs[0]->attr.size[1]; + vsi_size_t in_depth = inputs[0]->attr.size[2]; + vsi_size_t out_width = outputs[0]->attr.size[0]; + vsi_size_t out_height = outputs[0]->attr.size[1]; + vsi_size_t out_depth = outputs[0]->attr.size[2]; + float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + float input_tail = -(input_zp * input_scale); + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); + float half_pixel_value = 0.0f; + float scale_factor_x = 0.0f; + float scale_factor_y = 0.0f; + float scale_factor_z = 0.0f; + + if (align_corners && out_width > 1) + { + scale_factor_x = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1); + } + else + { + scale_factor_x = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width; + } + + if (align_corners && out_height > 1) + { + scale_factor_y = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1); + } + else + { + scale_factor_y = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height; + } + + if (align_corners && out_depth > 1) + { + scale_factor_z = ((vx_float32)(in_depth - 1) * 1.0f) / (vx_float32)(out_depth - 1); + } + else + { + scale_factor_z = ((vx_float32)in_depth * 1.0f) / (vx_float32)out_depth; + } + + if (half_pixel_centers) + { + half_pixel_value = 0.5f; + } + else + { + half_pixel_value = 0.0f; + } + + + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + size_t node_params_num = RESIZE_3D_BILINEAR_NUM; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, RESIZE_3D_BILINEAR_NUM, + inputs, input_num, outputs, output_num ); + node_params[2] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_x ); + node_params[3] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_y ); + node_params[4] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_z ); + node_params[5] = vsi_nn_kernel_scalar_create( graph, F32, &half_pixel_value ); + node_params[6] = vsi_nn_kernel_scalar_create( graph, U32, &in_width ); + node_params[7] = vsi_nn_kernel_scalar_create( graph, U32, &in_height ); + node_params[8] = vsi_nn_kernel_scalar_create( graph, U32, &in_depth ); + node_params[9] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale ); + node_params[10] = vsi_nn_kernel_scalar_create( graph, F32, &input_tail ); + node_params[11] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale ); + node_params[12] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[2] ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + vsi_nn_kernel_scalar_release( &node_params[10] ); + vsi_nn_kernel_scalar_release( &node_params[11] ); + vsi_nn_kernel_scalar_release( &node_params[12] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( resize_3d_bilinear, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cl/resize_3d_nearest_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_3d_nearest_cl.c new file mode 100644 index 000000000..b0e6138c7 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/resize_3d_nearest_cl.c @@ -0,0 +1,332 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_RESIZE_3D_NEAREST, +} _internal_kernel_e; + +#define _RESIZE_3D_NEAREST_KERNEL_SOURCE "resize_3d_nearest" + +#define STR(a) #a +// Add kernel hashtable here +#define RESIZE_3D_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + (( IN_DTYPE << 8 ) | ( OUT_DTYPE )) + +#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_3D_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("cl.resize_3d_nearest_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + _RESIZE_3D_NEAREST_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _resize_3d_nearest_kernel_map[] = +{ + PACK_KERNEL_MAP( F32, F32), + PACK_KERNEL_MAP( F32, U8), + PACK_KERNEL_MAP( U8, F32), + PACK_KERNEL_MAP( U8, U8), + PACK_KERNEL_MAP( I8, I8), + PACK_KERNEL_MAP( BF16,BF16), +}; + + + + +/* + * Kernel params + */ +static vx_param_description_t _resize_3d_nearest_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; + +#define _RESIZE_3D_NEAREST_PARAM_NUM _cnt_of_array( _resize_3d_nearest_kernel_param_def ) + + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_resize_3d_nearest_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_size_array_t * out_shape = NULL; + + VSI_UNREFERENCED(param_size); + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + out_shape = output_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = 3; + gpu_param.global_size[0] = gpu_align_p2( + (out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = ( + (out_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]); + gpu_param.global_size[2] = out_shape->data[2]; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(output_attr); + return status; +} /* _resize_3d_nearest_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _resize_3d_nearest_kernel_map; + size_t kernel_map_size = _cnt_of_array( _resize_3d_nearest_kernel_map ); + vx_param_description_t * param_def = _resize_3d_nearest_kernel_param_def; + size_t param_def_size = _cnt_of_array( _resize_3d_nearest_kernel_param_def ); + vx_kernel_initialize_f initializer = _resize_3d_nearest_initializer; + + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in_dtype) + { + in_dtype = F32; + } + if (F16 == out_dtype) + { + out_dtype = F32; + } + + if (I16 == in_dtype) + { + in_dtype = I8; + } + if (I16 == out_dtype) + { + out_dtype = I8; + } + + key = RESIZE_3D_NEAREST_HASH_KEY( in_dtype, out_dtype ); + + for( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if( kernel_map[i].key == key ) + { + break; + } + } + if( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_RESIZE_3D_NEAREST_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + vsi_size_t in_width = inputs[0]->attr.size[0]; + vsi_size_t in_height = inputs[0]->attr.size[1]; + vsi_size_t in_depth = inputs[0]->attr.size[2]; + vsi_size_t out_width = outputs[0]->attr.size[0]; + vsi_size_t out_height = outputs[0]->attr.size[1]; + vsi_size_t out_depth = outputs[0]->attr.size[2]; + float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + float output_scale = input_scale / vsi_nn_get_tensor_scale(outputs[0]); + float output_tail = (float)vsi_nn_get_tensor_zero_point(outputs[0]) - input_zp * output_scale; + float half_pixel_value = 0.0f; + float round_value = 0.0f; + float scale_factor_x = 0.0f; + float scale_factor_y = 0.0f; + float scale_factor_z = 0.0f; + + if (align_corners && out_width > 1) + { + scale_factor_x = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1); + } + else + { + scale_factor_x = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width; + } + + if (align_corners && out_height > 1) + { + scale_factor_y = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1); + } + else + { + scale_factor_y = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height; + } + + if (align_corners && out_depth > 1) + { + scale_factor_z = ((vx_float32)(in_depth - 1) * 1.0f) / (vx_float32)(out_depth - 1); + } + else + { + scale_factor_z = ((vx_float32)in_depth * 1.0f) / (vx_float32)out_depth; + } + + if (align_corners) + { + round_value = 0.5f; + } + else + { + round_value = 0.0f; + } + + if (half_pixel_centers) + { + half_pixel_value = 0.5f; + } + else + { + half_pixel_value = 0.0f; + } + + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + size_t node_params_num = _RESIZE_3D_NEAREST_PARAM_NUM; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _RESIZE_3D_NEAREST_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[2] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_x ); + node_params[3] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_y ); + node_params[4] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_z ); + node_params[5] = vsi_nn_kernel_scalar_create( graph, F32, &half_pixel_value ); + node_params[6] = vsi_nn_kernel_scalar_create( graph, F32, &round_value ); + node_params[7] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale ); + node_params[8] = vsi_nn_kernel_scalar_create(graph, F32, &output_tail ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + vsi_nn_kernel_scalar_release( &node_params[2] ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( resize_3d_nearest, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c index a9c0285fb..60fbda3eb 100644 --- a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c @@ -115,6 +115,8 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c index d61abcf30..1ca6ba9f1 100644 --- a/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c @@ -121,6 +121,8 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer) vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/reversesequence_cl.c b/src/tim/vx/internal/src/kernel/cl/reversesequence_cl.c index cb9cdcd19..10b3855d2 100644 --- a/src/tim/vx/internal/src/kernel/cl/reversesequence_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/reversesequence_cl.c @@ -124,6 +124,8 @@ DEF_KERNEL_INITIALIZER(_reversesequence_initializer) vsi_nn_kernel_tensor_attr_t *input_attr = NULL; vsi_size_array_t *input_shape = NULL; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input ); CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -161,16 +163,16 @@ static vsi_status _query_kernel int32_t batch_axis ) { - vsi_status status = VSI_FAILURE; - vsi_nn_kernel_dtype_e in_dtype; - vsi_nn_kernel_dtype_e out_dtype; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype = 0; + vsi_nn_kernel_dtype_e out_dtype = 0; const _kernel_map_type * kernel_map = _reversesequence_kernel_map; size_t kernel_map_size = _cnt_of_array( _reversesequence_kernel_map ); vx_param_description_t * param_def = _reversesequence_kernel_param_def; vx_kernel_initialize_f initializer = _reversesequence_initializer; vsi_nn_kernel_batch_axis_type_e axis_type = _axis1; - uint32_t key; - uint32_t i; + uint32_t key = 0; + size_t i = 0; in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -223,7 +225,7 @@ static vsi_status _query_kernel break; } - for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + for ( i = 0; i < kernel_map_size; i ++ ) { if ( kernel_map[i].key == key ) { @@ -272,6 +274,13 @@ static vsi_nn_kernel_node_t _setup float inoutScale = inputScale / outputScale; float inoutTail = outputTail - inputTail * inoutScale; + vsi_nn_kernel_tensor_t reshape_tensor = NULL; + vsi_size_t shapes[VSI_NN_MAX_DIM_NUM] = {1}; + uint32_t new_rank = 2; + + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, @@ -280,6 +289,11 @@ static vsi_nn_kernel_node_t _setup return NULL; } + shapes[0] = inputs[1]->attr.size[0]; + shapes[1] = 1; + + reshape_tensor = vsi_nn_kernel_tensor_reshape(inputs[1]->t, shapes, new_rank); + status = _query_kernel( kernel, inputs, outputs, batch_axis ); if ( VSI_SUCCESS == status) { @@ -287,9 +301,10 @@ static vsi_nn_kernel_node_t _setup if ( node ) { /* Set inputs and outputs */ - uint32_t index = 3; - vsi_nn_kernel_node_pack_io( node_params, _REVERSESEQUENCE_PARAM_NUM, - inputs, input_num, outputs, output_num ); + uint32_t index = 0; + node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t; + node_params[index++] = reshape_tensor; + node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t; node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inoutScale ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inoutTail ); /* Pass parameters to node. */ @@ -298,6 +313,11 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &node_params[4] ); } } + + if (reshape_tensor) + { + vsi_nn_kernel_tensor_release( &reshape_tensor ); + } return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c index e897d0f78..9cf2818a6 100644 --- a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c @@ -134,6 +134,8 @@ DEF_KERNEL_INITIALIZER(_roi_align_initializer) vsi_size_array_t * rois_shape = NULL; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + rois_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( rois_attr, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c index 2be6a78da..fec2f3b69 100644 --- a/src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c @@ -155,6 +155,8 @@ DEF_KERNEL_INITIALIZER(_scatter_elements_initializer) vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c index d409c4c45..e56d37dde 100644 --- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c @@ -183,6 +183,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer) vsi_ssize_t block_size = 0; vsi_ssize_t height = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -222,7 +224,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype = U8; vsi_nn_kernel_coord_type_e coord_type = _1D; uint32_t key = 0; - int i = 0; + size_t i = 0; input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -290,6 +292,9 @@ static vsi_nn_kernel_node_t _setup int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; vsi_size_t width = 0, area = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if (coord_dim > 3) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c index d5f2867bd..94c4fa330 100644 --- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c @@ -188,6 +188,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer) vsi_ssize_t block_size = 0; vsi_ssize_t height = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -227,7 +229,9 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e input2_dtype = U8; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; - int i = 0; + size_t i = 0; + + VSI_UNREFERENCED(coord_dim); input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); @@ -284,6 +288,9 @@ static vsi_nn_kernel_node_t _setup vsi_size_t *input_size = inputs[2]->attr.size; uint32_t dims_num = inputs[2]->attr.dim_num; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if (coord_dim > 4 && input_size[dims_num - 1] > 1) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/select_cl.c b/src/tim/vx/internal/src/kernel/cl/select_cl.c index 53b1fcdd9..ab449010a 100644 --- a/src/tim/vx/internal/src/kernel/cl/select_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/select_cl.c @@ -35,6 +35,7 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "libnnext/vx_lib_nnext.h" +#include "kernel/vsi_nn_kernel_eltwise.h" __BEGIN_DECLS @@ -62,6 +63,10 @@ typedef enum _internal_img_dim_e CVIVANTE_NAMESPACE("cl.select_"STR(COND_DTYPE)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ _SELECT_KERNEL_SOURCE} +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) + typedef struct { uint32_t key; @@ -111,7 +116,7 @@ DEF_KERNEL_INITIALIZER(_select_initializer) size_t param_size ) { - vsi_status status = VX_SUCCESS; + vsi_status status = VSI_FAILURE; // Alignment with a power of two value. gpu_param_t gpu_param = { 3, @@ -125,6 +130,8 @@ DEF_KERNEL_INITIALIZER(_select_initializer) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t *output_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -247,19 +254,73 @@ static vsi_nn_kernel_node_t _setup float input1Scale = vsi_nn_get_tensor_scale(inputs[2]); float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[2]); + vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL }; + vsi_size_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_size_t* shapes_ptr[_IO_NUM]; + vsi_size_t* shapes_in[_INPUT_NUM]; + vsi_size_t rank_in[_INPUT_NUM]; + uint32_t new_rank = 0; + uint32_t i = 0; + vsi_bool ret = FALSE; + + VSI_UNREFERENCED(params); + input0Scale = input0Scale / outputScale; input1Scale = input1Scale / outputScale; input0Tail = outputZP - input0Tail * input0Scale; input1Tail = outputZP - input1Tail * input1Scale; - if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, - outputs[0]->attr.dim_num ) ) + + for (i = 0; i < _IO_NUM; i++) + { + shapes_ptr[i] = shapes[i]; + } + + for (i = 0; i < _INPUT_NUM; i++) + { + shapes_in[i] = inputs[i]->attr.size; + rank_in[i] = (vsi_size_t)inputs[i]->attr.dim_num; + } + + ret = vsi_nn_kernel_optimize_broadcast_shape( + (const vsi_size_t**)shapes_in, rank_in, _INPUT_NUM, + outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes_ptr, shapes[_INPUT_NUM], &new_rank); + + if ( ret ) + { + for (i = 0; i < _INPUT_NUM; i++) + { + reshape_tensors[i] = vsi_nn_reshape_tensor( graph, + inputs[i], shapes[i], new_rank ); + } + + for (i = 0; i < _OUTPUT_NUM; i++) + { + reshape_tensors[i + _INPUT_NUM] = vsi_nn_reshape_tensor( graph, + outputs[i], shapes[i + _INPUT_NUM], new_rank ); + } + } + else + { + for (i = 0; i < _INPUT_NUM; i++) + { + reshape_tensors[i] = inputs[i]; + } + for (i = 0; i < _OUTPUT_NUM; i++) + { + reshape_tensors[i + _INPUT_NUM] = outputs[i]; + } + } + + if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[3]->attr.size, + reshape_tensors[3]->attr.dim_num ) ) { return NULL; } - image_2d = (outputs[0]->attr.dim_num == 2 || outputs[0]->attr.size[2] == 1); - status = _query_kernel( kernel, inputs, outputs, image_2d); + image_2d = (reshape_tensors[3]->attr.dim_num == 2); + status = _query_kernel( kernel, inputs, &reshape_tensors[3], image_2d); if( VSI_SUCCESS == status) { @@ -268,7 +329,7 @@ static vsi_nn_kernel_node_t _setup { /* Set inputs and outputs */ vsi_nn_kernel_node_pack_io( node_params, _SELECT_PARAM_NUM, - inputs, input_num, outputs, output_num ); + &reshape_tensors[0], input_num, &reshape_tensors[3], output_num ); node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale ); node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input0Tail ); node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale ); @@ -283,6 +344,15 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] ); } } + + if (ret) + { + for (i = 0; i < _IO_NUM; i++) + { + vsi_safe_release_tensor( reshape_tensors[i] ); + } + } + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c b/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c index d65200d33..4c620f4ce 100644 --- a/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c @@ -116,6 +116,8 @@ DEF_KERNEL_INITIALIZER(_sequence_mask_initializer) vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -155,7 +157,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype = U8; vsi_status status = VSI_FAILURE; uint32_t key = 0; - int i = 0; + size_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -204,6 +206,8 @@ static int32_t _optimize_mask_shape vsi_size_t new_rank = 0; uint32_t i = 0; + VSI_UNREFERENCED(outputs); + for(i = 0; i < inputs[0]->attr.dim_num; i++) { in_shape[i] = inputs[0]->attr.size[i]; @@ -253,6 +257,9 @@ static vsi_nn_kernel_node_t _setup float input_zpScale = 0; float outputVal1 = 1.0f; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { diff --git a/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c b/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c index 7aee0e0af..7a2bef62f 100644 --- a/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c @@ -95,6 +95,8 @@ DEF_KERNEL_INITIALIZER(_signal_frame_initializer) vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -199,6 +201,9 @@ static vsi_nn_kernel_node_t _setup vsi_nn_tensor_t* rs_tensors[2] = { NULL }; vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }}; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + for (i = 0; i < axis; i++) { inner *= inputs[0]->attr.size[i]; diff --git a/src/tim/vx/internal/src/kernel/cl/slice_cl.c b/src/tim/vx/internal/src/kernel/cl/slice_cl.c index 4900bb129..d3379bbfe 100644 --- a/src/tim/vx/internal/src/kernel/cl/slice_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/slice_cl.c @@ -128,6 +128,8 @@ DEF_KERNEL_INITIALIZER(_slice_initializer) vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); @@ -245,6 +247,8 @@ static vsi_nn_kernel_node_t _setup float outputScale = vsi_nn_get_tensor_scale(outputs[0]); float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f; + VSI_UNREFERENCED(params); + outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, diff --git a/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c index 7c7a59a2f..3bca54f63 100644 --- a/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c @@ -114,6 +114,8 @@ DEF_KERNEL_INITIALIZER(_space2depth_internal_initializer) vsi_ssize_t height = 0; vsi_ssize_t chn = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -155,7 +157,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype = U8; vsi_status status = VSI_FAILURE; uint32_t key = 0; - int i = 0; + size_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -215,6 +217,9 @@ static vsi_nn_kernel_node_t _setup float scaleInOut = 1.0f; float zpInOut = 0.0f; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + scaleInOut = inputScale / outputScale; zpInOut = outputZp - inputZp * scaleInOut; diff --git a/src/tim/vx/internal/src/kernel/cl/swish_cl.c b/src/tim/vx/internal/src/kernel/cl/swish_cl.c index b616a84ac..97d0db96b 100644 --- a/src/tim/vx/internal/src/kernel/cl/swish_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/swish_cl.c @@ -167,11 +167,13 @@ DEF_KERNEL_INITIALIZER(_swish_initializer) {0, 0, 0} }; - vx_status status = VX_FAILURE; + vsi_status status = VSI_FAILURE; vx_tensor output = (vx_tensor)param[1]; vsi_nn_kernel_tensor_attr_t * attr_out = NULL; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr_out = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); CHECK_PTR_FAIL_GOTO( attr_out, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -293,6 +295,9 @@ static vsi_nn_kernel_node_t _setup vx_float32 logE = (vx_float32)(log10(exp(1.0f)) / log10(2.0f)); vsi_bool ret = FALSE; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + #if (VX_ACTIVATION_EXT_SUPPORT) if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver) { diff --git a/src/tim/vx/internal/src/kernel/cl/tile_cl.c b/src/tim/vx/internal/src/kernel/cl/tile_cl.c index 63816947e..266b8ed6a 100644 --- a/src/tim/vx/internal/src/kernel/cl/tile_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/tile_cl.c @@ -106,6 +106,7 @@ static const struct { TENSOR_TILE_AXIS0_UINT32(U32, U32) TENSOR_TILE_AXIS0_FLOAT(F16, F16) TENSOR_TILE_AXIS0_FLOAT(F32, F32) + TENSOR_TILE_AXIS0_KERNELS(F32, U32) TENSOR_TILE_AXIS0_INT32_2D(I8, I8) TENSOR_TILE_AXIS0_INT32_2D(I16, I16) @@ -114,6 +115,7 @@ static const struct { TENSOR_TILE_AXIS0_UINT32_2D(U32, U32) TENSOR_TILE_AXIS0_FLOAT_2D(F16, F16) TENSOR_TILE_AXIS0_FLOAT_2D(F32, F32) + TENSOR_TILE_AXIS0_KERNELS_2D(F32, U32) }; /* @@ -130,6 +132,8 @@ static vx_param_description_t kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; #define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) @@ -140,6 +144,8 @@ static vx_param_description_t kernel_param_def[] = #define SCALAR_INPUT_MULTIPLES_1 (6) #define SCALAR_INPUT_MULTIPLES_2 (7) #define SCALAR_INPUT_MULTIPLES_3 (8) +#define IN_OUT_SCALE (9) +#define IN_OUT_TAIL (10) /* * Kernel initializer @@ -163,6 +169,8 @@ DEF_KERNEL_INITIALIZER(_tile_initializer) vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; vsi_size_array_t * in_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -205,10 +213,29 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + size_t i; input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + if (input_dtype == F16) + { + input_dtype = F32; + } + else if (input_dtype == U8) + { + input_dtype = U32; + } + + if (output_dtype == F16) + { + output_dtype = F32; + } + else if (output_dtype == U8) + { + output_dtype = U32; + } + + key = HASH_TILE_AXIS0_KEY( input_dtype, output_dtype, image_2d ); for( i = 0; i < _cnt_of_array(kernel_map); i ++ ) @@ -280,6 +307,16 @@ static vsi_nn_kernel_node_t _setup vsi_bool ret = FALSE; uint32_t dim = inputs[0]->attr.dim_num; vsi_size_t multiples[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 }; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float inoutScale = inputScale / outputScale; + float inoutTail = outputTail - inputTail * inoutScale; + + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); for ( i = 0; i < dim; i++) { @@ -299,10 +336,34 @@ static vsi_nn_kernel_node_t _setup return NULL; } - reshape_tensors[0] = vsi_nn_reshape_tensor( graph, - inputs[0], shapes[0], new_rank ); - reshape_tensors[1] = vsi_nn_reshape_tensor( graph, - outputs[0], shapes[2], new_rank ); + if ( new_rank == 4) + { + vsi_size_t newshapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + newshapes[0][0] = shapes[0][0]; + newshapes[2][0] = shapes[2][0]; + newshapes[0][1] = shapes[0][1]; + newshapes[2][1] = shapes[2][1]; + newshapes[0][2] = shapes[0][2] * shapes[0][3]; + newshapes[2][2] = shapes[2][2] * shapes[2][3]; + + if (newshapes[0][2] >= GPU_TENSOR_MAX_WIDTH || + newshapes[2][2] >= GPU_TENSOR_MAX_WIDTH) + { + return NULL; + } + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], newshapes[0], 3 ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], newshapes[2], 3 ); + } + else + { + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], shapes[2], new_rank ); + } } else { @@ -315,7 +376,7 @@ static vsi_nn_kernel_node_t _setup goto final; } - image_2d = ((reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1)); + image_2d = reshape_tensors[0]->attr.dim_num == 2; status = _query_kernel( &reshape_tensors[0], &reshape_tensors[1], image_2d, kernel ); if( VSI_SUCCESS == status) { @@ -323,13 +384,16 @@ static vsi_nn_kernel_node_t _setup if( node ) { - uint32_t depthIn = (uint32_t)(new_rank > 2 ? reshape_tensors[0]->attr.size[2] : 1); - uint32_t depthOut = (uint32_t)(new_rank > 2 ? reshape_tensors[1]->attr.size[2] : 1); - uint32_t batchIn = (uint32_t)(new_rank > 3 ? reshape_tensors[0]->attr.size[3] : 1); + uint32_t depthIn = (uint32_t)(new_rank > 2 ? shapes[0][2] : 1); + uint32_t depthOut = (uint32_t)(new_rank > 2 ? shapes[2][2] : 1); + uint32_t batchIn = (uint32_t)(new_rank > 3 ? shapes[0][3] : 1); vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, &reshape_tensors[0], 1, &reshape_tensors[1], 1 ); + shapes[1][2] = shapes[1][2] == 0 ? 1 : shapes[1][2]; + shapes[1][3] = shapes[1][3] == 0 ? 1 : shapes[1][3]; + /* Pass parameters to node. */ node_params[SCALAR_INPUT_BATCH_IN] = vsi_nn_kernel_scalar_create( graph, I32, &batchIn ); @@ -338,14 +402,17 @@ static vsi_nn_kernel_node_t _setup node_params[SCALAR_INPUT_DEPTH_OUT] = vsi_nn_kernel_scalar_create( graph, I32, &depthOut ); node_params[SCALAR_INPUT_MULTIPLES_0] = vsi_nn_kernel_scalar_create( - graph, I32, &multiples[0] ); + graph, I32, &shapes[1][0] ); node_params[SCALAR_INPUT_MULTIPLES_1] = vsi_nn_kernel_scalar_create( - graph, I32, &multiples[1] ); + graph, I32, &shapes[1][1] ); node_params[SCALAR_INPUT_MULTIPLES_2] = vsi_nn_kernel_scalar_create( - graph, I32, &multiples[2] ); + graph, I32, &shapes[1][2] ); node_params[SCALAR_INPUT_MULTIPLES_3] = vsi_nn_kernel_scalar_create( - graph, I32, &multiples[3] ); - + graph, I32, &shapes[1][3] ); + node_params[IN_OUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &inoutScale ); + node_params[IN_OUT_TAIL] = vsi_nn_kernel_scalar_create( + graph, F32, &inoutTail ); status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); VSI_ASSERT( status == VSI_SUCCESS ); @@ -356,6 +423,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MULTIPLES_1] ); vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MULTIPLES_2] ); vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MULTIPLES_3] ); + vsi_nn_kernel_scalar_release( &node_params[IN_OUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[IN_OUT_TAIL] ); } } diff --git a/src/tim/vx/internal/src/kernel/cl/topk_cl.c b/src/tim/vx/internal/src/kernel/cl/topk_cl.c index 0354a1e3f..3d6884065 100644 --- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c @@ -181,6 +181,8 @@ DEF_KERNEL_INITIALIZER(_topk_initializer) vsi_size_array_t * in_shape = NULL; int32_t num_stages = 0; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); @@ -222,6 +224,8 @@ DEF_KERNEL_INITIALIZER(_topk_odd_even_sort_initializer) vsi_nn_kernel_tensor_attr_t * input_attr = NULL; vsi_size_array_t * in_shape = NULL; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); @@ -424,7 +428,7 @@ static vsi_nn_kernel_node_t _setup ) { vsi_status status = VSI_FAILURE; - vsi_nn_kernel_node_param_t node_params[_TOPK_ODD_EVEN_SORT_PARAM_NUM]; + vsi_nn_kernel_node_param_t node_params[_TOPK_ODD_EVEN_SORT_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; vsi_size_t block_size = inputs[0]->attr.size[0]; vsi_size_t block_num = 1; @@ -473,8 +477,10 @@ static vsi_nn_kernel_node_t _setup rs_tensors[1] = vsi_nn_reshape_tensor( graph, outputs[0], shape[1], 2 ); + CHECK_PTR_FAIL_GOTO(rs_tensors[1], "Create tensor failed", final); rs_tensors[2] = vsi_nn_reshape_tensor( graph, outputs[1], shape[1], 2 ); + CHECK_PTR_FAIL_GOTO(rs_tensors[2], "Create tensor failed", final); } else { @@ -484,14 +490,17 @@ static vsi_nn_kernel_node_t _setup memcpy( &attr, &(rs_tensors[0]->attr), sizeof(vsi_nn_tensor_attr_t) ); rs_tensors[1] = vsi_nn_CreateTensor( graph, &attr ); + CHECK_PTR_FAIL_GOTO(rs_tensors[1], "Create tensor failed", final); attr.dtype.vx_type = VSI_NN_TYPE_INT32; attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; rs_tensors[2] = vsi_nn_CreateTensor( graph, &attr ); - + CHECK_PTR_FAIL_GOTO(rs_tensors[2], "Create tensor failed", final); rs_tensors[3] = vsi_nn_reshape_tensor( graph, outputs[0], shape[1], 2 ); + CHECK_PTR_FAIL_GOTO(rs_tensors[3], "Create tensor failed", final); rs_tensors[4] = vsi_nn_reshape_tensor( graph, outputs[1], shape[1], 2 ); + CHECK_PTR_FAIL_GOTO(rs_tensors[4], "Create tensor failed", final); input_num = 3; } @@ -505,10 +514,10 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_pack_io( node_params, param_num, rs_tensors, input_num, &rs_tensors[input_num], output_num ); /* Pass parameters to node. */ - node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &inputScale ); - node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &inputTail ); + node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &inputScale ); + node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &inputTail ); node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &outputScale ); - node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &outputTail ); + node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &outputTail ); if (is_odd_even_sort) { node_params[SCALAR_INPUT_SIZE] = vsi_nn_kernel_scalar_create( diff --git a/src/tim/vx/internal/src/kernel/cl/upsample_cl.c b/src/tim/vx/internal/src/kernel/cl/upsample_cl.c index 6f469883a..d2c33870a 100644 --- a/src/tim/vx/internal/src/kernel/cl/upsample_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/upsample_cl.c @@ -123,12 +123,14 @@ DEF_KERNEL_INITIALIZER(_upsample_initializer) {0, 0, 0} }; - vx_status status = VX_FAILURE; + vsi_status status = VSI_FAILURE; vx_tensor input = (vx_tensor)param[0]; vsi_nn_kernel_tensor_attr_t * attr_in = NULL; vsi_size_array_t * in_shape = NULL; vsi_bool image_2d = FALSE; + VSI_UNREFERENCED(param_size); + attr_in = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input ); CHECK_PTR_FAIL_GOTO( attr_in, "vsi_nn_kernel_tensor_attr_create fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c b/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c index c241e1e16..e0b4517a2 100644 --- a/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c @@ -109,7 +109,7 @@ DEF_KERNEL_INITIALIZER(_a_times_b_plus_c_initializer) { #define _PACK_A_TIMES_B_PLUS_C_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE ) \ (( IN2_TYPE << 24) | ( IN1_TYPE << 16) | ( IN0_TYPE << 8) | ( OUT_TYPE)) - vsi_status status = VX_SUCCESS; + vsi_status status = VSI_FAILURE; // Alignment with a power of two value. gpu_param_t gpu_param = { 3, @@ -128,6 +128,8 @@ DEF_KERNEL_INITIALIZER(_a_times_b_plus_c_initializer) vsi_size_array_t *output_shape = NULL; uint32_t pack_key = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0); CHECK_PTR_FAIL_GOTO( attr[0], "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -331,6 +333,8 @@ static vsi_nn_kernel_node_t _setup vsi_bool ret = FALSE; vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL }; + VSI_UNREFERENCED(params); + for (i = 0; i < _IO_NUM; i++) { shapes_ptr[i] = shapes[i]; diff --git a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c index 679a07d9a..e1861a262 100644 --- a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c @@ -90,7 +90,7 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer) size_t param_size ) { - vsi_status status = VX_FAILURE; + vsi_status status = VSI_FAILURE; // Alignment with a power of two value. gpu_param_t gpu_param = { 2, @@ -119,6 +119,8 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer) float dimRatio = 0.0f; int32_t width = 0; + VSI_UNREFERENCED(param_size); + input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0); CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input1); diff --git a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c index 3fe4185ba..f5010111c 100644 --- a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c @@ -173,6 +173,8 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer) vsi_size_array_t * output_shape = NULL; uint32_t packedArgIdx[4] = {0}; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -413,7 +415,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + size_t i; input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -469,6 +471,9 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; int32_t axis = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + axis = vsi_nn_kernel_param_get_int32(params, "axis"); if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, diff --git a/src/tim/vx/internal/src/kernel/evis/argmin_evis.c b/src/tim/vx/internal/src/kernel/evis/argmin_evis.c index bce04ac52..90713e08b 100644 --- a/src/tim/vx/internal/src/kernel/evis/argmin_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/argmin_evis.c @@ -166,6 +166,8 @@ DEF_KERNEL_INITIALIZER(_argmin_initializer) vsi_size_array_t * output_shape = NULL; uint32_t packedArgIdx[4] = {0}; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -351,7 +353,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + size_t i; input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -397,6 +399,9 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; int32_t axis = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + axis = vsi_nn_kernel_param_get_int32(params, "axis"); if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, diff --git a/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c index a794ee542..80a1b21ea 100644 --- a/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c @@ -188,7 +188,7 @@ DEF_KERNEL_INITIALIZER(_batch_norm_initializer) #define _PACK_BATCH_NORM_KEY( IN_TYPE, OUT_TYPE ) \ ( ( IN_TYPE << 16) | ( OUT_TYPE ) ) - vsi_status status = VX_SUCCESS; + vsi_status status = VSI_FAILURE; // Alignment with a power of two value. gpu_param_t gpu_param = { 3, @@ -208,6 +208,8 @@ DEF_KERNEL_INITIALIZER(_batch_norm_initializer) float output_zp = 0; uint32_t pack_key = 0; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input); CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); diff --git a/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c b/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c index 01ea2ab4d..553f8b739 100644 --- a/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c @@ -58,8 +58,8 @@ typedef enum #define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ { \ BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \ - CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \ - _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE) \ + CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \ + _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE) \ } typedef struct @@ -139,6 +139,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer) float output_scale = 1.0; int32_t outputZP = 0; + VSI_UNREFERENCED(param_size); + input_attr[0] = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]); CHECK_PTR_FAIL_GOTO( @@ -418,14 +420,17 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP", &input0ZP); status |= vsi_nn_kernel_gpu_add_param(node, "uint8Scale", &uint8Scale); status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &uint8ZP_out); - status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_left_4x4", &uniU8SubZPtoFp32_left_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_left_4x4", + &uniU8SubZPtoFp32_left_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8); if (U8 == input1_dtype) { status |= vsi_nn_kernel_gpu_add_param(node, "input1_ZP", &input1ZP); status |= vsi_nn_kernel_gpu_add_param(node, "input1Scale", &input1_scale); - status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part0_4x4", &uniU8SubZPtoFp32_part0_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part1_4x4", &uniU8SubZPtoFp32_part1_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part0_4x4", + &uniU8SubZPtoFp32_part0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part1_4x4", + &uniU8SubZPtoFp32_part1_4x4); } else if (F16 == input1_dtype) { status |= vsi_nn_kernel_gpu_add_param( @@ -552,9 +557,9 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer) gpu_param.global_scale[2] = 1; gpu_param.dim = 2; - gpu_param.global_size[0] = gpu_align_p2( + gpu_param.global_size[0] = (out_width + gpu_param.global_scale[0] - 1) / - gpu_param.global_scale[0], 4); + gpu_param.global_scale[0]; gpu_param.global_size[1] = ((out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1]); diff --git a/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c b/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c index d7074c3db..75623dda3 100644 --- a/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c @@ -109,6 +109,8 @@ DEF_KERNEL_INITIALIZER(_bucketize_initializer) vsi_size_array_t * input0_shape = NULL; vsi_size_array_t * input1_shape = NULL; + VSI_UNREFERENCED(param_size); + input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input0_attr, "Create tensor attr buffer fail.", final ); input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); diff --git a/src/tim/vx/internal/src/kernel/evis/cast_evis.c b/src/tim/vx/internal/src/kernel/evis/cast_evis.c index f36e100b1..7908dd581 100644 --- a/src/tim/vx/internal/src/kernel/evis/cast_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/cast_evis.c @@ -150,6 +150,8 @@ DEF_KERNEL_INITIALIZER(_cast_initializer) vsi_nn_kernel_tensor_attr_t * input_attr = NULL; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); @@ -289,6 +291,8 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; + VSI_UNREFERENCED(params); + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) ) { diff --git a/src/tim/vx/internal/src/kernel/evis/clip_evis.c b/src/tim/vx/internal/src/kernel/evis/clip_evis.c index 87784bf31..add96c2c0 100644 --- a/src/tim/vx/internal/src/kernel/evis/clip_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/clip_evis.c @@ -142,6 +142,8 @@ DEF_KERNEL_INITIALIZER(_clip_initializer) int32_t srcFixPointPos = 0; int32_t dstFixPointPos = 0; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); diff --git a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c index 2fb8330de..4547dfb11 100644 --- a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c @@ -308,6 +308,8 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer) float input1Scale = 1.0f; float input1Tail = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -365,7 +367,6 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer) / gpu_param.global_scale[1]); gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; - if (1) { gpu_dp_inst_t uniExtractInteger_2x8 = {{ 0x33333333, // TCfg @@ -475,7 +476,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + size_t i; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); @@ -527,6 +528,9 @@ static vsi_nn_kernel_node_t _setup vsi_size_t new_rank = 0; vsi_bool ret = FALSE; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + ret = vsi_nn_kernel_optimize_eltwise_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, inputs[1]->attr.size, inputs[1]->attr.dim_num, @@ -543,11 +547,11 @@ static vsi_nn_kernel_node_t _setup outputs[0], shapes[2], new_rank ); #define _swap_tensor(a, b, tmp) \ - do { \ + { \ tmp = a; \ a = b; \ b = tmp; \ - } while(0) + } if (shapes[1][3] > shapes[0][3] && new_rank == 4) { diff --git a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c index 8e5d05e6c..e5669b0fd 100644 --- a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c @@ -134,6 +134,8 @@ DEF_KERNEL_INITIALIZER(_conv1d_ovxlib_initializer) int32_t input_width = 0; int32_t output_width = 0; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c index cad8476a6..dbdd513ab 100644 --- a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c @@ -36,6 +36,7 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_dtype_util.h" __BEGIN_DECLS @@ -47,21 +48,29 @@ __BEGIN_DECLS #define KERNEL_SOURCE_2 "cumsum_2d" #define KERNEL_SOURCE_3 "cumsum_bf16" #define KERNEL_SOURCE_4 "cumsum_f16_u8" +#define KERNEL_SOURCE_5 "cumsum_ex_rev_axis0" +#define KERNEL_SOURCE_6 "cumsum_ex_rev_axis1" +#define KERNEL_SOURCE_7 "cumsum_ex_rev_axis2" // Add kernel hashtable here -#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ - ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) +#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, EX_REV, _image_2d) \ + ((EX_REV << 24) | (AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) #define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \ - { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0), \ CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \ SOURCE }, #define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \ - { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1), \ CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \ SOURCE }, +#define HASH_CUMSUM_EX_REV_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \ + { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0), \ + CVIVANTE_NAMESPACE("evis.cumsum_ex_rev_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \ + SOURCE }, + static const struct { uint32_t key; char* function_name; @@ -108,6 +117,24 @@ static const struct { HASH_CUMSUM_KERNELS_2D(1, F16, U8, KERNEL_SOURCE_4) HASH_CUMSUM_KERNELS_2D(1, F16, I8, KERNEL_SOURCE_4) HASH_CUMSUM_KERNELS_2D(1, F16, I16, KERNEL_SOURCE_4) + HASH_CUMSUM_EX_REV_KERNELS(0, U8, U8, KERNEL_SOURCE_5) + HASH_CUMSUM_EX_REV_KERNELS(0, I8, I8, KERNEL_SOURCE_5) + HASH_CUMSUM_EX_REV_KERNELS(0, I16, I16, KERNEL_SOURCE_5) + HASH_CUMSUM_EX_REV_KERNELS(0, F16, F16, KERNEL_SOURCE_5) + HASH_CUMSUM_EX_REV_KERNELS(1, U8, U8, KERNEL_SOURCE_6) + HASH_CUMSUM_EX_REV_KERNELS(1, I8, I8, KERNEL_SOURCE_6) + HASH_CUMSUM_EX_REV_KERNELS(1, I16, I16, KERNEL_SOURCE_6) + HASH_CUMSUM_EX_REV_KERNELS(1, F16, F16, KERNEL_SOURCE_6) + HASH_CUMSUM_EX_REV_KERNELS(2, U8, U8, KERNEL_SOURCE_7) + HASH_CUMSUM_EX_REV_KERNELS(2, I8, I8, KERNEL_SOURCE_7) + HASH_CUMSUM_EX_REV_KERNELS(2, I16, I16, KERNEL_SOURCE_7) + HASH_CUMSUM_EX_REV_KERNELS(2, F16, F16, KERNEL_SOURCE_7) + HASH_CUMSUM_EX_REV_KERNELS(1, F16, U8, KERNEL_SOURCE_4) + HASH_CUMSUM_EX_REV_KERNELS(1, F16, I8, KERNEL_SOURCE_4) + HASH_CUMSUM_EX_REV_KERNELS(1, F16, I16, KERNEL_SOURCE_4) + HASH_CUMSUM_EX_REV_KERNELS(2, F16, U8, KERNEL_SOURCE_4) + HASH_CUMSUM_EX_REV_KERNELS(2, F16, I8, KERNEL_SOURCE_4) + HASH_CUMSUM_EX_REV_KERNELS(2, F16, I16, KERNEL_SOURCE_4) }; /* @@ -143,6 +170,8 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer) {0, 0, 0}}; // globalWorkSize: image size in thread int32_t axis = 0; + int32_t exclusive = 0; + int32_t reverse = 0; int32_t width = 0; int32_t height = 0; int32_t channel = 0; @@ -161,6 +190,8 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer) uint32_t pack_key = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -168,6 +199,10 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis); CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &exclusive); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &reverse); + CHECK_STATUS_FAIL_GOTO(status, OnError ); if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) { @@ -204,7 +239,7 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer) } in_out_scale = input_scale * output_scale; - in_out_zp_scale = (float)in_out_scale * input_zp; + in_out_zp_scale = (float)in_out_scale * input_zp * (-1); input_shape = attr[0]->shape; dim = (uint32_t)input_shape->size; @@ -460,14 +495,121 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer) 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniSumHorzRevF16toF16A_4x4 = {{ + 0x01051555, // TCfg + 0x00000000, // ASelt + 0x05674567, 0x00070067, // ABin + 0x020a2aaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00003c00, + 0x3c003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniSumHorzRevF16toF16B_4x4 = {{ + 0x01051555, // TCfg + 0x00000000, // ASelt + 0x01230123, 0x00030023, // ABin + 0x020a2aaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00003c00, + 0x3c003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniSumHorzRevF16toF16C_2x8 = {{ + 0x11115555, // TCfg + 0x00000000, // ASelt + 0x43424140, 0x07060504, // ABin + 0x2222aaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniAccSumHorzRevF16toF16_2x8 = {{ + 0x55555555, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniSumHorzRevU8toI16A_4x4 = {{ + 0x01051555, // TCfg + 0x00000000, // ASelt + 0x05674567, 0x00070067, // ABin + 0x020a2aaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00000001, + 0x00010001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniSumHorzRevU8toI16B_8x4 = {{ + 0x15555555, 0x01550555, // TCfg + 0x443214c7, 0x3214c700, 0x14c70044, 0xc7000432, 0x00003214, // BinSelect + 0x00000700, // AccumType, ConstantType, and PostShift + 0x01010101, 0x01010101, 0x01010101, 0x00010101, + 0x01010101, 0x00000101, 0x01010101, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniSubZpRevI16toI16_2x8 = {{ + 0x55555555, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00080001, 0x00070001, 0x00060001, 0x00050001, + 0x00040001, 0x00030001, 0x00020001, 0x00010001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniAccSumHorzRevI16toI32A_4x4 = {{ + 0x0d0d0d0d, // TCfg + 0x04040404, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniAccSumHorzRevI16toI32B_4x4 = {{ + 0x0d0d0d0d, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_quantize_multiplier_16bit( (double)input_scale * output_scale, &M0, &postShift); multAndoutZP0[0] = (uint32_t)(M0); multAndoutZP0[1] = (uint32_t)((attr[1]->asymm.zero_point << postShift) - input_zp * M0); gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift ); - status = vsi_nn_kernel_gpu_add_param(node, "width", &width); - status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); - CHECK_STATUS_FAIL_GOTO(status, OnError ); + if ((exclusive || reverse) && axis == 0) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniSumHorzRevF16toF16A_4x4", &uniSumHorzRevF16toF16A_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSumHorzRevF16toF16B_4x4", &uniSumHorzRevF16toF16B_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSumHorzRevF16toF16C_2x8", &uniSumHorzRevF16toF16C_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumHorzRevF16toF16_2x8", &uniAccSumHorzRevF16toF16_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSumHorzRevU8toI16A_4x4", &uniSumHorzRevU8toI16A_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSumHorzRevU8toI16B_8x4", &uniSumHorzRevU8toI16B_8x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSubZpRevI16toI16_2x8", &uniSubZpRevI16toI16_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumHorzRevI16toI32A_4x4", &uniAccSumHorzRevI16toI32A_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumHorzRevI16toI32B_4x4", &uniAccSumHorzRevI16toI32B_4x4 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } switch( pack_key ) { @@ -477,7 +619,6 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer) case _PACK_SELECT_KEY( F16, F16, 2, 3): { status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel); - status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp); status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale); status |= vsi_nn_kernel_gpu_add_param(node, "in_out_zp_scale", &in_out_zp_scale); @@ -493,47 +634,21 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer) "uniAccSumVertU8toI32D_4x4", &uniAccSumVertU8toI32D_4x4 ); status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniSumHorzF16toF16A_4x4", &uniSumHorzF16toF16A_4x4 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniSumHorzF16toF16B_4x4", &uniSumHorzF16toF16B_4x4 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniSumHorzF16toF16C_2x8", &uniSumHorzF16toF16C_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniAccSumHorzF16toF16_2x8", &uniAccSumHorzF16toF16_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniSumHorzU8toI16A_4x4", &uniSumHorzU8toI16A_4x4 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniSumHorzU8toI16B_8x4", &uniSumHorzU8toI16B_8x4 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniSubZpI16toI16_2x8", &uniSubZpI16toI16_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniAccSumHorzI16toI32A_4x4", &uniAccSumHorzI16toI32A_4x4 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniAccSumHorzI16toI32B_4x4", &uniAccSumHorzI16toI32B_4x4 ); status |= vsi_nn_kernel_gpu_add_param( node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; - case _PACK_SELECT_KEY( U8, U8, 0, 2): - case _PACK_SELECT_KEY( U8, U8, 1, 2): - case _PACK_SELECT_KEY( U8, U8, 0, 3): case _PACK_SELECT_KEY( U8, U8, 1, 3): - case _PACK_SELECT_KEY( I8, I8, 0, 2): - case _PACK_SELECT_KEY( I8, I8, 1, 2): - case _PACK_SELECT_KEY( I8, I8, 0, 3): case _PACK_SELECT_KEY( I8, I8, 1, 3): - case _PACK_SELECT_KEY( I16, I16, 0, 2): - case _PACK_SELECT_KEY( I16, I16, 1, 2): - case _PACK_SELECT_KEY( I16, I16, 0, 3): case _PACK_SELECT_KEY( I16, I16, 1, 3): - case _PACK_SELECT_KEY( F16, F16, 0, 2): - case _PACK_SELECT_KEY( F16, F16, 1, 2): - case _PACK_SELECT_KEY( F16, F16, 0, 3): case _PACK_SELECT_KEY( F16, F16, 1, 3): + case _PACK_SELECT_KEY( U8, U8, 1, 2): + case _PACK_SELECT_KEY( I8, I8, 1, 2): + case _PACK_SELECT_KEY( I16, I16, 1, 2): + case _PACK_SELECT_KEY( F16, F16, 1, 2): { - status = vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp); + status = vsi_nn_kernel_gpu_add_param(node, "height", &height); status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale); status |= vsi_nn_kernel_gpu_add_param(node, "in_out_zp_scale", &in_out_zp_scale); @@ -547,6 +662,26 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer) "uniAccSumVertU8toI32C_4x4", &uniAccSumVertU8toI32C_4x4 ); status |= vsi_nn_kernel_gpu_add_param( node, "uniAccSumVertU8toI32D_4x4", &uniAccSumVertU8toI32D_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( + node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, U8, 0, 2): + case _PACK_SELECT_KEY( U8, U8, 0, 3): + case _PACK_SELECT_KEY( I8, I8, 0, 2): + case _PACK_SELECT_KEY( I8, I8, 0, 3): + case _PACK_SELECT_KEY( I16, I16, 0, 2): + case _PACK_SELECT_KEY( I16, I16, 0, 3): + case _PACK_SELECT_KEY( F16, F16, 0, 2): + case _PACK_SELECT_KEY( F16, F16, 0, 3): + { + status = vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale); status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); status |= vsi_nn_kernel_gpu_add_param( node, @@ -578,7 +713,9 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer) case _PACK_SELECT_KEY( BF16, BF16, 1, 3): case _PACK_SELECT_KEY( BF16, BF16, 2, 3): { - status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel); + status = vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); + status |= vsi_nn_kernel_gpu_add_param(node, "channel", &channel); status |= vsi_nn_kernel_gpu_add_param( node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); status |= vsi_nn_kernel_gpu_add_param( @@ -604,7 +741,9 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer) case _PACK_SELECT_KEY( F16, I16, 1, 3): case _PACK_SELECT_KEY( F16, I16, 2, 3): { - status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel); + status = vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); + status |= vsi_nn_kernel_gpu_add_param(node, "channel", &channel); status |= vsi_nn_kernel_gpu_add_param( node, "uniAccSumVertF16toF16_2x8", &uniAccSumVertF16toF16_2x8); status |= vsi_nn_kernel_gpu_add_param( @@ -655,21 +794,24 @@ static vsi_status _query_kernel vsi_nn_kernel_t* kernel, const vsi_nn_kernel_param_t * params, int32_t axis, - int32_t is_2d + int32_t is_2d, + int32_t is_ex_rev ) { vsi_status status = VSI_FAILURE; vsi_nn_kernel_dtype_e input0_dtype = U8; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; - int i = 0; + size_t i = 0; + + VSI_UNREFERENCED(params); input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d); + key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_ex_rev, is_2d); - for( i = 0; i < _cnt_of_array(cumsum_map); i ++ ) + for ( i = 0; i < _cnt_of_array(cumsum_map); i ++ ) { if ( cumsum_map[i].key == key ) { @@ -716,17 +858,35 @@ static vsi_nn_kernel_node_t _setup int32_t axis_new = 0; int32_t is_2d = 0; uint32_t rs_dim = 2; - int32_t i = 0; + uint32_t i = 0; + int32_t is_ex_or_rev = exclusive || reverse; - vsi_nn_kernel_optimize_softmax_shape( - inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, - shapes[0], &rs_dim, &axis_new); - if (exclusive || reverse || rs_dim > 3) + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + + if (axis < 0) + { + axis_new = 0; + shapes[0][0] = 1; + shapes[0][1] = 1; + for (i = 0; i < inputs[0]->attr.dim_num; i++) + { + shapes[0][0] *= inputs[0]->attr.size[i]; + } + rs_dim = 2; + } + else + { + vsi_nn_kernel_optimize_softmax_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, + shapes[0], &rs_dim, &axis_new); + } + if (rs_dim > 3) { return NULL; } - if (rs_dim == 2) + if (rs_dim == 2 && is_ex_or_rev == 0) { is_2d = 1; } @@ -736,7 +896,7 @@ static vsi_nn_kernel_node_t _setup reshape_tensors[1] = vsi_nn_reshape_tensor( graph, outputs[0], shapes[0], (vsi_size_t)rs_dim ); - status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d); + status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d, is_ex_or_rev); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); @@ -754,6 +914,14 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &tmp_params[2] ); vsi_nn_kernel_scalar_release( &tmp_params[3] ); vsi_nn_kernel_scalar_release( &tmp_params[4] ); + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + vsi_nn_Float32ToDtype(0, (uint8_t*)&border.constant_value.U32, &outputs[0]->attr.dtype); + status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); + CHECK_STATUS(status); + } } } diff --git a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c index de5aa8326..9d464623f 100644 --- a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c @@ -152,6 +152,8 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer) uint32_t pack_key = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -363,7 +365,9 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e input0_dtype = U8; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; - int i = 0; + size_t i = 0; + + VSI_UNREFERENCED(params); input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -422,6 +426,9 @@ static vsi_nn_kernel_node_t _setup int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); int32_t blk_flg = block_size == 2 ? 1 : 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { diff --git a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c index 45c4073fd..a2f10ce82 100644 --- a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c @@ -197,6 +197,8 @@ DEF_KERNEL_INITIALIZER(_depthwise_conv1d_initializer) vx_context ctx = vxGetContext((vx_reference)node); uint64_t pack_key = 0; + VSI_UNREFERENCED(param_size); + memset(&hw_param, 0, sizeof(vx_hardware_caps_params_t)); status = vxQueryHardwareCaps(ctx, &hw_param, sizeof(vx_hardware_caps_params_t)); CHECK_STATUS_FAIL_GOTO(status, final); @@ -729,7 +731,9 @@ static vsi_nn_kernel_node_t _setup reshape_tensors[0] = inputs[0]; - if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) + if (inputs[1]->attr.dtype.qnt_type != + VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC && + inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8) { shape[0] = inputs[1]->attr.size[0]; shape[1] = 1; @@ -811,7 +815,9 @@ static vsi_nn_kernel_node_t _setup } final: - if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) + if (inputs[1]->attr.dtype.qnt_type != + VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC && + inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8) { vsi_nn_ReleaseTensor( &reshape_tensors[1] ); } diff --git a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c index ee5faf1c3..aa781c8d8 100644 --- a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c @@ -122,6 +122,8 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer) int32_t input1_ZP = 0; int32_t input0_ZP = 0; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/evis/detect_post_nms_evis.c b/src/tim/vx/internal/src/kernel/evis/detect_post_nms_evis.c index bc849b4da..5359233ba 100644 --- a/src/tim/vx/internal/src/kernel/evis/detect_post_nms_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/detect_post_nms_evis.c @@ -145,7 +145,13 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_t * kernel ) { - + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(outputs); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(kernel); return NULL; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c index 23b1433a7..5d383a15e 100644 --- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c @@ -223,6 +223,8 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) float beta = 0; uint32_t pack_key; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -467,7 +469,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + size_t i; input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -518,6 +520,9 @@ static vsi_nn_kernel_node_t _setup float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" ); float beta = vsi_nn_kernel_param_get_float32( params, "beta" ); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + ret = vsi_nn_kernel_optimize_element_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank ); diff --git a/src/tim/vx/internal/src/kernel/evis/erf_evis.c b/src/tim/vx/internal/src/kernel/evis/erf_evis.c index a4203164a..ebc8ad8f2 100644 --- a/src/tim/vx/internal/src/kernel/evis/erf_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/erf_evis.c @@ -136,6 +136,8 @@ DEF_KERNEL_INITIALIZER(_erf_initializer) float outputZP = 0; uint32_t pack_key; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -376,6 +378,10 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_bool ret = FALSE; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + ret = vsi_nn_kernel_optimize_element_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank ); diff --git a/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c b/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c index 627e48b58..eec0f08e0 100644 --- a/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c @@ -97,7 +97,10 @@ DEF_KERNEL_INITIALIZER(_extra_ending_initializer) vsi_nn_kernel_tensor_attr_t * attr = NULL; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); out_shape = attr->shape; gpu_param.global_scale[0] = 8; @@ -136,6 +139,8 @@ static vsi_status _query_kernel uint32_t key = 0; uint32_t i = 0; + VSI_UNREFERENCED(inputs); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); key = EXTRA_ENDING_HASH_KEY( out_dtype ); @@ -186,6 +191,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; int32_t i = 0; + VSI_UNREFERENCED(params); + vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, shapes[0], &rank[0]); vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, diff --git a/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c b/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c index be1bd1714..86d4d585b 100644 --- a/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c @@ -120,7 +120,7 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer) {0, 0, 0}, {0, 0, 0} }; - vx_status status = VX_FAILURE; + vsi_status status = VSI_FAILURE; vx_tensor input0 = (vx_tensor)param[0]; vx_tensor input1 = (vx_tensor)param[1]; vx_tensor output = (vx_tensor)param[2]; @@ -139,6 +139,8 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer) float in1Tail = 0; float outZp = 0; + VSI_UNREFERENCED(param_size); + input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0 ); CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -402,6 +404,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; vsi_bool image_2d = FALSE; + VSI_UNREFERENCED(params); + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { diff --git a/src/tim/vx/internal/src/kernel/evis/gather_elements_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_elements_evis.c index 0554d1124..07f159311 100644 --- a/src/tim/vx/internal/src/kernel/evis/gather_elements_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/gather_elements_evis.c @@ -51,18 +51,31 @@ typedef enum #define STR(a) #a // Add kernel hashtable here -#define GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMG_2D ) \ - (( AXIS ) | ( IN0_DTYPE << 2 ) | ( IN1_DTYPE << 10 ) | ( OUT_DTYPE << 18 ) | ( IMG_2D << 26 )) +#define GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMG_2D , BEYOND_MAXWIDTH) \ + (( AXIS ) | ( IN0_DTYPE << 2 ) | ( IN1_DTYPE << 10 ) | ( OUT_DTYPE << 18 ) | ( IMG_2D << 26 ) |\ + (BEYOND_MAXWIDTH << 28)) #define PACK_KERNEL_3D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ - { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 ), \ + { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 , 0), \ CVIVANTE_NAMESPACE("evis.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \ _GATHER_ELEMENTS_KERNEL_SOURCE} #define PACK_KERNEL_2D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ - { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 ), \ + { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 , 0), \ CVIVANTE_NAMESPACE("evis.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ _GATHER_ELEMENTS_KERNEL_SOURCE} +#define PACK_KERNEL_BEYOND_MAXWIDTH_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 , 1), \ + CVIVANTE_NAMESPACE("evis.gather_elements_beyond_maxwidth_axis"STR(AXIS)\ + "_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \ + _GATHER_ELEMENTS_KERNEL_SOURCE} + +#define PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ + { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 , 1), \ + CVIVANTE_NAMESPACE("evis.gather_elements_beyond_maxwidth_axis"STR(AXIS)\ + "_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ + _GATHER_ELEMENTS_KERNEL_SOURCE} + typedef struct { uint32_t key; @@ -94,6 +107,32 @@ static const _kernel_map_type _gather_elements_kernel_map[] = PACK_KERNEL_2D_MAP( 1, I16, I32, I16 ), PACK_KERNEL_2D_MAP( 1, I8, I32, I8 ), PACK_KERNEL_2D_MAP( 1, U8, I32, U8 ), + + PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 0, F16, I32, F16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 0, I16, I32, I16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 0, I8, I32, I8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 0, U8, I32, U8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 1, F16, I32, F16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 1, I16, I32, I16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 1, I8, I32, I8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 1, U8, I32, U8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 2, F16, I32, F16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 2, I16, I32, I16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 2, I8, I32, I8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 2, U8, I32, U8 ), + + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, F16, I32, F16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, I16, I32, I16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, I8, I32, I8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, U8, I32, U8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, F16, I32, F16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, I16, I32, I16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, I8, I32, I8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, U8, I32, U8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, F16, I32, F16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, I16, I32, I16 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, I8, I32, I8 ), + PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, U8, I32, U8 ), }; @@ -128,26 +167,48 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer) {0, 0, 0}, {0, 0, 0} }; - vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_nn_kernel_tensor_attr_t * input_attr0 = NULL; + vsi_nn_kernel_tensor_attr_t * input_attr1 = NULL; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_size_array_t * out_shape = NULL; int32_t axis = 0; int32_t axis_size = 0; - - input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); - CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + uint32_t width0 = 0; + uint32_t height0 = 0; + uint32_t width1 = 0; + uint32_t height1 = 0; + uint32_t width_out = 0; + uint32_t height_out = 0; + uint32_t depth0 = 0; + uint32_t depth1 = 0; + + VSI_UNREFERENCED(param_size); + + input_attr0 = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr0, "Create tensor attr buffer fail.", final ); + input_attr1 = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( input_attr1, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_AXIS], &axis); out_shape = output_attr->shape; - axis_size = (int32_t)input_attr->shape->data[axis]; + axis_size = (int32_t)input_attr0->shape->data[axis]; if (axis == 0) { gpu_param.global_scale[0] = 4; } + width0 = (uint32_t)input_attr0->shape->data[0]; + height0 = (uint32_t)input_attr0->shape->data[1]; + depth0 = input_attr0->shape->size > 2 ? (uint32_t)input_attr0->shape->data[2] : 1; + width1 = (uint32_t)input_attr1->shape->data[0]; + height1 = (uint32_t)input_attr1->shape->data[1]; + depth1 = input_attr1->shape->size > 2 ? (uint32_t)input_attr1->shape->data[2] : 1; + width_out = (uint32_t)output_attr->shape->data[0]; + height_out = (uint32_t)output_attr->shape->data[1]; + gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3; gpu_param.global_size[0] = gpu_align_p2( (out_shape->data[0] + gpu_param.global_scale[0] - 1) @@ -157,13 +218,31 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer) / gpu_param.global_scale[1]); gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + if (width0 >= GPU_TENSOR_MAX_WIDTH || + width1 >= GPU_TENSOR_MAX_WIDTH || + height0 >= GPU_TENSOR_MAX_WIDTH || + height1 >= GPU_TENSOR_MAX_WIDTH || + depth0 >= GPU_TENSOR_MAX_WIDTH || + depth1 >= GPU_TENSOR_MAX_WIDTH) + { + gpu_param.global_scale[0] = 1; + gpu_param.global_size[0] = out_shape->data[0]; + } + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); status |= vsi_nn_kernel_gpu_add_param( node, "axis_size", &axis_size ); + status |= vsi_nn_kernel_gpu_add_param( node, "width0", &width0 ); + status |= vsi_nn_kernel_gpu_add_param( node, "height0", &height0 ); + status |= vsi_nn_kernel_gpu_add_param( node, "width1", &width1 ); + status |= vsi_nn_kernel_gpu_add_param( node, "height1", &height1 ); + status |= vsi_nn_kernel_gpu_add_param( node, "width_out", &width_out ); + status |= vsi_nn_kernel_gpu_add_param( node, "height_out", &height_out ); CHECK_STATUS_FAIL_GOTO(status, final ); final: #define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } - SAFE_FREE_TENSOR_ATTR(input_attr); + SAFE_FREE_TENSOR_ATTR(input_attr0); + SAFE_FREE_TENSOR_ATTR(input_attr1); SAFE_FREE_TENSOR_ATTR(output_attr); return status; } /* _gather_elements_initializer() */ @@ -190,6 +269,9 @@ static vsi_status _query_kernel vx_param_description_t * param_def = _gather_elements_kernel_param_def; vx_kernel_initialize_f initializer = _gather_elements_initializer; int32_t img_2d = (outputs[0]->attr.dim_num < 3 || outputs[0]->attr.size[2] == 1) ? 1 : 0; + int32_t beyond_maxwidth = 0; + vsi_size_t depth0 = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1; + vsi_size_t depth1 = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1; uint32_t key; uint32_t i; @@ -207,7 +289,17 @@ static vsi_status _query_kernel out_dtype = F16; } - key = GATHER_ELEMENTS_HASH_KEY( axis, in0_dtype, in1_dtype, out_dtype, img_2d ); + if (inputs[0]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH || + inputs[0]->attr.size[1] >= GPU_TENSOR_MAX_WIDTH || + inputs[1]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH || + inputs[1]->attr.size[1] >= GPU_TENSOR_MAX_WIDTH || + depth0 >= GPU_TENSOR_MAX_WIDTH || + depth1 >= GPU_TENSOR_MAX_WIDTH) + { + beyond_maxwidth = 1; + } + + key = GATHER_ELEMENTS_HASH_KEY( axis, in0_dtype, in1_dtype, out_dtype, img_2d, beyond_maxwidth ); for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) { diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c index 499bc5a28..ba7ad75f4 100644 --- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c @@ -294,6 +294,8 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) uint32_t pack_key = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -491,6 +493,8 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer) uint32_t pack_key = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -692,7 +696,9 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e input0_dtype = U8; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; - int i = 0; + size_t i = 0; + + VSI_UNREFERENCED(params); input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -768,6 +774,9 @@ static vsi_nn_kernel_node_t _setup vsi_size_t rs_dim = batch_dims == 0 ? 2 : 3; int32_t i = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if (axis == 0) { status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, batch_dims, 0, &is_array); diff --git a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c index 355e90857..91c8f1744 100644 --- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c @@ -148,7 +148,7 @@ static vsi_status get_gather_nd_tensor_reshape_size vsi_size_t block_size, uint32_t coordDim, int32_t* newDim, - int32_t batch_dims + uint32_t batch_dims ) { vsi_status status = VSI_FAILURE; @@ -175,17 +175,23 @@ static vsi_status get_gather_nd_tensor_reshape_size if (batch_dims) { + int32_t rank = 1; for (i = 0; i < offset; i++) { sizes[0] *= input_size[i]; } - for (i = 0; i < coordDim; i++) + for (i = 0; i < coordDim - 1; i++) { - sizes[i + 1] = input_size[i + offset]; + sizes[rank++] = input_size[i + offset]; } - newDim[0] = coordDim == 1 ? 2 : 3; + for (i = 0; i < batch_dims; i++) + { + sizes[rank] *= input_size[dims_num - i - 1]; + } + + newDim[0] = rank + 1; } else { @@ -215,13 +221,27 @@ static vsi_status get_gather_nd_tensor_reshape_size } else // indices&output reshape { - if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH) + if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH && batch_dims == 0) { sizes[0] = block_size; sizes[1] = elementCnt / block_size; status = VSI_SUCCESS; newDim[0] = 2; } + else if (batch_dims > 0) + { + vsi_size_t batch_cnt = 1; + for (i = 0; i < batch_dims; ++i) + { + batch_cnt *= input_size[dims_num - i - 1]; + } + + sizes[0] = block_size; + sizes[1] = (elementCnt / block_size) / batch_cnt; + sizes[2] = batch_cnt; + status = VSI_SUCCESS; + newDim[0] = 3; + } } #undef VSI_NN_MAX_IMAGE_WIDTH @@ -248,15 +268,18 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer) }; vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; - int32_t block_size = 0; - int32_t indices_num = 1; - int32_t src0ZP = 0; - float src0Scale = 1; - int32_t dstZP = 0; - float dstScale = 1; + int32_t block_size = 0; + int32_t indices_num = 1; + int32_t batch_num = 1; + int32_t src0ZP = 0; + float src0Scale = 1; + int32_t dstZP = 0; + float dstScale = 1; uint32_t pack_key = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -302,6 +325,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer) } indices_num = (int32_t)(attr[1]->shape->data[1]); + batch_num = (int32_t)(attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1); gpu_param.global_scale[0] = 1; gpu_param.global_scale[1] = 1; @@ -310,7 +334,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer) gpu_param.global_size[0] = gpu_align_p2((block_size + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); gpu_param.global_size[1] = indices_num; - gpu_param.global_size[2] = 1; + gpu_param.global_size[2] = batch_num; status = vsi_nn_kernel_gpu_config( node, &gpu_param ); CHECK_STATUS_FAIL_GOTO(status, OnError); @@ -422,7 +446,8 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype = U8; vsi_nn_kernel_coord_type_e coord_type = _error; uint32_t key = 0; - int i = 0; + int32_t batch_flg = batch_dims > 0 ? 1 : 0; + size_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -448,7 +473,7 @@ static vsi_status _query_kernel coord_type = _3D; } - key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, batch_dims ); + key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, batch_flg ); for ( i = 0; i < _cnt_of_array(gather_nd_map); i ++ ) { @@ -495,6 +520,9 @@ static vsi_nn_kernel_node_t _setup int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + status = get_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim, batch_dims); status |= get_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim, batch_dims); status |= get_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim, batch_dims); diff --git a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c index 8a9971fc6..ce13b84f7 100644 --- a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c @@ -246,6 +246,8 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer) float sum_x2_tail1 = 1; float work_item_pixels = 1; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -381,6 +383,8 @@ DEF_KERNEL_INITIALIZER(_groupnorm_means_initializer) int32_t chn = 0; int32_t group_stride = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -450,6 +454,8 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer) int32_t height = 0, width = 0, chn = 0; int32_t is2D = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); @@ -776,6 +782,9 @@ static vsi_nn_kernel_node_t _setup vsi_size_t group_size = inputs[0]->attr.size[2] / group_num; float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + // Check if gpu can support the size if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) @@ -898,11 +907,11 @@ static vsi_nn_kernel_node_t _setup if (node) { uint32_t index = 0; - int32_t pStride = 0; + float pStride = 0; if (!is2D_flg) { - pStride = (int32_t)(inputs[1]->attr.size[0] / new_shape[1]); - rSpaceOrg = 1.0f / (new_shape[0] / pStride); + pStride = (float)inputs[1]->attr.size[0] / (float)new_shape[1]; + rSpaceOrg = pStride < 1.0f ? 0.0f : 1.0f / (new_shape[0] / pStride); } node_params[index++] = rs_input; node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t; @@ -912,7 +921,7 @@ static vsi_nn_kernel_node_t _setup node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rSpaceOrg ); - node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pStride ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &pStride ); status = vsi_nn_kernel_node_pass_param( node, node_params, _GROUPNORM_PARAM_NUM ); diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c index 9b5a2c1fb..1bfdb49fd 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c @@ -227,6 +227,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer) vsi_size_array_t * output_shape = NULL; vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL, NULL, NULL, NULL }; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -635,7 +637,7 @@ static vsi_status _query_kernel int32_t input_category, int32_t input_layout, int32_t use_cudnn, - int32_t* param_count, + vsi_size_t* param_count, int32_t* input_count, int32_t* output_count /* Add extra params */ @@ -756,7 +758,7 @@ static vsi_nn_kernel_node_t _setup int32_t k = 0; vsi_size_t input_size = inputs[0]->attr.size[0]; vsi_size_t batch = inputs[0]->attr.size[1]; - int32_t param_count = 0; + vsi_size_t param_count = 0; int32_t input_count = 0; int32_t output_count = 0; int32_t gate_activation = 0; @@ -765,6 +767,8 @@ static vsi_nn_kernel_node_t _setup int32_t use_cudnn = vsi_nn_kernel_param_get_int32( params, "use_cudnn_implementation" ); int32_t input_layout = vsi_nn_kernel_param_get_int32( params, "input_layout" ); + VSI_UNREFERENCED(input_num); + gate_activation = vsi_nn_kernel_param_get_int32( params, "gate_activation" ); candidate_activation = vsi_nn_kernel_param_get_int32( params, "candidate_activation" ); @@ -783,7 +787,9 @@ static vsi_nn_kernel_node_t _setup if( VSI_SUCCESS == status) { _inputs = (vsi_nn_tensor_t**)malloc(input_count * sizeof(vsi_nn_tensor_t**)); + CHECK_PTR_FAIL_GOTO( _inputs, "Create buffer fail.", final ); node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count); + CHECK_PTR_FAIL_GOTO( node_params, "Create buffer fail.", final ); if (use_cudnn) { @@ -896,6 +902,7 @@ static vsi_nn_kernel_node_t _setup } } +final: vsi_nn_safe_free(_inputs); vsi_nn_safe_free(node_params); diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c index 75b6136e1..9ad5852c3 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c @@ -110,7 +110,7 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_sma_initializer) { #define _PACK_A_GRUCELL_ACTIVATION_SMA_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE ) \ (( IN1_TYPE << 24) | ( IN1_TYPE << 16) | ( IN0_TYPE << 8) | ( OUT_TYPE)) - vsi_status status = VX_SUCCESS; + vsi_status status = VSI_FAILURE; // Alignment with a power of two value. gpu_param_t gpu_param = { 3, @@ -129,6 +129,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_sma_initializer) vsi_size_array_t *output_shape = NULL; uint32_t pack_key = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0); CHECK_PTR_FAIL_GOTO( attr[0], "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -302,6 +304,8 @@ static vsi_nn_kernel_node_t _setup vsi_bool ret = FALSE; vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL }; + VSI_UNREFERENCED(params); + for (i = 0; i < _IO_NUM; i++) { shapes_ptr[i] = shapes[i]; diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c index 40e22e981..7adf6bfb7 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c @@ -124,6 +124,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer) #define _PACK_SELECT_KEY( hstate_type, fc_type, output_type ) \ (hstate_type | (fc_type << 8) | (output_type << 16)) + VSI_UNREFERENCED(param_size); + output = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_Z_H_IN_CNT + GRUCELL_ACT_Z_H_OUT_OUTPUT]; hstate_out = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_Z_H_IN_CNT + GRUCELL_ACT_Z_H_OUT_HSTATE]; diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c index 85220002f..afd872352 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c @@ -117,6 +117,8 @@ DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer) #define _PACK_SELECT_KEY( hstate_type, fc_type, output_type ) \ (hstate_type | (fc_type << 8) | (output_type << 16)) + VSI_UNREFERENCED(param_size); + output = (vsi_nn_kernel_tensor_t)param[3]; for (i = 0; i < 2; i++) diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c index 0c35aeaf9..60d932b80 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c @@ -46,17 +46,19 @@ typedef enum _grucell_nn_activation_type_e { SIGMOID = VSI_NN_ACT_SIGMOID, HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID, + TANH = VSI_NN_ACT_TANH, }grucell_nn_activation_type_e; #define _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE "grucell_reset_after_activation" // Add kernel hashtable here -#define GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \ - (( HSTATE_DTYPE ) | ( IN_FC_DTYPE << 6 ) | ( OUT_TYPE << 12 ) | ( REC_ACT << 18 )) -#define PACK_KERNEL_MAP( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \ - { GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ), \ -CVIVANTE_NAMESPACE("evis.grucell_reset_after_activation_"#HSTATE_DTYPE"_"#IN_FC_DTYPE"to"#OUT_TYPE"_"#REC_ACT), \ -_GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE } +#define GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT, ACT ) \ + (( HSTATE_DTYPE ) | ( IN_FC_DTYPE << 6 ) | ( OUT_TYPE << 12 ) | ( REC_ACT << 18 ) | ( ACT << 24 )) +#define PACK_KERNEL_MAP( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT, ACT ) \ + { GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT, ACT ), \ + CVIVANTE_NAMESPACE("evis.grucell_reset_after_activation_"\ + #HSTATE_DTYPE"_"#IN_FC_DTYPE"to"#OUT_TYPE"_"#ACT"_"#REC_ACT), \ + _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE } typedef struct { @@ -68,10 +70,14 @@ typedef struct static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] = { // Register kernel here - PACK_KERNEL_MAP( U8, F16, U8, SIGMOID ), - PACK_KERNEL_MAP( I8, F16, I8, SIGMOID ), - PACK_KERNEL_MAP( I16, F16, I16, SIGMOID ), - PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ), + PACK_KERNEL_MAP( U8, F16, U8, SIGMOID, TANH ), + PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, TANH ), + PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, TANH ), + PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, TANH ), + PACK_KERNEL_MAP( U8, F16, U8, SIGMOID, SIGMOID ), + PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, SIGMOID ), + PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, SIGMOID ), + PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, SIGMOID ), }; @@ -123,6 +129,8 @@ DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer) #define _PACK_SELECT_KEY( hstate_type, fc_type, output_type ) \ (hstate_type | (fc_type << 8) | (output_type << 16)) + VSI_UNREFERENCED(param_size); + output = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_IN_CNT + GRUCELL_ACT_OUT_OUTPUT]; hstate_out = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_IN_CNT + GRUCELL_ACT_OUT_H_STATE]; @@ -297,7 +305,8 @@ static vsi_status _query_kernel vsi_nn_kernel_t * kernel, vsi_nn_tensor_t * const * const inputs, vsi_nn_tensor_t * const * const outputs, - int32_t recurrent_activation + int32_t recurrent_activation, + int32_t activation ) { vsi_status status = VSI_FAILURE; @@ -309,14 +318,15 @@ static vsi_status _query_kernel vx_param_description_t * param_def = _grucell_reset_after_activation_kernel_param_def; vx_kernel_initialize_f initializer = _grucell_reset_after_activation_initializer; - uint32_t key; - uint32_t i; + uint32_t key = 0; + uint32_t i = 0; hstate_dtype = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_H_STATE]->attr.dtype.vx_type ); fc_dtype = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_I_FC_Z]->attr.dtype.vx_type ); out_dtype = vsi_nn_kernel_map_dtype( outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dtype.vx_type ); - key = GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( hstate_dtype, fc_dtype, out_dtype, recurrent_activation ); + key = GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( hstate_dtype, fc_dtype, out_dtype, + recurrent_activation, activation ); for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) { @@ -362,12 +372,7 @@ static vsi_nn_kernel_node_t _setup int32_t activation = vsi_nn_kernel_param_get_int32( params, "activation" ); int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" ); - if( activation != VSI_NN_ACT_TANH ) - { - return NULL; - } - - status = _query_kernel( kernel, inputs, outputs, recurrent_activation ); + status = _query_kernel( kernel, inputs, outputs, recurrent_activation, activation ); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); diff --git a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c index 48af7f85a..7e5a84650 100644 --- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c @@ -246,6 +246,8 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer) float sum_x2_tail1 = 1; float work_item_pixels = 1; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -402,6 +404,8 @@ DEF_KERNEL_INITIALIZER(_instancenorm_means_initializer) vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; vsi_size_array_t * input_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -452,6 +456,8 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) vsi_size_array_t * input_shape = NULL; vx_int32 width = 0, chn = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -808,6 +814,10 @@ static vsi_nn_kernel_node_t _setup vsi_size_t batch = 1; vsi_bool ret = FALSE; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + + memcpy(new_shape, inputs[0]->attr.size, sizeof(inputs[0]->attr.size)); if (new_shape[0] >= GPU_TENSOR_MAX_WIDTH || new_shape[1] >= GPU_TENSOR_MAX_WIDTH) diff --git a/src/tim/vx/internal/src/kernel/evis/l1norm_evis.c b/src/tim/vx/internal/src/kernel/evis/l1norm_evis.c index 00c31c319..ce097d624 100644 --- a/src/tim/vx/internal/src/kernel/evis/l1norm_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/l1norm_evis.c @@ -168,6 +168,8 @@ DEF_KERNEL_INITIALIZER(_l1norm_initializer_axis) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t *output_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis); @@ -197,12 +199,12 @@ DEF_KERNEL_INITIALIZER(_l1norm_initializer_axis) } else if (axis == 1) { - gpu_param.global_size[0] = (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0];; + gpu_param.global_size[0] = (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0]; gpu_param.global_size[1] = depth; } else if (axis == 2) { - gpu_param.global_size[0] = (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0];; + gpu_param.global_size[0] = (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0]; gpu_param.global_size[1] = height; } diff --git a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c index be4a29953..068257c43 100644 --- a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c @@ -139,6 +139,8 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer) int32_t axis2Dflg = 0; int32_t inputWidth = 0; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); diff --git a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c index 966a6cdd8..0a477c525 100644 --- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c @@ -250,6 +250,8 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer) float inv_multiplier = 0; int32_t height = 0, width = 0, chn = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); @@ -539,6 +541,8 @@ DEF_KERNEL_INITIALIZER(_layernorm_axis01_sums_initializer) int32_t height = 0; int32_t chn = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -653,6 +657,8 @@ DEF_KERNEL_INITIALIZER(_layernorm_axis01_initializer) vx_uint32 group_num = 0; vx_int32 height = 0, width = 0, chn = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); @@ -787,7 +793,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e input2_dtype = F16; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; - int32_t i = 0; + size_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); @@ -832,7 +838,7 @@ static vsi_status _query_kernel_axis01 vsi_nn_kernel_dtype_e input2_dtype = F16; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; - int i = 0; + size_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); @@ -917,6 +923,9 @@ static vsi_nn_kernel_node_t _setup_axis01 uint32_t axis_size = 0; uint32_t rank_in = 0, rank_para = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + status = vsi_nn_kernel_optimize_tensor_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, axis_num, new_shape[0], &rank_in, new_axis, &axis_size); @@ -942,6 +951,7 @@ static vsi_nn_kernel_node_t _setup_axis01 rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[0], rank_in); kernel_sums = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); + CHECK_PTR_FAIL_GOTO( kernel_sums, "Create kernel fail.", final ); // Assign unique_id kernel_sums->unique_id = kernel->unique_id; @@ -961,6 +971,7 @@ static vsi_nn_kernel_node_t _setup_axis01 attr.size[3] = new_shape[0][3]; attr.dim_num = rank_in; tensor_sums = vsi_nn_CreateTensor( graph, &attr ); + CHECK_PTR_FAIL_GOTO( tensor_sums, "Create tensor fail.", final ); status = _query_kernel_axis01(inputs, outputs, kernel_sums, kernel); if ( VSI_SUCCESS != status ) @@ -972,6 +983,7 @@ static vsi_nn_kernel_node_t _setup_axis01 ** sum(x) and sumsq(x*x) */ sums_node = vsi_nn_kernel_create_node(graph, kernel_sums); + CHECK_PTR_FAIL_GOTO( sums_node, "Create kernel fail.", final ); if (sums_node) { sums_node_params[0] = rs_input; @@ -992,6 +1004,7 @@ static vsi_nn_kernel_node_t _setup_axis01 } node = vsi_nn_kernel_create_node( graph, kernel ); + CHECK_PTR_FAIL_GOTO( node, "Create kernel fail.", final ); if (node) { uint32_t index = 0; @@ -1065,6 +1078,9 @@ static vsi_nn_kernel_node_t _setup_axis0 uint32_t rank_in = 0; int32_t is_img2d_input = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + status = vsi_nn_kernel_optimize_tensor_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, axis_num, new_shape[0], &rank_in, new_axis, &axis_size); diff --git a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c index 3ee30282d..4e7b8a087 100644 --- a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c @@ -166,6 +166,8 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer) float rlogE = (float)(log10(2.0f) / log10(exp(1.0f))); float scaleLogE = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -482,7 +484,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + size_t i; input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -529,6 +531,9 @@ static vsi_nn_kernel_node_t _setup int32_t axis = 0; float beta = 1.0f; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + axis = vsi_nn_kernel_param_get_int32(params, "axis"); beta = vsi_nn_kernel_param_get_float32(params, "beta"); diff --git a/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c b/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c index 890f7bc78..d59d851ed 100644 --- a/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c @@ -98,7 +98,7 @@ DEF_KERNEL_INITIALIZER(_logical_not_initializer) size_t param_size ) { - vsi_status status = VX_SUCCESS; + vsi_status status = VSI_FAILURE; // Alignment with a power of two value. gpu_param_t gpu_param = { 3, @@ -112,6 +112,8 @@ DEF_KERNEL_INITIALIZER(_logical_not_initializer) vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_size_array_t *output_shape = NULL; + VSI_UNREFERENCED(param_size); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -226,6 +228,8 @@ static vsi_nn_kernel_node_t _setup vsi_size_t new_rank = 0; vsi_bool ret = FALSE; + VSI_UNREFERENCED(params); + ret = vsi_nn_kernel_optimize_element_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank ); diff --git a/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c b/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c index 7e5476b74..54713cb08 100644 --- a/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c @@ -109,7 +109,7 @@ DEF_KERNEL_INITIALIZER(_logical_ops_initializer) size_t param_size ) { - vsi_status status = VX_FAILURE; + vsi_status status = VSI_FAILURE; // Alignment with a power of two value. gpu_param_t gpu_param = { 3, @@ -125,6 +125,8 @@ DEF_KERNEL_INITIALIZER(_logical_ops_initializer) vsi_nn_kernel_tensor_attr_t *input_attr = NULL, *output_attr = NULL; vsi_size_array_t *output_shape = NULL; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input); CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -308,11 +310,11 @@ static vsi_nn_kernel_node_t _setup outputs[0], shapes[2], new_rank ); #define _swap_tensor(a, b, tmp) \ - do { \ + { \ tmp = a; \ a = b; \ b = tmp; \ - } while(0) + } if (shapes[1][3] > shapes[0][3] && new_rank == 4) { diff --git a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c index a99acc6cd..95232b9d1 100644 --- a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c @@ -65,7 +65,8 @@ typedef enum _LSTMUNIT_nn_activation_e #define LSTMUNIT_ACTIVATION_HASH_KEY(_is_ln, _is_cifg, _is_proj, _is_hybrid, _is_peephole, \ _input_type, _output_type, _cell_type, _rec_act) \ ((_is_ln << 31) | (_is_cifg << 30) | (_is_proj << 29) | (_is_hybrid << 28) | (_is_peephole << 27) \ -| (_input_type << 23) | (_output_type << 19) | (_cell_type << 15) | (_rec_act << 10)) +| (((uint32_t)_input_type) << 23) | (((uint32_t)_output_type) << 19) | (((uint32_t)_cell_type) << 15) \ +| (_rec_act << 10)) #define LSTMUNIT_ACTIVATION_SOURCE_NAME(_ln_cifg_proj_hybrid_, _input_type) \ "lstmunit_activation_"#_ln_cifg_proj_hybrid_"_"#_input_type diff --git a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c index 6e4ee41b1..f5dc60b1e 100644 --- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c @@ -58,9 +58,12 @@ __BEGIN_DECLS #define KERNEL_SOURCE_14 "matrixmul_f16i16_i16" #define KERNEL_SOURCE_15 "matrixmul_bf16" #define KERNEL_SOURCE_16 "matrixmul_u8i16_i16" +#define KERNEL_SOURCE_17 "matrixmul_merge" +#define KERNEL_SOURCE_18 "matrixmul_cross" +#define KERNEL_SOURCE_19 "matrixmul_cross_i16" -#define HASH_MATRIX_MUL_KEY(_input0_type, _input1_type, _output_type, _trans_a, _trans_b) \ - ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_trans_a << 4) | (_trans_b)) +#define HASH_MATRIX_MUL_KEY(_type0, _type1, _type2, _trans_a, _trans_b, _cross) \ + ((_type0 << 24) | (_type1 << 16) | (_type2 << 8) | (_trans_a << 4) | (_trans_b << 2) | (_cross)) #define HASH_MATRIX_MUL_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ CVIVANTE_NAMESPACE("evis.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE) @@ -71,21 +74,37 @@ __BEGIN_DECLS #define HASH_MATRIX_MUL_TRANSA_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ CVIVANTE_NAMESPACE("evis.gemm_transa_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE) +#define HASH_MATRIX_MUL_SH_KERNEL_CROSS_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_cross") + +#define HASH_MATRIX_MUL_SH_KERNEL_MERGE_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_merge") + #define TENSOR_MATRIX_MUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ - { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 0), \ + { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 0, 0), \ HASH_MATRIX_MUL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ SOURCE }, #define TENSOR_MATRIX_MUL_TRANSB_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ - { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 1), \ + { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 1, 0), \ HASH_MATRIX_MUL_TRANSB_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ SOURCE }, #define TENSOR_MATRIX_MUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ - { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1, 0), \ + { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1, 0, 0), \ HASH_MATRIX_MUL_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ SOURCE }, +#define TENSOR_MATRIX_MUL_CROSS_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 0, 1), \ + HASH_MATRIX_MUL_SH_KERNEL_CROSS_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_MATRIX_MUL_MERGE_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 0, 2), \ + HASH_MATRIX_MUL_SH_KERNEL_MERGE_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + SOURCE }, + static const struct { uint32_t key; @@ -135,6 +154,14 @@ static const struct { TENSOR_MATRIX_MUL_TRANSA_KERNELS(F16, F16, F16, KERNEL_SOURCE_7) TENSOR_MATRIX_MUL_TRANSA_KERNELS(BF16,BF16,BF16, KERNEL_SOURCE_15) TENSOR_MATRIX_MUL_TRANSA_KERNELS(U8, I16, I16, KERNEL_SOURCE_7) + TENSOR_MATRIX_MUL_MERGE_KERNELS(U8, U8, U8, KERNEL_SOURCE_17) + TENSOR_MATRIX_MUL_MERGE_KERNELS(I8, I8, I8, KERNEL_SOURCE_17) + TENSOR_MATRIX_MUL_MERGE_KERNELS(I16, I16, I16, KERNEL_SOURCE_19) + TENSOR_MATRIX_MUL_MERGE_KERNELS(F16, F16, F16, KERNEL_SOURCE_17) + TENSOR_MATRIX_MUL_CROSS_KERNELS(U8, U8, U8, KERNEL_SOURCE_18) + TENSOR_MATRIX_MUL_CROSS_KERNELS(I8, I8, I8, KERNEL_SOURCE_18) + TENSOR_MATRIX_MUL_CROSS_KERNELS(I16, I16, I16, KERNEL_SOURCE_19) + TENSOR_MATRIX_MUL_CROSS_KERNELS(F16, F16, F16, KERNEL_SOURCE_18) }; /* @@ -154,7 +181,35 @@ static vx_param_description_t _matrix_mul_kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, // Add kererl parameters here }; + +static vx_param_description_t _matrix_mul_kernel_cross_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; #define _MATRIX_MUL_PARAM_NUM _cnt_of_array( _matrix_mul_kernel_param_def ) +#define _MATRIX_MUL_CROSS_PARAM_NUM _cnt_of_array( _matrix_mul_kernel_cross_param_def ) /* * Kernel initializer @@ -180,7 +235,10 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) int32_t transB = 0; int32_t width = 0; int32_t height = 0; - int32_t chn = 0; + vsi_size_t chn = 0; + int32_t a_depth = 0; + int32_t b_depth = 0; + vsi_size_t outer = 0; int32_t src0ZP = 0; float src0Scale = 0; @@ -204,6 +262,8 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) uint32_t evis2 = 0; vx_context ctx = vxGetContext((vx_reference)node); vx_hardware_caps_params_t hw_param; + + VSI_UNREFERENCED(param_size); memset(&hw_param, 0, sizeof(vx_hardware_caps_params_t)); status = vxQueryHardwareCaps(ctx, &hw_param, sizeof(vx_hardware_caps_params_t)); CHECK_STATUS_FAIL_GOTO(status, OnError ); @@ -294,22 +354,59 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) mulKIn0In1Zp = (float)((int)(K + 3) / 4 * 4 * src1ZP * src0ZP); inOutScale = src0Scale * src1Scale / dstScale; - if ((attr[0]->shape->size > attr[1]->shape->size) || - (attr[0]->shape->data[2] > attr[1]->shape->data[2] - && attr[0]->shape->size > 2 && attr[1]->shape->size > 2)) + a_depth = (int32_t)(attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1); + b_depth = (int32_t)(attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1); + + if (b_depth == 1) { bc2zero = 1; } - else if ((attr[1]->shape->size > attr[0]->shape->size) || - (attr[1]->shape->data[2] > attr[0]->shape->data[2] - && attr[0]->shape->size > 2 && attr[1]->shape->size > 2)) + if (a_depth == 1) { ac2zero = 1; } width = (int32_t)(attr[2]->shape->data[0]); height = (int32_t)(attr[2]->shape->data[1]); - chn = (int32_t)(attr[2]->shape->size > 2 ? attr[2]->shape->data[2] : 1); + chn = (attr[2]->shape->size > 2 ? attr[2]->shape->data[2] : 1); + + if (((attr[0]->shape->size == 4 && attr[1]->shape->size == 3) || + (attr[0]->shape->size == 3 && attr[1]->shape->size == 4)) + && attr[0]->shape->data[2] > 1 && attr[1]->shape->data[2] > 1 + && chn != attr[0]->shape->data[2] * attr[1]->shape->data[2]) + { + vsi_size_t iter = attr[0]->shape->data[2] * attr[1]->shape->data[2] / chn; + if (attr[0]->shape->size == 4) + { + ac2zero = 1; + bc2zero = 0; + chn = attr[1]->shape->data[2]; + outer = attr[0]->shape->data[2] / iter; + } + else + { + ac2zero = 0; + bc2zero = 1; + chn = attr[0]->shape->data[2]; + outer = attr[1]->shape->data[2] / iter; + } + } + else if (attr[0]->shape->size == 4 && attr[1]->shape->size == 3 + && attr[0]->shape->data[2] != 1 && attr[1]->shape->data[2] != 1) + { + ac2zero = 1; + bc2zero = 0; + chn = attr[1]->shape->data[2]; + outer = attr[0]->shape->data[2]; + } + else if (attr[1]->shape->size == 4 && attr[0]->shape->size == 3 + && attr[0]->shape->data[2] != 1 && attr[1]->shape->data[2] != 1) + { + ac2zero = 0; + bc2zero = 1; + chn = attr[0]->shape->data[2]; + outer = attr[1]->shape->data[2]; + } gpu_param.global_scale[0] = 4; gpu_param.global_scale[1] = 4; @@ -319,7 +416,7 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) / gpu_param.global_scale[0], 4); gpu_param.global_size[1] = gpu_align_p2((height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1], 4); - gpu_param.global_size[2] = chn; + gpu_param.global_size[2] = (size_t)chn; status = vsi_nn_kernel_gpu_config( node, &gpu_param ); CHECK_STATUS_FAIL_GOTO(status, OnError); @@ -683,6 +780,12 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) uniI16MulI16SumtoI32_16x1.data[i] = multiplierZpB; } + if (outer) + { + status = vsi_nn_kernel_gpu_add_param( node, "outer", &outer ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + switch( pack_key ) { case _PACK_SELECT_KEY( U8, U8, F16, 0, 1, 0 ): @@ -790,16 +893,19 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) "uniConvertUint8SubZpToFp32_4x4", &uniConvertUint8SubZpToFp32_4x4 ); status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertUint8SubZpToFp32B_4x4", &uniConvertUint8SubZpToFp32B_4x4 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniI16MulI16SumtoI32_16x1", &uniI16MulI16SumtoI32_16x1 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniI16MulI16SumtoI32B_16x1", &uniI16MulI16SumtoI32B_16x1 ); status |= vsi_nn_kernel_gpu_add_param( node, "input0_ZP", &src0ZP ); status |= vsi_nn_kernel_gpu_add_param( node, "input1_ZP", &src1ZP ); status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP ); status |= vsi_nn_kernel_gpu_add_param( node, "outputScale", &reScaleOut ); - status |= vsi_nn_kernel_gpu_add_param( node, "inout_scale", &inScaledivOut ); - status |= vsi_nn_kernel_gpu_add_param( node, "inout_beta", &inout_beta ); + if (outer == 0) + { + status |= vsi_nn_kernel_gpu_add_param( node, + "uniI16MulI16SumtoI32_16x1", &uniI16MulI16SumtoI32_16x1 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniI16MulI16SumtoI32B_16x1", &uniI16MulI16SumtoI32B_16x1 ); + status |= vsi_nn_kernel_gpu_add_param( node, "inout_scale", &inScaledivOut ); + status |= vsi_nn_kernel_gpu_add_param( node, "inout_beta", &inout_beta ); + } } break; case _PACK_SELECT_KEY( F16, U8, F16, 0, 0, 0 ): @@ -1093,6 +1199,308 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) return status; } /* _matrix_mul_initializer() */ +DEF_KERNEL_INITIALIZER(_matrix_mul_cross_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + int32_t transA = 0; + int32_t transB = 0; + int32_t width = 0; + int32_t height = 0; + int32_t axis_size = 0; + + int32_t src0ZP = 0; + float src0Scale = 0; + int32_t src1ZP = 0; + float src1Scale = 0; + float dstZP = 0; + float dstScale = 0; + + uint32_t pack_key = 0; + + float mulKIn0In1Zp = 0; + float inOutScale = 0; + int32_t K = 0; + + uint32_t evis2 = 0; + vx_context ctx = vxGetContext((vx_reference)node); + vx_hardware_caps_params_t hw_param; + + VSI_UNREFERENCED(param_size); + memset(&hw_param, 0, sizeof(vx_hardware_caps_params_t)); + status = vxQueryHardwareCaps(ctx, &hw_param, sizeof(vx_hardware_caps_params_t)); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + if (hw_param.evis2 == TRUE) + { + evis2 = 1; + } + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &transA); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &transB); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &K); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &axis_size); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + src0ZP = attr[0]->asymm.zero_point; + src0Scale = attr[0]->asymm.scale; + src1ZP = attr[1]->asymm.zero_point; + src1Scale = attr[1]->asymm.scale; + dstZP = (float)attr[2]->asymm.zero_point; + dstScale = attr[2]->asymm.scale; + + if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[0]->dfp.fl > 0) + { + src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + src0ZP = 0; + } + else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE) + { + src0Scale = 1; + src0ZP = 0; + } + + if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[1]->dfp.fl > 0) + { + src1Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl))); + } + else + { + src1Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl)); + } + src1ZP = 0; + } + else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE) + { + src1Scale = 1; + src1ZP = 0; + } + + if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[2]->dfp.fl > 0) + { + dstScale = (float)((int64_t)1 << attr[2]->dfp.fl); + } + else + { + dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); + } + dstScale = 1.0f / dstScale; + dstZP = 0.0f; + } + else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE ) + { + dstScale = 1; + dstZP = 0.0f; + } + + mulKIn0In1Zp = (float)((int)(K + 3) / 4 * 4 * src1ZP * src0ZP); + inOutScale = src0Scale * src1Scale / dstScale; + + width = (int32_t)(attr[2]->shape->data[0]); + height = (int32_t)(attr[2]->shape->data[1]); + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 4; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = gpu_align_p2((height + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1], 4); + gpu_param.global_size[2] = (size_t)axis_size; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + +#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE, TRANSA, TRANSB, EVIS2) \ + ((IN0_TYPE << 24) | (IN1_TYPE << 16) | (OUT_TYPE << 8) | (TRANSA << 4) | (TRANSB << 2) | (EVIS2)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, attr[2]->dtype, transA, transB, evis2); + { + uint16_t M0 = 0; + uint16_t M1 = 0; + int32_t postShift0 = 0; + int32_t postShift1 = 0; + uint32_t multiplierA = 0; + uint32_t multiplierB = 0; + gpu_dp_inst_t uniGemmU8U8MulZptoFp32_8x4 = {{ + 0xaaaaaaaa, 0xaaaaaaaa, // TCfg + 0xf02a0600, 0x2a8620e0, 0x0640e8f2, 0x60f0f42b, 0xf8f62b86, // BinSelect + 0x00000700, // AccumType, ConstantType, and PostShift + 0x03020302, 0x03020302, 0x03020302, 0x03020302, + 0x03020302, 0x03020302, 0x03020302, 0x03020302 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemmU8U8toFp32Block4_4x4 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x32103210, 0x32103210, // ABin + 0x55555555, // BSelt + 0xd951c840, 0xfb73ea62, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGemmU8F16toF32Lo_4x4b = {{ + 0x55555555, // TCfg + 0x50505050, // ASelt + 0x51514040, 0x73736262, // ABin + 0x00000000, // BSelt + 0x32103210, 0x32103210, // BBin + 0x00000000, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniConvertUint8SubZpToFp32_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertUint8SubZpToFp32B_4x4 = {{ + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + float reScaleOut = 1 / dstScale; + uint32_t multiplierU8ZpAB = (src0ZP << 24) | (src1ZP << 16) | (src0ZP << 8) | (src1ZP); + int32_t i = 8; + gpu_quantize_multiplier_16bit(src0Scale / 1.0f, &M0, &postShift0); + gpu_quantize_multiplier_16bit(src1Scale / 1.0f, &M1, &postShift1); + + multiplierA = (M0 << 16) | M0; + multiplierB = (M1 << 16) | M1; + + uniConvertUint8SubZpToFp32_4x4.data[7] |= (postShift0 & 0x1F); + uniConvertUint8SubZpToFp32B_4x4.data[7] |= (postShift1 & 0x1F); + for( i = 8; i < 16; i += 2) + { + uniConvertUint8SubZpToFp32_4x4.data[i] = multiplierA; + uniConvertUint8SubZpToFp32B_4x4.data[i] = multiplierB; + } + for( i = 8; i < 16; i++) + { + uniGemmU8U8MulZptoFp32_8x4.data[i] = multiplierU8ZpAB; + } + + switch( pack_key ) + { + case _PACK_SELECT_KEY( U8, U8, U8, 0, 0, 1 ): + case _PACK_SELECT_KEY( I8, I8, I8, 0, 0, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmU8U8toFp32Block4_4x4", &uniGemmU8U8toFp32Block4_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniGemmU8U8MulZptoFp32_8x4", &uniGemmU8U8MulZptoFp32_8x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "mulKIn0In1Zp", &mulKIn0In1Zp ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( I16, I16, I16, 0, 0, 0 ): + case _PACK_SELECT_KEY( I16, I16, I16, 0, 0, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertUint8SubZpToFp32_4x4", &uniConvertUint8SubZpToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertUint8SubZpToFp32B_4x4", &uniConvertUint8SubZpToFp32B_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "input0_ZP", &src0ZP ); + status |= vsi_nn_kernel_gpu_add_param( node, "input1_ZP", &src1ZP ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP ); + status |= vsi_nn_kernel_gpu_add_param( node, "outputScale", &reScaleOut ); + } + break; + case _PACK_SELECT_KEY( F16, F16, F16, 0, 0, 1 ): + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniGemmU8F16toF32Lo_4x4b", &uniGemmU8F16toF32Lo_4x4b ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; + } + } +#undef _PACK_SELECT_KEY + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } + return status; +} /* _matrix_mul_cross_initializer() */ + /* * Query kernel */ @@ -1102,7 +1510,8 @@ static vsi_status _query_kernel vsi_nn_tensor_t* const* const outputs, vsi_nn_kernel_t* kernel, int32_t transa, - int32_t transb + int32_t transb, + int32_t cross ) { vsi_status status = VSI_FAILURE; @@ -1110,13 +1519,13 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e input1_dtype = U8; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; - int i = 0; + size_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = HASH_MATRIX_MUL_KEY( input0_dtype, input1_dtype, output_dtype, transa, transb ); + key = HASH_MATRIX_MUL_KEY( input0_dtype, input1_dtype, output_dtype, transa, transb, cross); for( i = 0; i < _cnt_of_array(matrix_mul_map); i ++ ) { @@ -1128,9 +1537,18 @@ static vsi_status _query_kernel if ( i < _cnt_of_array(matrix_mul_map) ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", matrix_mul_map[i].function_name ); - kernel->info.parameters = _matrix_mul_kernel_param_def; - kernel->info.numParams = _cnt_of_array( _matrix_mul_kernel_param_def ); - kernel->info.initialize = _matrix_mul_initializer; + if (cross == 1) + { + kernel->info.parameters = _matrix_mul_kernel_cross_param_def; + kernel->info.numParams = _cnt_of_array( _matrix_mul_kernel_cross_param_def ); + kernel->info.initialize = _matrix_mul_cross_initializer; + } + else + { + kernel->info.parameters = _matrix_mul_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _matrix_mul_kernel_param_def ); + kernel->info.initialize = _matrix_mul_initializer; + } vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, "vsi_nn_kernel_header", @@ -1155,18 +1573,28 @@ static vsi_nn_kernel_node_t _setup ) { vsi_status status = VSI_FAILURE; - vsi_nn_kernel_node_param_t tmp_params[_MATRIX_MUL_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t tmp_params[_MATRIX_MUL_CROSS_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; int32_t transposeA = vsi_nn_kernel_param_get_int32( params, "transposeA" ); int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" ); int32_t adjointA = vsi_nn_kernel_param_get_int32( params, "adjointA" ); int32_t adjointB = vsi_nn_kernel_param_get_int32( params, "adjointB" ); + uint32_t cross_flg = vsi_nn_kernel_param_get_int32( params, "cross_flg" ); + size_t tmp_size = 0; + uint32_t* size_axis_in_out = NULL; + uint32_t* stride_axis_in_out = NULL; vsi_size_t M = inputs[0]->attr.size[1]; vsi_size_t K = inputs[0]->attr.size[0]; vsi_size_t N = inputs[1]->attr.size[0]; vsi_size_t depthA = 1, depthB = 1; + size_axis_in_out = (uint32_t *)vsi_nn_kernel_param_get_buffer( params, "size_axis_inner_outer", &tmp_size); + stride_axis_in_out = (uint32_t *)vsi_nn_kernel_param_get_buffer( params, "stride_axis_inner_outer", &tmp_size); + + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if ((inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && inputs[1]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32) @@ -1209,13 +1637,14 @@ static vsi_nn_kernel_node_t _setup rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 ); } - status = _query_kernel( inputs, outputs, kernel, transposeA, transposeB ); + status = _query_kernel( inputs, outputs, kernel, transposeA, transposeB, cross_flg ); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); if ( node ) { uint32_t index = 3; + size_t param_num = cross_flg == 1 ? _MATRIX_MUL_CROSS_PARAM_NUM : _MATRIX_MUL_PARAM_NUM; /* Pass parameters to node. */ if (rs_input) { @@ -1225,7 +1654,7 @@ static vsi_nn_kernel_node_t _setup } else { - vsi_nn_kernel_node_pack_io( tmp_params, _MATRIX_MUL_PARAM_NUM, + vsi_nn_kernel_node_pack_io( tmp_params, param_num, inputs, 2, outputs, 1 ); } tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &transposeA ); @@ -1235,7 +1664,22 @@ static vsi_nn_kernel_node_t _setup tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &M ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &K ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &N ); - status = vsi_nn_kernel_node_pass_param( node, tmp_params, _MATRIX_MUL_PARAM_NUM ); + if (cross_flg == 1) + { + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &size_axis_in_out[0] ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &size_axis_in_out[1] ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &size_axis_in_out[2] ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[0] ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[1] ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[2] ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[3] ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[4] ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[5] ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[6] ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[7] ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[8] ); + } + status = vsi_nn_kernel_node_pass_param( node, tmp_params, param_num ); CHECK_STATUS(status); vsi_nn_kernel_scalar_release( &tmp_params[3] ); vsi_nn_kernel_scalar_release( &tmp_params[4] ); @@ -1244,6 +1688,21 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &tmp_params[7] ); vsi_nn_kernel_scalar_release( &tmp_params[8] ); vsi_nn_kernel_scalar_release( &tmp_params[9] ); + if (cross_flg == 1) + { + vsi_nn_kernel_scalar_release( &tmp_params[10] ); + vsi_nn_kernel_scalar_release( &tmp_params[11] ); + vsi_nn_kernel_scalar_release( &tmp_params[12] ); + vsi_nn_kernel_scalar_release( &tmp_params[13] ); + vsi_nn_kernel_scalar_release( &tmp_params[14] ); + vsi_nn_kernel_scalar_release( &tmp_params[15] ); + vsi_nn_kernel_scalar_release( &tmp_params[16] ); + vsi_nn_kernel_scalar_release( &tmp_params[17] ); + vsi_nn_kernel_scalar_release( &tmp_params[18] ); + vsi_nn_kernel_scalar_release( &tmp_params[19] ); + vsi_nn_kernel_scalar_release( &tmp_params[20] ); + vsi_nn_kernel_scalar_release( &tmp_params[21] ); + } { // Set default border mode. vx_border_t border; diff --git a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c index 460ad87f7..d862eb752 100644 --- a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c @@ -153,6 +153,8 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer) vsi_size_array_t * out_shape = NULL; uint32_t pack_key; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -404,7 +406,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + size_t i; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); @@ -453,6 +455,10 @@ static vsi_nn_kernel_node_t _setup vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type; vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { diff --git a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c index 11478f544..cb9fc3563 100644 --- a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c @@ -153,6 +153,8 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer) vsi_size_array_t * out_shape = NULL; uint32_t pack_key; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -404,7 +406,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + size_t i; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); @@ -453,6 +455,10 @@ static vsi_nn_kernel_node_t _setup vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type; vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { diff --git a/src/tim/vx/internal/src/kernel/evis/mod_evis.c b/src/tim/vx/internal/src/kernel/evis/mod_evis.c index fe7edd7cc..70188f6e7 100644 --- a/src/tim/vx/internal/src/kernel/evis/mod_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/mod_evis.c @@ -119,7 +119,7 @@ DEF_KERNEL_INITIALIZER(_mod_initializer) {0, 0, 0}, {0, 0, 0} }; - vx_status status = VX_FAILURE; + vsi_status status = VSI_FAILURE; vx_tensor input0 = (vx_tensor)param[0]; vx_tensor input1 = (vx_tensor)param[1]; vx_tensor output = (vx_tensor)param[2]; @@ -138,6 +138,8 @@ DEF_KERNEL_INITIALIZER(_mod_initializer) float in1Tail = 0; float outZp = 0; + VSI_UNREFERENCED(param_size); + input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0 ); CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/evis/moments_evis.c b/src/tim/vx/internal/src/kernel/evis/moments_evis.c index d79142617..9dc6eae47 100644 --- a/src/tim/vx/internal/src/kernel/evis/moments_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/moments_evis.c @@ -162,7 +162,7 @@ static vx_param_description_t _moments_kernel_param_def[] = }; #define _MOMENTS_PARAM_NUM _cnt_of_array( _moments_kernel_param_def ) -static int32_t set_constant_border +static int32_t _set_constant_border ( vsi_nn_kernel_node_t node, int32_t value @@ -172,9 +172,6 @@ static int32_t set_constant_border vx_border_t border; border.mode = VX_BORDER_CONSTANT; border.constant_value.S32 = value; - border.constant_value.U32 = (vx_uint32)value; - border.constant_value.S16 = (vx_int16)value; - border.constant_value.U8 = (vx_uint8)value; status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); return status; } @@ -226,6 +223,8 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) uint32_t pack_key = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -797,7 +796,9 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e input0_dtype = U8; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; - int i = 0; + size_t i = 0; + + VSI_UNREFERENCED(params); input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -866,6 +867,9 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_bool is_continue_axis = TRUE; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + axis_num = (int32_t)axis_num_temp; for ( i = 1; i < axis_num; i++) @@ -901,7 +905,7 @@ static vsi_nn_kernel_node_t _setup reshape_tensors[2] = vsi_nn_reshape_tensor( graph, outputs[1], shapes[1], rank_out ); - if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[1]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[1]->attr.size, reshape_tensors[1]->attr.dim_num ) ) { return NULL; @@ -911,10 +915,10 @@ static vsi_nn_kernel_node_t _setup axis_first = new_axis[0]; status = _query_kernel( inputs, outputs, kernel, params, new_axis, axis_size, image_2d ); - if( VSI_SUCCESS == status) + if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); - if( node ) + if ( node ) { uint32_t index = 3; /* Pass parameters to node. */ @@ -926,17 +930,14 @@ static vsi_nn_kernel_node_t _setup CHECK_STATUS(status); vsi_nn_kernel_scalar_release( &node_params[3] ); vsi_nn_kernel_scalar_release( &node_params[4] ); - status = set_constant_border(node, vsi_nn_get_tensor_zero_point(inputs[0])); + status = _set_constant_border(node, 0); CHECK_STATUS(status); } } - for(i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { - if(reshape_tensors[i]) - { - vsi_nn_ReleaseTensor(&reshape_tensors[i]); - } + vsi_safe_release_tensor(reshape_tensors[i]); } return node; diff --git a/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c b/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c new file mode 100644 index 000000000..28ff2d1ae --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c @@ -0,0 +1,614 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + INTERNAL_KERNEL_NEAREST_GRID_SAMPLE, +} _internal_kernel_e; + +#define STR(a) #a + +#define _NEAREST_GRID_SAMPLE_KERNEL_SOURCE(_input_type, _output_type) \ + "nearest_grid_sample_" #_input_type "_to_" #_output_type + +// Add kernel hashtable here +#define NEAREST_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + ((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE)) +#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + { \ + NEAREST_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \ + CVIVANTE_NAMESPACE("evis.nearest_grid_sample_" STR( \ + IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \ + _NEAREST_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE) \ + } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _nearest_grid_sample_kernel_map[] = +{ + PACK_KERNEL_MAP(F16, F32, F16), + PACK_KERNEL_MAP(F16, U8, F16), + PACK_KERNEL_MAP(F16, F16, F16), + PACK_KERNEL_MAP(F16, F32, U8), + PACK_KERNEL_MAP(F16, F16, U8), + PACK_KERNEL_MAP(F16, U8, U8), + PACK_KERNEL_MAP(U8, U8, U8), + PACK_KERNEL_MAP(U8, F16, U8), + PACK_KERNEL_MAP(U8, F32, U8), + PACK_KERNEL_MAP(I16, I16, I16), + PACK_KERNEL_MAP(I8, I8, I8), + PACK_KERNEL_MAP(BF16, BF16, BF16), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _nearest_grid_sample_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _NEAREST_GRID_SAMPLE_PARAM_NUM _cnt_of_array( _nearest_grid_sample_kernel_param_def ) + +#define SCALAR_ALIGN_CORNERS (3) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_nearest_grid_sample_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ +#define MAX_POST_SHIFT_BITS (31) +#define MAX_MULTIPLIER_NUM (65535) + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = {3, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}}; + vsi_nn_kernel_tensor_attr_t* output_attr = NULL; + vsi_nn_kernel_tensor_attr_t* input_attr[2] = {NULL}; + vsi_size_array_t* out_shape = NULL; + vsi_size_array_t* in0_shape = NULL; + vsi_nn_kernel_dtype_e input0_dtype = F16; + vsi_nn_kernel_dtype_e input1_dtype = F16; + vsi_nn_kernel_dtype_e output_dtype = F16; + + uint32_t depth = 0; + float half_input0_wh[2]; + float add_float_value[2]; + uint32_t in0_width; + uint32_t in0_height; + uint32_t out_width; + uint32_t out_height; + int32_t align_corners; + + float input0_scale = 1.0; + int32_t input0ZP = 0; + float input1_scale = 1.0; + int32_t input1ZP = 0; + float output_scale = 1.0; + int32_t outputZP = 0; + + VSI_UNREFERENCED(param_size); + + input_attr[0] = + vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]); + CHECK_PTR_FAIL_GOTO( + input_attr[0], "Create tensor attr buffer fail.", final); + + input_attr[1] = + vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]); + CHECK_PTR_FAIL_GOTO( + input_attr[1], "Create tensor attr buffer fail.", final); + + output_attr = + vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]); + CHECK_PTR_FAIL_GOTO(output_attr, "Create tensor attr buffer fail.", final); + + status = vsi_nn_kernel_scalar_read_int32( + (vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners)); + CHECK_STATUS_FAIL_GOTO(status, final); + + out_shape = output_attr->shape; + in0_shape = input_attr[0]->shape; + input0_dtype = input_attr[0]->dtype; + input1_dtype = input_attr[1]->dtype; + output_dtype = output_attr->dtype; + + input0_scale = input_attr[0]->scale; + input0ZP = input_attr[0]->zero_point; + input1_scale = input_attr[1]->scale; + input1ZP = input_attr[1]->zero_point; + output_scale = output_attr->scale; + outputZP = output_attr->zero_point; + + + in0_width = (uint32_t)(in0_shape->data[0]); + in0_height = (uint32_t)(in0_shape->data[1]); + depth = (uint32_t)(in0_shape->data[2]); + out_width = (uint32_t)(out_shape->data[0]); + out_height = (uint32_t)(out_shape->data[1]); + + if (align_corners) { + half_input0_wh[0] = ((float)in0_width - 1.0f) * 0.5f; + half_input0_wh[1] = ((float)in0_height - 1.0f) * 0.5f; + add_float_value[0] = half_input0_wh[0] + 0.5f; + add_float_value[1] = half_input0_wh[1] + 0.5f; + } else { + half_input0_wh[0] = (float)in0_width * 0.5f; + half_input0_wh[1] = (float)in0_height * 0.5f; + add_float_value[0] = half_input0_wh[0]; + add_float_value[1] = half_input0_wh[1]; + } + + status = vsi_nn_kernel_gpu_add_param(node, "half_input0_wh", half_input0_wh); + status |= vsi_nn_kernel_gpu_add_param(node, "add_float_value", add_float_value); + status |= vsi_nn_kernel_gpu_add_param(node, "depth", &depth); + + { + gpu_dp_inst_t uniFp16toFp32_part0_4x4 = { + { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, + GPU_DP_TYPE_16}; + gpu_dp_inst_t uniFp16toFp32_part1_4x4 = { + { + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, + GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8SubZPtoFp32_part0_4x4 = { + { + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniU8SubZPtoFp32_part1_4x4 = { + { + 0x09090909, // TCfg + 0x04040404, // ASelt + 0x00050004, 0x00070006, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtact8Bit_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + if (F16 == input0_dtype && + (F16 == input1_dtype || F32 == input1_dtype || + U8 == input1_dtype) && + F16 == output_dtype) { + if (F16 == input1_dtype) { + status |= vsi_nn_kernel_gpu_add_param( + node, "uniFp16toFp32_part0_4x4", &uniFp16toFp32_part0_4x4); + status |= vsi_nn_kernel_gpu_add_param( + node, "uniFp16toFp32_part1_4x4", &uniFp16toFp32_part1_4x4); + } else if (U8 == input1_dtype) { + status |= + vsi_nn_kernel_gpu_add_param(node, "input1_ZP", &input1ZP); + status |= vsi_nn_kernel_gpu_add_param( + node, "input1Scale", &input1_scale); + status |= + vsi_nn_kernel_gpu_add_param(node, + "uniU8SubZPtoFp32_part0_4x4", + &uniU8SubZPtoFp32_part0_4x4); + status |= + vsi_nn_kernel_gpu_add_param(node, + "uniU8SubZPtoFp32_part1_4x4", + &uniU8SubZPtoFp32_part1_4x4); + } + } else if (F16 == input0_dtype && + (F16 == input1_dtype || F32 == input1_dtype || + U8 == input1_dtype) && + U8 == output_dtype) { + float uint8Scale = 1.0f / output_scale; + float uint8ZP_out = (float)outputZP; + status |= vsi_nn_kernel_gpu_add_param(node, "uint8Scale", &uint8Scale); + status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &uint8ZP_out); + status |= vsi_nn_kernel_gpu_add_param( + node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8); + if (U8 == input1_dtype) { + status |= + vsi_nn_kernel_gpu_add_param(node, "input1_ZP", &input1ZP); + status |= vsi_nn_kernel_gpu_add_param( + node, "input1Scale", &input1_scale); + status |= + vsi_nn_kernel_gpu_add_param(node, + "uniU8SubZPtoFp32_part0_4x4", + &uniU8SubZPtoFp32_part0_4x4); + status |= + vsi_nn_kernel_gpu_add_param(node, + "uniU8SubZPtoFp32_part1_4x4", + &uniU8SubZPtoFp32_part1_4x4); + } else if (F16 == input1_dtype) { + status |= vsi_nn_kernel_gpu_add_param( + node, "uniFp16toFp32_part0_4x4", &uniFp16toFp32_part0_4x4); + status |= vsi_nn_kernel_gpu_add_param( + node, "uniFp16toFp32_part1_4x4", &uniFp16toFp32_part1_4x4); + } + } + else if (U8 == input0_dtype && + (F16 == input1_dtype || F32 == input1_dtype || + U8 == input1_dtype) && + U8 == output_dtype) { + uint16_t M0 = 0; + int32_t postShift = 0; + uint32_t multAndoutZP[2] = {0}; + gpu_dp_inst_t uniMultiplyAndPostShift_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + gpu_quantize_multiplier_16bit( + (double)input0_scale / (double)output_scale, &M0, &postShift); + + multAndoutZP[0] = (uint32_t)(M0); + multAndoutZP[1] = + (uint32_t)((outputZP << postShift) - input0ZP * M0); + + uniMultiplyAndPostShift_2x8.data[7] |= (postShift & 0x1F); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP", multAndoutZP); + status |= vsi_nn_kernel_gpu_add_param( node, "uniMultiplyAndPostShift_2x8", + &uniMultiplyAndPostShift_2x8); + if (U8 == input1_dtype) { + status |= vsi_nn_kernel_gpu_add_param(node, "input1_ZP", &input1ZP); + status |= vsi_nn_kernel_gpu_add_param(node, "input1Scale", &input1_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part0_4x4", + &uniU8SubZPtoFp32_part0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part1_4x4", + &uniU8SubZPtoFp32_part1_4x4); + } + else if (F16 == input1_dtype) { + status |= vsi_nn_kernel_gpu_add_param( + node, "uniFp16toFp32_part0_4x4", &uniFp16toFp32_part0_4x4); + status |= vsi_nn_kernel_gpu_add_param( + node, "uniFp16toFp32_part1_4x4", &uniFp16toFp32_part1_4x4); + } + } + else if (BF16 == input0_dtype && BF16 == input1_dtype && + BF16 == output_dtype) { + gpu_dp_inst_t uniBF16toFp32_part0_2x8 = { + { + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniBF16toFp32_part1_2x8 = { + { + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + status |= vsi_nn_kernel_gpu_add_param( + node, "uniBF16toFp32_part0_2x8", &uniBF16toFp32_part0_2x8); + status |= vsi_nn_kernel_gpu_add_param( + node, "uniBF16toFp32_part1_2x8", &uniBF16toFp32_part1_2x8); + } + else if (((I16 == input0_dtype && I16 == input1_dtype && + I16 == output_dtype)) || + ((I8 == input0_dtype && I8 == input1_dtype && + I8 == output_dtype))) { + uint16_t M0 = 0; + int32_t postShift = 0; + uint32_t i = 0; + gpu_dp_inst_t uniDFPtoFp32_part0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniDFPtoFp32_part1_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertI8toI8_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_quantize_multiplier_16bit( + (double)input0_scale / (double)output_scale, &M0, &postShift); + uniConvertI8toI8_2x8.data[7] |= (postShift & 0x1F); + for (i = 0; i < 8; i++) { + uniConvertI8toI8_2x8.data[i + 8] = M0; + } + + status |= vsi_nn_kernel_gpu_add_param(node, "input1_scale", &input1_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDFPtoFp32_part0_4x4", &uniDFPtoFp32_part0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDFPtoFp32_part1_4x4", &uniDFPtoFp32_part1_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertI8toI8_2x8", &uniConvertI8toI8_2x8); + } + else { + VSILOGE("input or output's format is not support"); + status = VSI_FAILURE; + } + } + CHECK_STATUS_FAIL_GOTO(status, final); + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.dim = 2; + gpu_param.global_size[0] = + (out_width + gpu_param.global_scale[0] - 1) / + gpu_param.global_scale[0]; + gpu_param.global_size[1] = ((out_height + gpu_param.global_scale[1] - 1) / + gpu_param.global_scale[1]); + + status = vsi_nn_kernel_gpu_config(node, &gpu_param); + +#undef MAX_MULTIPLIER_NUM +#undef MAX_POST_SHIFT_BITS + + final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) \ + if (_PTR) { \ + vsi_nn_kernel_tensor_attr_release(&_PTR); \ + _PTR = NULL; \ + } + SAFE_FREE_TENSOR_ATTR(output_attr); + SAFE_FREE_TENSOR_ATTR(input_attr[0]); + SAFE_FREE_TENSOR_ATTR(input_attr[1]); + + return status; +} /* _nearest_grid_sample_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype, in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _nearest_grid_sample_kernel_map; + size_t kernel_map_size = _cnt_of_array( _nearest_grid_sample_kernel_map ); + vx_param_description_t * param_def = _nearest_grid_sample_kernel_param_def; + vx_kernel_initialize_f initializer = _nearest_grid_sample_initializer; + + uint32_t key; + uint32_t i; + + in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type); + in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type); + out_dtype = vsi_nn_kernel_map_dtype(outputs[0]->attr.dtype.vx_type); + + key = NEAREST_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _nearest_grid_sample_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_NEAREST_GRID_SAMPLE_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; + uint32_t final_in1_rank = 0; + vsi_nn_tensor_t* rs_tensors = NULL; + vsi_nn_tensor_t* final_tensors[3] = {NULL}; + vsi_nn_kernel_dtype_e in0_dtype; + uint32_t pad_val = 0; + int32_t align_corners = + vsi_nn_kernel_param_get_int32(params, "align_corners"); + + // Check if gpu can support the size + if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size, + inputs[0]->attr.dim_num)) { + return NULL; + } + + if (!vsi_nn_kernel_gpu_check_shape(inputs[1]->attr.size, + inputs[1]->attr.dim_num)) { + return NULL; + } + + final_tensors[0] = inputs[0]; + + if (inputs[1]->attr.dim_num >= 3) { + final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0]; + final_shape[1] = inputs[1]->attr.size[2]; + final_shape[2] = 1; + final_shape[3] = + inputs[1]->attr.dim_num > 3 ? inputs[1]->attr.size[3] : 1; + final_in1_rank = + inputs[1]->attr.dim_num == 3 ? 2 : inputs[1]->attr.dim_num; + if (!vsi_nn_kernel_gpu_check_shape(final_shape, final_in1_rank)) { + return NULL; + } + + rs_tensors = vsi_nn_reshape_tensor( + graph, inputs[1], final_shape, final_in1_rank); + final_tensors[1] = rs_tensors; + } else { + final_tensors[1] = inputs[1]; + } + final_tensors[2] = outputs[0]; + + in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type); + if (U8 == in0_dtype) { + pad_val = inputs[0]->attr.dtype.zero_point; + } + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _NEAREST_GRID_SAMPLE_PARAM_NUM, + final_tensors, input_num, &final_tensors[2], output_num ); + node_params[SCALAR_ALIGN_CORNERS] = + vsi_nn_kernel_scalar_create(graph, I32, &align_corners); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _NEAREST_GRID_SAMPLE_PARAM_NUM ); + VSI_ASSERT(status == VSI_SUCCESS); + vsi_nn_kernel_scalar_release(&node_params[SCALAR_ALIGN_CORNERS]); + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U32 = pad_val; + status = vxSetNodeAttribute( + (vx_node)node, VX_NODE_BORDER, &border, sizeof(border)); + CHECK_STATUS(status); + } + } + } + vsi_safe_release_tensor(rs_tensors); + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( nearest_grid_sample, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c index 5dc05023c..de2d35add 100644 --- a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c @@ -148,6 +148,8 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer) int32_t srcFixPointPos = 0; vsi_nn_kernel_dtype_e input_dtype = F16; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -422,6 +424,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_tensor_t* rs_tensors[2] = { NULL }; vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }}; int32_t i = 0; + size_t j = 0; vsi_bool image_2d = FALSE; vsi_size_t num_elements = vsi_nn_vxGetTensorElementNum(&inputs[0]->attr); vsi_size_t prefix_dim_size = 1; @@ -505,11 +508,11 @@ static vsi_nn_kernel_node_t _setup vsi_nn_ReleaseTensor( &rs_tensors[1] ); } - for (i = SCALAR_INPUT_SUFFIX_SIZE; i < _ONE_HOT_PARAM_NUM; i++) + for (j = SCALAR_INPUT_SUFFIX_SIZE; j < _ONE_HOT_PARAM_NUM; j++) { - if (node_params[i]) + if (node_params[j]) { - vsi_nn_kernel_scalar_release( &node_params[i] ); + vsi_nn_kernel_scalar_release( &node_params[j] ); } } diff --git a/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c b/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c index a625d97f8..e45704fe6 100644 --- a/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c @@ -146,6 +146,8 @@ DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer) int32_t output_ZP = 0; vsi_bool image_2d = FALSE; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); diff --git a/src/tim/vx/internal/src/kernel/evis/pow_evis.c b/src/tim/vx/internal/src/kernel/evis/pow_evis.c index b4d4f218c..679526e6a 100644 --- a/src/tim/vx/internal/src/kernel/evis/pow_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pow_evis.c @@ -149,6 +149,8 @@ DEF_KERNEL_INITIALIZER(_pow_initializer) vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -377,7 +379,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key = 0; - int i = 0; + size_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); @@ -423,6 +425,10 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c index 498ee4528..52588a4d4 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c @@ -84,6 +84,8 @@ static vx_param_description_t vxPreProcessBgraKernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; #define _EVIS_PRE_PROCESS_BGRA_PARAM_NUM _cnt_of_array(vxPreProcessBgraKernel_param_def) @@ -115,6 +117,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer) vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -391,7 +395,7 @@ static vsi_status _query_kernel vsi_nn_kernel_convert_type_e convert_type = SCALE; vsi_status status = VSI_FAILURE; uint32_t key = 0; - int i = 0; + size_t i = 0; vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); @@ -449,6 +453,9 @@ static vsi_nn_kernel_node_t _setup vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; int32_t trans = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { @@ -469,7 +476,9 @@ static vsi_nn_kernel_node_t _setup float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); - float bgra_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" ); + float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" ); + float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" ); + float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" ); int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); /* Pass parameters to node. */ @@ -496,9 +505,11 @@ static vsi_nn_kernel_node_t _setup tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); - tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &bgra_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale ); status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_BGRA_PARAM_NUM ); CHECK_STATUS(status); vsi_nn_kernel_scalar_release( &tmp_params[2] ); @@ -511,6 +522,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &tmp_params[9] ); vsi_nn_kernel_scalar_release( &tmp_params[10] ); vsi_nn_kernel_scalar_release( &tmp_params[11] ); + vsi_nn_kernel_scalar_release( &tmp_params[12] ); + vsi_nn_kernel_scalar_release( &tmp_params[13] ); } } diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c index 797c925b2..1973eb2a3 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c @@ -124,6 +124,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer) vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -224,6 +226,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_initializer) vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -388,6 +392,8 @@ DEF_KERNEL_INITIALIZER(_resize_gray_initializer) vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -516,7 +522,7 @@ static vsi_status _query_kernel vsi_nn_gray_convert_type_e convert_type = SCALE; vsi_status status = VSI_FAILURE; uint32_t key = 0; - int32_t i = 0; + size_t i = 0; vsi_bool is_4_over_3 = FALSE; vsi_bool is_half_scale = FALSE; vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); @@ -605,6 +611,9 @@ static vsi_nn_kernel_node_t _setup float scale = vsi_nn_kernel_param_get_float32( params, "scale" ); vsi_bool is_no_range_change = FALSE; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c index fe39a5cfb..a0d76f4ba 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c @@ -112,6 +112,8 @@ static vx_param_description_t vxPreProcessNv12Kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; #define _EVIS_PRE_PROCESS_NV12_PARAM_NUM _cnt_of_array(vxPreProcessNv12Kernel_param_def) @@ -136,13 +138,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) int32_t order1 = 2; uint32_t width = 0; uint32_t height = 0; - float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f; - float outputScaleVar = 0.0f; + float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f; + float b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f; + float outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f; float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -152,10 +157,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &bMean); CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &var); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &r_scale); CHECK_STATUS_FAIL_GOTO(status, OnError ); status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder); CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &g_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[15], &b_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[0]->shape; output_scale = 1.0f / attr[0]->scale; @@ -169,10 +178,12 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) order1 = 0; } - outputScaleVar = output_scale * var; - bMeanScaleVarZp = output_zp - bMean * outputScaleVar; - gMeanScaleVarZp = output_zp - gMean * outputScaleVar; - rMeanScaleVarZp = output_zp - rMean * outputScaleVar; + outputScaleVar_b = output_scale * b_scale; + outputScaleVar_g = output_scale * g_scale; + outputScaleVar_r = output_scale * r_scale; + bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b; + gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g; + rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r; shaderParam.global_scale[0] = 4; shaderParam.global_scale[1] = 1; @@ -255,7 +266,9 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r); status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp); @@ -317,14 +330,17 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) uint32_t yrIntFloat_16 = 0; int32_t xRatio = 0; int32_t yRatio = 0; - float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f; - float outputScaleVar = 0.0f; + float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f; + float b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f; + float outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f; float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f; float resize = 0.0f; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -341,10 +357,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &bMean); CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &var); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &r_scale); CHECK_STATUS_FAIL_GOTO(status, OnError ); status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder); CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &g_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[15], &b_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[1]->shape; output_scale = 1.0f / attr[1]->scale; @@ -364,10 +384,12 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) xrIntFloat_16 = (uint32_t)((roi_width << 16) / width + 1); yrIntFloat_16 = (uint32_t)((roi_height << 16) / height + 1); - outputScaleVar = output_scale * var; - bMeanScaleVarZp = output_zp - bMean * outputScaleVar; - gMeanScaleVarZp = output_zp - gMean * outputScaleVar; - rMeanScaleVarZp = output_zp - rMean * outputScaleVar; + outputScaleVar_b = output_scale * b_scale; + outputScaleVar_g = output_scale * g_scale; + outputScaleVar_r = output_scale * r_scale; + bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b; + gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g; + rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r; shaderParam.global_scale[0] = 4; shaderParam.global_scale[1] = 1; @@ -472,7 +494,9 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUVtoCharSub128_2x8", &uniConvertUVtoCharSub128_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16); status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r); status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp); @@ -537,7 +561,7 @@ static vsi_status _query_kernel vsi_nn_kernel_convert_type_e convert_type = SCALE; vsi_status status = VSI_FAILURE; uint32_t key = 0; - int i = 0; + size_t i = 0; vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); vsi_size_t dstWidth = outputs[0]->attr.size[0]; float scaleVal = (float)dstWidth / ((scale_x * dstWidth) >> 15); @@ -611,6 +635,9 @@ static vsi_nn_kernel_node_t _setup int32_t trans = 0; int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { @@ -630,7 +657,9 @@ static vsi_nn_kernel_node_t _setup float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); - float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" ); + float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" ); + float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" ); + float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" ); int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); int32_t nv_type = vsi_nn_kernel_param_get_int32( params, "nv_type" ); @@ -645,10 +674,12 @@ static vsi_nn_kernel_node_t _setup tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); - tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &nv_type ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale ); status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM ); CHECK_STATUS(status); vsi_nn_kernel_scalar_release( &tmp_params[3] ); @@ -662,6 +693,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &tmp_params[11] ); vsi_nn_kernel_scalar_release( &tmp_params[12] ); vsi_nn_kernel_scalar_release( &tmp_params[13] ); + vsi_nn_kernel_scalar_release( &tmp_params[14] ); + vsi_nn_kernel_scalar_release( &tmp_params[15] ); } } vsi_safe_release_tensor(reshape_tensors[0]); diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c index ddfc9b5a8..256f7e5ce 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c @@ -143,8 +143,10 @@ static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] = { {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, @@ -162,8 +164,10 @@ static vx_param_description_t _pre_process_rgb888_planar_sep_kernel_param_def[] {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, @@ -195,8 +199,10 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_initializer) float output_zp = 0; float output_scale = 1; + int32_t reverse = 0; + int32_t rgb_order[4] = {0}; uint32_t width = 0; - uint32_t height = 0; + int32_t height = 0; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * out_shape = NULL; @@ -210,30 +216,28 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_initializer) attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); } CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); - status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &output_scale); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 4], &reverse); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 3], &height); CHECK_STATUS_FAIL_GOTO(status, OnError ); - out_shape = attr[0]->shape; - width = (uint32_t)(out_shape->data[0]); - height = (uint32_t)(out_shape->data[1]); - - if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + if (reverse) { - if ( attr[0]->dfp.fl > 0 ) - { - output_scale *= (float)((int64_t)1 << attr[0]->dfp.fl); - } - else - { - output_scale *= (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl)); - } + rgb_order[0] = 2 * height; + rgb_order[1] = height; + rgb_order[2] = 0; } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + else { - output_zp = (float)attr[0]->asymm.zero_point; - output_scale /= attr[0]->asymm.scale; + rgb_order[0] = 0; + rgb_order[1] = height; + rgb_order[2] = 2 * height; } + out_shape = attr[0]->shape; + width = (uint32_t)(out_shape->data[0]); + output_scale /= attr[0]->scale; + output_zp = (float)attr[0]->zero_point; + shaderParam.global_scale[0] = 4; shaderParam.global_scale[1] = 1; shaderParam.global_scale[2] = 1; @@ -322,7 +326,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal); status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4); - + status |= vsi_nn_kernel_gpu_add_param(node, "rgb_order", &rgb_order); status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale); @@ -363,8 +367,10 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer) float output_zp = 0; float output_scale = 1; - uint32_t width = 0; - uint32_t height = 0; + uint32_t width = 0; + int32_t height = 0; + int32_t reverse = 0; + int32_t rgb_order[4] = {0}; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * out_shape = NULL; @@ -378,12 +384,25 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer) attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); } CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); - status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &output_scale); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 4], &reverse); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 3], &height); CHECK_STATUS_FAIL_GOTO(status, OnError ); + if (reverse) + { + rgb_order[0] = 2 * height; + rgb_order[1] = height; + rgb_order[2] = 0; + } + else + { + rgb_order[0] = 0; + rgb_order[1] = height; + rgb_order[2] = 2 * height; + } + out_shape = attr[0]->shape; width = (uint32_t)(out_shape->data[0]); - height = (uint32_t)(out_shape->data[1]); if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) { @@ -435,6 +454,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer) status = vsi_nn_kernel_gpu_add_param(node, "uniDataMeanStddevLo_2x8", &uniDataMeanStddevLo_2x8); status |= vsi_nn_kernel_gpu_add_param(node, "uniDataMeanStddevHi_2x8", &uniDataMeanStddevHi_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "rgb_order", &rgb_order); status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale); status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); CHECK_STATUS_FAIL_GOTO(status, OnError ); @@ -464,11 +484,13 @@ DEF_KERNEL_INITIALIZER(_resize_rgb888_planar_initializer) {0, 0, 0}, // localWorkSize: local group size in thread {0, 0, 0}}; // globalWorkSize: image size in thread - uint32_t width = 0; - uint32_t height = 0; - vsi_bool is_4_over_3 = 0; + uint32_t width = 0; + int32_t height = 0; + vsi_bool is_4_over_3 = 0; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; vsi_size_array_t * out_shape = NULL; + int32_t reverse = 0; + int32_t rgb_order[4] = {0}; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -482,12 +504,28 @@ DEF_KERNEL_INITIALIZER(_resize_rgb888_planar_initializer) } CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 4], &reverse); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 3], &height); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + if (reverse) + { + rgb_order[0] = 2 * height; + rgb_order[1] = height; + rgb_order[2] = 0; + } + else + { + rgb_order[0] = 0; + rgb_order[1] = height; + rgb_order[2] = 2 * height; + } + out_shape = attr[1]->shape; width = (uint32_t)(out_shape->data[0]); - height = (uint32_t)(out_shape->data[1]); is_4_over_3 = (attr[0]->shape->data[0] * 3 == width * 4) && - (attr[0]->shape->data[1] * 3 == height * 4); + (attr[0]->shape->data[1] * 3 == (vsi_size_t)height * 4); if (is_4_over_3) { @@ -570,7 +608,7 @@ DEF_KERNEL_INITIALIZER(_resize_rgb888_planar_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l01_4x4", &uniBilinear_4over3_l01_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l11_4x4", &uniBilinear_4over3_l11_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l21_4x4", &uniBilinear_4over3_l21_4x4); - + status |= vsi_nn_kernel_gpu_add_param(node, "rgb_order", &rgb_order); CHECK_STATUS_FAIL_GOTO(status, OnError ); } @@ -609,7 +647,7 @@ static vsi_status _query_kernel _internal_scale_e scale_type = SCALE; vsi_status status = VSI_FAILURE; uint32_t key = 0; - int32_t i = 0; + size_t i = 0; vsi_bool is_4_over_3 = FALSE; vsi_bool is_half_scale = FALSE; vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); @@ -679,8 +717,7 @@ static vsi_status _query_kernel { kernel->info.initialize = _pre_process_rgb888_planar_initializer; } - vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, - "vsi_nn_kernel_header", + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, pre_process_rgb888_planar_kernel_map[i].source_name ); vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, pre_process_rgb888_planar_kernel_map[i].source_name ); @@ -705,19 +742,31 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t* node_params = NULL; vsi_nn_kernel_node_t node = NULL; - int32_t param_count = _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM; + vsi_nn_tensor_t* reshape_tensor = NULL; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + size_t param_count = _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM; int32_t width = vsi_nn_kernel_param_get_int32( params, "width" ); int32_t height = vsi_nn_kernel_param_get_int32( params, "height" ); + int32_t output_height = (int32_t)outputs[0]->attr.size[1]; float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); - float scale = vsi_nn_kernel_param_get_float32( params, "scale" ); + float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" ); + int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); + float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" ); + float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" ); vsi_bool is_no_range_change = FALSE; input_num = inputs[1] == NULL ? 1 : input_num; param_count = inputs[1] == NULL ? _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM : param_count; - if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, + memcpy(shape, outputs[0]->attr.size, outputs[0]->attr.dim_num * sizeof(shape[0])); + shape[1] *= shape[2]; + shape[2] = 1; + reshape_tensor = vsi_nn_reshape_tensor( graph, + outputs[0], shape, outputs[0]->attr.dim_num ); + + if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; @@ -727,7 +776,9 @@ static vsi_nn_kernel_node_t _setup outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 && outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC && (float)outputs[0]->attr.dtype.zero_point == r_mean && r_mean == g_mean && r_mean == b_mean && - vsi_nn_abs(outputs[0]->attr.dtype.scale - scale) < 1e-8 ) + vsi_nn_abs(outputs[0]->attr.dtype.scale - r_scale) < 1e-8 && + vsi_nn_abs(outputs[0]->attr.dtype.scale - g_scale) < 1e-8 && + vsi_nn_abs(outputs[0]->attr.dtype.scale - b_scale) < 1e-8) { is_no_range_change = TRUE; } @@ -736,10 +787,11 @@ static vsi_nn_kernel_node_t _setup if ( VSI_SUCCESS == status) { node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count); + CHECK_PTR_FAIL_GOTO( node_params, "Create buffer fail.", final ); node = vsi_nn_kernel_create_node( graph, kernel ); if ( node ) { - uint32_t index = inputs[1] == NULL ? 4 : 6; + uint32_t index = inputs[1] == NULL ? 2 : 4; uint32_t scalar_index = index; int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); @@ -748,7 +800,7 @@ static vsi_nn_kernel_node_t _setup /* Set inputs and outputs */ vsi_nn_kernel_node_pack_io( node_params, param_count, - inputs, input_num, outputs, output_num ); + inputs, input_num, &reshape_tensor, output_num ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); @@ -757,7 +809,11 @@ static vsi_nn_kernel_node_t _setup node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); - node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &output_height ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale ); /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, node_params, param_count ); index = scalar_index; @@ -769,9 +825,14 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &node_params[index++] ); vsi_nn_kernel_scalar_release( &node_params[index++] ); vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); } } +final: vsi_nn_safe_free(node_params); return node; diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c new file mode 100644 index 000000000..ae559dac1 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c @@ -0,0 +1,1002 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +#define RGB888_SEP_SOURCE_0 "pre_process_rgb888_planar_sep_nhwc_0", +#define RGB888_SEP_SOURCE_1 "pre_process_rgb888_planar_sep_nhwc_1", +#define RGB888_SEP_SOURCE_2 "pre_process_rgb888_planar_sep_nhwc_2", +#define RGB888_SOURCE_0 "pre_process_rgb888_planar_nhwc_0", +#define RGB888_SOURCE_1 "pre_process_rgb888_planar_nhwc_1", +#define RGB888_SOURCE_2 "pre_process_rgb888_planar_nhwc_2", + +#define STR(a) #a + +typedef enum +{ + COPY = 0, + SCALE, + FOUR_OVER_THREE, + HALF +} _internal_scale_e; + +// Add kernel hashtable here +#define PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, SEP, SCALE_FLAG ) \ + (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 ) | ( SEP << 4 ) | (SCALE_FLAG)) + +#define PACK_KERNEL_SCALE_MAP( IN_DTYPE, OUT_DTYPE ) \ + { PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, SCALE ), \ + CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_nhwc"), \ + RGB888_SOURCE_0 } + +#define PACK_KERNEL_SEP_SCALE_MAP( IN_DTYPE, OUT_DTYPE ) \ + { PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, SCALE ), \ + CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_nhwc"), \ + RGB888_SEP_SOURCE_0 } + +#define PACK_KERNEL_COPY_MAP( IN_DTYPE, OUT_DTYPE ) \ + { PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, COPY ), \ + CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_nhwc"), \ + RGB888_SOURCE_1 } + +#define PACK_KERNEL_SEP_COPY_MAP( IN_DTYPE, OUT_DTYPE ) \ + { PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, COPY ), \ + CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_nhwc"), \ + RGB888_SEP_SOURCE_1 } + +#define PACK_KERNEL_HALF_MAP( IN_DTYPE, OUT_DTYPE ) \ + { PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, HALF ), \ + CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_nhwc"), \ + RGB888_SOURCE_2 } + +#define PACK_KERNEL_SEP_HALF_MAP( IN_DTYPE, OUT_DTYPE ) \ + { PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, HALF ), \ + CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_nhwc"), \ + RGB888_SEP_SOURCE_2 } +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _pre_process_rgb888_planar_nhwc_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_SCALE_MAP( U8, F16 ), + PACK_KERNEL_SCALE_MAP( U8, I16 ), + PACK_KERNEL_SCALE_MAP( U8, I8 ), + PACK_KERNEL_SCALE_MAP( U8, U8 ), + + PACK_KERNEL_COPY_MAP( U8, F16 ), + PACK_KERNEL_COPY_MAP( U8, I16 ), + PACK_KERNEL_COPY_MAP( U8, I8 ), + PACK_KERNEL_COPY_MAP( U8, U8 ), + + PACK_KERNEL_HALF_MAP( U8, U8 ), + + PACK_KERNEL_SEP_SCALE_MAP( U8, F16 ), + PACK_KERNEL_SEP_SCALE_MAP( U8, I16 ), + PACK_KERNEL_SEP_SCALE_MAP( U8, I8 ), + PACK_KERNEL_SEP_SCALE_MAP( U8, U8 ), + + PACK_KERNEL_SEP_COPY_MAP( U8, F16 ), + PACK_KERNEL_SEP_COPY_MAP( U8, I16 ), + PACK_KERNEL_SEP_COPY_MAP( U8, I8 ), + PACK_KERNEL_SEP_COPY_MAP( U8, U8 ), + + PACK_KERNEL_SEP_HALF_MAP( U8, U8 ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def ) + +static vx_param_description_t _pre_process_rgb888_planar_sep_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 2, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + float output_zp = 0; + float output_scale = 1; + int32_t reverse = 0; + uint32_t width = 0; + uint32_t height = 0; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_size_array_t * out_shape = NULL; + + if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def )) + { + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + } + else + { + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + } + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 3], &reverse); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + out_shape = attr[0]->shape; + width = (uint32_t)(out_shape->data[0] / 3); + height = (uint32_t)(out_shape->data[1]); + output_scale /= attr[0]->scale; + output_zp = (float)attr[0]->zero_point; + + shaderParam.global_scale[0] = 4; + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = height; + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniVecShift10 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00020000, 0x00060004, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000400, 0x00000000, 0x00000400, 0x00000000, + 0x00000400, 0x00000000, 0x00000400, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniAddRShift = {{ + 0x0f0f0f0f, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002405, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniGetTempVal = {{ + 0x09090909, // TCfg + 0x00000000, // ASelt + 0x00230001, 0x00670045, // ABin + 0x05050505, // BSelt + 0x00110000, 0x00330022, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractBytes = {{ + 0x0f0f0f0f, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002414, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertIntergetoF32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni16BitsDataInterleaveRGB_0_2x8 = {{ + 0x11111111, // TCfg + 0x00100100, // ASelt + 0x01000400, 0x06020105, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni16BitsDataInterleaveRGB_1_2x8 = {{ + 0x00001111, // TCfg + 0x00001001, // ASelt + 0x03070302, 0x00000000, // ABin + 0x00002222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni16BitsDataInterleaveBGR_0_2x8 = {{ + 0x11111111, // TCfg + 0x01001001, // ASelt + 0x01000400, 0x06020105, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni16BitsDataInterleaveBGR_1_2x8 = {{ + 0x00001111, // TCfg + 0x00000010, // ASelt + 0x03070302, 0x00000000, // ABin + 0x00002222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni8BitsDataInterleaveRGB_0_2x8= {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x01080400, 0x06020905, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni8BitsDataInterleaveRGB_1_2x8 = {{ + 0x00001111, // TCfg + 0x00000000, // ASelt + 0x0b07030a, 0x00000000, // ABin + 0x00002222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni8BitsDataInterleaveBGR_0_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x09000408, 0x060a0105, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni8BitsDataInterleaveBGR_1_2x8 = {{ + 0x00001111, // TCfg + 0x00000000, // ASelt + 0x03070b02, 0x00000000, // ABin + 0x00002222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param(node, "uniVecShift10", &uniVecShift10); + status |= vsi_nn_kernel_gpu_add_param(node, "uniAddRShift", &uniAddRShift); + status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale); + if (reverse) + { + status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_0_2x8", + &uni16BitsDataInterleaveBGR_0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_1_2x8", + &uni16BitsDataInterleaveBGR_1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_0_2x8", + &uni8BitsDataInterleaveBGR_0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_1_2x8", + &uni8BitsDataInterleaveBGR_1_2x8); + } + else + { + status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_0_2x8", + &uni16BitsDataInterleaveRGB_0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_1_2x8", + &uni16BitsDataInterleaveRGB_1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_0_2x8", + &uni8BitsDataInterleaveRGB_0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_1_2x8", + &uni8BitsDataInterleaveRGB_1_2x8); + } + + if (attr[0]->dtype == F16) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8); + } + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _pre_process_rgb888_planar_initializer() */ + +DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 2, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + float output_zp = 0; + float output_scale = 1; + uint32_t width = 0; + uint32_t height = 0; + int32_t reverse = 0; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_size_array_t * out_shape = NULL; + + if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def )) + { + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + } + else + { + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + } + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 3], &reverse); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + out_shape = attr[0]->shape; + width = (uint32_t)(out_shape->data[0] / 3); + height = (uint32_t)(out_shape->data[1]); + + if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if ( attr[0]->dfp.fl > 0 ) + { + output_scale *= (float)((int64_t)1 << attr[0]->dfp.fl); + } + else + { + output_scale *= (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl)); + } + } + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + output_zp = (float)attr[0]->asymm.zero_point; + output_scale /= attr[0]->asymm.scale; + } + + if (attr[0]->dtype == F16 || attr[0]->dtype == I16) + { + shaderParam.global_scale[0] = 4; + } + else + { + shaderParam.global_scale[0] = 8; + } + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = height; + shaderParam.global_size[2] = 1; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniDataMeanStddevLo_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0x99999999, // BSelt + 0x06060606, 0x06060606, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000, + 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni16BitsDataInterleaveRGB_0_2x8 = {{ + 0x11111111, // TCfg + 0x00100100, // ASelt + 0x01000400, 0x06020105, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni16BitsDataInterleaveRGB_1_2x8 = {{ + 0x00001111, // TCfg + 0x00001001, // ASelt + 0x03070302, 0x00000000, // ABin + 0x00002222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni16BitsDataInterleaveBGR_0_2x8 = {{ + 0x11111111, // TCfg + 0x01001001, // ASelt + 0x01000400, 0x06020105, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni16BitsDataInterleaveBGR_1_2x8 = {{ + 0x00001111, // TCfg + 0x00000010, // ASelt + 0x03070302, 0x00000000, // ABin + 0x00002222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni8BDataInterleaveRGB_0_2x8 = {{ + 0x11111111, // TCfg + 0x00100100, // ASelt + 0x01000800, 0x0a020109, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni8BDataInterleaveRGB_1_2x8 = {{ + 0x11111111, // TCfg + 0x01001001, // ASelt + 0x030b0302, 0x05040c04, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni8BDataInterleaveRGB_2_2x8 = {{ + 0x11111111, // TCfg + 0x10010010, // ASelt + 0x0e06050d, 0x070f0706, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni8BDataInterleaveBGR_0_2x8 = {{ + 0x11111111, // TCfg + 0x01001001, // ASelt + 0x01000800, 0x0a020109, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni8BDataInterleaveBGR_1_2x8 = {{ + 0x11111111, // TCfg + 0x10010010, // ASelt + 0x030b0302, 0x05040c04, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni8BDataInterleaveBGR_2_2x8 = {{ + 0x11111111, // TCfg + 0x00100100, // ASelt + 0x0e06050d, 0x070f0706, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + status = vsi_nn_kernel_gpu_add_param(node, "uniDataMeanStddevLo_2x8", &uniDataMeanStddevLo_2x8); + if (reverse) + { + status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_0_2x8", + &uni16BitsDataInterleaveBGR_0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_1_2x8", + &uni16BitsDataInterleaveBGR_1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_0_2x8", + &uni8BDataInterleaveBGR_0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_1_2x8", + &uni8BDataInterleaveBGR_1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_2_2x8", + &uni8BDataInterleaveBGR_2_2x8); + } + else + { + status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_0_2x8", + &uni16BitsDataInterleaveRGB_0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_1_2x8", + &uni16BitsDataInterleaveRGB_1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_0_2x8", + &uni8BDataInterleaveRGB_0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_1_2x8", + &uni8BDataInterleaveRGB_1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_2_2x8", + &uni8BDataInterleaveRGB_2_2x8); + } + status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _pre_process_gray_copy_initializer() */ + +DEF_KERNEL_INITIALIZER(_resize_rgb888_planar_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 2, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + int32_t reverse = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def )) + { + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + } + else + { + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + } + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 3], &reverse); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + { + shaderParam.global_scale[0] = 16; + shaderParam.global_scale[1] = 2; + shaderParam.global_size[0] = gpu_align_p2((attr[0]->shape->data[0] + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = (attr[0]->shape->data[1] + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1]; + } + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uni8BDataInterleaveRGB_0_2x8 = {{ + 0x11111111, // TCfg + 0x00100100, // ASelt + 0x01000800, 0x0a020109, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni8BDataInterleaveRGB_1_2x8 = {{ + 0x11111111, // TCfg + 0x01001001, // ASelt + 0x030b0302, 0x05040c04, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni8BDataInterleaveRGB_2_2x8 = {{ + 0x11111111, // TCfg + 0x10010010, // ASelt + 0x0e06050d, 0x070f0706, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni8BDataInterleaveBGR_0_2x8 = {{ + 0x11111111, // TCfg + 0x01001001, // ASelt + 0x01000800, 0x0a020109, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni8BDataInterleaveBGR_1_2x8 = {{ + 0x11111111, // TCfg + 0x10010010, // ASelt + 0x030b0302, 0x05040c04, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uni8BDataInterleaveBGR_2_2x8 = {{ + 0x11111111, // TCfg + 0x00100100, // ASelt + 0x0e06050d, 0x070f0706, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + + if (reverse) + { + status = vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_0_2x8", + &uni8BDataInterleaveBGR_0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_1_2x8", + &uni8BDataInterleaveBGR_1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_2_2x8", + &uni8BDataInterleaveBGR_2_2x8); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_0_2x8", + &uni8BDataInterleaveRGB_0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_1_2x8", + &uni8BDataInterleaveRGB_1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_2_2x8", + &uni8BDataInterleaveRGB_2_2x8); + } + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + + return status; +} /* _resize_rgb888_planar_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_nn_kernel_t* kernel, + const vsi_nn_kernel_param_t * params, + vsi_bool is_no_range_change, + int32_t width, + int32_t height + ) +{ + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + _internal_scale_e scale_type = SCALE; + vsi_status status = VSI_FAILURE; + uint32_t key = 0; + size_t i = 0; + vsi_bool is_half_scale = FALSE; + vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); + vsi_bool is_rgb888_sep = (vsi_bool)(inputs[1] != NULL); + + is_half_scale = (width == (int32_t)outputs[0]->attr.size[0] * 2) && + (height == (int32_t)outputs[0]->attr.size[1] * 2); + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (enable_copy) + { + scale_type = COPY; + } + else + { + if (is_no_range_change && is_half_scale) + { + scale_type = HALF; + } + else + { + scale_type = SCALE; + } + } + + key = PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( input0_dtype, output_dtype, is_rgb888_sep, scale_type); + + for ( i = 0; i < _cnt_of_array(_pre_process_rgb888_planar_nhwc_kernel_map); i ++ ) + { + if ( _pre_process_rgb888_planar_nhwc_kernel_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(_pre_process_rgb888_planar_nhwc_kernel_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", + _pre_process_rgb888_planar_nhwc_kernel_map[i].function_name ); + + if (is_rgb888_sep) + { + kernel->info.parameters = _pre_process_rgb888_planar_sep_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ); + } + else + { + kernel->info.parameters = _pre_process_rgb888_planar_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def ); + } + + if (enable_copy) + { + kernel->info.initialize = _pre_process_rgb888_planar_copy_initializer; + } + else if (scale_type == HALF) + { + kernel->info.initialize = _resize_rgb888_planar_initializer; + } + else + { + kernel->info.initialize = _pre_process_rgb888_planar_initializer; + } + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + _pre_process_rgb888_planar_nhwc_kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _pre_process_rgb888_planar_nhwc_kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t* node_params = NULL; + vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* reshape_tensor = NULL; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + size_t param_count = _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM; + int32_t width = vsi_nn_kernel_param_get_int32( params, "width" ); + int32_t height = vsi_nn_kernel_param_get_int32( params, "height" ); + float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); + float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); + float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); + float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" ); + float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" ); + float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" ); + int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); + vsi_bool is_no_range_change = FALSE; + + input_num = inputs[1] == NULL ? 1 : input_num; + param_count = inputs[1] == NULL ? _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM : param_count; + + memcpy(shape, outputs[0]->attr.size, outputs[0]->attr.dim_num * sizeof(shape[0])); + shape[0] *= shape[1]; + shape[1] = shape[2]; + shape[2] = 1; + reshape_tensor = vsi_nn_reshape_tensor( graph, + outputs[0], shape, outputs[0]->attr.dim_num ); + + if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + if ( width == (int32_t)inputs[0]->attr.size[0] && height == (int32_t)inputs[0]->attr.size[1] && + outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 && + outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC && + (float)outputs[0]->attr.dtype.zero_point == r_mean && r_mean == g_mean && r_mean == b_mean && + vsi_nn_abs(outputs[0]->attr.dtype.scale - r_scale) < 1e-8 && + vsi_nn_abs(outputs[0]->attr.dtype.scale - g_scale) < 1e-8 && + vsi_nn_abs(outputs[0]->attr.dtype.scale - b_scale) < 1e-8) + { + is_no_range_change = TRUE; + } + + status = _query_kernel( inputs, outputs, kernel, params, is_no_range_change, width, height ); + if ( VSI_SUCCESS == status) + { + node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count); + CHECK_PTR_FAIL_GOTO( node_params, "Create buffer fail.", final ); + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = inputs[1] == NULL ? 2 : 4; + uint32_t scalar_index = index; + int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); + int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); + int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); + int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, param_count, + inputs, input_num, &reshape_tensor, output_num ); + + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, param_count ); + index = scalar_index; + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + } + } + +final: + vsi_nn_safe_free(node_params); + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( pre_process_rgb888_planar_nhwc, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c index 5fda28142..984293bcb 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c @@ -106,6 +106,8 @@ static vx_param_description_t vxPreProcessRgbKernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; #define _EVIS_PRE_PROCESS_RGB_PARAM_NUM _cnt_of_array(vxPreProcessRgbKernel_param_def) @@ -126,19 +128,24 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) float outputZP = 0; float outputScale = 1; - int32_t reorder = 0; - int32_t trans = 0; - int32_t xRatio = 0; - int32_t yRatio = 0; - int32_t order1 = 2; - uint32_t width = 0; - uint32_t height = 0; - int32_t enable_copy= 0; - uint32_t pack_key = 0; + int32_t reorder = 0; + int32_t trans = 0; + int32_t xRatio = 0; + int32_t yRatio = 0; + int32_t order1 = 2; + uint32_t width = 0; + uint32_t height = 0; + int32_t enable_copy = 0; + uint32_t pack_key = 0; + float rgb_mean[4] = {0}; + float rgb_scale[4] = {0}; + float param_data[4] = {0}; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -148,6 +155,18 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder); CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[6], &rgb_mean[0]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &rgb_mean[1]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &rgb_mean[2]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &rgb_scale[0]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[12], &rgb_scale[1]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &rgb_scale[2]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[0]->shape; width = (uint32_t)(out_shape->data[0]); @@ -417,6 +436,12 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) case _PACK_SELECT_KEY( 1, 0, 0): // copy case _PACK_SELECT_KEY( 1, 2, 0): // copy reorder { + int32_t i = 0; + for (i = 0;i < 3; i ++) + { + rgb_scale[i] *= outputScale; + param_data[i] = rgb_mean[i] * rgb_scale[i] - outputZP; + } if (attr[0]->dtype == I8 || attr[0]->dtype == U8) { shaderParam.global_scale[0] = 16; @@ -454,6 +479,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part3_4x4", &uniExtractBtoF32_part3_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "r_order", &reorder); status |= vsi_nn_kernel_gpu_add_param(node, "b_order", &order1); + status |= vsi_nn_kernel_gpu_add_param(node, "rgb_scale", &rgb_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "param_data", ¶m_data); CHECK_STATUS_FAIL_GOTO(status, OnError); } break; @@ -486,6 +513,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes); status |= vsi_nn_kernel_gpu_add_param(node, "r_order", &reorder); status |= vsi_nn_kernel_gpu_add_param(node, "b_order", &order1); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale); + status |= vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP); CHECK_STATUS_FAIL_GOTO(status, OnError); } break; @@ -493,10 +522,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) break; } - status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale); - status |= vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_gpu_config( node, &shaderParam ); CHECK_STATUS_FAIL_GOTO(status, OnError); } @@ -523,7 +548,7 @@ static vsi_status _query_kernel vsi_nn_kernel_convert_type_e convert_type = SCALE; vsi_status status = VSI_FAILURE; uint32_t key = 0; - int i = 0; + size_t i = 0; vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); @@ -580,6 +605,9 @@ static vsi_nn_kernel_node_t _setup vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; int32_t trans = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { @@ -600,7 +628,9 @@ static vsi_nn_kernel_node_t _setup float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); - float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" ); + float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" ); + float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" ); + float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" ); int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); /* Pass parameters to node. */ @@ -616,9 +646,11 @@ static vsi_nn_kernel_node_t _setup tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); - tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale ); status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_RGB_PARAM_NUM ); CHECK_STATUS(status); vsi_nn_kernel_scalar_release( &tmp_params[2] ); @@ -631,6 +663,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &tmp_params[9] ); vsi_nn_kernel_scalar_release( &tmp_params[10] ); vsi_nn_kernel_scalar_release( &tmp_params[11] ); + vsi_nn_kernel_scalar_release( &tmp_params[12] ); + vsi_nn_kernel_scalar_release( &tmp_params[13] ); } } diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c index 8e5f77949..eb9d16056 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c @@ -99,6 +99,8 @@ static vx_param_description_t vxPreProcessYuv420Kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; #define _EVIS_PRE_PROCESS_YUV420_PARAM_NUM _cnt_of_array(vxPreProcessYuv420Kernel_param_def) @@ -128,6 +130,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer) vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -496,6 +500,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer) vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -833,7 +839,7 @@ static vsi_status _query_kernel vsi_nn_kernel_convert_type_e convert_type = SCALE; vsi_status status = VSI_FAILURE; uint32_t key = 0; - int i = 0; + size_t i = 0; vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); @@ -900,6 +906,9 @@ static vsi_nn_kernel_node_t _setup vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; int32_t trans = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { @@ -920,7 +929,9 @@ static vsi_nn_kernel_node_t _setup float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); - float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" ); + float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" ); + float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" ); + float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" ); int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); /* Pass parameters to node. */ @@ -935,9 +946,11 @@ static vsi_nn_kernel_node_t _setup tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); - tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale ); status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_YUV420_PARAM_NUM ); CHECK_STATUS(status); vsi_nn_kernel_scalar_release( &tmp_params[4] ); @@ -950,6 +963,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &tmp_params[11] ); vsi_nn_kernel_scalar_release( &tmp_params[12] ); vsi_nn_kernel_scalar_release( &tmp_params[13] ); + vsi_nn_kernel_scalar_release( &tmp_params[14] ); + vsi_nn_kernel_scalar_release( &tmp_params[15] ); } } if (reshape_tensors[0]) diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c index ca397de23..61d421d27 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c @@ -99,6 +99,8 @@ static vx_param_description_t vxPreProcessyuv422Kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; #define _EVIS_PRE_PROCESS_YUV422_PARAM_NUM _cnt_of_array(vxPreProcessyuv422Kernel_param_def) @@ -126,13 +128,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer) int32_t order1 = 2; uint32_t width = 0; uint32_t height = 0; - float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f; - float outputScaleVar = 0.0f; + float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f; + float b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f; + float outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f; float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -142,10 +147,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &bMean); CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &var); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &r_scale); CHECK_STATUS_FAIL_GOTO(status, OnError ); status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder); CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &g_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &b_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[0]->shape; output_scale = 1.0f / attr[0]->scale; @@ -159,10 +168,12 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer) order1 = 0; } - outputScaleVar = output_scale * var; - bMeanScaleVarZp = output_zp - bMean * outputScaleVar; - gMeanScaleVarZp = output_zp - gMean * outputScaleVar; - rMeanScaleVarZp = output_zp - rMean * outputScaleVar; + outputScaleVar_b = output_scale * b_scale; + outputScaleVar_g = output_scale * g_scale; + outputScaleVar_r = output_scale * r_scale; + bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b; + gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g; + rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r; shaderParam.global_scale[0] = 4; shaderParam.global_scale[1] = 1; @@ -245,7 +256,9 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder); status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1); status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractYUVtoShortSub_2x8", &uniExtractYUVtoShortSub_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r); status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp); @@ -308,13 +321,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer) uint32_t yrIntFloat_16 = 0; int32_t xRatio = 0; int32_t yRatio = 0; - float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f; - float outputScaleVar = 0.0f; + float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f; + float b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f; + float outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f; float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -328,10 +344,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &bMean); CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &var); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &r_scale); CHECK_STATUS_FAIL_GOTO(status, OnError ); status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder); CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &g_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &b_scale); + CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[0]->shape; output_scale = 1.0f / attr[0]->scale; @@ -350,10 +370,12 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer) xrIntFloat_16 = (uint32_t)((roi_width << 16) / width + 1); yrIntFloat_16 = (uint32_t)((roi_height << 16) / height + 1); - outputScaleVar = output_scale * var; - bMeanScaleVarZp = output_zp - bMean * outputScaleVar; - gMeanScaleVarZp = output_zp - gMean * outputScaleVar; - rMeanScaleVarZp = output_zp - rMean * outputScaleVar; + outputScaleVar_b = output_scale * b_scale; + outputScaleVar_g = output_scale * g_scale; + outputScaleVar_r = output_scale * r_scale; + bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b; + gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g; + rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r; shaderParam.global_scale[0] = 4; shaderParam.global_scale[1] = 1; @@ -445,7 +467,9 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toR_4x4", &uniConvertYUV422toR_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16); status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g); + status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r); status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp); status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp); @@ -503,9 +527,11 @@ static vsi_status _query_kernel vsi_nn_kernel_convert_type_e convert_type = SCALE; vsi_status status = VSI_FAILURE; uint32_t key = 0; - int i = 0; + size_t i = 0; vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); + VSI_UNREFERENCED(scale_x); + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -570,6 +596,9 @@ static vsi_nn_kernel_node_t _setup int32_t trans = 0; int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { @@ -589,7 +618,9 @@ static vsi_nn_kernel_node_t _setup float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); - float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" ); + float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" ); + float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" ); + float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" ); int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); int32_t yuv422_type = vsi_nn_kernel_param_get_int32( params, "yuv422_type" ); @@ -604,10 +635,12 @@ static vsi_nn_kernel_node_t _setup tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); - tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &yuv422_type ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale ); status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_YUV422_PARAM_NUM ); CHECK_STATUS(status); vsi_nn_kernel_scalar_release( &tmp_params[2] ); @@ -621,6 +654,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &tmp_params[10] ); vsi_nn_kernel_scalar_release( &tmp_params[11] ); vsi_nn_kernel_scalar_release( &tmp_params[12] ); + vsi_nn_kernel_scalar_release( &tmp_params[13] ); + vsi_nn_kernel_scalar_release( &tmp_params[14] ); } } vsi_safe_release_tensor(reshape_tensors[0]); diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c index 7c7efc765..4c322a8fc 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c @@ -95,6 +95,8 @@ static vx_param_description_t vxPreProcessYuv444Kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; #define _EVIS_PRE_PROCESS_YUV444_PARAM_NUM _cnt_of_array(vxPreProcessYuv444Kernel_param_def) @@ -123,6 +125,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer) vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -488,6 +492,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer) vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -845,7 +851,7 @@ static vsi_status _query_kernel vsi_nn_kernel_convert_type_e convert_type = SCALE; vsi_status status = VSI_FAILURE; uint32_t key = 0; - int i = 0; + size_t i = 0; vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); @@ -910,6 +916,9 @@ static vsi_nn_kernel_node_t _setup vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; int32_t trans = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { @@ -930,7 +939,9 @@ static vsi_nn_kernel_node_t _setup float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" ); float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" ); - float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" ); + float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" ); + float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" ); + float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" ); int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); /* Pass parameters to node. */ @@ -944,9 +955,11 @@ static vsi_nn_kernel_node_t _setup tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); - tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale ); status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_YUV444_PARAM_NUM ); CHECK_STATUS(status); vsi_nn_kernel_scalar_release( &tmp_params[4] ); @@ -959,6 +972,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &tmp_params[11] ); vsi_nn_kernel_scalar_release( &tmp_params[12] ); vsi_nn_kernel_scalar_release( &tmp_params[13] ); + vsi_nn_kernel_scalar_release( &tmp_params[14] ); + vsi_nn_kernel_scalar_release( &tmp_params[15] ); } } if(reshape_tensors[0]) diff --git a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c index c007a088e..bed0b6c46 100644 --- a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c @@ -142,6 +142,8 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer) vx_context ctx = vxGetContext((vx_reference)node); vx_hardware_caps_params_t hw_param; + VSI_UNREFERENCED(param_size); + memset(&hw_param, 0, sizeof(vx_hardware_caps_params_t)); status = vxQueryHardwareCaps(ctx, &hw_param, sizeof(vx_hardware_caps_params_t)); CHECK_STATUS_FAIL_GOTO(status, final); @@ -531,7 +533,7 @@ static vsi_status _query_kernel vsi_nn_shader_type_e sh_type = image_2d ? (input_fl >= output_fl ? _2D_OPT : _2D) : _3D; vsi_status status = VSI_FAILURE; uint32_t key; - int i; + size_t i; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); @@ -583,6 +585,9 @@ static vsi_nn_kernel_node_t _setup vsi_bool ret; int32_t is_per_channel_alpha = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + is_per_channel_alpha = vsi_nn_kernel_param_get_int32(params, "is_per_channel_alpha"); if (is_per_channel_alpha) diff --git a/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c b/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c index daa40605e..cac4e3b13 100644 --- a/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c @@ -35,7 +35,6 @@ #include "vsi_nn_error.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" __BEGIN_DECLS @@ -151,6 +150,8 @@ DEF_KERNEL_INITIALIZER(_multinomial_initializer) vsi_nn_kernel_tensor_attr_t * attr = NULL; vsi_size_array_t * in_shape = NULL; + VSI_UNREFERENCED(param_size); + attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); @@ -196,6 +197,8 @@ DEF_KERNEL_INITIALIZER(_cdf_initializer) uint32_t class_size = 0; uint32_t batch = 0; + VSI_UNREFERENCED(param_size); + attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); @@ -292,6 +295,8 @@ DEF_KERNEL_INITIALIZER(_seed_initializer) float rand_max = (float)(pow(2.0,32)); float re_rand_max = 1 / rand_max; + VSI_UNREFERENCED(param_size); + attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); @@ -425,20 +430,24 @@ static vsi_nn_kernel_node_t _setup uint32_t hashkey = 0; int32_t i; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + // Check if gpu can support the size - if( !vsi_nn_kernel_gpu_check_shape( + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } - for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + for ( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) { ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); // Assign unique_id ikernels[i]->unique_id = kernel->unique_id; } - if( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ) + if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ) { class_max_stride = (int32_t)gpu_align_p2(inputs[0]->attr.size[0], 4); } @@ -453,17 +462,20 @@ static vsi_nn_kernel_node_t _setup attr.is_const = FALSE; attr.vtl = TRUE; tensors[SEED_INDEX] = vsi_nn_CreateTensor( graph, &attr ); + CHECK_PTR_FAIL_GOTO(tensors[SEED_INDEX], "Create tensor failed", final); attr.size[0] = class_max_stride * inputs[0]->attr.size[1]; attr.size[1] = inputs[0]->attr.size[1]; attr.dim_num = 2; tensors[CDF_INDEX] = vsi_nn_CreateTensor( graph, &attr ); + CHECK_PTR_FAIL_GOTO(tensors[CDF_INDEX], "Create tensor failed", final); memcpy( &attr, &(inputs[1]->attr), sizeof(vsi_nn_tensor_attr_t) ); attr.size[1] = 1; attr.dim_num = 2; tensors[SEEDS_INDEX] = vsi_nn_reshape_tensor( graph, inputs[1], attr.size, attr.dim_num ); + CHECK_PTR_FAIL_GOTO(tensors[SEEDS_INDEX], "Create tensor failed", final); in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); @@ -474,17 +486,17 @@ static vsi_nn_kernel_node_t _setup hashkey = MULTINOMIAL_HASH_KEY( F32, F32, out_dtype ); status = _query_kernel( ikernels[SEED_INDEX], hashkeys[SEED_INDEX], INTERNAL_KERNEL_SEED ); - if( VSI_SUCCESS != status ) + if ( VSI_SUCCESS != status ) { goto final; } status = _query_kernel( ikernels[CDF_INDEX], hashkeys[CDF_INDEX], INTERNAL_KERNEL_CDF ); - if( VSI_SUCCESS != status ) + if ( VSI_SUCCESS != status ) { goto final; } status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_MULTINOMIAL ); - if( VSI_SUCCESS != status ) + if ( VSI_SUCCESS != status ) { goto final; } @@ -518,13 +530,13 @@ static vsi_nn_kernel_node_t _setup /* Pass parameters to node. */ final: - for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + for ( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) { - if( ikernels[i] ) + if ( ikernels[i] ) { vsi_nn_kernel_release( &ikernels[i] ); } - if( tensors[i] ) + if ( tensors[i] ) { vsi_nn_ReleaseTensor( &tensors[i] ); } diff --git a/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c index caf40b973..a133a121e 100644 --- a/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c @@ -111,6 +111,8 @@ DEF_KERNEL_INITIALIZER(_reduceall_internal_initializer) vsi_size_array_t * output_shape = NULL; int32_t axisSize = 0; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); diff --git a/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c index df45307c9..11aa099ec 100644 --- a/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c @@ -111,6 +111,8 @@ DEF_KERNEL_INITIALIZER(_reduceany_internal_initializer) vsi_size_array_t * output_shape = NULL; int32_t axisSize = 0; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); diff --git a/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c index e70b58a52..efb52f080 100644 --- a/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c @@ -159,6 +159,8 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer) float outputScale = 1.0f; float output_offset_asymmetric = 0.0f; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); diff --git a/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c index b1149fd59..d9bd40d8a 100644 --- a/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c @@ -161,6 +161,8 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer) float outputScale = 1.0f; float output_offset_asymmetric = 0.0f; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); diff --git a/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c index 6fd1b7d63..3c710f599 100644 --- a/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c @@ -167,6 +167,8 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer) float outputScale = 1.0f; float output_offset_asymmetric = 0.0f; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); diff --git a/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c b/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c index d7cb58d43..131111732 100644 --- a/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c @@ -141,6 +141,8 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer) int32_t srcFixPointPos = 0; int32_t dstFixPointPos = 0; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); diff --git a/src/tim/vx/internal/src/kernel/evis/repeat_evis.c b/src/tim/vx/internal/src/kernel/evis/repeat_evis.c index 7fe19bc70..164ab495c 100644 --- a/src/tim/vx/internal/src/kernel/evis/repeat_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/repeat_evis.c @@ -147,6 +147,8 @@ DEF_KERNEL_INITIALIZER(_preprocess_initializer) vsi_nn_kernel_tensor_attr_t* attr[1] = {NULL}; int32_t width = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -212,6 +214,8 @@ DEF_KERNEL_INITIALIZER(_repeat_initializer) int32_t is1d = 0; int32_t axis = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -303,7 +307,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; int32_t is1d = inputs[0]->attr.dim_num == 1 ? 1 : 0; - int i = 0; + size_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); @@ -453,6 +457,9 @@ static vsi_nn_kernel_node_t _setup vsi_size_t new_rank[2] = {0, 0}; int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + // Check if gpu can support the size if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) @@ -497,7 +504,7 @@ static vsi_nn_kernel_node_t _setup attr.size[1] = 1; attr.dim_num = 2; tensor_preprocess = vsi_nn_CreateTensor( graph, &attr ); - + CHECK_PTR_FAIL_GOTO( tensor_preprocess, "Create tensor fail.", final ); // preprocess tmp_node = vsi_nn_kernel_create_node( graph, kernel_preprocess ); if (tmp_node) diff --git a/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c index f893feaf2..95c33b80b 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c @@ -35,7 +35,6 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" #include "utils/vsi_nn_dtype_util_prv.h" __BEGIN_DECLS @@ -855,7 +854,6 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer) else if (F16 == output_dtype) { status = vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4); status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertFp2FP32_left_4x4", &uniConvertFp2FP32_left_4x4); status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertFp2FP32_right_4x4", @@ -1187,7 +1185,7 @@ static vsi_nn_tensor_t* _create_scale_tensor uint32_t dims = output->attr.dim_num; vsi_size_t batch = dims > 3 ? output->attr.size[3] : 1; vsi_size_t width = output->attr.size[0]; - vsi_size_t sizes[4] = {width * 2, 1, 1, batch}; + vsi_size_t sizes[4] = { 0, 0, 0, 0 }; vsi_size_t item_count = width * 2 * batch; vsi_size_t input_width = input->attr.size[0]; vsi_size_t x = 0; @@ -1195,6 +1193,10 @@ static vsi_nn_tensor_t* _create_scale_tensor float width_scale = 1.0f; uint16_t *scale_data_ptr = NULL; + sizes[0] = width * 2; + sizes[1] = 1; + sizes[2] = 1; + sizes[3] = batch; if (align_corners && width > 1) { width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(width - 1); @@ -1310,6 +1312,7 @@ static vsi_nn_kernel_node_t _setup if (is_run_opt_kernel) { scale = _create_scale_tensor(graph, inputs[0], outputs[0], align_corners, half_pixel_centers); + CHECK_PTR_FAIL_GOTO( scale, "Create tensor fail.", final ); node_params[SCALAR_TENSOR_SCALE] = (vsi_nn_kernel_node_param_t)(scale->t); node_params_num = _RESIZE_1D_BILINEAR_PARAM_NUM; } @@ -1325,16 +1328,18 @@ static vsi_nn_kernel_node_t _setup { vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_TYPE] ); } + } + } - if (is_run_opt_kernel) - { - if (scale) - { - vsi_nn_ReleaseTensor(&scale); - } - } +final: + if (is_run_opt_kernel) + { + if (scale) + { + vsi_nn_ReleaseTensor(&scale); } } + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c index be1cd0972..fddd1e381 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c @@ -144,6 +144,8 @@ DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer) float half_pixel_value = 0.0f; float round_value = 0.0f; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c index 1e79cbfe3..ebfe9ed38 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c @@ -868,6 +868,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer) vsi_bool is_4x_up_kernel = FALSE; vsi_bool is_8x_up_kernel = FALSE; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -1167,6 +1169,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_align_corners_opt_initializer) uint32_t out_height = 0; vsi_bool is_8x_align_corners = FALSE; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -1490,7 +1494,7 @@ static vsi_nn_tensor_t* _create_scale_tensor vsi_size_t width = output->attr.size[0]; vsi_size_t height = output->attr.size[1]; vsi_size_t batch = dims > 3 ? output->attr.size[3] : 1; - vsi_size_t sizes[4] = {width * 4, height, 1, batch}; + vsi_size_t sizes[4] = { 0, 0, 0, 0 }; vsi_size_t item_count = width * 4 * height * batch; vsi_size_t input_width = input->attr.size[0]; vsi_size_t input_height = input->attr.size[1]; @@ -1501,6 +1505,10 @@ static vsi_nn_tensor_t* _create_scale_tensor float height_scale = 1.0f; uint16_t *scale_data_ptr = NULL; + sizes[0] = width * 4; + sizes[1] = height; + sizes[2] = 1; + sizes[3] = batch; if (align_corners && width > 1) { width_scale = ((float)(input_width - 1) * 1.0f) / (float)(width - 1); diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c index b8e634e4e..596d528f7 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c @@ -137,6 +137,8 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_nhwc_initializer) vsi_bool is_3x_up_kernel = FALSE; vsi_bool is_4x_up_kernel = FALSE; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -433,6 +435,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_nhwc_bound_initializer) vsi_bool is_3x_up_kernel = FALSE; vsi_bool is_4x_up_kernel = FALSE; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c index 4d0189327..6bf9ba87c 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c @@ -145,6 +145,8 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer) float half_pixel_value = 0.0f; float round_value = 0.0f; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c index 9876ebc71..bba21eabb 100644 --- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c @@ -188,6 +188,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer) int32_t coord_dim = 0; int32_t offsetX = 0, offsetY = 0, offsetZ = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -345,6 +347,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_big_initializer) int32_t coord_dim = 0; int32_t offsetX = 0, offsetY = 0, offsetZ = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -457,7 +461,9 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e input1_dtype = U8; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; - int i = 0; + size_t i = 0; + + VSI_UNREFERENCED(coord_dim); input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -517,6 +523,9 @@ static vsi_nn_kernel_node_t _setup vsi_size_t width = 0, area = 0; int32_t big_flg = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if (coord_dim > 3) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c index e9d6d5dd0..43ea15c3f 100644 --- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c @@ -45,64 +45,82 @@ __BEGIN_DECLS #define KERNEL_SOURCE_2 "scatter_nd_update_big" #define KERNEL_SOURCE_3 "scatter_nd_update_atom" #define KERNEL_SOURCE_4 "scatter_nd_update_special" +#define KERNEL_SOURCE_5 "scatter_nd_update_qint" +#define KERNEL_SOURCE_6 "scatter_nd_update_fp" -#define HASH_SCATTER_ND_UPDATE_KEY(_input0_type, _input2_type, _output_type, _pre_op, _large_type) \ - ((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | (_pre_op << 4) | (_large_type)) +#define HASH_SCATTER_ND_UPDATE_KEY(_in0_type, _in2_type, _out_type, _stage, _coord_type, _opt_flg) \ + ((_in0_type << 24) | (_in2_type << 16) | (_out_type << 8) | (_stage << 4) | (_coord_type << 2) | (_opt_flg)) -#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_NAME(SRC0_TYPE, SRC2_TYPE, DST_TYPE) \ - CVIVANTE_NAMESPACE("evis.scatter_nd_update_"#SRC0_TYPE#SRC2_TYPE"to"#DST_TYPE) +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_RESET_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.scatter_nd_update_reset_"#SRC0_TYPE"to"#DST_TYPE) -#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_BIG_NAME(SRC0_TYPE, SRC2_TYPE, DST_TYPE) \ - CVIVANTE_NAMESPACE("evis.scatter_nd_update_"#SRC0_TYPE#SRC2_TYPE"to"#DST_TYPE"_big") +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_NAME(SRC2_TYPE) \ + CVIVANTE_NAMESPACE("evis.scatter_nd_update_update_"#SRC2_TYPE) -#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PRE_NAME(SRC0_TYPE) \ - CVIVANTE_NAMESPACE("evis.scatter_nd_update_"#SRC0_TYPE"_pre") +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_4X_NAME(SRC2_TYPE) \ + CVIVANTE_NAMESPACE("evis.scatter_nd_update_update_"#SRC2_TYPE"_4X") - #define HASH_SCATTER_ND_UPDATE_SH_KERNEL_RESET_NAME() \ - CVIVANTE_NAMESPACE("evis.scatter_nd_update_reset") +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_NAME(SRC2_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.scatter_nd_update_ref_"#SRC2_TYPE"to"#DST_TYPE) -#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_NAME(SRC0_TYPE, DST_TYPE) \ +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_4X_NAME(SRC2_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.scatter_nd_update_ref_"#SRC2_TYPE"to"#DST_TYPE"_4X") + +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_COPY_NAME(DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.scatter_nd_update_copy_"#DST_TYPE) + +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_SPECIAL_REF_NAME(SRC0_TYPE, DST_TYPE) \ CVIVANTE_NAMESPACE("evis.scatter_nd_update_ref2out_"#SRC0_TYPE"to"#DST_TYPE) -#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_NAME(SRC2_TYPE, DST_TYPE) \ +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_SPECIAL_UPDATE_NAME(SRC2_TYPE, DST_TYPE) \ CVIVANTE_NAMESPACE("evis.scatter_nd_update_update2ref_"#SRC2_TYPE"to"#DST_TYPE"_16x") -#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_COPY_NAME(DST_TYPE) \ +#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_SPECIAL_COPY_NAME(DST_TYPE) \ CVIVANTE_NAMESPACE("evis.scatter_nd_update_cpy2out_"#DST_TYPE"to"#DST_TYPE) -#define TENSOR_SCATTER_ND_UPDATE_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \ - { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 0, 0), \ - HASH_SCATTER_ND_UPDATE_SH_KERNEL_NAME(IN0_TYPE, IN2_TYPE, OUT_TYPE), \ +#define TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 4, 1, 0), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_SPECIAL_REF_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 5, 1, 0), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_SPECIAL_UPDATE_NAME(IN2_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 6, 1, 0), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_SPECIAL_COPY_NAME(IN0_TYPE), \ SOURCE }, -#define TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \ - { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 0, 1), \ - HASH_SCATTER_ND_UPDATE_SH_KERNEL_BIG_NAME(IN0_TYPE, IN2_TYPE, OUT_TYPE), \ +#define TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, 0, OUT_TYPE, 0, 0, 0), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_RESET_NAME(IN0_TYPE, OUT_TYPE), \ SOURCE }, -#define TENSOR_SCATTER_ND_UPDATE_PRE_KERNELS(IN0_TYPE, SOURCE) \ - { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, I32, I32, 1, 1), \ - HASH_SCATTER_ND_UPDATE_SH_KERNEL_PRE_NAME(IN0_TYPE), \ +#define TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(IN2_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, 0, 1, 0, 0), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_NAME(IN2_TYPE), \ SOURCE }, - #define TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(SOURCE) \ - { HASH_SCATTER_ND_UPDATE_KEY(I32, I32, I32, 2, 1), \ - HASH_SCATTER_ND_UPDATE_SH_KERNEL_RESET_NAME(), \ +#define TENSOR_SCATTER_ND_UPDATE_UPDATE_4X_KERNELS(IN2_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, 0, 1, 0, 1), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_4X_NAME(IN2_TYPE), \ SOURCE }, -#define TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \ - { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 3, 1), \ - HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_NAME(IN0_TYPE, OUT_TYPE), \ +#define TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(IN2_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, OUT_TYPE, 2, 0, 0), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_NAME(IN2_TYPE, OUT_TYPE), \ SOURCE }, -#define TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \ - { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 4, 1), \ - HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_NAME(IN2_TYPE, OUT_TYPE), \ +#define TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(IN2_TYPE, OUT_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, OUT_TYPE, 2, 0, 1), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_4X_NAME(IN2_TYPE, OUT_TYPE), \ SOURCE }, -#define TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \ - { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 5, 1), \ - HASH_SCATTER_ND_UPDATE_SH_KERNEL_COPY_NAME(IN0_TYPE), \ +#define TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(OUT_TYPE, SOURCE) \ + { HASH_SCATTER_ND_UPDATE_KEY(0, 0, OUT_TYPE, 3, 0, 0), \ + HASH_SCATTER_ND_UPDATE_SH_KERNEL_COPY_NAME(OUT_TYPE), \ SOURCE }, typedef struct @@ -112,93 +130,118 @@ typedef struct const char * source_name; } _kernel_map_type; -static const _kernel_map_type scatter_nd_update_map[] = +static const _kernel_map_type scatter_nd_update_reset_map[] = { - TENSOR_SCATTER_ND_UPDATE_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_1) - TENSOR_SCATTER_ND_UPDATE_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_1) - TENSOR_SCATTER_ND_UPDATE_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_1) - TENSOR_SCATTER_ND_UPDATE_KERNELS(F16, I32, F16, F16, KERNEL_SOURCE_1) - TENSOR_SCATTER_ND_UPDATE_KERNELS(BF16, I32, BF16, BF16, KERNEL_SOURCE_1) - TENSOR_SCATTER_ND_UPDATE_KERNELS(U8, I32, U8, F16, KERNEL_SOURCE_1) - TENSOR_SCATTER_ND_UPDATE_KERNELS(I8, I32, I8, F16, KERNEL_SOURCE_1) - TENSOR_SCATTER_ND_UPDATE_KERNELS(I16, I32, I16, F16, KERNEL_SOURCE_1) - TENSOR_SCATTER_ND_UPDATE_KERNELS(F16, I32, F16, U8, KERNEL_SOURCE_1) - TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(F16, I32, F16, F16, KERNEL_SOURCE_2) - TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(F16, I32, F16, U8, KERNEL_SOURCE_2) + TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(U8, U8, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(I8, I8, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(I16, I16, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(F16, F16, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(BF16, BF16, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(U8, F16, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(I8, F16, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(I16, F16, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(F16, U8, KERNEL_SOURCE_5) }; -static const _kernel_map_type scatter_nd_update_reset_map[] = +static const _kernel_map_type scatter_nd_update_update_map[] = { - TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(KERNEL_SOURCE_3) + TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(U8, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(I8, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(I16, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(F16, KERNEL_SOURCE_6) + TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(BF16, KERNEL_SOURCE_6) + TENSOR_SCATTER_ND_UPDATE_UPDATE_4X_KERNELS(U8, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_UPDATE_4X_KERNELS(I8, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_UPDATE_4X_KERNELS(I16, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_UPDATE_4X_KERNELS(F16, KERNEL_SOURCE_6) + TENSOR_SCATTER_ND_UPDATE_UPDATE_4X_KERNELS(BF16, KERNEL_SOURCE_6) }; -static const _kernel_map_type scatter_nd_update_pre_map[] = +static const _kernel_map_type scatter_nd_update_ref_map[] = { - TENSOR_SCATTER_ND_UPDATE_PRE_KERNELS(U8, KERNEL_SOURCE_3) - TENSOR_SCATTER_ND_UPDATE_PRE_KERNELS(I8, KERNEL_SOURCE_3) - TENSOR_SCATTER_ND_UPDATE_PRE_KERNELS(I16, KERNEL_SOURCE_3) + TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(I32, U8, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(I32, I8, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(I32, I16, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(I32, F16, KERNEL_SOURCE_6) + TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(F32, F16, KERNEL_SOURCE_6) + TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(F32, BF16, KERNEL_SOURCE_6) + TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(I32, U8, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(I32, I8, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(I32, I16, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(I32, F16, KERNEL_SOURCE_6) + TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(F32, F16, KERNEL_SOURCE_6) + TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(F32, BF16, KERNEL_SOURCE_6) }; -static const _kernel_map_type scatter_nd_update_post_map[] = +static const _kernel_map_type scatter_nd_update_copy_map[] = { - TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(U8, I32, U8, F16, KERNEL_SOURCE_3) - TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(I8, I32, I8, F16, KERNEL_SOURCE_3) - TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(I16, I32, I16, F16, KERNEL_SOURCE_3) - TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_3) - TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_3) - TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_3) + TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(U8, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(I8, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(I16, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(F16, KERNEL_SOURCE_5) + TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(BF16, KERNEL_SOURCE_5) }; -static const _kernel_map_type scatter_nd_update_ref_map[] = +static const _kernel_map_type scatter_nd_update_special_ref_map[] = { - TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4) - TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4) + TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4) + TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4) }; -static const _kernel_map_type scatter_nd_update_update_map[] = +static const _kernel_map_type scatter_nd_update_special_update_map[] = { - TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4) - TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4) + TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4) + TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4) }; -static const _kernel_map_type scatter_nd_update_copy_map[] = +static const _kernel_map_type scatter_nd_update_special_copy_map[] = { - TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4) - TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4) + TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4) + TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4) }; /* * Kernel params */ -static vx_param_description_t _scatter_nd_update_kernel_param_def[] = +static vx_param_description_t _scatter_nd_update_reset_kernel_param_def[] = { - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, // Add kererl parameters here }; -static vx_param_description_t _scatter_nd_update_reset_kernel_param_def[] = +static vx_param_description_t _scatter_nd_update_update_kernel_param_def[] = { + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, // Add kererl parameters here }; -static vx_param_description_t _scatter_nd_update_pre_kernel_param_def[] = +static vx_param_description_t _scatter_nd_update_ref_kernel_param_def[] = { {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - //{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, @@ -206,22 +249,17 @@ static vx_param_description_t _scatter_nd_update_pre_kernel_param_def[] = // Add kererl parameters here }; -static vx_param_description_t _scatter_nd_update_post_kernel_param_def[] = +static vx_param_description_t _scatter_nd_update_copy_kernel_param_def[] = { - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, // Add kererl parameters here }; -static vx_param_description_t _scatter_nd_update_ref_kernel_param_def[] = +static vx_param_description_t _scatter_nd_update_special_ref_kernel_param_def[] = { {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, @@ -229,7 +267,7 @@ static vx_param_description_t _scatter_nd_update_ref_kernel_param_def[] = // Add kererl parameters here }; -static vx_param_description_t _scatter_nd_update_update_kernel_param_def[] = +static vx_param_description_t _scatter_nd_update_special_update_kernel_param_def[] = { {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, @@ -243,7 +281,7 @@ static vx_param_description_t _scatter_nd_update_update_kernel_param_def[] = // Add kererl parameters here }; -static vx_param_description_t _scatter_nd_update_copy_kernel_param_def[] = +static vx_param_description_t _scatter_nd_update_special_copy_kernel_param_def[] = { {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, @@ -251,13 +289,14 @@ static vx_param_description_t _scatter_nd_update_copy_kernel_param_def[] = // Add kererl parameters here }; -#define _SCATTER_ND_UPDATE_PARAM_NUM _cnt_of_array( _scatter_nd_update_kernel_param_def ) -#define _SCATTER_ND_UPDATE_PRE_PARAM_NUM _cnt_of_array( _scatter_nd_update_pre_kernel_param_def ) -#define _SCATTER_ND_UPDATE_POST_PARAM_NUM _cnt_of_array( _scatter_nd_update_post_kernel_param_def ) #define _SCATTER_ND_UPDATE_RESET_PARAM_NUM _cnt_of_array( _scatter_nd_update_reset_kernel_param_def ) -#define _SCATTER_ND_UPDATE_REF_PARAM_NUM _cnt_of_array( _scatter_nd_update_ref_kernel_param_def ) -#define _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM _cnt_of_array( _scatter_nd_update_update_kernel_param_def ) -#define _SCATTER_ND_UPDATE_COPY_PARAM_NUM _cnt_of_array( _scatter_nd_update_copy_kernel_param_def ) +#define _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM _cnt_of_array(_scatter_nd_update_update_kernel_param_def) +#define _SCATTER_ND_UPDATE_REF_PARAM_NUM _cnt_of_array(_scatter_nd_update_ref_kernel_param_def) +#define _SCATTER_ND_UPDATE_COPY_PARAM_NUM _cnt_of_array(_scatter_nd_update_copy_kernel_param_def) + +#define _SCATTER_ND_UPDATE_SPECIAL_REF_PARAM_NUM _cnt_of_array(_scatter_nd_update_special_ref_kernel_param_def) +#define _SCATTER_ND_UPDATE_SPECIAL_UPDATE_PARAM_NUM _cnt_of_array(_scatter_nd_update_special_update_kernel_param_def) +#define _SCATTER_ND_UPDATE_SPECIAL_COPY_PARAM_NUM _cnt_of_array(_scatter_nd_update_special_copy_kernel_param_def) static vsi_status get_scatter_nd_update_tensor_reshape_size ( @@ -265,24 +304,17 @@ static vsi_status get_scatter_nd_update_tensor_reshape_size vsi_size_t sizes[VSI_NN_MAX_DIM_NUM], uint32_t block_size, uint32_t coordDim, - vsi_size_t* width, - vsi_size_t* area, - vsi_size_t* vol, + vsi_size_t strides[VSI_NN_MAX_DIM_NUM], int32_t* newDim, int32_t* isBig ) { - vsi_status status = VSI_FAILURE; + vsi_status status = VSI_SUCCESS; uint32_t dims_num = inputs[0]->attr.dim_num; vsi_size_t *input_size = inputs[0]->attr.size; uint32_t i = 0; vsi_size_t elementCnt = 1; - if (coordDim != 0 && (width == NULL || area == NULL)) - { - return status; - } - #define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH newDim[0] = 0; @@ -305,36 +337,30 @@ static vsi_status get_scatter_nd_update_tensor_reshape_size isBig[0] |= 1; } - if (coordDim == 1) // index shape - { - *width = 0; - *area = 0; - } - else if (coordDim == 2) + if (coordDim == 1 && strides) // index shape { - *width = input_size[dims_num - 2]; - *area = 0; - } - else if (coordDim == 3) - { - *width = input_size[dims_num - 3]; - *area = input_size[dims_num - 3] * input_size[dims_num - 2]; - } - else if (coordDim == 4) - { - *width = input_size[dims_num - 4]; - *area = input_size[dims_num - 4] * input_size[dims_num - 3]; - *vol = input_size[dims_num - 4] * input_size[dims_num - 3] * input_size[dims_num - 2]; + for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + strides[i] = 0; + } } - else if (coordDim == 5) + else if (coordDim >= 2 && coordDim <= VSI_NN_MAX_DIM_NUM && strides) { - *width = input_size[dims_num - 5]; - *area = input_size[dims_num - 5] * input_size[dims_num - 4]; - *vol = input_size[dims_num - 5] * input_size[dims_num - 4] * input_size[dims_num - 3]; + for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + strides[i] = 0; + } + + strides[0] = input_size[dims_num - coordDim]; + for (i = 1; i < coordDim - 1; i++) + { + strides[i] = strides[i - 1] * input_size[dims_num - coordDim + i]; + } } + #undef VSI_NN_MAX_IMAGE_WIDTH - return VSI_SUCCESS; + return status; } /* _get_EltOP_tensor_reshape_size */ static vsi_status check_scatter_nd_update_index_repeat @@ -458,7 +484,8 @@ static vsi_status check_scatter_nd_update_index_repeat /* * Kernel initializer */ -DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer) + +DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer) ( vsi_nn_kernel_node_t node, const vsi_nn_kernel_node_param_t * param, @@ -474,157 +501,68 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer) {0, 0, 0} }; - vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL }; - int32_t block_size = 1; - int32_t height = 1; - int32_t index_num = 1; - int32_t width = 0, area = 0, vol = 0; - int32_t coord_dim = 0; - int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0; - int32_t src0ZP = 0; - float src0Scale = 1; - int32_t src2ZP = 0; - float src2Scale = 1; - int32_t dstZP = 0; - float dstScale = 1; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + int32_t block_size = 1; + int32_t width = 0; + int32_t height = 0; + + int32_t input0_zp = 0; + float input0_scale = 1.0f; + int32_t output_zp = 0; + float output_scale = 1.0f; + + uint32_t pack_key = 0; + + VSI_UNREFERENCED(param_size); attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); - attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); - CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); - attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); - CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError ); - - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &width); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &area); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &vol); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &coord_dim); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - - block_size = (int32_t)(attr[3]->shape->data[0]); - height = (int32_t)(attr[3]->shape->data[1]); - index_num = (int32_t)(attr[1]->shape->data[1]); - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - src0ZP = attr[0]->asymm.zero_point; - src0Scale = attr[0]->asymm.scale; - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[0]->dfp.fl > 0) - { - src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - } - - if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - src2ZP = attr[2]->asymm.zero_point; - src2Scale = attr[2]->asymm.scale; - } - else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[2]->dfp.fl > 0) - { - src2Scale = (1.0f / ((float) ((int64_t)1 << attr[2]->dfp.fl))); - } - else - { - src2Scale = ((float) ((int64_t)1 << -attr[2]->dfp.fl)); - } - } - - if (attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + block_size = (int32_t)(attr[0]->shape->data[0]); + height = (int32_t)(attr[0]->shape->data[1]); + width = (int32_t)(block_size * height); + if (attr[0]->dtype == F16 || attr[0]->dtype == I16 || attr[0]->dtype == U16) { - dstZP = attr[3]->asymm.zero_point; - dstScale = attr[3]->asymm.scale; + width = (width + 7) / 8; } - else if ( attr[3]->quant == VSI_NN_KERNEL_QUANT_DFP ) + else if (attr[0]->dtype == U8 || attr[0]->dtype == I8) { - if (attr[3]->dfp.fl > 0) - { - dstScale = (float)((int64_t)1 << attr[3]->dfp.fl); - } - else - { - dstScale = (1.0f / (float)((int64_t)1 << -attr[3]->dfp.fl)); - } - dstScale = 1.0f/dstScale; + width = (width + 15) / 16; } - if (coord_dim == 5) - { - offset_idx = 1; - } - if (coord_dim == 4 || coord_dim == 5) - { - offsetX = vol; - offsetY = area; - offsetZ = width; - offsetW = 1; - } - else if (coord_dim == 3) - { - offsetX = area; - offsetY = width; - offsetZ = 1; - offsetW = 0; - } - else if (coord_dim == 2) - { - offsetX = width; - offsetY = 1; - offsetZ = 0; - offsetW = 0; - } - else if (coord_dim == 1) - { - offsetX = 1; - offsetY = 0; - offsetZ = 0; - offsetW = 0; - } + input0_zp = attr[0]->asymm.zero_point; + input0_scale = attr[0]->asymm.scale; + output_zp = attr[1]->asymm.zero_point; + output_scale = 1.0f / attr[1]->asymm.scale; - gpu_param.global_scale[0] = 8; + gpu_param.global_scale[0] = 1; gpu_param.global_scale[1] = 1; gpu_param.global_scale[2] = 1; - gpu_param.global_size[0] = gpu_align_p2((block_size + gpu_param.global_scale[0] - 1) - / gpu_param.global_scale[0], 4); - gpu_param.global_size[1] = height; + gpu_param.global_size[0] = width; + gpu_param.global_size[1] = 1; gpu_param.global_size[2] = 1; status = vsi_nn_kernel_gpu_config( node, &gpu_param ); CHECK_STATUS_FAIL_GOTO(status, OnError); +#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \ + (IN0_TYPE | ( OUT_TYPE << 16)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype ); + + switch( pack_key ) { - uint16_t M0 = 0; - uint16_t M1 = 0; - int32_t postShift0 = 0; - int32_t postShift1 = 0; - uint32_t multAndoutZP0[2] = {0}; - uint32_t multAndoutZP1[2] = {0}; - gpu_dp_inst_t uniAccumulateSum_2x8 = {{ - 0x55555555, // TCfg - 0x44444444, // ASelt - 0x33221100, 0x77665544, // ABin - 0xaaaaaaaa, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00010001, 0x00010001, 0x00010001, - 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{ + case _PACK_SELECT_KEY( I8, I8 ): + case _PACK_SELECT_KEY( U8, U8 ): + { + uint16_t M0 = 0; + int32_t postShift0 = 0; + uint32_t multAndoutZP0[2] = {0}; + + gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{ 0xdddddddd, // TCfg 0x44444444, // ASelt 0x13121110, 0x17161514, // ABin @@ -633,80 +571,40 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer) 0x00002600, // AccumType, ConstantType, and PostShift 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniU8MulAndPostShift_1_Lo_2x8 = {{ + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{ 0xdddddddd, // TCfg 0x44444444, // ASelt - 0x13121110, 0x17161514, // ABin + 0x1b1a1918, 0x1f1e1d1c, // ABin 0x11111111, // BSelt 0x00000000, 0x00000000, // BBin 0x00002600, // AccumType, ConstantType, and PostShift 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ - 0x11111111, // TCfg - 0x01010101, // ASelt - 0x01050004, 0x03070206, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ - 0x11111111, // TCfg - 0x01010101, // ASelt - 0x05050404, 0x07070606, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniExtractOddData_2x8 = {{ - 0x11111111, // TCfg - 0x11110000, // ASelt - 0x07050301, 0x07050301, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16}; + }, GPU_DP_TYPE_16 }; - gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0); - gpu_quantize_multiplier_16bit( (double)src2Scale / dstScale, &M1, &postShift1); - multAndoutZP0[0] = (uint32_t)(M0); - multAndoutZP0[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0); - multAndoutZP1[0] = (uint32_t)(M1); - multAndoutZP1[1] = (uint32_t)((dstZP << postShift1) - src2ZP * M1); - gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift0 ); - gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_1_Lo_2x8, postShift1 ); + gpu_quantize_multiplier_16bit( (double)input0_scale * output_scale, &M0, &postShift0); - status = vsi_nn_kernel_gpu_add_param( node, - "uniAccumulateSum_2x8", &uniAccumulateSum_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniU8MulAndPostShift_1_Lo_2x8", &uniU8MulAndPostShift_1_Lo_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); - status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniExtractOddData_2x8", &uniExtractOddData_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, "index_num", &index_num ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW ); - status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx ); - CHECK_STATUS_FAIL_GOTO(status, OnError); + multAndoutZP0[0] = (uint32_t)(M0); + multAndoutZP0[1] = (uint32_t)((output_zp << postShift0) - input0_zp * M0); + + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift0 ); + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift0 ); + + status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift0_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniU8MulAndPostShift0_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; } +#undef _PACK_SELECT_KEY + OnError: if (attr[0]) { @@ -718,20 +616,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer) vsi_nn_kernel_tensor_attr_release( &attr[1] ); attr[1] = NULL; } - if (attr[2]) - { - vsi_nn_kernel_tensor_attr_release( &attr[2] ); - attr[2] = NULL; - } - if (attr[3]) - { - vsi_nn_kernel_tensor_attr_release( &attr[3] ); - attr[3] = NULL; - } return status; -} /* _scatter_nd_update_initializer() */ +} /* _scatter_nd_update_special_ref_initializer() */ -DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer) +DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer) ( vsi_nn_kernel_node_t node, const vsi_nn_kernel_node_param_t * param, @@ -747,19 +635,20 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer) {0, 0, 0} }; - vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL }; + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; int32_t block_size = 1; - int32_t height = 1; + int32_t update_width = 1; int32_t index_num = 1; int32_t width = 0, area = 0, vol = 0; int32_t coord_dim = 0; int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0; - int32_t src0ZP = 0; - float src0Scale = 1; - int32_t src2ZP = 0; - float src2Scale = 1; - int32_t dstZP = 0; - float dstScale = 1; + int32_t input1_zp = 0; + float input1_scale = 1.0f; + int32_t output_zp = 0; + float output_scale = 1.0f; + uint32_t pack_key = 0; + + VSI_UNREFERENCED(param_size); attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -767,73 +656,24 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer) CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); - attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); - CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &width); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &width); CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &area); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &area); CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &vol); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &vol); CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &coord_dim); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &coord_dim); CHECK_STATUS_FAIL_GOTO(status, OnError ); - block_size = (int32_t)(attr[3]->shape->data[0]); - height = (int32_t)(attr[3]->shape->data[1]); - index_num = (int32_t)(attr[1]->shape->data[1]); - - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - src0ZP = attr[0]->asymm.zero_point; - src0Scale = attr[0]->asymm.scale; - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[0]->dfp.fl > 0) - { - src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - } - - if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - src2ZP = attr[2]->asymm.zero_point; - src2Scale = attr[2]->asymm.scale; - } - else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[2]->dfp.fl > 0) - { - src2Scale = (1.0f / ((float) ((int64_t)1 << attr[2]->dfp.fl))); - } - else - { - src2Scale = ((float) ((int64_t)1 << -attr[2]->dfp.fl)); - } - } + block_size = (int32_t)(attr[2]->shape->data[0]); + update_width = (int32_t)(attr[1]->shape->data[0]); + index_num = (int32_t)(attr[0]->shape->data[1]); - if (attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - dstZP = attr[3]->asymm.zero_point; - dstScale = attr[3]->asymm.scale; - } - else if ( attr[3]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[3]->dfp.fl > 0) - { - dstScale = (float)((int64_t)1 << attr[3]->dfp.fl); - } - else - { - dstScale = (1.0f / (float)((int64_t)1 << -attr[3]->dfp.fl)); - } - dstScale = 1.0f / dstScale; - } + input1_zp = attr[1]->asymm.zero_point; + input1_scale = attr[1]->asymm.scale; + output_zp = attr[2]->asymm.zero_point; + output_scale = 1.0f / attr[2]->asymm.scale; if (coord_dim == 5) { @@ -865,35 +705,60 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer) offsetZ = 0; } + if (attr[1]->dtype == F16 || attr[1]->dtype == I16 || attr[1]->dtype == U16) + { + update_width = (update_width + 7) / 8; + } + else if (attr[1]->dtype == U8 || attr[1]->dtype == I8) + { + update_width = (update_width + 15) / 16; + } + + if (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == U16) + { + block_size = (block_size + 7) / 8; + } + else if (attr[2]->dtype == U8 || attr[2]->dtype == I8) + { + block_size = (block_size + 15) / 16; + } + gpu_param.global_scale[0] = 1; gpu_param.global_scale[1] = 1; gpu_param.global_scale[2] = 1; gpu_param.global_size[0] = block_size; - gpu_param.global_size[1] = height; + gpu_param.global_size[1] = index_num; gpu_param.global_size[2] = 1; status = vsi_nn_kernel_gpu_config( node, &gpu_param ); CHECK_STATUS_FAIL_GOTO(status, OnError); { - uint16_t M0 = 0; - uint16_t M1 = 0; - int32_t postShift0 = 0; - int32_t postShift1 = 0; - uint32_t multAndoutZP0[2] = {0}; - uint32_t multAndoutZP1[2] = {0}; - gpu_dp_inst_t uniAccumulateSum_2x8 = {{ - 0x55555555, // TCfg - 0x44444444, // ASelt - 0x33221100, 0x77665544, // ABin - 0xaaaaaaaa, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00010001, 0x00010001, 0x00010001, - 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{ + status = vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ ); + status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW ); + status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } +#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \ + (IN0_TYPE | ( OUT_TYPE << 16)) + + pack_key = _PACK_SELECT_KEY( attr[1]->dtype, attr[2]->dtype ); + + switch( pack_key ) + { + case _PACK_SELECT_KEY( I8, I8 ): + case _PACK_SELECT_KEY( U8, U8 ): + { + uint16_t M1 = 0; + int32_t postShift1 = 0; + uint32_t multAndoutZP1[2] = {0}; + + gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{ 0xdddddddd, // TCfg 0x44444444, // ASelt 0x13121110, 0x17161514, // ABin @@ -902,48 +767,38 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer) 0x00002600, // AccumType, ConstantType, and PostShift 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniU8MulAndPostShift_1_Lo_2x8 = {{ + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{ 0xdddddddd, // TCfg 0x44444444, // ASelt - 0x13121110, 0x17161514, // ABin + 0x1b1a1918, 0x1f1e1d1c, // ABin 0x11111111, // BSelt 0x00000000, 0x00000000, // BBin 0x00002600, // AccumType, ConstantType, and PostShift 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; + }, GPU_DP_TYPE_16 }; - gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0); - gpu_quantize_multiplier_16bit( (double)src2Scale / dstScale, &M1, &postShift1); - multAndoutZP0[0] = (uint32_t)(M0); - multAndoutZP0[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0); - multAndoutZP1[0] = (uint32_t)(M1); - multAndoutZP1[1] = (uint32_t)((dstZP << postShift1) - src2ZP * M1); - gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift0 ); - gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_1_Lo_2x8, postShift1 ); + gpu_quantize_multiplier_16bit( (double)input1_scale * output_scale, &M1, &postShift1); - status = vsi_nn_kernel_gpu_add_param( node, - "uniAccumulateSum_2x8", &uniAccumulateSum_2x8 ); - if (attr[3]->quant != VSI_NN_KERNEL_QUANT_NONE) - { + multAndoutZP1[0] = (uint32_t)(M1); + multAndoutZP1[1] = (uint32_t)((output_zp << postShift1) - input1_zp * M1); + + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift1 ); + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 ); + + status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 ); status |= vsi_nn_kernel_gpu_add_param( node, - "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 ); + "uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 ); status |= vsi_nn_kernel_gpu_add_param( node, - "uniU8MulAndPostShift_1_Lo_2x8", &uniU8MulAndPostShift_1_Lo_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); - status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 ); + "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); } - status |= vsi_nn_kernel_gpu_add_param( node, "index_num", &index_num ); - status |= vsi_nn_kernel_gpu_add_param( node, "update_width", &block_size ); - status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW ); - status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx ); - CHECK_STATUS_FAIL_GOTO(status, OnError); + break; + default: + break; } +#undef _PACK_SELECT_KEY OnError: if (attr[0]) @@ -961,15 +816,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer) vsi_nn_kernel_tensor_attr_release( &attr[2] ); attr[2] = NULL; } - if (attr[3]) - { - vsi_nn_kernel_tensor_attr_release( &attr[3] ); - attr[3] = NULL; - } return status; -} /* _scatter_nd_update_big_initializer() */ +} /* _scatter_nd_update_special_update_initializer() */ -DEF_KERNEL_INITIALIZER(_scatter_nd_update_pre_initializer) +DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_copy_initializer) ( vsi_nn_kernel_node_t node, const vsi_nn_kernel_node_param_t * param, @@ -985,140 +835,50 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_pre_initializer) {0, 0, 0} }; - vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; int32_t block_size = 1; - int32_t update_width = 1; - int32_t index_num = 1; - int32_t width = 0, area = 0, vol = 0; - int32_t coord_dim = 0; - int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0; - int32_t src0ZP = 0; - float src0Scale = 1; + int32_t width = 0; + int32_t height = 0; + + VSI_UNREFERENCED(param_size); attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); - attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); - CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); - attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); - CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); - - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &width); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &area); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &vol); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &coord_dim); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - block_size = (int32_t)(attr[2]->shape->data[0]); - update_width = (int32_t)(attr[1]->shape->data[0]); - index_num = (int32_t)(attr[0]->shape->data[1]); - - if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - src0ZP = attr[1]->asymm.zero_point; - src0Scale = attr[1]->asymm.scale; - } - else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[1]->dfp.fl > 0) - { - src0Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl))); - } - else - { - src0Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl)); - } - } + block_size = (int32_t)(attr[0]->shape->data[0]); + height = (int32_t)(attr[0]->shape->data[1]); + width = (int32_t)(block_size * height); - if (coord_dim == 5) - { - offset_idx = 1; - } - if (coord_dim == 4 || coord_dim == 5) - { - offsetX = vol; - offsetY = area; - offsetZ = width; - offsetW = 1; - } - else if (coord_dim == 3) - { - offsetX = area; - offsetY = width; - offsetZ = 1; - } - else if (coord_dim == 2) + if (attr[0]->dtype == F16 || attr[0]->dtype == I16 || attr[0]->dtype == U16) { - offsetX = width; - offsetY = 1; - offsetZ = 0; + width = (width + 7) / 8; } - else if (coord_dim == 1) + else if (attr[0]->dtype == U8 || attr[0]->dtype == I8) { - offsetX = 1; - offsetY = 0; - offsetZ = 0; + width = (width + 15) / 16; } gpu_param.global_scale[0] = 1; gpu_param.global_scale[1] = 1; gpu_param.global_scale[2] = 1; - gpu_param.global_size[0] = block_size; - gpu_param.global_size[1] = index_num; + gpu_param.global_size[0] = width; + gpu_param.global_size[1] = 1; gpu_param.global_size[2] = 1; status = vsi_nn_kernel_gpu_config( node, &gpu_param ); CHECK_STATUS_FAIL_GOTO(status, OnError); - { - gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{ - 0x05050505, // TCfg - 0x04040404, // ASelt - 0x00010000, 0x00030002, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - - status = vsi_nn_kernel_gpu_add_param( node, - "uniConvert1stUint8SubZpToFp32_4x4", &uniConvert1stUint8SubZpToFp32_4x4 ); - status |= vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width ); - status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW ); - status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx ); - status |= vsi_nn_kernel_gpu_add_param( node, "input_zp", &src0ZP ); - status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &src0Scale ); - CHECK_STATUS_FAIL_GOTO(status, OnError); - } - OnError: if (attr[0]) { vsi_nn_kernel_tensor_attr_release( &attr[0] ); attr[0] = NULL; } - if (attr[1]) - { - vsi_nn_kernel_tensor_attr_release( &attr[1] ); - attr[1] = NULL; - } - if (attr[2]) - { - vsi_nn_kernel_tensor_attr_release( &attr[2] ); - attr[2] = NULL; - } return status; -} /* _scatter_nd_update_pre_initializer() */ +} /* _scatter_nd_update_special_copy_initializer() */ -DEF_KERNEL_INITIALIZER(_scatter_nd_update_post_initializer) +DEF_KERNEL_INITIALIZER(_scatter_nd_update_reset_initializer) ( vsi_nn_kernel_node_t node, const vsi_nn_kernel_node_param_t * param, @@ -1127,132 +887,56 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_post_initializer) { vsi_status status = VSI_FAILURE; gpu_param_t gpu_param = { - 3, + 1, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0} }; - vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; - int32_t block_size = 1; - int32_t height = 1; - int32_t width = 0, area = 0, vol = 0; - int32_t coord_dim = 0; - int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0; - int32_t src0ZP = 0; - float src0Scale = 1; - float src2Scale = 1; - int32_t dstZP = 0; - float dstScale = 1; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + int32_t width = 0; + int32_t element_size = 1; + int32_t input_zp0 = 0; + float input_scale0 = 1; + int32_t output_zp = 0; + float output_scale = 1; + int32_t i = 0; - attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); // ref + VSI_UNREFERENCED(param_size); + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); - attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] ); // update + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); - attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[5] ); // output - CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); - - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &width); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &area); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &vol); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &coord_dim); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - - block_size = (int32_t)(attr[2]->shape->data[0]); - height = (int32_t)(attr[2]->shape->data[1]); - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + for (i = 0; i < (int32_t)attr[0]->shape->size; i++) { - src0ZP = attr[0]->asymm.zero_point; - src0Scale = attr[0]->asymm.scale; - } - else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[0]->dfp.fl > 0) - { - src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } + element_size *= (int32_t)attr[0]->shape->data[i]; } + width = element_size / 8; + + input_zp0 = attr[0]->asymm.zero_point; + input_scale0 = attr[0]->asymm.scale; + output_zp = attr[1]->asymm.zero_point; + output_scale = attr[1]->asymm.scale; - if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM) + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE) { - src2Scale = attr[1]->asymm.scale; + input_scale0 = 1.0f; } - else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) + if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE) { - if (attr[1]->dfp.fl > 0) - { - src2Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl))); - } - else - { - src2Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl)); - } - } - - if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - dstZP = attr[2]->asymm.zero_point; - dstScale = attr[2]->asymm.scale; - } - else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) - { - if (attr[2]->dfp.fl > 0) - { - dstScale = (float)((int64_t)1 << attr[2]->dfp.fl); - } - else - { - dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); - } - dstScale = 1.0f / dstScale; - dstZP = 0; - } - - if (coord_dim == 5) - { - offset_idx = 1; - } - if (coord_dim == 4 || coord_dim == 5) - { - offsetX = vol; - offsetY = area; - offsetZ = width; - offsetW = 1; - } - else if (coord_dim == 3) - { - offsetX = area; - offsetY = width; - offsetZ = 1; - } - else if (coord_dim == 2) - { - offsetX = width; - offsetY = 1; - offsetZ = 0; - } - else if (coord_dim == 1) - { - offsetX = 1; - offsetY = 0; - offsetZ = 0; + output_scale = 1.0f; } gpu_param.global_scale[0] = 1; gpu_param.global_scale[1] = 1; gpu_param.global_scale[2] = 1; - gpu_param.global_size[0] = block_size; - gpu_param.global_size[1] = height; + gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = 1; gpu_param.global_size[2] = 1; status = vsi_nn_kernel_gpu_config( node, &gpu_param ); @@ -1272,38 +956,15 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_post_initializer) 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ - 0x33333333, // TCfg - 0x11110000, // ASelt - 0x03020100, 0x03020100, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - float output_zp = (float)dstZP; - float scaleInOut = src2Scale / dstScale; - gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0); + gpu_quantize_multiplier_16bit( (double)input_scale0 / output_scale, &M0, &postShift0); multAndoutZP0[0] = (uint32_t)(M0); - multAndoutZP0[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0); + multAndoutZP0[1] = (uint32_t)((output_zp << postShift0) - input_zp0 * M0); gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift0 ); status = vsi_nn_kernel_gpu_add_param( node, "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 ); status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); - status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW ); - status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx ); - status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &src2Scale ); - status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp ); - status |= vsi_nn_kernel_gpu_add_param( node, "scaleInOut", &scaleInOut ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); CHECK_STATUS_FAIL_GOTO(status, OnError); } @@ -1318,15 +979,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_post_initializer) vsi_nn_kernel_tensor_attr_release( &attr[1] ); attr[1] = NULL; } - if (attr[2]) - { - vsi_nn_kernel_tensor_attr_release( &attr[2] ); - attr[2] = NULL; - } return status; -} /* _scatter_nd_update_post_initializer() */ +} /* _scatter_nd_update_reset_initializer() */ -DEF_KERNEL_INITIALIZER(_scatter_nd_update_reset_initializer) +DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer) ( vsi_nn_kernel_node_t node, const vsi_nn_kernel_node_param_t * param, @@ -1335,168 +991,137 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_reset_initializer) { vsi_status status = VSI_FAILURE; gpu_param_t gpu_param = { - 3, + 2, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0} }; - vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; int32_t block_size = 1; + int32_t update_width = 1; + int32_t index_num = 1; int32_t width = 0; - int32_t height = 0; - int32_t count_width = 0; - - attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); - CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); - - block_size = (int32_t)(attr[0]->shape->data[0]); - height = (int32_t)(attr[0]->shape->data[1]); - width = (int32_t)(block_size * height); - count_width = (int32_t)((height + 3) / 4); - - gpu_param.global_scale[0] = 1; - gpu_param.global_scale[1] = 1; - gpu_param.global_scale[2] = 1; - - gpu_param.global_size[0] = (width + 3) / 4; - gpu_param.global_size[1] = 1; - gpu_param.global_size[2] = 1; - - status = vsi_nn_kernel_gpu_config( node, &gpu_param ); - CHECK_STATUS_FAIL_GOTO(status, OnError); - - status = vsi_nn_kernel_gpu_add_param( node, "count_width", &count_width ); - CHECK_STATUS_FAIL_GOTO(status, OnError); - -OnError: - if (attr[0]) - { - vsi_nn_kernel_tensor_attr_release( &attr[0] ); - attr[0] = NULL; - } - return status; -} /* _scatter_nd_update_reset_initializer() */ - -DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer) - ( - vsi_nn_kernel_node_t node, - const vsi_nn_kernel_node_param_t * param, - size_t param_size - ) -{ - vsi_status status = VSI_FAILURE; - gpu_param_t gpu_param = { - 3, - {0, 0, 0}, - {0, 0, 0}, - {0, 0, 0}, - {0, 0, 0} - }; - - vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; - int32_t block_size = 1; - int32_t width = 0; - int32_t height = 0; - - int32_t input0_zp = 0; - float input0_scale = 1.0f; - int32_t output_zp = 0; - float output_scale = 1.0f; + int32_t coord_dim = 0; + int32_t strides[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t coord_strides[8] = {0}; + int32_t *coord_strides1 = coord_strides + 4; + int32_t input2_zp = 0; + int32_t i = 0; - uint32_t pack_key = 0; + VSI_UNREFERENCED(param_size); attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); - block_size = (int32_t)(attr[0]->shape->data[0]); - height = (int32_t)(attr[0]->shape->data[1]); - width = (int32_t)(block_size * height); - if (attr[0]->dtype == F16 || attr[0]->dtype == I16 || attr[0]->dtype == U16) + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &strides[0]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &strides[1]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &strides[2]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &strides[3]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &strides[4]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &strides[5]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &strides[6]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &coord_dim); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + block_size = (int32_t)(attr[2]->shape->data[0]); + update_width = (int32_t)(attr[1]->shape->data[0]); + index_num = (int32_t)(attr[0]->shape->data[1]); + width = block_size; + if (block_size % 4 == 0) { - width = (width + 7) / 8; + update_width = update_width / 4; + width = block_size / 4; } - else if (attr[0]->dtype == U8 || attr[0]->dtype == I8) + + input2_zp = attr[1]->asymm.zero_point; + + coord_strides[coord_dim - 1] = 1; + for (i = 0; i < coord_dim - 1; i++) { - width = (width + 15) / 16; + coord_strides[i] = strides[coord_dim - 2 - i]; } - input0_zp = attr[0]->asymm.zero_point; - input0_scale = attr[0]->asymm.scale; - output_zp = attr[1]->asymm.zero_point; - output_scale = 1.0f / attr[1]->asymm.scale; - gpu_param.global_scale[0] = 1; gpu_param.global_scale[1] = 1; gpu_param.global_scale[2] = 1; gpu_param.global_size[0] = width; - gpu_param.global_size[1] = 1; + gpu_param.global_size[1] = index_num; gpu_param.global_size[2] = 1; status = vsi_nn_kernel_gpu_config( node, &gpu_param ); CHECK_STATUS_FAIL_GOTO(status, OnError); -#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \ - (IN0_TYPE | ( OUT_TYPE << 16)) - - pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype ); - - switch( pack_key ) { - case _PACK_SELECT_KEY( I8, I8 ): - case _PACK_SELECT_KEY( U8, U8 ): - { - uint16_t M0 = 0; - int32_t postShift0 = 0; - uint32_t multAndoutZP0[2] = {0}; - - gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{ - 0xdddddddd, // TCfg - 0x44444444, // ASelt - 0x13121110, 0x17161514, // ABin - 0x11111111, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002600, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{ - 0xdddddddd, // TCfg - 0x44444444, // ASelt - 0x1b1a1918, 0x1f1e1d1c, // ABin - 0x11111111, // BSelt + gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt 0x00000000, 0x00000000, // BBin - 0x00002600, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; - gpu_quantize_multiplier_16bit( (double)input0_scale * output_scale, &M0, &postShift0); + gpu_dp_inst_t uniConvertFp16ToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; - multAndoutZP0[0] = (uint32_t)(M0); - multAndoutZP0[1] = (uint32_t)((output_zp << postShift0) - input0_zp * M0); + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; - gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift0 ); - gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift0 ); + status = vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size ); + status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride", &coord_strides ); + status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride1", &coord_strides1 ); + CHECK_STATUS_FAIL_GOTO(status, OnError); - status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniU8MulAndPostShift0_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 ); + if (attr[1]->dtype == U8 || attr[1]->dtype == I8 || attr[1]->dtype == I16) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvert1stUint8SubZpToFp32_4x4", &uniConvert1stUint8SubZpToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "input_zp", &input2_zp ); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + else if (attr[1]->dtype == F16 || attr[1]->dtype == BF16) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertFp16ToFp32_4x4", &uniConvertFp16ToFp32_4x4 ); status |= vsi_nn_kernel_gpu_add_param( node, - "uniU8MulAndPostShift0_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 ); + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); CHECK_STATUS_FAIL_GOTO(status, OnError ); } - break; - default: - break; } -#undef _PACK_SELECT_KEY - OnError: if (attr[0]) { @@ -1508,10 +1133,15 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer) vsi_nn_kernel_tensor_attr_release( &attr[1] ); attr[1] = NULL; } + if (attr[2]) + { + vsi_nn_kernel_tensor_attr_release( &attr[2] ); + attr[2] = NULL; + } return status; -} /* _scatter_nd_update_ref_initializer() */ +} /* _scatter_nd_update_update_initializer() */ -DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer) +DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer) ( vsi_nn_kernel_node_t node, const vsi_nn_kernel_node_param_t * param, @@ -1531,164 +1161,127 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer) int32_t block_size = 1; int32_t update_width = 1; int32_t index_num = 1; - int32_t width = 0, area = 0, vol = 0; + int32_t width = 0; int32_t coord_dim = 0; - int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0; - int32_t input1_zp = 0; - float input1_scale = 1.0f; - int32_t output_zp = 0; + int32_t strides[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t coord_strides[8] = {0}; + int32_t *coord_strides1 = coord_strides + 4; + float output_zp = 0; + float input_scale = 1.0f; float output_scale = 1.0f; - uint32_t pack_key = 0; + float inout_scale = 1.0f; + int32_t i = 0; + + VSI_UNREFERENCED(param_size); attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); - attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &width); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &strides[0]); CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &area); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &strides[1]); CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &vol); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &strides[2]); CHECK_STATUS_FAIL_GOTO(status, OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &coord_dim); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &strides[3]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &strides[4]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &strides[5]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &strides[6]); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &coord_dim); CHECK_STATUS_FAIL_GOTO(status, OnError ); block_size = (int32_t)(attr[2]->shape->data[0]); update_width = (int32_t)(attr[1]->shape->data[0]); index_num = (int32_t)(attr[0]->shape->data[1]); - input1_zp = attr[1]->asymm.zero_point; - input1_scale = attr[1]->asymm.scale; - output_zp = attr[2]->asymm.zero_point; - output_scale = 1.0f / attr[2]->asymm.scale; - - if (coord_dim == 5) - { - offset_idx = 1; - } - if (coord_dim == 4 || coord_dim == 5) - { - offsetX = vol; - offsetY = area; - offsetZ = width; - offsetW = 1; - } - else if (coord_dim == 3) - { - offsetX = area; - offsetY = width; - offsetZ = 1; - } - else if (coord_dim == 2) + input_scale = attr[1]->asymm.scale; + output_scale = attr[2]->asymm.scale; + output_zp = (float)attr[2]->asymm.zero_point; + if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE) { - offsetX = width; - offsetY = 1; - offsetZ = 0; + input_scale = 1.0f; } - else if (coord_dim == 1) + if (attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE) { - offsetX = 1; - offsetY = 0; - offsetZ = 0; + output_scale = 1.0f; } + inout_scale = input_scale / output_scale; - if (attr[1]->dtype == F16 || attr[1]->dtype == I16 || attr[1]->dtype == U16) + coord_strides[coord_dim - 1] = 1; + for (i = 0; i < coord_dim - 1; i++) { - update_width = (update_width + 7) / 8; - } - else if (attr[1]->dtype == U8 || attr[1]->dtype == I8) - { - update_width = (update_width + 15) / 16; + coord_strides[i] = strides[coord_dim - 2 - i]; } - if (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == U16) - { - block_size = (block_size + 7) / 8; - } - else if (attr[2]->dtype == U8 || attr[2]->dtype == I8) + width = block_size; + if (block_size % 4 == 0) { - block_size = (block_size + 15) / 16; + width = block_size / 4; } gpu_param.global_scale[0] = 1; gpu_param.global_scale[1] = 1; gpu_param.global_scale[2] = 1; - gpu_param.global_size[0] = block_size; + gpu_param.global_size[0] = width; gpu_param.global_size[1] = index_num; gpu_param.global_size[2] = 1; status = vsi_nn_kernel_gpu_config( node, &gpu_param ); - CHECK_STATUS_FAIL_GOTO(status, OnError); - - { - status = vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width ); - status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ ); - status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW ); - status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx ); - CHECK_STATUS_FAIL_GOTO(status, OnError); - } -#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \ - (IN0_TYPE | ( OUT_TYPE << 16)) - - pack_key = _PACK_SELECT_KEY( attr[1]->dtype, attr[2]->dtype ); - - switch( pack_key ) - { - case _PACK_SELECT_KEY( I8, I8 ): - case _PACK_SELECT_KEY( U8, U8 ): - { - uint16_t M1 = 0; - int32_t postShift1 = 0; - uint32_t multAndoutZP1[2] = {0}; - - gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{ - 0xdddddddd, // TCfg - 0x44444444, // ASelt - 0x13121110, 0x17161514, // ABin - 0x11111111, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002600, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{ - 0xdddddddd, // TCfg - 0x44444444, // ASelt - 0x1b1a1918, 0x1f1e1d1c, // ABin - 0x11111111, // BSelt + CHECK_STATUS_FAIL_GOTO(status, OnError); + + { + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt 0x00000000, 0x00000000, // BBin - 0x00002600, // AccumType, ConstantType, and PostShift + 0x00002400, // AccumType, ConstantType, and PostShift 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_quantize_multiplier_16bit( (double)input1_scale * output_scale, &M1, &postShift1); - - multAndoutZP1[0] = (uint32_t)(M1); - multAndoutZP1[1] = (uint32_t)((output_zp << postShift1) - input1_zp * M1); + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; - gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift1 ); - gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 ); + status = vsi_nn_kernel_gpu_add_param( node, "output_stride", &width ); + status |= vsi_nn_kernel_gpu_add_param( node, "ref_stride", &update_width ); + status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride", &coord_strides ); + status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride1", &coord_strides1 ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp ); + status |= vsi_nn_kernel_gpu_add_param( node, "inout_scale", &inout_scale ); + CHECK_STATUS_FAIL_GOTO(status, OnError); - status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 ); - CHECK_STATUS_FAIL_GOTO(status, OnError ); + if (attr[1]->dtype == U8 || attr[1]->dtype == I8 || attr[1]->dtype == I16) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + } + else if (attr[1]->dtype == BF16) + { + status = vsi_nn_kernel_gpu_add_param( node, + "uniExtractOddData_2x8", &uniExtractOddData_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, OnError); } - break; - default: - break; } -#undef _PACK_SELECT_KEY OnError: if (attr[0]) @@ -1707,7 +1300,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer) attr[2] = NULL; } return status; -} /* _scatter_nd_update_update_initializer() */ +} /* _scatter_nd_update_ref_initializer() */ DEF_KERNEL_INITIALIZER(_scatter_nd_update_copy_initializer) ( @@ -1718,7 +1311,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_copy_initializer) { vsi_status status = VSI_FAILURE; gpu_param_t gpu_param = { - 3, + 1, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, @@ -1726,31 +1319,27 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_copy_initializer) }; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - int32_t block_size = 1; - int32_t width = 0; - int32_t height = 0; + int32_t width = 0; + int32_t element_size = 1; + int32_t i = 0; + + VSI_UNREFERENCED(param_size); attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); - block_size = (int32_t)(attr[0]->shape->data[0]); - height = (int32_t)(attr[0]->shape->data[1]); - width = (int32_t)(block_size * height); - - if (attr[0]->dtype == F16 || attr[0]->dtype == I16 || attr[0]->dtype == U16) - { - width = (width + 7) / 8; - } - else if (attr[0]->dtype == U8 || attr[0]->dtype == I8) + for (i = 0; i < (int32_t)attr[0]->shape->size; i++) { - width = (width + 15) / 16; + element_size *= (int32_t)attr[0]->shape->data[i]; } + width = element_size / 8; gpu_param.global_scale[0] = 1; gpu_param.global_scale[1] = 1; gpu_param.global_scale[2] = 1; - gpu_param.global_size[0] = width; + gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); gpu_param.global_size[1] = 1; gpu_param.global_size[2] = 1; @@ -1766,166 +1355,151 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_copy_initializer) return status; } /* _scatter_nd_update_copy_initializer() */ -/* - * Query kernel - */ static vsi_status _query_kernel ( vsi_nn_tensor_t* const* const inputs, vsi_nn_tensor_t* const* const outputs, - vsi_nn_kernel_t* kernel, - int32_t coord_dim, - int32_t isBig + vsi_nn_kernel_t* kernel_reset, + vsi_nn_kernel_t* kernel_update, + vsi_nn_kernel_t* kernel_ref, + vsi_nn_kernel_t* kernel_copy, + int32_t coord_flg, + int32_t opt_flg ) { - vsi_status status = VSI_FAILURE; + vsi_status status = VSI_SUCCESS; vsi_nn_kernel_dtype_e input0_dtype = U8; - vsi_nn_kernel_dtype_e input2_dtype = U8; + vsi_nn_kernel_dtype_e input2_dtype = F16; vsi_nn_kernel_dtype_e output_dtype = U8; + vsi_nn_kernel_dtype_e acc_dtype = I32; uint32_t key = 0; - int i = 0; + size_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 0, isBig ); + if (input2_dtype == F16) + { + acc_dtype = F32; + } + + key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, 0, output_dtype, 0, 0, 0); - for ( i = 0; i < _cnt_of_array(scatter_nd_update_map); i ++ ) + for ( i = 0; i < _cnt_of_array(scatter_nd_update_reset_map); i ++ ) { - if ( scatter_nd_update_map[i].key == key ) + if ( scatter_nd_update_reset_map[i].key == key ) { break; } } - if ( i < _cnt_of_array(scatter_nd_update_map) ) + + if ( i < _cnt_of_array(scatter_nd_update_reset_map) ) { - snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_map[i].function_name ); - kernel->info.parameters = _scatter_nd_update_kernel_param_def; - kernel->info.numParams = _cnt_of_array( _scatter_nd_update_kernel_param_def ); - if (isBig) - { - kernel->info.initialize = _scatter_nd_update_big_initializer; - } - else - { - kernel->info.initialize = _scatter_nd_update_initializer; - } + snprintf( kernel_reset->info.name, VX_MAX_KERNEL_NAME, "%s", + scatter_nd_update_reset_map[i].function_name ); + kernel_reset->info.parameters = _scatter_nd_update_reset_kernel_param_def; + kernel_reset->info.numParams = _SCATTER_ND_UPDATE_RESET_PARAM_NUM; + kernel_reset->info.initialize = _scatter_nd_update_reset_initializer; - vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + vsi_nn_kernel_add_source( kernel_reset, VSI_NN_GPU_SOURCE_FMT_CODE, 2, "vsi_nn_kernel_header", - scatter_nd_update_map[i].source_name ); - vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, - scatter_nd_update_map[i].source_name ); - status = VSI_SUCCESS; + scatter_nd_update_reset_map[i].source_name ); + vsi_nn_kernel_add_source( kernel_reset, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + scatter_nd_update_reset_map[i].source_name ); + } + else + { + status = VSI_FAILURE; } - return status; -} /* _query_kernel() */ - -static vsi_status _query_kernel_large - ( - vsi_nn_tensor_t* const* const inputs, - vsi_nn_tensor_t* const* const outputs, - vsi_nn_kernel_t* kernel_reset, - vsi_nn_kernel_t* kernel_pre, - vsi_nn_kernel_t* kernel - ) -{ - vsi_status status = VSI_SUCCESS; - vsi_nn_kernel_dtype_e input0_dtype = U8; - vsi_nn_kernel_dtype_e input2_dtype = F16; - vsi_nn_kernel_dtype_e output_dtype = U8; - uint32_t key = 0; - int i = 0; - - input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); - input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); - output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, I32, I32, 1, 1 ); + key = HASH_SCATTER_ND_UPDATE_KEY( 0, input2_dtype, 0, 1, coord_flg, opt_flg); - for ( i = 0; i < _cnt_of_array(scatter_nd_update_pre_map); i ++ ) + for ( i = 0; i < _cnt_of_array(scatter_nd_update_update_map); i ++ ) { - if ( scatter_nd_update_pre_map[i].key == key ) + if ( scatter_nd_update_update_map[i].key == key ) { break; } } - - if ( i < _cnt_of_array(scatter_nd_update_pre_map) ) + if ( i < _cnt_of_array(scatter_nd_update_update_map) ) { - snprintf( kernel_pre->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_pre_map[i].function_name ); - kernel_pre->info.parameters = _scatter_nd_update_pre_kernel_param_def; - kernel_pre->info.numParams = _SCATTER_ND_UPDATE_PRE_PARAM_NUM; - kernel_pre->info.initialize = _scatter_nd_update_pre_initializer; + snprintf( kernel_update->info.name, VX_MAX_KERNEL_NAME, "%s", + scatter_nd_update_update_map[i].function_name ); + kernel_update->info.parameters = _scatter_nd_update_update_kernel_param_def; + kernel_update->info.numParams = _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM; + kernel_update->info.initialize = _scatter_nd_update_update_initializer; - vsi_nn_kernel_add_source( kernel_pre, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + vsi_nn_kernel_add_source( kernel_update, VSI_NN_GPU_SOURCE_FMT_CODE, 2, "vsi_nn_kernel_header", - scatter_nd_update_pre_map[i].source_name ); - vsi_nn_kernel_add_source( kernel_pre, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, - scatter_nd_update_pre_map[i].source_name ); + scatter_nd_update_update_map[i].source_name ); + vsi_nn_kernel_add_source( kernel_update, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + scatter_nd_update_update_map[i].source_name ); } else { - status = VSI_FAILURE; + status |= VSI_FAILURE; } + key = HASH_SCATTER_ND_UPDATE_KEY( 0, acc_dtype, output_dtype, 2, coord_flg, opt_flg); - key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 0, 1 ); - - for ( i = 0; i < _cnt_of_array(scatter_nd_update_post_map); i ++ ) + for ( i = 0; i < _cnt_of_array(scatter_nd_update_ref_map); i ++ ) { - if ( scatter_nd_update_post_map[i].key == key ) + if ( scatter_nd_update_ref_map[i].key == key ) { break; } } - if ( i < _cnt_of_array(scatter_nd_update_post_map) ) + + if ( i < _cnt_of_array(scatter_nd_update_ref_map) ) { - snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_post_map[i].function_name ); - kernel->info.parameters = _scatter_nd_update_post_kernel_param_def; - kernel->info.numParams = _SCATTER_ND_UPDATE_POST_PARAM_NUM; - kernel->info.initialize = _scatter_nd_update_post_initializer; + snprintf( kernel_ref->info.name, VX_MAX_KERNEL_NAME, "%s", + scatter_nd_update_ref_map[i].function_name ); + kernel_ref->info.parameters = _scatter_nd_update_ref_kernel_param_def; + kernel_ref->info.numParams = _SCATTER_ND_UPDATE_REF_PARAM_NUM; + kernel_ref->info.initialize = _scatter_nd_update_ref_initializer; - vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + vsi_nn_kernel_add_source( kernel_ref, VSI_NN_GPU_SOURCE_FMT_CODE, 2, "vsi_nn_kernel_header", - scatter_nd_update_post_map[i].source_name ); - vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, - scatter_nd_update_post_map[i].source_name ); + scatter_nd_update_ref_map[i].source_name ); + vsi_nn_kernel_add_source( kernel_ref, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + scatter_nd_update_ref_map[i].source_name ); } else { - status |= VSI_FAILURE; + status = VSI_FAILURE; } - key = HASH_SCATTER_ND_UPDATE_KEY( I32, I32, I32, 2, 1 ); + key = HASH_SCATTER_ND_UPDATE_KEY( 0, 0, output_dtype, 3, 0, 0); - for ( i = 0; i < _cnt_of_array(scatter_nd_update_reset_map); i ++ ) + for ( i = 0; i < _cnt_of_array(scatter_nd_update_copy_map); i ++ ) { - if ( scatter_nd_update_reset_map[i].key == key ) + if ( scatter_nd_update_copy_map[i].key == key ) { break; } } - if ( i < _cnt_of_array(scatter_nd_update_reset_map) ) + if ( i < _cnt_of_array(scatter_nd_update_copy_map) ) { - snprintf( kernel_reset->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_reset_map[i].function_name ); - kernel_reset->info.parameters = _scatter_nd_update_reset_kernel_param_def; - kernel_reset->info.numParams = _SCATTER_ND_UPDATE_RESET_PARAM_NUM; - kernel_reset->info.initialize = _scatter_nd_update_reset_initializer; + snprintf( kernel_copy->info.name, VX_MAX_KERNEL_NAME, "%s", + scatter_nd_update_copy_map[i].function_name ); + kernel_copy->info.parameters = _scatter_nd_update_copy_kernel_param_def; + kernel_copy->info.numParams = _SCATTER_ND_UPDATE_COPY_PARAM_NUM; + kernel_copy->info.initialize = _scatter_nd_update_copy_initializer; - vsi_nn_kernel_add_source( kernel_reset, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + vsi_nn_kernel_add_source( kernel_copy, VSI_NN_GPU_SOURCE_FMT_CODE, 2, "vsi_nn_kernel_header", - scatter_nd_update_reset_map[i].source_name ); - vsi_nn_kernel_add_source( kernel_reset, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, - scatter_nd_update_reset_map[i].source_name ); + scatter_nd_update_copy_map[i].source_name ); + vsi_nn_kernel_add_source( kernel_copy, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + scatter_nd_update_copy_map[i].source_name ); } else { status |= VSI_FAILURE; } + return status; -} /* _query_kernel_large() */ +} /* _query_kernel() */ static vsi_status _query_kernel_special ( @@ -1941,34 +1515,35 @@ static vsi_status _query_kernel_special vsi_nn_kernel_dtype_e input2_dtype = F16; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; - int i = 0; + size_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 3, 1 ); + key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 4, 1, 0); - for ( i = 0; i < _cnt_of_array(scatter_nd_update_ref_map); i ++ ) + for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_ref_map); i ++ ) { - if ( scatter_nd_update_ref_map[i].key == key ) + if ( scatter_nd_update_special_ref_map[i].key == key ) { break; } } - if ( i < _cnt_of_array(scatter_nd_update_ref_map) ) + if ( i < _cnt_of_array(scatter_nd_update_special_ref_map) ) { - snprintf( kernel_ref->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_ref_map[i].function_name ); - kernel_ref->info.parameters = _scatter_nd_update_ref_kernel_param_def; - kernel_ref->info.numParams = _SCATTER_ND_UPDATE_REF_PARAM_NUM; - kernel_ref->info.initialize = _scatter_nd_update_ref_initializer; + snprintf( kernel_ref->info.name, VX_MAX_KERNEL_NAME, "%s", + scatter_nd_update_special_ref_map[i].function_name ); + kernel_ref->info.parameters = _scatter_nd_update_special_ref_kernel_param_def; + kernel_ref->info.numParams = _SCATTER_ND_UPDATE_SPECIAL_REF_PARAM_NUM; + kernel_ref->info.initialize = _scatter_nd_update_special_ref_initializer; vsi_nn_kernel_add_source( kernel_ref, VSI_NN_GPU_SOURCE_FMT_CODE, 2, "vsi_nn_kernel_header", - scatter_nd_update_ref_map[i].source_name ); + scatter_nd_update_special_ref_map[i].source_name ); vsi_nn_kernel_add_source( kernel_ref, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, - scatter_nd_update_ref_map[i].source_name ); + scatter_nd_update_special_ref_map[i].source_name ); } else { @@ -1976,54 +1551,56 @@ static vsi_status _query_kernel_special } - key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 4, 1 ); + key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 5, 1, 0); - for ( i = 0; i < _cnt_of_array(scatter_nd_update_update_map); i ++ ) + for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_update_map); i ++ ) { - if ( scatter_nd_update_update_map[i].key == key ) + if ( scatter_nd_update_special_update_map[i].key == key ) { break; } } - if ( i < _cnt_of_array(scatter_nd_update_update_map) ) + if ( i < _cnt_of_array(scatter_nd_update_special_update_map) ) { - snprintf( kernel_update->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_update_map[i].function_name ); - kernel_update->info.parameters = _scatter_nd_update_update_kernel_param_def; - kernel_update->info.numParams = _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM; - kernel_update->info.initialize = _scatter_nd_update_update_initializer; + snprintf( kernel_update->info.name, VX_MAX_KERNEL_NAME, "%s", + scatter_nd_update_special_update_map[i].function_name ); + kernel_update->info.parameters = _scatter_nd_update_special_update_kernel_param_def; + kernel_update->info.numParams = _SCATTER_ND_UPDATE_SPECIAL_UPDATE_PARAM_NUM; + kernel_update->info.initialize = _scatter_nd_update_special_update_initializer; vsi_nn_kernel_add_source( kernel_update, VSI_NN_GPU_SOURCE_FMT_CODE, 2, "vsi_nn_kernel_header", - scatter_nd_update_update_map[i].source_name ); + scatter_nd_update_special_update_map[i].source_name ); vsi_nn_kernel_add_source( kernel_update, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, - scatter_nd_update_update_map[i].source_name ); + scatter_nd_update_special_update_map[i].source_name ); } else { status |= VSI_FAILURE; } - key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 5, 1 ); + key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 6, 1, 0); - for ( i = 0; i < _cnt_of_array(scatter_nd_update_copy_map); i ++ ) + for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_copy_map); i ++ ) { - if ( scatter_nd_update_copy_map[i].key == key ) + if ( scatter_nd_update_special_copy_map[i].key == key ) { break; } } - if ( i < _cnt_of_array(scatter_nd_update_copy_map) ) + if ( i < _cnt_of_array(scatter_nd_update_special_copy_map) ) { - snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_copy_map[i].function_name ); - kernel->info.parameters = _scatter_nd_update_copy_kernel_param_def; - kernel->info.numParams = _SCATTER_ND_UPDATE_COPY_PARAM_NUM; - kernel->info.initialize = _scatter_nd_update_copy_initializer; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", + scatter_nd_update_special_copy_map[i].function_name ); + kernel->info.parameters = _scatter_nd_update_special_copy_kernel_param_def; + kernel->info.numParams = _SCATTER_ND_UPDATE_SPECIAL_COPY_PARAM_NUM; + kernel->info.initialize = _scatter_nd_update_special_copy_initializer; vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, "vsi_nn_kernel_header", - scatter_nd_update_copy_map[i].source_name ); + scatter_nd_update_special_copy_map[i].source_name ); vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, - scatter_nd_update_copy_map[i].source_name ); + scatter_nd_update_special_copy_map[i].source_name ); } else { @@ -2044,41 +1621,37 @@ static vsi_nn_kernel_node_t _setup ) { vsi_status status = VSI_FAILURE; - vsi_nn_kernel_node_param_t tmp_params[_SCATTER_ND_UPDATE_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t strides[VSI_NN_MAX_DIM_NUM] = {0}; int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); int32_t idx_num = vsi_nn_kernel_param_get_int32( params, "idx_num" ); - vsi_size_t *input_size = inputs[2]->attr.size; - uint32_t dims_num = inputs[2]->attr.dim_num; int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; - vsi_size_t width = 0, area = 0, vol = 0; int32_t big_flg = 0; vsi_nn_kernel_dtype_e update_dtype = vsi_nn_kernel_map_dtype(inputs[2]->attr.dtype.vx_type); vsi_nn_kernel_dtype_e ref_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type); vsi_nn_kernel_dtype_e output_dtype = vsi_nn_kernel_map_dtype(outputs[0]->attr.dtype.vx_type); int32_t type_flg = ((update_dtype == U8 || update_dtype == I8 || update_dtype == I16) && (update_dtype == ref_dtype && update_dtype == output_dtype)) ? 1 : 0; - int32_t special_flg = (block_size % 16 == 0 && type_flg) ? 1 : 0; + int32_t special_flg = (block_size % 16 == 0 && type_flg && coord_dim <= 4) ? 1 : 0; + int32_t coord_flg = 0; + int32_t opt_flg = (block_size % 4 == 0) ? 1 : 0; int32_t i = 0; int32_t isRepeat = 0; + vsi_nn_tensor_t * tensors[4] = { NULL }; + vsi_nn_kernel_t * ikernels[3] = { NULL }; - if (coord_dim > 4 && input_size[dims_num - 1] > 1) - { - return NULL; - } + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); status = get_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0], coord_dim, 0, - NULL, NULL, NULL, &rs_idx_dim, &big_flg); + NULL, &rs_idx_dim, &big_flg); status |= get_scatter_nd_update_tensor_reshape_size(&inputs[2], shapes[1], block_size, 0, - NULL, NULL, NULL, &rs_in_dim, &big_flg); + NULL, &rs_in_dim, &big_flg); status |= get_scatter_nd_update_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim, - &width, &area, &vol, &rs_out_dim, &big_flg); - if (status != VSI_SUCCESS) - { - return NULL; - } + strides, &rs_out_dim, &big_flg); + CHECK_STATUS_FAIL_GOTO( status, final ); check_scatter_nd_update_index_repeat(inputs, coord_dim, block_size, idx_num, &isRepeat); @@ -2087,11 +1660,9 @@ static vsi_nn_kernel_node_t _setup vsi_nn_tensor_attr_t attr; vsi_nn_kernel_node_t tmp_node = NULL; vsi_nn_kernel_node_t ref_node = NULL; - vsi_nn_kernel_node_param_t ref_params[_SCATTER_ND_UPDATE_REF_PARAM_NUM] = { NULL }; - vsi_nn_kernel_node_param_t node_params[_SCATTER_ND_UPDATE_UPDATE_PARAM_NUM] = { NULL }; - vsi_nn_kernel_node_param_t cpy_params[_SCATTER_ND_UPDATE_COPY_PARAM_NUM] = { NULL }; - vsi_nn_kernel_t * ikernels[2] = { NULL }; - vsi_nn_tensor_t * tensors[3] = { NULL }; + vsi_nn_kernel_node_param_t ref_params[_SCATTER_ND_UPDATE_SPECIAL_REF_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t node_params[_SCATTER_ND_UPDATE_SPECIAL_UPDATE_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t cpy_params[_SCATTER_ND_UPDATE_SPECIAL_COPY_PARAM_NUM] = { NULL }; ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); ikernels[0]->unique_id = kernel->unique_id; @@ -2127,7 +1698,8 @@ static vsi_nn_kernel_node_t _setup ref_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[2], rs_out_dim ); ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t; ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t; - status = vsi_nn_kernel_node_pass_param( ref_node, ref_params, _SCATTER_ND_UPDATE_REF_PARAM_NUM ); + status = vsi_nn_kernel_node_pass_param( ref_node, ref_params, + _SCATTER_ND_UPDATE_SPECIAL_REF_PARAM_NUM ); CHECK_STATUS(status); vsi_nn_kernel_tensor_release( &ref_params[0] ); } @@ -2143,11 +1715,12 @@ static vsi_nn_kernel_node_t _setup node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t; node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t; node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t; - node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); - node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &area ); - node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &vol ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[0] ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[1] ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[2] ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim ); - status = vsi_nn_kernel_node_pass_param( tmp_node, node_params, _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM ); + status = vsi_nn_kernel_node_pass_param( tmp_node, node_params, + _SCATTER_ND_UPDATE_SPECIAL_UPDATE_PARAM_NUM ); CHECK_STATUS(status); vsi_nn_kernel_tensor_release( &node_params[0] ); vsi_nn_kernel_tensor_release( &node_params[1] ); @@ -2166,7 +1739,7 @@ static vsi_nn_kernel_node_t _setup cpy_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t; cpy_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t; cpy_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim ); - status = vsi_nn_kernel_node_pass_param( node, cpy_params, _SCATTER_ND_UPDATE_COPY_PARAM_NUM ); + status = vsi_nn_kernel_node_pass_param( node, cpy_params, _SCATTER_ND_UPDATE_SPECIAL_COPY_PARAM_NUM ); CHECK_STATUS(status); vsi_nn_kernel_tensor_release( &cpy_params[2] ); } @@ -2195,106 +1768,159 @@ static vsi_nn_kernel_node_t _setup if (ref_node) {vsi_nn_kernel_node_release( &ref_node );} if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );} } - else if ((update_dtype == U8 || update_dtype == I8 || update_dtype == I16)) + else { vsi_nn_tensor_attr_t attr; - vsi_nn_kernel_node_t tmp_node = NULL; vsi_nn_kernel_node_t reset_node = NULL; - vsi_nn_kernel_node_param_t pre_params[_SCATTER_ND_UPDATE_PRE_PARAM_NUM] = { NULL }; - vsi_nn_kernel_node_param_t node_params[_SCATTER_ND_UPDATE_POST_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t update_node = NULL; + vsi_nn_kernel_node_t ref_node = NULL; vsi_nn_kernel_node_param_t reset_params[_SCATTER_ND_UPDATE_RESET_PARAM_NUM] = { NULL }; - vsi_nn_kernel_t * ikernels[2] = { NULL }; - vsi_nn_tensor_t * tensors[3] = { NULL }; + vsi_nn_kernel_node_param_t ref_params[_SCATTER_ND_UPDATE_REF_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t update_params[_SCATTER_ND_UPDATE_UPDATE_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t cpy_params[_SCATTER_ND_UPDATE_COPY_PARAM_NUM] = { NULL }; + int32_t width = 1; + int32_t res = 0; ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); ikernels[0]->unique_id = kernel->unique_id; ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); ikernels[1]->unique_id = kernel->unique_id; + ikernels[2] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); + ikernels[2]->unique_id = kernel->unique_id; memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) ); - attr.dtype.vx_type = VSI_NN_TYPE_INT32; - attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype = outputs[0]->attr.dtype; attr.is_const = FALSE; attr.vtl = TRUE; for (i = 0; i < rs_out_dim; i++) { attr.size[i] = shapes[2][i]; + width *= (int32_t)shapes[2][i]; } attr.dim_num = rs_out_dim; - tensors[0] = vsi_nn_CreateTensor( graph, &attr ); + res = width % 8; + width = (width >> 3) << 3; + + tensors[0] = vsi_nn_CreateTensor( graph, &attr ); // ref' + attr.dtype = inputs[2]->attr.dtype; + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + if (update_dtype == F16) + { + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + } + tensors[1] = vsi_nn_CreateTensor( graph, &attr ); // temp_buf_int attr.size[0] = 1; - tensors[1] = vsi_nn_CreateTensor( graph, &attr ); attr.size[1] = 1; - tensors[2] = vsi_nn_CreateTensor( graph, &attr ); + tensors[2] = vsi_nn_CreateTensor( graph, &attr ); // link_buffer0 + tensors[3] = vsi_nn_CreateTensor( graph, &attr ); // link_buffer1 - status = _query_kernel_large( inputs, outputs, ikernels[0], ikernels[1], kernel); + status = _query_kernel( inputs, outputs, ikernels[0], ikernels[1], ikernels[2], kernel, coord_flg, opt_flg); if ( VSI_SUCCESS == status) { - // reset count + // convert ref to output reset_node = vsi_nn_kernel_create_node( graph, ikernels[0] ); if (reset_node) { uint32_t index = 0; /* Pass parameters to node. */ - reset_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim ); + reset_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[2], rs_out_dim ); reset_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t; reset_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t; + reset_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + reset_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res ); status = vsi_nn_kernel_node_pass_param( reset_node, reset_params, _SCATTER_ND_UPDATE_RESET_PARAM_NUM ); CHECK_STATUS(status); vsi_nn_kernel_tensor_release( &reset_params[0] ); + vsi_nn_kernel_scalar_release( &reset_params[3] ); + vsi_nn_kernel_scalar_release( &reset_params[4] ); } - // pre-process - tmp_node = vsi_nn_kernel_create_node( graph, ikernels[1] ); - if (tmp_node) + // update + update_node = vsi_nn_kernel_create_node( graph, ikernels[1] ); + if (update_node) + { + uint32_t index = 0; + /* Pass parameters to node. */ + update_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim ); + update_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim ); + update_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t; + update_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t; + update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[0] ); + update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[1] ); + update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[2] ); + update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[3] ); + update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[4] ); + update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[5] ); + update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[6] ); + update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim ); + status = vsi_nn_kernel_node_pass_param( update_node, update_params, + _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM ); + CHECK_STATUS(status); + vsi_nn_kernel_tensor_release( &update_params[0] ); + vsi_nn_kernel_tensor_release( &update_params[1] ); + vsi_nn_kernel_scalar_release( &update_params[4] ); + vsi_nn_kernel_scalar_release( &update_params[5] ); + vsi_nn_kernel_scalar_release( &update_params[6] ); + vsi_nn_kernel_scalar_release( &update_params[7] ); + vsi_nn_kernel_scalar_release( &update_params[8] ); + vsi_nn_kernel_scalar_release( &update_params[9] ); + vsi_nn_kernel_scalar_release( &update_params[10] ); + vsi_nn_kernel_scalar_release( &update_params[11] ); + } + + // ref + ref_node = vsi_nn_kernel_create_node( graph, ikernels[2] ); + if (ref_node) { uint32_t index = 0; /* Pass parameters to node. */ - pre_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim ); - pre_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim ); - pre_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t; - pre_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t; - pre_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t; - pre_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); - pre_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &area ); - pre_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &vol ); - pre_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim ); - status = vsi_nn_kernel_node_pass_param( tmp_node, pre_params, _SCATTER_ND_UPDATE_PRE_PARAM_NUM ); + ref_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim ); + ref_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim ); + ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t; + ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t; + ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t; + ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[3]->t; + ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[0] ); + ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[1] ); + ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[2] ); + ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[3] ); + ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[4] ); + ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[5] ); + ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[6] ); + ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim ); + status = vsi_nn_kernel_node_pass_param( ref_node, ref_params, _SCATTER_ND_UPDATE_REF_PARAM_NUM ); CHECK_STATUS(status); - vsi_nn_kernel_tensor_release( &pre_params[0] ); - vsi_nn_kernel_tensor_release( &pre_params[1] ); - vsi_nn_kernel_scalar_release( &pre_params[5] ); - vsi_nn_kernel_scalar_release( &pre_params[6] ); - vsi_nn_kernel_scalar_release( &pre_params[7] ); - vsi_nn_kernel_scalar_release( &pre_params[8] ); + vsi_nn_kernel_tensor_release( &ref_params[0] ); + vsi_nn_kernel_tensor_release( &ref_params[1] ); + vsi_nn_kernel_scalar_release( &ref_params[6] ); + vsi_nn_kernel_scalar_release( &ref_params[7] ); + vsi_nn_kernel_scalar_release( &ref_params[8] ); + vsi_nn_kernel_scalar_release( &ref_params[9] ); + vsi_nn_kernel_scalar_release( &ref_params[10] ); + vsi_nn_kernel_scalar_release( &ref_params[11] ); + vsi_nn_kernel_scalar_release( &ref_params[12] ); + vsi_nn_kernel_scalar_release( &ref_params[13] ); } + // copy to output node = vsi_nn_kernel_create_node( graph, kernel ); if ( node ) { uint32_t index = 0; /* Pass parameters to node. */ - node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[2], rs_out_dim ); - node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t; - node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t; - node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t; - node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim ); - node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim ); - node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); - node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &area ); - node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &vol ); - node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim ); - status = vsi_nn_kernel_node_pass_param( node, node_params, _SCATTER_ND_UPDATE_POST_PARAM_NUM ); + cpy_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t; + cpy_params[index++] = (vsi_nn_kernel_node_param_t)tensors[3]->t; + cpy_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim ); + cpy_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + cpy_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res ); + status = vsi_nn_kernel_node_pass_param( node, cpy_params, _SCATTER_ND_UPDATE_COPY_PARAM_NUM ); CHECK_STATUS(status); - vsi_nn_kernel_tensor_release( &node_params[0] ); - vsi_nn_kernel_tensor_release( &node_params[4] ); - vsi_nn_kernel_tensor_release( &node_params[5] ); - vsi_nn_kernel_scalar_release( &node_params[6] ); - vsi_nn_kernel_scalar_release( &node_params[7] ); - vsi_nn_kernel_scalar_release( &node_params[8] ); - vsi_nn_kernel_scalar_release( &node_params[9] ); + vsi_nn_kernel_tensor_release( &cpy_params[2] ); + vsi_nn_kernel_scalar_release( &cpy_params[3] ); + vsi_nn_kernel_scalar_release( &cpy_params[4] ); } } @@ -2306,6 +1932,10 @@ static vsi_nn_kernel_node_t _setup { vsi_nn_kernel_release( &ikernels[1] ); } + if ( ikernels[2] ) + { + vsi_nn_kernel_release( &ikernels[2] ); + } if ( tensors[0] ) { vsi_nn_ReleaseTensor( &tensors[0] ); @@ -2318,41 +1948,33 @@ static vsi_nn_kernel_node_t _setup { vsi_nn_ReleaseTensor( &tensors[2] ); } + if ( tensors[3] ) + { + vsi_nn_ReleaseTensor( &tensors[3] ); + } + if (ref_node) {vsi_nn_kernel_node_release( &ref_node );} if (reset_node) {vsi_nn_kernel_node_release( &reset_node );} - if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );} + if (update_node) {vsi_nn_kernel_node_release( &update_node );} } - else + +final: + if (ikernels[0]) { - status = _query_kernel( inputs, outputs, kernel, coord_dim, big_flg); - if ( VSI_SUCCESS == status) - { - node = vsi_nn_kernel_create_node( graph, kernel ); - if ( node ) - { - uint32_t index = 0; - /* Pass parameters to node. */ - tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[2], rs_out_dim ); - tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim ); - tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim ); - //tmp_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t; - tmp_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim ); - tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); - tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &area ); - tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &vol ); - tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim ); - status = vsi_nn_kernel_node_pass_param( node, tmp_params, _SCATTER_ND_UPDATE_PARAM_NUM ); - CHECK_STATUS(status); - vsi_nn_kernel_tensor_release( &tmp_params[0] ); - vsi_nn_kernel_tensor_release( &tmp_params[1] ); - vsi_nn_kernel_tensor_release( &tmp_params[2] ); - vsi_nn_kernel_tensor_release( &tmp_params[3] ); - vsi_nn_kernel_scalar_release( &tmp_params[4] ); - vsi_nn_kernel_scalar_release( &tmp_params[5] ); - vsi_nn_kernel_scalar_release( &tmp_params[6] ); - vsi_nn_kernel_scalar_release( &tmp_params[7] ); - } - } + vsi_nn_kernel_release(&ikernels[0]); + } + if (ikernels[1]) + { + vsi_nn_kernel_release(&ikernels[1]); + } + if (ikernels[2]) + { + vsi_nn_kernel_release(&ikernels[2]); } + vsi_safe_release_tensor(tensors[0]); + vsi_safe_release_tensor(tensors[1]); + vsi_safe_release_tensor(tensors[2]); + vsi_safe_release_tensor(tensors[3]); + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/select_evis.c b/src/tim/vx/internal/src/kernel/evis/select_evis.c index fae6ad78c..b918e2c08 100644 --- a/src/tim/vx/internal/src/kernel/evis/select_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/select_evis.c @@ -34,6 +34,7 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" __BEGIN_DECLS @@ -61,6 +62,10 @@ typedef enum _internal_img_dim_e CVIVANTE_NAMESPACE("evis.select_"STR(COND_DTYPE)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \ _SELECT_KERNEL_SOURCE} +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) + typedef struct { uint32_t key; @@ -138,7 +143,7 @@ DEF_KERNEL_INITIALIZER(_select_initializer) (( IN0_TYPE << 24) | ( IN1_TYPE << 16) | ( OUT_TYPE << 8)) #define MAX_MULTIPLIER_NUM (65535) #define MAX_POST_SHIFT_BITS (31) - vsi_status status = VX_SUCCESS; + vsi_status status = VSI_FAILURE; // Alignment with a power of two value. gpu_param_t gpu_param = { 3, @@ -166,6 +171,8 @@ DEF_KERNEL_INITIALIZER(_select_initializer) uint16_t in1_M0 = 0; int32_t in1_postShift = 0; uint32_t pack_key = 0; + + VSI_UNREFERENCED(param_size); input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0); CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input1); @@ -444,15 +451,67 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_SELECT_PARAM_NUM] = {NULL}; vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; + vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL }; + vsi_size_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_size_t* shapes_ptr[_IO_NUM]; + vsi_size_t* shapes_in[_INPUT_NUM]; + vsi_size_t rank_in[_INPUT_NUM]; + uint32_t new_rank = 0; + uint32_t i = 0; + vsi_bool ret = FALSE; + + VSI_UNREFERENCED(params); + + for (i = 0; i < _IO_NUM; i++) + { + shapes_ptr[i] = shapes[i]; + } + + for (i = 0; i < _INPUT_NUM; i++) + { + shapes_in[i] = inputs[i]->attr.size; + rank_in[i] = (vsi_size_t)inputs[i]->attr.dim_num; + } + + ret = vsi_nn_kernel_optimize_broadcast_shape( + (const vsi_size_t**)shapes_in, rank_in, _INPUT_NUM, + outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes_ptr, shapes[_INPUT_NUM], &new_rank); + + if ( ret ) + { + for (i = 0; i < _INPUT_NUM; i++) + { + reshape_tensors[i] = vsi_nn_reshape_tensor( graph, + inputs[i], shapes[i], new_rank ); + } + + for (i = 0; i < _OUTPUT_NUM; i++) + { + reshape_tensors[i + _INPUT_NUM] = vsi_nn_reshape_tensor( graph, + outputs[i], shapes[i + _INPUT_NUM], new_rank ); + } + } + else + { + for (i = 0; i < _INPUT_NUM; i++) + { + reshape_tensors[i] = inputs[i]; + } + for (i = 0; i < _OUTPUT_NUM; i++) + { + reshape_tensors[i + _INPUT_NUM] = outputs[i]; + } + } - if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, - outputs[0]->attr.dim_num ) ) + if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[3]->attr.size, + reshape_tensors[3]->attr.dim_num ) ) { return NULL; } - image_2d = (outputs[0]->attr.dim_num == 2); - status = _query_kernel( kernel, inputs, outputs, image_2d); + image_2d = (reshape_tensors[3]->attr.dim_num == 2); + status = _query_kernel( kernel, inputs, &reshape_tensors[3], image_2d); if ( VSI_SUCCESS == status) { @@ -460,12 +519,22 @@ static vsi_nn_kernel_node_t _setup if ( node ) { /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _SELECT_PARAM_NUM, - inputs, input_num, outputs, output_num ); + &reshape_tensors[0], input_num, &reshape_tensors[3], output_num ); + /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, node_params, _SELECT_PARAM_NUM ); } } + if (ret) + { + for (i = 0; i < _IO_NUM; i++) + { + vsi_safe_release_tensor( reshape_tensors[i] ); + } + } + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c b/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c index 5d7e2d6cf..b2e22ed7c 100644 --- a/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c @@ -123,6 +123,8 @@ DEF_KERNEL_INITIALIZER(_sequence_mask_initializer) int32_t output_zp = 0; int32_t input_zp = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -252,7 +254,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype = U8; vsi_status status = VSI_FAILURE; uint32_t key = 0; - int i = 0; + size_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -299,9 +301,11 @@ static int32_t _optimize_mask_shape vsi_status status = VSI_SUCCESS; vsi_size_t in_shape[VSI_NN_MAX_DIM_NUM] = {0}; vsi_size_t new_rank = 0; - uint32_t i = 0; + vsi_size_t i = 0; + + VSI_UNREFERENCED(outputs); - for(i = 0; i < inputs[0]->attr.dim_num; i++) + for (i = 0; i < (vsi_size_t)inputs[0]->attr.dim_num; i++) { in_shape[i] = inputs[0]->attr.size[i]; } @@ -313,7 +317,7 @@ static int32_t _optimize_mask_shape } opt_shape_out[0] = max_len; - for(i = 0; i < (uint32_t)new_rank; i++) + for (i = 0; i < new_rank; i++) { opt_shape_out[i + 1] = opt_shape_in[i]; } @@ -344,6 +348,9 @@ static vsi_nn_kernel_node_t _setup int32_t max_len = vsi_nn_kernel_param_get_int32( params, "max_len" ); int32_t is2Dflg = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { diff --git a/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c b/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c index bcfe0d01c..6fca37fce 100644 --- a/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c @@ -95,7 +95,10 @@ DEF_KERNEL_INITIALIZER(_signal_frame_initializer) vsi_nn_kernel_tensor_attr_t * attr = NULL; vsi_size_array_t * out_shape = NULL; + VSI_UNREFERENCED(param_size); + attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); out_shape = attr->shape; gpu_param.global_scale[0] = 16; diff --git a/src/tim/vx/internal/src/kernel/evis/slice_evis.c b/src/tim/vx/internal/src/kernel/evis/slice_evis.c index 883947073..773d38b0d 100644 --- a/src/tim/vx/internal/src/kernel/evis/slice_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/slice_evis.c @@ -162,6 +162,8 @@ DEF_KERNEL_INITIALIZER(_slice_initializer) int32_t is_samefl = 0; uint32_t pack_key = 0; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); @@ -409,6 +411,8 @@ static vsi_nn_kernel_node_t _setup vsi_size_t output_batch = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; vsi_bool is_same_quant = FALSE; + VSI_UNREFERENCED(params); + vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, shapes[0], &rank[0]); vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, diff --git a/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c index 2b9d53e94..f95405aca 100644 --- a/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c @@ -125,6 +125,8 @@ DEF_KERNEL_INITIALIZER(_space2depth_internal_initializer) uint32_t pack_key = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -273,7 +275,9 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e input0_dtype = U8; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; - int i = 0; + size_t i = 0; + + VSI_UNREFERENCED(params); input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); @@ -323,6 +327,9 @@ static vsi_nn_kernel_node_t _setup int32_t block_size_y = vsi_nn_kernel_param_get_int32( params, "block_size_y" ); int32_t opt_flg = (block_size_x == 2 && block_size_y == 1) ? 1 : 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { diff --git a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c index 46595a170..f31de5495 100644 --- a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c @@ -165,6 +165,8 @@ DEF_KERNEL_INITIALIZER(_get_matrix_initializer) float output_h = 1.0f; float scale[4] = {0}; + VSI_UNREFERENCED(param_size); + attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); @@ -256,6 +258,8 @@ DEF_KERNEL_INITIALIZER(_warp_affine_initializer) float output_scale = 1.0f; float output_zp = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); @@ -309,7 +313,6 @@ DEF_KERNEL_INITIALIZER(_warp_affine_initializer) gpu_param.global_size[1] = out_shape->data[1]; gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; - do { gpu_dp_inst_t uniConvertDatatoF32_0_4x4 = {{ 0x01010101, // TCfg @@ -369,7 +372,7 @@ DEF_KERNEL_INITIALIZER(_warp_affine_initializer) "uniExtract8Data_2x8", &uniExtractInteger_2x8 ); } CHECK_STATUS_FAIL_GOTO(status, final ); - }while(0); + } status = vsi_nn_kernel_gpu_config( node, &gpu_param ); @@ -502,6 +505,9 @@ static vsi_nn_kernel_node_t _setup float output_h = (float)outputs[0]->attr.size[1]; int32_t i = 0; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + if (align_corners && output_w > 1) { output_w = output_w - 1; @@ -565,42 +571,46 @@ static vsi_nn_kernel_node_t _setup // Get Matrix node = vsi_nn_kernel_create_node( graph, ikernels[MATRIX_INDEX] ); - vsi_nn_kernel_node_pack_io( node_params, _GET_MATRIX_PARAM_NUM, - &inputs[1], 1, &tensors[0], 1 ); - node_params[HAS_THETA_1_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_1 ); - node_params[HAS_THETA_1_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_2 ); - node_params[HAS_THETA_1_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_3 ); - node_params[HAS_THETA_2_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_1 ); - node_params[HAS_THETA_2_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_2 ); - node_params[HAS_THETA_2_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_3 ); - node_params[THETA_1_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_1 ); - node_params[THETA_1_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_2 ); - node_params[THETA_1_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_3 ); - node_params[THETA_2_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_1 ); - node_params[THETA_2_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_2 ); - node_params[THETA_2_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_3 ); - node_params[I_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &input_w ); - node_params[I_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &input_h ); - node_params[O_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &output_w ); - node_params[O_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &output_h ); - status = vsi_nn_kernel_node_pass_param( node, node_params, _GET_MATRIX_PARAM_NUM ); - vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_1] ); - vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_2] ); - vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_3] ); - vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_1] ); - vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_2] ); - vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_3] ); - vsi_nn_kernel_scalar_release( &node_params[THETA_1_1] ); - vsi_nn_kernel_scalar_release( &node_params[THETA_1_2] ); - vsi_nn_kernel_scalar_release( &node_params[THETA_1_3] ); - vsi_nn_kernel_scalar_release( &node_params[THETA_2_1] ); - vsi_nn_kernel_scalar_release( &node_params[THETA_2_2] ); - vsi_nn_kernel_scalar_release( &node_params[THETA_2_3] ); - vsi_nn_kernel_scalar_release( &node_params[I_WIDTH] ); - vsi_nn_kernel_scalar_release( &node_params[I_HEIGHT] ); - vsi_nn_kernel_scalar_release( &node_params[O_WIDTH] ); - vsi_nn_kernel_scalar_release( &node_params[O_HEIGHT] ); - vsi_nn_kernel_node_release( &node ); + + if (node) + { + vsi_nn_kernel_node_pack_io( node_params, _GET_MATRIX_PARAM_NUM, + &inputs[1], 1, &tensors[0], 1 ); + node_params[HAS_THETA_1_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_1 ); + node_params[HAS_THETA_1_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_2 ); + node_params[HAS_THETA_1_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_3 ); + node_params[HAS_THETA_2_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_1 ); + node_params[HAS_THETA_2_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_2 ); + node_params[HAS_THETA_2_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_3 ); + node_params[THETA_1_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_1 ); + node_params[THETA_1_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_2 ); + node_params[THETA_1_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_3 ); + node_params[THETA_2_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_1 ); + node_params[THETA_2_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_2 ); + node_params[THETA_2_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_3 ); + node_params[I_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &input_w ); + node_params[I_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &input_h ); + node_params[O_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &output_w ); + node_params[O_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &output_h ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _GET_MATRIX_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_1] ); + vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_2] ); + vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_3] ); + vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_1] ); + vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_2] ); + vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_3] ); + vsi_nn_kernel_scalar_release( &node_params[THETA_1_1] ); + vsi_nn_kernel_scalar_release( &node_params[THETA_1_2] ); + vsi_nn_kernel_scalar_release( &node_params[THETA_1_3] ); + vsi_nn_kernel_scalar_release( &node_params[THETA_2_1] ); + vsi_nn_kernel_scalar_release( &node_params[THETA_2_2] ); + vsi_nn_kernel_scalar_release( &node_params[THETA_2_3] ); + vsi_nn_kernel_scalar_release( &node_params[I_WIDTH] ); + vsi_nn_kernel_scalar_release( &node_params[I_HEIGHT] ); + vsi_nn_kernel_scalar_release( &node_params[O_WIDTH] ); + vsi_nn_kernel_scalar_release( &node_params[O_HEIGHT] ); + vsi_nn_kernel_node_release( &node ); + } // Warp Affine node = vsi_nn_kernel_create_node( graph, ikernels[WARP_AFFINE_INDEX] ); @@ -617,19 +627,26 @@ static vsi_nn_kernel_node_t _setup border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; } status = vsi_nn_kernel_node_set_border( node, &border ); - VSI_ASSERT( status == VSI_SUCCESS ); + if ( VSI_SUCCESS != status ) + { + goto final; + } + vsi_nn_kernel_node_pack_io( warp_affine_node_params, _WARP_AFFINE_PARAM_NUM, + warp_affine_tensors, 2, outputs, 1 ); + status = vsi_nn_kernel_node_pass_param( node, warp_affine_node_params, _WARP_AFFINE_PARAM_NUM ); + if ( VSI_SUCCESS != status ) + { + goto final; + } } - vsi_nn_kernel_node_pack_io( warp_affine_node_params, _WARP_AFFINE_PARAM_NUM, - warp_affine_tensors, 2, outputs, 1 ); - status = vsi_nn_kernel_node_pass_param( node, warp_affine_node_params, _WARP_AFFINE_PARAM_NUM ); final: - for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + for ( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) { - if( ikernels[i] ) + if ( ikernels[i] ) { vsi_nn_kernel_release( &ikernels[i] ); } - if( tensors[i] ) + if ( tensors[i] ) { vsi_nn_ReleaseTensor( &tensors[i] ); } diff --git a/src/tim/vx/internal/src/kernel/evis/swish_evis.c b/src/tim/vx/internal/src/kernel/evis/swish_evis.c index 724037575..befe6ac74 100644 --- a/src/tim/vx/internal/src/kernel/evis/swish_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/swish_evis.c @@ -154,7 +154,7 @@ DEF_KERNEL_INITIALIZER(_swish_initializer) size_t param_size ) { - vsi_status status = VX_FAILURE; + vsi_status status = VSI_FAILURE; // Alignment with a power of two value. gpu_param_t gpu_param = { 3, @@ -177,6 +177,8 @@ DEF_KERNEL_INITIALIZER(_swish_initializer) vsi_size_array_t *out_shape = NULL; uint32_t pack_key = 0; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input); CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -365,7 +367,7 @@ DEF_KERNEL_INITIALIZER(_hswish_initializer) size_t param_size ) { - vsi_status status = VX_FAILURE; + vsi_status status = VSI_FAILURE; // Alignment with a power of two value. gpu_param_t gpu_param = { 3, @@ -387,6 +389,8 @@ DEF_KERNEL_INITIALIZER(_hswish_initializer) vsi_size_array_t *out_shape = NULL; uint32_t pack_key = 0; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input); CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -649,6 +653,9 @@ static vsi_nn_kernel_node_t _setup int32_t swish_type = vsi_nn_kernel_param_get_int32( params, "type" ); float beta = 1.0f; vsi_bool ret = FALSE; + + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); #if (VX_ACTIVATION_EXT_SUPPORT) if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver) { diff --git a/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c b/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c index 15854526a..4a57905ce 100644 --- a/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c @@ -123,6 +123,8 @@ DEF_KERNEL_INITIALIZER(_tensorstackconcat_initializer) vsi_size_array_t * in_shape = NULL; // Add initializer + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); @@ -225,6 +227,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; vsi_bool image_2d = FALSE; + VSI_UNREFERENCED(params); + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); status = _query_kernel( kernel, inputs, outputs, image_2d ); if ( VSI_SUCCESS == status) diff --git a/src/tim/vx/internal/src/kernel/evis/tile_evis.c b/src/tim/vx/internal/src/kernel/evis/tile_evis.c index 50e43cf81..f46941aff 100644 --- a/src/tim/vx/internal/src/kernel/evis/tile_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/tile_evis.c @@ -272,6 +272,8 @@ DEF_KERNEL_INITIALIZER(_tile_initializer) int32_t output_ZP = 0; int32_t input_ZP = 0; + VSI_UNREFERENCED(param_size); + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -408,7 +410,7 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e output_dtype; vsi_status status = VSI_FAILURE; uint32_t key; - int32_t i = 0; + size_t i = 0; int32_t dim0_size1 = inputs[0]->attr.size[0] == 1 ? 1 : 0; input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); @@ -497,6 +499,11 @@ static vsi_nn_kernel_node_t _setup uint32_t dim = inputs[0]->attr.dim_num; vsi_size_t multiples[VSI_NN_MAX_DIM_NUM] = { 0 }; + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + + for ( i = 0; i < dim; i++) { multiples[i] = outputs[0]->attr.size[i] / inputs[0]->attr.size[i]; @@ -515,10 +522,34 @@ static vsi_nn_kernel_node_t _setup return NULL; } - reshape_tensors[0] = vsi_nn_reshape_tensor( graph, - inputs[0], shapes[0], new_rank ); - reshape_tensors[1] = vsi_nn_reshape_tensor( graph, - outputs[0], shapes[2], new_rank ); + if ( new_rank == 4) + { + vsi_size_t newshapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + newshapes[0][0] = shapes[0][0]; + newshapes[2][0] = shapes[2][0]; + newshapes[0][1] = shapes[0][1]; + newshapes[2][1] = shapes[2][1]; + newshapes[0][2] = shapes[0][2] * shapes[0][3]; + newshapes[2][2] = shapes[2][2] * shapes[2][3]; + + if (newshapes[0][2] >= GPU_TENSOR_MAX_WIDTH || + newshapes[2][2] >= GPU_TENSOR_MAX_WIDTH) + { + return NULL; + } + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], newshapes[0], 3 ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], newshapes[2], 3 ); + } + else + { + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], shapes[2], new_rank ); + } } else { @@ -532,7 +563,7 @@ static vsi_nn_kernel_node_t _setup } remainder = reshape_tensors[0]->attr.size[0] % 8; - image_2d = (reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1); + image_2d = reshape_tensors[0]->attr.dim_num == 2; status = _query_kernel( &reshape_tensors[0], &reshape_tensors[1], image_2d, remainder, kernel ); if( VSI_SUCCESS == status) { @@ -540,9 +571,9 @@ static vsi_nn_kernel_node_t _setup if( node ) { /* Pass parameters to node. */ - vsi_size_t depthIn = new_rank > 2 ? reshape_tensors[0]->attr.size[2] : 1; - vsi_size_t depthOut = new_rank > 2 ? reshape_tensors[1]->attr.size[2] : 1; - vsi_size_t batchIn = new_rank > 3 ? reshape_tensors[0]->attr.size[3] : 1; + vsi_size_t depthIn = new_rank > 2 ? shapes[0][2] : 1; + vsi_size_t depthOut = new_rank > 2 ? shapes[2][2] : 1; + vsi_size_t batchIn = new_rank > 3 ? shapes[0][3] : 1; shapes[1][2] = shapes[1][2] == 0 ? 1 : shapes[1][2]; shapes[1][3] = shapes[1][3] == 0 ? 1 : shapes[1][3]; diff --git a/src/tim/vx/internal/src/kernel/evis/upsample_evis.c b/src/tim/vx/internal/src/kernel/evis/upsample_evis.c index 0ac1b6d28..fb78c4905 100644 --- a/src/tim/vx/internal/src/kernel/evis/upsample_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/upsample_evis.c @@ -162,6 +162,8 @@ DEF_KERNEL_INITIALIZER(_upsample_initializer) float factorOut = 1.0f; vsi_bool image_2d = FALSE; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); axis_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); diff --git a/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c b/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c index 27a478b0e..6bc113f3c 100644 --- a/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c @@ -152,6 +152,8 @@ DEF_KERNEL_INITIALIZER(_upsamplescale_initializer) uint32_t pack_key = 0; _internal_upscale_e flag = UP_ORG; + VSI_UNREFERENCED(param_size); + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); @@ -344,7 +346,7 @@ static vsi_status _query_kernel _internal_upscale_e flag = (stride == 2 && scale >= 0 ) ? UP_K2 : UP_ORG; uint32_t key = 0; - int i; + size_t i; in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c index aa05c359d..83334269c 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c @@ -136,6 +136,10 @@ static vsi_status VX_CALLBACK _kernel_validator vx_meta_format metas[] ) { + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(parameters); + VSI_UNREFERENCED(num); + VSI_UNREFERENCED(metas); return VSI_SUCCESS; } /* _kernel_validator() */ @@ -146,6 +150,9 @@ static vsi_status VX_CALLBACK _kernel_initializer uint32_t paraNum ) { + VSI_UNREFERENCED(nodObj); + VSI_UNREFERENCED(paramObj); + VSI_UNREFERENCED(paraNum); return VSI_SUCCESS; } /* _kernel_initializer() */ @@ -156,6 +163,9 @@ static vsi_status VX_CALLBACK _kernel_deinitializer uint32_t paraNum ) { + VSI_UNREFERENCED(nodObj); + VSI_UNREFERENCED(paraObj); + VSI_UNREFERENCED(paraNum); return VSI_SUCCESS; } /* _kernel_deinitializer() */ @@ -287,6 +297,9 @@ static const uint8_t* _load_internal_executable vsi_nn_kernel_type_e type ) { + VSI_UNREFERENCED(source_name); + VSI_UNREFERENCED(size); + VSI_UNREFERENCED(type); #if VSI_USE_VXC_BINARY switch( type ) { @@ -518,8 +531,10 @@ static vx_program _create_program_from_executable program_info.data = _load_internal_executable( source_info->data[0], &program_info.size, kernel->type); + CHECK_PTR_FAIL_GOTO( program_info.data, "Create buffer fail.", final ); program = vxCreateProgramWithBinary( graph->ctx->c, (const vx_uint8 *)program_info.data, program_info.size ); +final: return program; } /* _create_program_from_executable() */ diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c index ecbdccf06..26c918079 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c @@ -113,6 +113,12 @@ static vsi_size_t eltwise_fill_dim vsi_size_t divisor = 0; vsi_size_t remainder = 0; compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor ); + if (divisor == 0) + { + VSILOGE( "divisor might be used in a division by zero." ); + cost_size = (vsi_size_t)-1; + goto final; + } remainder = size_output / divisor; if( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank ) { @@ -152,6 +158,7 @@ static vsi_size_t eltwise_fill_dim shape_output[rank + 1] = remainder; } } +final: return cost_size; } /* eltwise_fill_dim() */ @@ -177,11 +184,11 @@ vsi_bool vsi_nn_kernel_optimize_eltwise_shape eltwise_broadcast_state_e prv_state = ELTWISE_BROADCAST_STATE_EMPTY; #define _swap_size(a, b, tmp) \ - do { \ + { \ tmp = a; \ a = b; \ b = tmp; \ - } while(0) + } for( i = 0; i < rank_output; i++ ) { sx = i < rank_x ? shape_x[i] : 1; @@ -352,6 +359,12 @@ static vsi_size_t broadcast_fill_dim vsi_size_t divisor = 0; vsi_size_t remainder = 0; compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor ); + if (divisor == 0) + { + VSILOGE( "divisor might be used in a division by zero." ); + cost_size = (vsi_size_t)-1; + goto final; + } remainder = size_output / divisor; if( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank ) { @@ -386,6 +399,7 @@ static vsi_size_t broadcast_fill_dim shape_output[rank + 1] = remainder; } } +final: return cost_size; } /* broadcast_fill_dim() */ @@ -412,11 +426,11 @@ vsi_bool vsi_nn_kernel_optimize_broadcast_shape int32_t prv_state_mask = -1; #define _swap_size(a, b, tmp) \ - do { \ + { \ tmp = a; \ a = b; \ b = tmp; \ - } while(0) + } if (input_num > MAX_INPUT_NUM) { diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c index f3a8f4fce..18919b4d5 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c @@ -28,6 +28,7 @@ #include "vsi_nn_error.h" #include "utils/vsi_nn_math.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "kernel/vsi_nn_kernel_eltwise.h" static vsi_bool compute_gpu_divisor ( @@ -84,6 +85,12 @@ static vsi_size_t element_fill_dim vsi_size_t divisor = 0; vsi_size_t remainder = 0; compute_gpu_divisor( size_x, max_rank, 1, &divisor ); + if (divisor == 0) + { + VSILOGE( "divisor might be used in a division by zero." ); + cost_size = (vsi_size_t)-1; + goto final; + } remainder = size_x / divisor; if ( remainder > max_rank || rank_x >= max_rank) { @@ -109,6 +116,7 @@ static vsi_size_t element_fill_dim } } } +final: return cost_size; } /* element_fill_dim() */ @@ -132,6 +140,9 @@ vsi_bool vsi_nn_kernel_optimize_reduce_shape vsi_size_t outerSize = 1; vsi_size_t axisSize = 1; + VSI_UNREFERENCED(shape_output); + VSI_UNREFERENCED(rank_output); + for (i = 0; i < axis_size; i++) { axisSize *= shape_x[axis[i]]; @@ -391,6 +402,12 @@ static vsi_size_t tile_fill_dim vsi_size_t divisor = 0; vsi_size_t remainder = 0; compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor ); + if (divisor == 0) + { + VSILOGE( "divisor might be used in a division by zero." ); + cost_size = (vsi_size_t)-1; + goto final; + } remainder = size_output / divisor; if ( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank ) { @@ -430,6 +447,7 @@ static vsi_size_t tile_fill_dim shape_output[rank + 1] = remainder; } } +final: return cost_size; } /* eltwise_fill_dim() */ @@ -442,35 +460,126 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape vsi_size_t* out_shape_output, vsi_size_t* out_rank_output ) { - vsi_bool ret = TRUE; - vsi_bool append_dim = FALSE; - vsi_size_t i = 0; - vsi_size_t dims = 0; + vsi_bool ret = TRUE; + vsi_bool append_dim = FALSE; + vsi_size_t i = 0; + vsi_size_t j = 0; + vsi_size_t dims = 0; vsi_size_t effective_size_x = 1; vsi_size_t effective_size_y = 1; vsi_size_t effective_size_z = 1; vsi_size_t sx = 0; vsi_size_t sy = 0; vsi_size_t sz = 0; + int32_t idx_start = -1; + int32_t idx_end = 0; tile_axis_state_e state = TILE_STATE_EMPTY; tile_axis_state_e next_state = TILE_STATE_EMPTY; + vsi_size_t* temp_shape_x = NULL; + vsi_size_t* temp_shape_y = NULL; + vsi_size_t* temp_shape_output = NULL; + vsi_size_t temp_rank = 0; #define _swap_size(a, b, tmp) \ - do { \ + { \ tmp = a; \ a = b; \ b = tmp; \ - } while(0) - for( i = 0; i < rank_output; i++ ) + } + + VSI_UNREFERENCED(rank_x); + VSI_UNREFERENCED(rank); + + temp_shape_x = (vsi_size_t*)malloc(rank * sizeof(vsi_size_t)); + if (temp_shape_x == NULL) { - sx = shape_x[i]; - sy = multiples[i]; - sz = shape_output[i]; + VSILOGE( "malloc temp_shape_x error." ); + ret = FALSE; + goto final; + } + + temp_shape_y = (vsi_size_t*)malloc(rank * sizeof(vsi_size_t)); + if (temp_shape_y == NULL) + { + VSILOGE( "malloc temp_shape_y error." ); + ret = FALSE; + goto final; + } + + temp_shape_output = (vsi_size_t*)malloc(rank * sizeof(vsi_size_t)); + if (temp_shape_output == NULL) + { + VSILOGE( "malloc temp_shape_output error." ); + ret = FALSE; + goto final; + } + memcpy(temp_shape_x, shape_x, rank * sizeof(vsi_size_t)); + memcpy(temp_shape_y, multiples, rank * sizeof(vsi_size_t)); + memcpy(temp_shape_output, shape_output, rank * sizeof(vsi_size_t)); + + for (i = 0, temp_rank = 0; i < rank_output; i++) + { + if (i == rank_output - 1 && temp_shape_x[i] == 1) + { + if (idx_start >= 0) + { + sx = 1; + sy = temp_shape_y[idx_start]; + sz = temp_shape_output[idx_start]; + idx_end = (int32_t)i ; + for (j = (vsi_size_t)idx_start + 1; j <= (vsi_size_t)idx_end; j++) + { + sy *= temp_shape_y[j]; + sz *= temp_shape_output[j]; + } + temp_rank += tile_fill_dim( temp_shape_x, temp_shape_y, temp_shape_output, + temp_rank, VSI_NN_MAX_DIM_NUM, sx, sy, sz ); + idx_start = -1; + } + else + { + temp_shape_x[temp_rank] = temp_shape_x[i]; + temp_shape_y[temp_rank] = temp_shape_y[i]; + temp_shape_output[temp_rank++] = temp_shape_output[i]; + } + } + else if (temp_shape_x[i] != 1) + { + idx_end = (int32_t)i - 1; + if (idx_start >= 0) + { + sx = 1; + sy = temp_shape_y[idx_start]; + sz = temp_shape_output[idx_start]; + for (j = (vsi_size_t)idx_start + 1; j <= (vsi_size_t)idx_end; j++) + { + sy *= temp_shape_y[j]; + sz *= temp_shape_output[j]; + } + temp_rank += tile_fill_dim( temp_shape_x, temp_shape_y, temp_shape_output, + temp_rank, VSI_NN_MAX_DIM_NUM, sx, sy, sz ); + idx_start = -1; + } + temp_shape_x[temp_rank] = temp_shape_x[i]; + temp_shape_y[temp_rank] = temp_shape_y[i]; + temp_shape_output[temp_rank++] = temp_shape_output[i]; + } + else if (idx_start == -1) + { + idx_start = (int32_t)i; + } + } + + for( i = 0; i < temp_rank; i++ ) + { + sx = temp_shape_x[i]; + sy = temp_shape_y[i]; + sz = temp_shape_output[i]; /* * Skip dim if the size is equal to 1 * Also skip if ( sx == 1 && sy == 1 ) */ - if ( shape_output[i] == 1 ) + if ( temp_shape_output[i] == 1 ) { continue; } @@ -490,8 +599,8 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape VSI_ASSERT( FALSE ); } - next_state = (i + 1) < rank_output ? - (multiples[i + 1] == 1 ? TILE_STATE_NO_AXIS : TILE_STATE_AXIS_X) : TILE_STATE_EMPTY; + next_state = (i + 1) < temp_rank ? + (temp_shape_y[i + 1] == 1 ? TILE_STATE_NO_AXIS : TILE_STATE_AXIS_X) : TILE_STATE_EMPTY; append_dim = FALSE; #define _pack_state( cur_state, next_state ) (next_state << 16 | cur_state) @@ -507,9 +616,13 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape * ...,x1,x2,... * ...,y1,y2,... */ + case _pack_state( TILE_STATE_AXIS_X, TILE_STATE_EMPTY ): + effective_size_x = sx; + effective_size_y = sy; + effective_size_z = sz; + break; case _pack_state( TILE_STATE_AXIS_X, TILE_STATE_AXIS_X ): case _pack_state( TILE_STATE_AXIS_X, TILE_STATE_NO_AXIS ): - case _pack_state( TILE_STATE_AXIS_X, TILE_STATE_EMPTY ): append_dim = TRUE; break; /* @@ -548,7 +661,7 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape if ( ret ) { /* Append the last dim */ - if ( i == rank_output ) + if ( i == temp_rank ) { sx = effective_size_x; sy = effective_size_y; @@ -573,6 +686,23 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape *out_rank_output = (uint32_t)dims; } #undef _swap_size +final: + if (temp_shape_x) + { + free( temp_shape_x); + temp_shape_x = NULL; + } + if (temp_shape_y) + { + free( temp_shape_y); + temp_shape_y = NULL; + } + if (temp_shape_output) + { + free( temp_shape_output); + temp_shape_output = NULL; + } + return ret; } /* vsi_nn_kernel_optimize_eltwise_shape() */ @@ -612,7 +742,7 @@ vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape return TRUE; } -static vsi_bool vsi_nn_kernel_optimize_element_shape_with_max_rank +vsi_bool vsi_nn_kernel_optimize_element_shape_with_max_rank ( const vsi_size_t* shape_x, const vsi_size_t rank_x, vsi_size_t* out_shape_x, vsi_size_t* out_rank_x, vsi_size_t max_rank @@ -755,3 +885,415 @@ vsi_bool vsi_nn_kernel_optimize_scatter_elements_shape return ret; } /* vsi_nn_kernel_optimize_scatter_elements_shape() */ + + +vsi_bool vsi_nn_kernel_optimize_matrixmul_broadcast_shape + ( + const vsi_size_t * shape_x, + const vsi_size_t * shape_y, + const vsi_size_t * shape_output, + vsi_size_t dim_x, + vsi_size_t dim_y, + vsi_size_t dim_out, + vsi_size_t* out_shape_x, + vsi_size_t* out_shape_y, + vsi_size_t* out_shape_output, + uint32_t* new_rank_out, + uint32_t* cross_flg, + uint32_t* size_axis_inner_outer, + uint32_t* strides_axis_inner_outer + ) +{ + vsi_bool ret = FALSE; + vsi_size_t rank_in[2] = {0, 0}; + vsi_size_t rank_out = 0; + vsi_size_t shapes_in_broadcast_part[2][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t* shapes_in_broadcast_part_ptr[2] = {NULL, NULL}; + vsi_size_t shapes_out_broadcast_part[VSI_NN_MAX_DIM_NUM] = {1}; + vsi_size_t out_shape_in[2][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t* out_shape_in_ptr[2] = {NULL, NULL}; + vsi_size_t out_shape_boradcast_output[VSI_NN_MAX_DIM_NUM] = {1}; + uint32_t new_rank = 0; + uint32_t i = 0; + vsi_size_t outer0 = 1; + vsi_size_t outer1 = 1; + vsi_size_t outer2 = 1; + vsi_size_t axis_size = 0; + vsi_size_t inner_size = 1; + vsi_size_t outer_size = 1; + vsi_size_t axis_size0 = 1; + vsi_size_t axis_size1 = 1; + vsi_size_t axis_size2 = 1; + vsi_size_t inner_size0 = 0; + vsi_size_t inner_size1 = 0; + vsi_size_t inner_size2 = 0; + vsi_size_t outer_size0 = 0; + vsi_size_t outer_size1 = 0; + vsi_size_t outer_size2 = 0; + uint32_t ne_flg = 0; + uint32_t axis = 0; + uint32_t outer_flg = 0; + uint32_t outer_axis = 0; + uint32_t first_flg = 0; + cross_flg[0] = 0; + + if (dim_x > 2 && dim_y > 2) + { + for (i = 2; i < dim_x; i++) + { + outer0 *= shape_x[i]; + } + for (i = 2; i < dim_y; i++) + { + outer1 *= shape_y[i]; + } + for (i = 2; i < dim_out; i++) + { + outer2 *= shape_output[i]; + } + + for (i = 2; i < vsi_nn_min(dim_x, dim_y); i++) + { + if (shape_x[i] != shape_y[i] && first_flg == 0) + { + if (shape_x[i] == 1) + { + ne_flg = 1; + inner_size = shape_y[i]; + } + else + { + ne_flg = 2; + inner_size = shape_x[i]; + } + first_flg = 1; + continue; + } + else if (ne_flg == 1 && shape_x[i] != shape_y[i] && shape_x[i] == 1 && first_flg == 1) + { + inner_size *= shape_y[i]; + } + else if (ne_flg == 2 && shape_x[i] != shape_y[i] && shape_y[i] == 1 && first_flg == 1) + { + inner_size *= shape_x[i]; + } + else if (ne_flg == 1 && shape_x[i] != shape_y[i] && shape_x[i] != 1 && first_flg == 1) + { + outer_flg = 1; + outer_axis = i; + break; + } + else if (ne_flg == 2 && shape_x[i] != shape_y[i] && shape_y[i] != 1 && first_flg == 1) + { + outer_flg = 2; + outer_axis = i; + break; + } + else if (i > 2 && shape_x[i] == shape_y[i] && shape_y[i] != 1 && first_flg == 1) + { + first_flg = 2; + } + else if (shape_x[i] != shape_y[i] && shape_x[i] != 1 && first_flg == 2) + { + outer_flg = 1; + outer_axis = i; + break; + } + else if (shape_x[i] != shape_y[i] && shape_y[i] != 1 && first_flg == 2) + { + outer_flg = 2; + outer_axis = i; + break; + } + else if (i == 2 && shape_x[i] == shape_y[i] && shape_y[i] != 1) + { + /*axis = 2; + axis_size = shape_x[i];*/ + } + } + + if (ne_flg > 0 && outer0 > 1 && outer1 > 1) + { + for (i = 2; i < vsi_nn_min(dim_x, dim_y); i++) + { + if (shape_x[i] == shape_y[i] && shape_x[i] != 1) + { + cross_flg[0] = 1; + axis = i; + axis_size = shape_x[i]; + break; + } + } + } + + if (cross_flg[0] == 1) // cross + { + if (outer_flg == 1) + { + for (i = outer_axis; i < dim_x; i++) + { + outer_size *= shape_x[i]; + } + } + else if (outer_flg == 2) + { + for (i = outer_axis; i < dim_y; i++) + { + outer_size *= shape_y[i]; + } + } + else + { + outer_size = 1; + } + + axis_size0 = 1; + axis_size1 = 1; + axis_size2 = 1; + if (axis > 2 && ne_flg == 1) + { + axis_size1 = inner_size; + axis_size2 = inner_size; + } + else if (axis > 2 && ne_flg == 2) + { + axis_size0 = inner_size; + axis_size2 = inner_size; + } + + inner_size0 = 0; + inner_size1 = 0; + inner_size2 = 1; + if (axis == 2 && ne_flg == 1) + { + inner_size1 = axis_size; + inner_size2 = axis_size; + } + else if (axis > 2 && ne_flg == 1) + { + inner_size1 = 1; + } + else if (axis == 2 && ne_flg == 2) + { + inner_size0 = axis_size; + inner_size2 = axis_size; + } + else if (axis > 2 && ne_flg == 2) + { + inner_size0 = 1; + } + + outer_size0 = 0; + outer_size1 = 0; + outer_size2 = axis_size * inner_size; + if (outer_flg == 1) + { + outer_size0 = axis_size0 * axis_size; + } + else if (outer_flg == 2) + { + outer_size1 = axis_size1 * axis_size; + } + + for (i = 0; i < 2; i++) + { + out_shape_x[i] = shape_x[i]; + out_shape_y[i] = shape_y[i]; + out_shape_output[i] = shape_output[i]; + } + out_shape_x[2] = outer0; + out_shape_x[3] = 1; + out_shape_y[2] = outer1; + out_shape_output[2] = outer2; + new_rank_out[0] = 4; + new_rank_out[1] = 3; + new_rank_out[2] = 3; + + size_axis_inner_outer[0] = (uint32_t)axis_size; + size_axis_inner_outer[1] = (uint32_t)inner_size; + size_axis_inner_outer[2] = (uint32_t)outer_size; + + strides_axis_inner_outer[0] = (uint32_t)axis_size0; + strides_axis_inner_outer[1] = (uint32_t)inner_size0; + strides_axis_inner_outer[2] = (uint32_t)outer_size0; + + strides_axis_inner_outer[3] = (uint32_t)axis_size1; + strides_axis_inner_outer[4] = (uint32_t)inner_size1; + strides_axis_inner_outer[5] = (uint32_t)outer_size1; + + strides_axis_inner_outer[6] = (uint32_t)axis_size2; + strides_axis_inner_outer[7] = (uint32_t)inner_size2; + strides_axis_inner_outer[8] = (uint32_t)outer_size2; + + return TRUE; + } + else if (outer0 > 1 && outer1 > 1 && ne_flg > 0 && cross_flg[0] == 0) + { + cross_flg[0] = 2; + } + } + + if (cross_flg[0] == 2) // merge + { + for (i = 0; i < 2; i++) + { + out_shape_x[i] = shape_x[i]; + out_shape_y[i] = shape_y[i]; + out_shape_output[i] = shape_output[i]; + } + out_shape_output[2] = outer2; + new_rank_out[2] = 3; + if (ne_flg == 1) + { + out_shape_x[2] = outer0; + out_shape_x[3] = 1; + out_shape_y[2] = outer1; + + new_rank_out[0] = 4; + new_rank_out[1] = 3; + } + else if (ne_flg == 2) + { + out_shape_x[2] = outer0; + out_shape_y[2] = outer1; + out_shape_y[3] = 1; + + new_rank_out[0] = 3; + new_rank_out[1] = 4; + } + + return TRUE; + } + else if (dim_x == 1 && dim_y > 1) + { + out_shape_x[0] = shape_x[0]; + out_shape_x[1] = 1; + + out_shape_y[0] = shape_y[0]; + out_shape_y[1] = shape_y[1]; + + out_shape_output[0] = shape_output[0]; + out_shape_output[1] = 1; + + if (dim_y > 2) + { + shapes_in_broadcast_part[0][0] = 1; + rank_in[0] = 1; + + for (i = 2; i <= dim_y; i++) + { + shapes_in_broadcast_part[1][i - 2] = shape_y[i]; + } + rank_in[1] = dim_y - 2; + + for(i = 1; i <= dim_out; i++) + { + shapes_out_broadcast_part[i - 1] = shape_output[i]; + } + rank_out = dim_out - 1; + } + } + else if (dim_y == 1 && dim_x > 1) + { + out_shape_y[0] = 1; + out_shape_y[1] = shape_y[0]; + + out_shape_x[0] = shape_x[0]; + out_shape_x[1] = shape_x[1]; + + out_shape_output[0] = 1; + out_shape_output[1] = shape_output[0]; + + if (dim_x > 2) + { + shapes_in_broadcast_part[1][0] = 1; + rank_in[1] = 1; + + for (i = 2; i <= dim_x; i++) + { + shapes_in_broadcast_part[0][i - 2] = shape_x[i]; + } + rank_in[0] = dim_x - 2; + + for(i = 1; i <= dim_out; i++) + { + shapes_out_broadcast_part[i - 1] = shape_output[i]; + } + rank_out = dim_out - 1; + } + } + else + { + out_shape_x[0] = shape_x[0]; + out_shape_x[1] = shape_x[1]; + + out_shape_y[0] = shape_y[0]; + out_shape_y[1] = shape_y[1]; + + out_shape_output[0] = shape_output[0]; + out_shape_output[1] = shape_output[1]; + + for (i = 2; i < dim_x; i++) + { + shapes_in_broadcast_part[0][i - 2] = shape_x[i]; + } + for (i = 2; i < dim_y; i++) + { + shapes_in_broadcast_part[1][i - 2] = shape_y[i]; + } + for (i = 2; i < dim_out; i++) + { + shapes_out_broadcast_part[i - 2] = shape_output[i]; + } + rank_in[0] = dim_x - 2; + rank_in[1] = dim_y - 2; + rank_out = dim_out - 2; + + } + + shapes_in_broadcast_part_ptr[0] = shapes_in_broadcast_part[0]; + shapes_in_broadcast_part_ptr[1] = shapes_in_broadcast_part[1]; + out_shape_in_ptr[0] = out_shape_in[0]; + out_shape_in_ptr[1] = out_shape_in[1]; + + ret = vsi_nn_kernel_optimize_broadcast_shape( + (const vsi_size_t **)shapes_in_broadcast_part_ptr, rank_in, 2, + shapes_out_broadcast_part, rank_out, + (vsi_size_t **)out_shape_in_ptr, out_shape_boradcast_output, &new_rank); + + if (ret) + { + int32_t j = 0; + + new_rank_out[0] = new_rank + 2; + new_rank_out[1] = new_rank + 2; + new_rank_out[2] = new_rank + 2; + + j = new_rank - 1; + while (out_shape_in[0][j] == 1 && j >= 0) { + new_rank_out[0]--; + j--; + } + + j = new_rank - 1; + while (out_shape_in[1][j] == 1 && j >= 0) { + new_rank_out[1]--; + j--; + } + + j = new_rank - 1; + while (out_shape_boradcast_output[j] == 1 && j >= 0) { + new_rank_out[2]--; + j--; + } + + for (i = 0; i < new_rank; i++) + { + out_shape_x[i + 2] = out_shape_in[0][i]; + out_shape_y[i + 2] = out_shape_in[1][i]; + out_shape_output[i + 2] = out_shape_boradcast_output[i]; + } + } + + return ret; +} diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c index c5b640c55..426dacf16 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c @@ -62,13 +62,13 @@ typedef struct } _param_type; #define CHECK_PARAM_NULL( ptr, rval, ... ) \ - do { \ + { \ if( ptr == NULL ) { \ VSILOGE(__VA_ARGS__); \ VSI_ASSERT(FALSE); \ return rval; \ } \ - } while(0) + } #define _PARAM_ADD_TEMPLATE(TYPE_NAME, TYPE, PARAM_DTYPE) \ vsi_bool vsi_nn_kernel_param_add_##TYPE_NAME \ diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c index 7b0c6ca67..6c6dda92c 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c @@ -68,6 +68,12 @@ KERNEL_SELECTOR( depthwise_conv1d ) { VSI_NN_KERNEL_TYPE_CL, 3 }, { VSI_NN_KERNEL_TYPE_CPU, 2 }, }; + + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(outputs); + VSI_UNREFERENCED(output_num); + dilation = dilation == 0 ? 0 : dilation - 1; real_kernel = (kernel - 1) * dilation + kernel; @@ -101,6 +107,12 @@ static vsi_status _select { VSI_NN_KERNEL_TYPE_CL, 1 }, { VSI_NN_KERNEL_TYPE_CPU, 0 }, }; + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(outputs); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); return vsi_nn_kernel_pirority_set( selector, pirority, _cnt_of_array(pirority) ); } /* _select */ @@ -141,5 +153,8 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(atan) REGISTER_VX_FIRST_KERNEL_SELECTOR(atanh) REGISTER_VX_FIRST_KERNEL_SELECTOR(acosh) REGISTER_VX_FIRST_KERNEL_SELECTOR(inverse_sigmoid) +#if (VX_TENSOR_SELECT_VX_SUPPORT) +REGISTER_VX_FIRST_KERNEL_SELECTOR(select) +#endif __END_DECLS diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c index a1680edbf..55a61001a 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c @@ -30,6 +30,7 @@ #include "vsi_nn_error.h" #include "vsi_nn_tensor_util.h" #include "kernel/vsi_nn_kernel.h" +#include "vsi_nn_error.h" typedef enum { @@ -73,6 +74,11 @@ vsi_status vsi_nn_kernel_copy_tensor_veiw_patch vx_trensor_addressing addr = NULL; vx_size dim_sizes[VSI_NN_MAX_DIM_NUM], strides[VSI_NN_MAX_DIM_NUM]; addr = (vx_trensor_addressing)malloc(sizeof(vx_tensorpatch_addressing_t)); + if ( NULL == addr ) + { + VSILOGE("Call vxCreateTensorAddressing fail"); + return status; + } addr->num_of_dims = (vx_uint32)attr->shape->size; for (i = 0; i < dim; i++) @@ -138,6 +144,7 @@ vsi_status vsi_nn_kernel_copy_tensor_veiw_patch } } #endif + return status; } /* vsi_nn_kernel_copy_tensor_veiw_patch() */ @@ -153,6 +160,9 @@ vsi_status vsi_nn_kernel_copy_tensor_patch vsi_size_t start[VSI_NN_MAX_DIM_NUM],end[VSI_NN_MAX_DIM_NUM],stride[VSI_NN_MAX_DIM_NUM]; vsi_status status = VSI_FAILURE; uint32_t i; + + VSI_UNREFERENCED(buffer_size); + if (NULL == tensor || NULL == user_ptr) { VSILOGE("Invalid parameter"); @@ -377,10 +387,12 @@ vsi_status vsi_nn_kernel_tensor_write_from_float vsi_size_t sz = 0; sz = vsi_nn_kernel_tensor_attr_get_size( attr ); internal_buffer0 = malloc( sz ); + CHECK_PTR_FAIL_GOTO( internal_buffer0, "Create buffer fail.", final ); } else { internal_buffer0 = malloc( bytes ); + CHECK_PTR_FAIL_GOTO( internal_buffer0, "Create buffer fail.", final ); internal_buffer = internal_buffer0; } @@ -422,6 +434,7 @@ vsi_status vsi_nn_kernel_tensor_write_from_float if ( attr->dtype == I4 || attr->dtype == U4 ) { internal_buffer = malloc( bytes ); + CHECK_PTR_FAIL_GOTO( internal_buffer, "Create buffer fail.", final ); status = vsi_nn_kernel_pack_4bit_data(attr, (uint8_t*)internal_buffer0, (uint8_t*)internal_buffer); } } @@ -442,7 +455,7 @@ vsi_status vsi_nn_kernel_tensor_write_from_float { vsi_nn_kernel_tensor_attr_release( &internal_attr ); } - if ( attr->dtype == I4 || attr->dtype == U4 ) + if ( attr && (attr->dtype == I4 || attr->dtype == U4) ) { vsi_nn_safe_free(internal_buffer0); } @@ -562,6 +575,8 @@ static void _convert_tensor_attr_to_vx_tensor_param MAP_TYPE( p->data_format, F64, VSI_NN_TYPE_FLOAT64 ); MAP_TYPE( p->data_format, BF16, VSI_NN_TYPE_BFLOAT16 ); MAP_TYPE( p->data_format, BOOL8, VSI_NN_TYPE_BOOL8 ); + MAP_TYPE( p->data_format, FP8_E4M3, VSI_NN_TYPE_FLOAT8_E4M3 ); + MAP_TYPE( p->data_format, FP8_E5M2, VSI_NN_TYPE_FLOAT8_E5M2 ); default: VSI_ASSERT( FALSE ); break; @@ -577,6 +592,12 @@ static void _convert_tensor_attr_to_vx_tensor_param MAP_TYPE( p->quant_format, VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL, VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC ); + MAP_TYPE(p->quant_format, + VSI_NN_KERNEL_QUANT_FLOAT8, + VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8); + MAP_TYPE(p->quant_format, + VSI_NN_KERNEL_QUANT_FLOAT8_PERCHANNEL, + VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8); default: VSI_ASSERT( FALSE ); break; @@ -615,11 +636,11 @@ vsi_nn_kernel_tensor_t vsi_nn_kernel_tensor_create //convert attr->shape->data to correct data type for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) { - size_vxsize[i] = -1 == attr->shape->data[i] ? -1 : (vx_size)attr->shape->data[i]; + size_vxsize[i] = (vsi_size_t)-1 == attr->shape->data[i] ? (vx_size)-1 : (vx_size)attr->shape->data[i]; } for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) { - size_u32[i] = -1 == attr->shape->data[i] ? -1 : (vx_uint32)attr->shape->data[i]; + size_u32[i] = (vsi_size_t)-1 == attr->shape->data[i] ? (vx_uint32)-1 : (vx_uint32)attr->shape->data[i]; } #ifdef VSI_40BIT_VA_SUPPORT params.sizes = size_vxsize; @@ -672,6 +693,8 @@ vsi_nn_tensor_t* vsi_nn_pad_tensor vsi_nn_dtype_t dst_type; vsi_nn_tensor_t *output = NULL; + VSI_UNREFERENCED(mode); + input_data_ptr = vsi_nn_ConvertTensorToFloat32Data(graph, input); CHECK_PTR_FAIL_GOTO( input_data_ptr, "Create data ptr fail.", final ); @@ -764,6 +787,7 @@ vsi_nn_tensor_t* vsi_nn_merge_input_zeropoint_to_bias uint32_t i, j; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); weight_data = vsi_nn_ConvertTensorToData(graph, weight); + CHECK_PTR_FAIL_GOTO( weight_data, "Create buffer fail.", final ); if (bias == NULL) { @@ -787,9 +811,11 @@ vsi_nn_tensor_t* vsi_nn_merge_input_zeropoint_to_bias attr.dim_num = 2; } bias_data = (int32_t *)vsi_nn_ConvertTensorToData(graph, bias); + CHECK_PTR_FAIL_GOTO( new_bias_data_ptr, "Create buffer fail.", final ); } new_bias_data_ptr = (int32_t *)malloc(attr.size[0] * sizeof(int32_t)); + CHECK_PTR_FAIL_GOTO( new_bias_data_ptr, "Create buffer fail.", final ); memset((void *)new_bias_data_ptr, 0, sizeof(int32_t) * attr.size[0]); if (input->attr.dtype.zero_point != 0) @@ -815,6 +841,7 @@ vsi_nn_tensor_t* vsi_nn_merge_input_zeropoint_to_bias new_bias = vsi_nn_CreateTensorFromData(graph, (uint8_t *)new_bias_data_ptr, &attr); +final: vsi_nn_safe_free( new_bias_data_ptr ); vsi_nn_safe_free( bias_data ); vsi_nn_safe_free( weight_data ); diff --git a/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c b/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c index 6756e3a16..a40bd81ba 100644 --- a/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c @@ -29,6 +29,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_tensor_util.h" #include "kernel/vsi_nn_kernel.h" +#include "vsi_nn_error.h" #define REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( kernel_name ) \ static vsi_nn_kernel_node_t _##kernel_name##setup \ @@ -62,6 +63,11 @@ REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( a_times_b_plus_c ) vsi_nn_tensor_t * a_times_b = NULL; vsi_nn_tensor_attr_t attr; + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + scale = 1.0; overflow_policy = VX_CONVERT_POLICY_SATURATE; rounding_policy = VX_ROUND_POLICY_TO_ZERO; @@ -70,7 +76,7 @@ REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( a_times_b_plus_c ) if(!scale_s) { VSILOGE("CreateScalar fail\n"); - goto OnError; + goto final; } memset(&attr, 0, sizeof(attr)); @@ -79,6 +85,7 @@ REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( a_times_b_plus_c ) attr.vtl = TRUE; attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; a_times_b = vsi_nn_CreateTensor(graph, &attr); + CHECK_PTR_FAIL_GOTO( a_times_b, "Create tensor fail.", final ); node = vxTensorMultiplyNode( graph->g, inputs[0]->t, inputs[1]->t, @@ -89,7 +96,7 @@ REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( a_times_b_plus_c ) if( NULL == node ) { VSILOGE("Call vxTensorMultiplyNode fail.(a_times_b_plus_c)"); - goto OnError; + goto final; } node = vxTensorAddNode( graph->g, a_times_b->t, inputs[2]->t, @@ -97,10 +104,10 @@ REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( a_times_b_plus_c ) if( NULL == node ) { VSILOGE("Call vxTensorAddNode fail.(a_times_b_plus_c)"); - goto OnError; + goto final; } -OnError: +final: if (scale_s) vxReleaseScalar(&scale_s); if (a_times_b) vsi_nn_ReleaseTensor(&a_times_b); diff --git a/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c b/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c index 955c61d2c..5fd98c2a9 100644 --- a/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c @@ -48,6 +48,10 @@ static vsi_nn_kernel_node_t _setup vx_node node = NULL; float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + node = vxBatchNormalizationLayer( graph->g, eps, diff --git a/src/tim/vx/internal/src/kernel/vx/convolutional.c b/src/tim/vx/internal/src/kernel/vx/convolutional.c index 2f9be4903..d77719477 100644 --- a/src/tim/vx/internal/src/kernel/vx/convolutional.c +++ b/src/tim/vx/internal/src/kernel/vx/convolutional.c @@ -293,6 +293,14 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d ) vx_tensor temp_tensors[3] = { NULL }; uint32_t i = 0; + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(outputs); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(kernel); + _build_vx_conv2d_param( &vxparam, 1, vsi_nn_kernel_param_get_int32(params, "stride"), @@ -310,7 +318,9 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d ) temp_tensors[0] = _expand_tensor_dim( inputs[0]->t, (vsi_ssize_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 1 ); CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final ); - if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) + if (inputs[1]->attr.dtype.qnt_type != + VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC && + inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8) { temp_tensors[1] = _expand_tensor_dim( inputs[1]->t, (vsi_ssize_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 1 ); @@ -369,6 +379,14 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d ) uint32_t i = 0; vsi_bool need_explicit_padding = FALSE; + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(outputs); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(kernel); + _build_vx_conv2d_param( &vxparam, 1, vsi_nn_kernel_param_get_int32(params, "stride"), @@ -387,7 +405,9 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d ) (vsi_ssize_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 1 ); CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final ); - if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) + if (inputs[1]->attr.dtype.qnt_type != + VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC && + inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8) { vsi_size_t new_w_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; uint32_t new_w_rank = 4; @@ -486,6 +506,14 @@ REGISTER_CONV_OPENVX_KERNEL( conv2d ) vx_node node = NULL; vx_nn_convolution_params_ext2_t vxparam; + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(outputs); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(kernel); + _build_vx_conv2d_param( &vxparam, vsi_nn_kernel_param_get_int32(params, "stride_h"), @@ -518,6 +546,14 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv2d ) vx_node node = NULL; vx_nn_convolution_params_ext2_t vxparam; + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(outputs); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(kernel); + _build_vx_conv2d_param( &vxparam, vsi_nn_kernel_param_get_int32(params, "stride_h"), @@ -552,6 +588,14 @@ REGISTER_CONV_OPENVX_KERNEL( deconvolution1d ) vx_tensor temp_tensors[2] = { NULL }; int i; + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(outputs); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(kernel); + _build_vx_deconv2d_param( &vxparam, 1, vsi_nn_kernel_param_get_int32(params, "stride"), @@ -595,6 +639,7 @@ REGISTER_CONV_OPENVX_KERNEL( conv3d ) vx_node node = NULL; #if VX_CONV_3D_API_SUPPORT vx_nn_convolution_3d_params_t vxparam; + memset(&vxparam, 0, sizeof(vxparam)); _build_vx_conv3d_param( @@ -625,14 +670,23 @@ REGISTER_CONV_OPENVX_KERNEL( conv3d ) outputs[0]->t ); #endif + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(outputs); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(kernel); return (vsi_nn_kernel_node_t)node; } /* depthwise_conv2d*/ REGISTER_CONV_OPENVX_KERNEL( deconv3d ) { vx_node node = NULL; + #if VX_DECONV_3D_API_SUPPORT vx_nn_deconvolution_3d_params_t vxparam; + memset(&vxparam, 0, sizeof(vxparam)); _build_vx_deconv3d_param( @@ -648,7 +702,7 @@ REGISTER_CONV_OPENVX_KERNEL( deconv3d ) vsi_nn_kernel_param_get_int32(params, "pad_right"), vsi_nn_kernel_param_get_int32(params, "outpadding_w"), vsi_nn_kernel_param_get_int32(params, "outpadding_h"), - vsi_nn_kernel_param_get_int32(params, "outpadding_w"), + vsi_nn_kernel_param_get_int32(params, "outpadding_d"), vsi_nn_kernel_param_get_int32(params, "group"), vsi_nn_kernel_param_get_int32(params, "overflow_policy"), vsi_nn_kernel_param_get_int32(params, "rounding_policy"), @@ -662,7 +716,14 @@ REGISTER_CONV_OPENVX_KERNEL( deconv3d ) outputs[0]->t ); #endif + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(outputs); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(kernel); return (vsi_nn_kernel_node_t)node; } /* deconv3d */ -#undef REGISTER_CONV_OPENVX_KERNEL \ No newline at end of file +#undef REGISTER_CONV_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c index 9e299da26..09514d316 100644 --- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c @@ -106,6 +106,10 @@ static vsi_nn_kernel_node_t _setup goto final; } + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + final: if (lut1) { @@ -120,6 +124,14 @@ static vsi_nn_kernel_node_t _setup return (vsi_nn_kernel_node_t)node; #else + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(input_num); + VSI_UNREFERENCED(outputs); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(lut_type); return NULL; #endif } /* _setup() */ @@ -190,6 +202,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( abs ) vx_tensor input = NULL, input0 = NULL; vx_tensor output = NULL, output0 = NULL; + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + if (inputs[0]->attr.dim_num > 4) { input_size[0] = vsi_nn_GetElementNum(inputs[0]) / @@ -231,6 +248,10 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( linear ) float a_v = vsi_nn_kernel_param_get_float32( params, "a_v" ); float b_v = vsi_nn_kernel_param_get_float32( params, "b_v" ); + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + node = vxActivationLayer( graph->g, inputs[0]->t, @@ -247,6 +268,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( sigmoid ) { vx_node node = NULL; + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + node = vxActivationLayer( graph->g, inputs[0]->t, @@ -265,6 +291,10 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( tanh ) float scale_a = vsi_nn_kernel_param_get_float32( params, "scale_a" ); float scale_b = vsi_nn_kernel_param_get_float32( params, "scale_b" ); + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + node = vxActivationLayer( graph->g, inputs[0]->t, @@ -281,6 +311,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( relu1 ) { vx_node node = NULL; + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + node = vxActivationLayer( graph->g, inputs[0]->t, @@ -297,6 +332,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( relu6 ) { vx_node node = NULL; + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + node = vxActivationLayer( graph->g, inputs[0]->t, @@ -313,6 +353,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( rsqrt ) { vx_node node = NULL; + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + node = vxActivationLayer( graph->g, inputs[0]->t, @@ -329,6 +374,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( sqrt ) { vx_node node = NULL; + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + node = vxActivationLayer( graph->g, inputs[0]->t, @@ -345,6 +395,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( softrelu ) { vx_node node = NULL; + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + node = vxActivationLayer( graph->g, inputs[0]->t, diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_vx.c index 3c9947d40..d81a55563 100644 --- a/src/tim/vx/internal/src/kernel/vx/eltwise_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/eltwise_vx.c @@ -57,6 +57,12 @@ REGISTER_ELTWISE_OPENVX_KERNEL( add ) { vx_node node = vxTensorAddNode( graph->g, inputs[0]->t, inputs[1]->t, VX_CONVERT_POLICY_SATURATE, outputs[0]->t ); + + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + return (vsi_nn_kernel_node_t)node; } /* add() */ @@ -65,6 +71,11 @@ REGISTER_ELTWISE_OPENVX_KERNEL( sub ) vx_node node = vxTensorSubtractNode( graph->g, inputs[0]->t, inputs[1]->t, VX_CONVERT_POLICY_SATURATE, outputs[0]->t ); + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + return (vsi_nn_kernel_node_t)node; } /* sub() */ @@ -75,6 +86,10 @@ REGISTER_ELTWISE_OPENVX_KERNEL( div ) vx_scalar scale_s = NULL; vx_node node = NULL; + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + scale = vsi_nn_kernel_param_get_float32(params, "scale"); overflow_policy = vsi_nn_kernel_param_get_int32(params, "overflow_policy"); rounding_policy = vsi_nn_kernel_param_get_int32(params, "rounding_policy"); @@ -105,6 +120,10 @@ REGISTER_ELTWISE_OPENVX_KERNEL( mul ) vx_scalar scale_s = NULL; vx_node node = NULL; + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + scale = vsi_nn_kernel_param_get_float32(params, "scale"); overflow_policy = vsi_nn_kernel_param_get_int32(params, "overflow_policy"); rounding_policy = vsi_nn_kernel_param_get_int32(params, "rounding_policy"); diff --git a/src/tim/vx/internal/src/kernel/vx/l2_normalize_vx.c b/src/tim/vx/internal/src/kernel/vx/l2_normalize_vx.c index 5133dabc4..af68dd210 100644 --- a/src/tim/vx/internal/src/kernel/vx/l2_normalize_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/l2_normalize_vx.c @@ -116,6 +116,10 @@ REGISTER_L2_NORMALIZE_OPENVX_KERNEL( l2_norm ) if (vx_output) vxReleaseTensor(&vx_output); #endif + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + if( NULL == node ) { VSILOGE("Call vxSoftmaxLayer2 fail.(softmax)"); diff --git a/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c b/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c index 3f5bfa1f4..5279543dc 100644 --- a/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c @@ -63,6 +63,10 @@ REGISTER_BATCH_GEMM_OPENVX_KERNEL( matrixmul ) vx_scalar trans_a = vxCreateScalar(graph->ctx->c, VX_TYPE_BOOL, &transposeA); vx_scalar trans_b = vxCreateScalar(graph->ctx->c, VX_TYPE_BOOL, &transposeB); + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + node = vxBatchGemmNode(graph->g, inputs[0]->t, inputs[1]->t, diff --git a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c index a458e3800..c9a2c845c 100644 --- a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c @@ -30,6 +30,7 @@ #include "vsi_nn_tensor_util.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_dtype_util.h" +#include "vsi_nn_error.h" #define REGISTER_PAD2_OPENVX_KERNEL( kernel_name ) \ static vsi_nn_kernel_node_t _##kernel_name##setup \ @@ -68,6 +69,10 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 ) vsi_bool release_intermediate_tensor = TRUE; float const_val = vsi_nn_kernel_param_get_float32(params, "const_val"); + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + memset(¶m, 0, sizeof(param)); memset(pad_front_array, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); memset(pad_back_array, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); @@ -90,6 +95,7 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 ) attr.is_const = FALSE; convert_tensor = vsi_nn_CreateTensor(graph, &attr); + CHECK_PTR_FAIL_GOTO( convert_tensor, "Create tensor fail.", final ); node = vxTensorCopyNode( graph->g, @@ -105,6 +111,7 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 ) node = vxTensorPadNode( graph->g, convert_tensor->t, outputs[0]->t, ¶m, sizeof(param) ); +final: vxReleaseScalar( ¶m.pad_const ); if (release_intermediate_tensor) diff --git a/src/tim/vx/internal/src/kernel/vx/prelu_vx.c b/src/tim/vx/internal/src/kernel/vx/prelu_vx.c index 4728ad651..ebf381256 100644 --- a/src/tim/vx/internal/src/kernel/vx/prelu_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/prelu_vx.c @@ -95,6 +95,10 @@ REGISTER_PRELU_OPENVX_KERNEL( prelu ) vx_node node = NULL; int32_t is_per_channel_alpha = 0; + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + is_per_channel_alpha = vsi_nn_kernel_param_get_int32(params, "is_per_channel_alpha"); if (!is_per_channel_alpha) diff --git a/src/tim/vx/internal/src/kernel/vx/resize_vx.c b/src/tim/vx/internal/src/kernel/vx/resize_vx.c index 3b2b16778..fdea91a43 100644 --- a/src/tim/vx/internal/src/kernel/vx/resize_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/resize_vx.c @@ -121,6 +121,9 @@ static vsi_nn_kernel_node_t _setup sizeof(param), outputs[0]->t ); #endif + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); if ( NULL == node ) { VSILOGI("Call vxTensorScaleNode fail.(resize)"); diff --git a/src/tim/vx/internal/src/kernel/vx/select_vx.c b/src/tim/vx/internal/src/kernel/vx/select_vx.c new file mode 100644 index 000000000..d50a99504 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/select_vx.c @@ -0,0 +1,86 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +#if (VX_TENSOR_SELECT_VX_SUPPORT) + +#define REGISTER_SELECTOPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_SELECTOPENVX_KERNEL( select ) +{ + vx_node node = NULL; + vx_tensor input_list[3] = {NULL}; + uint32_t i = 0; + uint32_t input_count = (uint32_t)input_num; + + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(output_num); + + for ( i = 0; i < input_count; i++ ) + { + input_list[i] = inputs[i]->t; + } + + node = vxTensorSelectLayer( + graph->g, + input_list, + input_count, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* select() */ + +#undef REGISTER_SELECTOPENVX_KERNEL + +#endif diff --git a/src/tim/vx/internal/src/kernel/vx/softmax_vx.c b/src/tim/vx/internal/src/kernel/vx/softmax_vx.c index f097fbbb9..1d1d445e5 100644 --- a/src/tim/vx/internal/src/kernel/vx/softmax_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/softmax_vx.c @@ -59,10 +59,12 @@ REGISTER_SOFTMAX_OPENVX_KERNEL( softmax ) vx_node node = NULL; float beta = vsi_nn_kernel_param_get_float32(params, "beta"); vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; +#if !VX_STREAM_PROCESSOR_SUPPORT vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; uint32_t rank_in = 0; - int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis"); int32_t new_axis = 0; +#endif + int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis"); size_t size = sizeof(vx_nn_softmax_params_t); #ifdef VX_SOFTMAX_AXIS_PARAMETER_SUPPORT vx_nn_softmax_params_ext_t paramExt; @@ -78,6 +80,17 @@ REGISTER_SOFTMAX_OPENVX_KERNEL( softmax ) base.beta = beta; #endif + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + +#if VX_STREAM_PROCESSOR_SUPPORT + node = vxSoftmaxLayer2( graph->g, + inputs[0]->t, + param, + size, + outputs[0]->t); +#else vsi_nn_kernel_optimize_softmax_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, shapes[0], &rank_in, &new_axis); @@ -108,13 +121,14 @@ REGISTER_SOFTMAX_OPENVX_KERNEL( softmax ) param, size, reshape_tensors[1]->t); +#endif if( NULL == node ) { VSILOGE("Call vxSoftmaxLayer2 fail.(softmax)"); } - vsi_nn_ReleaseTensor( &reshape_tensors[0] ); - vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + vsi_safe_release_tensor( reshape_tensors[0] ); + vsi_safe_release_tensor( reshape_tensors[1] ); return (vsi_nn_kernel_node_t)node; } /* softmax() */ diff --git a/src/tim/vx/internal/src/kernel/vx/square_vx.c b/src/tim/vx/internal/src/kernel/vx/square_vx.c index 5ae1499da..778557331 100644 --- a/src/tim/vx/internal/src/kernel/vx/square_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/square_vx.c @@ -46,6 +46,11 @@ static vsi_nn_kernel_node_t _setup { vx_node node = NULL; + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(params); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + node = vxActivationLayer( graph->g, inputs[0]->t, diff --git a/src/tim/vx/internal/src/kernel/vx/swish_vx.c b/src/tim/vx/internal/src/kernel/vx/swish_vx.c index 7557d9b11..9b458c62d 100644 --- a/src/tim/vx/internal/src/kernel/vx/swish_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/swish_vx.c @@ -62,6 +62,10 @@ REGISTER_SWISH_OPENVX_KERNEL( swish ) vx_enum function = VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SWISH; float beta = 1.0f; + VSI_UNREFERENCED(kernel); + VSI_UNREFERENCED(output_num); + VSI_UNREFERENCED(input_num); + if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver) { swish_type = (vsi_nn_swish_type)vsi_nn_kernel_param_get_int32(params, "type"); diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl index 49d04e2d4..755c809e3 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl @@ -15,6 +15,8 @@ __kernel void gather_U8toU8( int4 coord_in = (int4)(gidy, 0, gidx, 0); int4 indice = read_imagei(input1, coord_in.xy); + + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.w = gidz * axis_num + indice.x; uint4 data = read_imageui(input0, coord_in.zw); @@ -40,6 +42,8 @@ __kernel void gather_F16toF16( int4 coord_in = (int4)(gidy, 0, gidx, 0); int4 indice = read_imagei(input1, coord_in.xy); + + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.w = gidz * axis_num + indice.x; float4 data = read_imagef(input0, coord_in.zw); @@ -65,6 +69,8 @@ __kernel void gather_I32toI32( int4 coord_in = (int4)(gidy, 0, gidx, 0); int4 indice = read_imagei(input1, coord_in.xy); + + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.w = gidz * axis_num + indice.x; int4 data = read_imagei(input0, coord_in.zw); @@ -90,6 +96,8 @@ __kernel void gather_F32toF32( int4 coord_in = (int4)(gidy, 0, gidx, 0); int4 indice = read_imagei(input1, coord_in.xy); + + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.w = gidz * axis_num + indice.x; float4 data = read_imagef(input0, coord_in.zw); diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_array.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_array.cl index 15a466443..574dd6b3f 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/gather_array.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_array.cl @@ -15,6 +15,7 @@ __kernel void gather_array_U8toU8( int4 coord_in = (int4)(gidy, 0, gidx, 0); int4 indice = read_imagei(input1, coord_in.xy); + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.w = gidz * axis_num + indice.x; Image img1 = create_image_from_image2d(input0, 1); @@ -43,6 +44,7 @@ __kernel void gather_array_F16toF16( int4 coord_in = (int4)(gidy, 0, gidx, 0); int4 indice = read_imagei(input1, coord_in.xy); + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.w = gidz * axis_num + indice.x; Image img1 = create_image_from_image2d(input0, 2); @@ -71,6 +73,7 @@ __kernel void gather_array_I32toI32( int4 coord_in = (int4)(gidy, 0, gidx, 0); int4 indice = read_imagei(input1, coord_in.xy); + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.w = gidz * axis_num + indice.x; Image img1 = create_image_from_image2d(input0, 4); @@ -99,6 +102,7 @@ __kernel void gather_array_F32toF32( int4 coord_in = (int4)(gidy, 0, gidx, 0); int4 indice = read_imagei(input1, coord_in.xy); + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.w = gidz * axis_num + indice.x; Image img1 = create_image_from_image2d(input0, 4); diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl index 4ff6ec158..bfc88d0ed 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl @@ -20,6 +20,7 @@ __kernel void gather_batch_U8toU8( { int4 indice = read_imagei(input1, coord_idx); coord_idx.y++; + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.y = gidz * axis_num + indice.x; uint4 data = read_imageui(input0, coord_in); @@ -51,6 +52,7 @@ __kernel void gather_batch_F16toF16( { int4 indice = read_imagei(input1, coord_idx); coord_idx.y++; + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.y = gidz * axis_num + indice.x; float4 data = read_imagef(input0, coord_in); @@ -82,6 +84,7 @@ __kernel void gather_batch_I32toI32( { int4 indice = read_imagei(input1, coord_idx); coord_idx.y++; + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.y = gidz * axis_num + indice.x; int4 data = read_imagei(input0, coord_in); @@ -113,6 +116,7 @@ __kernel void gather_batch_F32toF32( { int4 indice = read_imagei(input1, coord_idx); coord_idx.y++; + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.y = gidz * axis_num + indice.x; float4 data = read_imagef(input0, coord_in); diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_elements.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_elements.cl index 323f69417..58403f9a3 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/gather_elements.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_elements.cl @@ -1,3 +1,11 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable + +_viv_uniform uint width0; +_viv_uniform uint height0; +_viv_uniform uint width1; +_viv_uniform uint height1; +_viv_uniform uint width_out; +_viv_uniform uint height_out; #define GATHER_ELEMENTS_AXIS0_2D(name, data_type, read_func, write_func, conv_func) \ __kernel void gather_elements_axis0_##name##_I32to##name##_2D \ @@ -133,3 +141,159 @@ __kernel void gather_elements_axis2_##name##_I32to##name \ GATHER_ELEMENTS_AXIS2(F32, float4, read_imagef, write_imagef, convert_float4) GATHER_ELEMENTS_AXIS2(I32, int4, read_imagei, write_imagei, convert_int4_rte) GATHER_ELEMENTS_AXIS2(U32, uint4, read_imageui, write_imageui, convert_uint4_rte) + +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(name, data_type, data_type_ptr, stride) \ +__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output, \ + float input_scale, \ + float input_tail, \ + int axis_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \ + Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \ + int* index_ptr = (int*)index_tensor.ptr; \ + int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \ + \ + Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \ + data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \ + data_type data = input_ptr[index + coord.y * width0 + coord.z * width0 * height0]; \ + \ + Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \ + data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \ + output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \ +} +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(F32, float, float*, 4) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I32, int, int*, 4) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(F16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I8, char, char*, 1) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(U8, uchar, uchar*, 1) + +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(name, data_type, data_type_ptr, stride) \ +__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output, \ + float input_scale, \ + float input_tail, \ + int axis_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \ + Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \ + int* index_ptr = (int*)index_tensor.ptr; \ + int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \ + \ + Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \ + data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \ + data_type data = input_ptr[coord.x + index * width0 + coord.z * width0 * height0]; \ + \ + Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \ + data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \ + output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \ +} +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(F32, float, float*, 4) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I32, int, int*, 4) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(F16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I8, char, char*, 1) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(U8, uchar, uchar*, 1) + +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(name, data_type, data_type_ptr, stride) \ +__kernel void gather_elements_beyond_maxwidth_axis2_##name##_I32to##name \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output, \ + float input_scale, \ + float input_tail, \ + int axis_size \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \ + Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \ + int* index_ptr = (int*)index_tensor.ptr; \ + int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \ + \ + Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \ + data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \ + data_type data = input_ptr[coord.x + coord.y * width0 + index * width0 * height0]; \ + \ + Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \ + data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \ + output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \ +} +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(F32, float, float*, 4) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I32, int, int*, 4) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(F16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I8, char, char*, 1) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(U8, uchar, uchar*, 1) + + +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(name, data_type, data_type_ptr, stride) \ +__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name##_2D \ + ( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + float input_scale, \ + float input_tail, \ + int axis_size \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + Image index_img = create_image_from_image2d(input1, 4); \ + int* index_ptr = (int*)index_img.ptr; \ + int index = index_ptr[coord.x + coord.y * width1]; \ + \ + Image input_img = create_image_from_image2d(input0, stride); \ + data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \ + data_type data = input_ptr[index + coord.y * width0]; \ + \ + Image output_img = create_image_from_image2d(output, stride); \ + data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \ + output_ptr[coord.x + coord.y * width_out] = data; \ +} +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(F32, float, float*, 4) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I32, int, int*, 4) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(F16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I8, char, char*, 1) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(U8, uchar, uchar*, 1) + +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(name, data_type, data_type_ptr, stride) \ +__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name##_2D \ + ( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + float input_scale, \ + float input_tail, \ + int axis_size \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + Image index_img = create_image_from_image2d(input1, 4); \ + int* index_ptr = (int*)index_img.ptr; \ + int index = index_ptr[coord.x + coord.y * width1]; \ + \ + Image input_img = create_image_from_image2d(input0, stride); \ + data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \ + data_type data = input_ptr[coord.x + index * width0]; \ + \ + Image output_img = create_image_from_image2d(output, stride); \ + data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \ + output_ptr[coord.x + coord.y * width_out] = data; \ +} +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(F32, float, float*, 4) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I32, int, int*, 4) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(F16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I8, char, char*, 1) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(U8, uchar, uchar*, 1) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd_batch.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd_batch.cl index 02e430922..1cf59759f 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd_batch.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd_batch.cl @@ -1,124 +1,133 @@ __kernel void gather_nd_batch_U8toU8_1D( __read_only image2d_t input0, - __read_only image2d_t input1, - __write_only image2d_t output, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, int block_size, int coord_dim ) { int gidx = get_global_id(0); // block_size - int gidy = get_global_id(1); // batch_num + int gidy = get_global_id(1); // index_num + int gidz = get_global_id(2); // batch_num - int4 coord = (int4)(gidx, gidy, 0, 0); - int4 indice = read_imagei(input1, coord.wy); - coord.z = indice.x * block_size + gidx; + int4 coord = (int4)(gidx, gidy, gidz, 0); + int4 indice = read_imagei(input1, coord.wyzw); + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz); - uint4 data = read_imageui(input0, coord.zy); - write_imageui(output, coord.xy, data); + uint4 data = read_imageui(input0, coord0); + write_imageui(output, coord, data); } __kernel void gather_nd_batch_F16toF16_1D( __read_only image2d_t input0, - __read_only image2d_t input1, - __write_only image2d_t output, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, int block_size, int coord_dim ) { int gidx = get_global_id(0); // block_size - int gidy = get_global_id(1); // batch_num + int gidy = get_global_id(1); // index_num + int gidz = get_global_id(2); // batch_num - int4 coord = (int4)(gidx, gidy, 0, 0); - int4 indice = read_imagei(input1, coord.wy); - coord.z = indice.x * block_size + gidx; + int4 coord = (int4)(gidx, gidy, gidz, 0); + int4 indice = read_imagei(input1, coord.wyzw); + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz); - float4 data = read_imagef(input0, coord.zy); - write_imagef(output, coord.xy, data); + float4 data = read_imagef(input0, coord0); + write_imagef(output, coord, data); } __kernel void gather_nd_batch_I8toI8_1D( __read_only image2d_t input0, - __read_only image2d_t input1, - __write_only image2d_t output, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, int block_size, int coord_dim ) { int gidx = get_global_id(0); // block_size - int gidy = get_global_id(1); // batch_num + int gidy = get_global_id(1); // index_num + int gidz = get_global_id(2); // batch_num - int4 coord = (int4)(gidx, gidy, 0, 0); - int4 indice = read_imagei(input1, coord.wy); - coord.z = indice.x * block_size + gidx; + int4 coord = (int4)(gidx, gidy, gidz, 0); + int4 indice = read_imagei(input1, coord.wyzw); + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz); - int4 data = read_imagei(input0, coord.zy); - write_imagei(output, coord.xy, data); + int4 data = read_imagei(input0, coord0); + write_imagei(output, coord, data); } //2D __kernel void gather_nd_batch_U8toU8_2D( __read_only image2d_array_t input0, - __read_only image2d_t input1, - __write_only image2d_t output, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, int block_size, int coord_dim ) { int gidx = get_global_id(0); // block_size - int gidy = get_global_id(1); // batch_num + int gidy = get_global_id(1); // index_num + int gidz = get_global_id(2); // batch_num - int4 coord = (int4)(0, gidy, gidx, 1); - int4 indice = read_imagei(input1, coord.xy); - int4 indice1 = read_imagei(input1, coord.wy); + int4 coord = (int4)(1, gidy, gidz, 0); + int4 indice = read_imagei(input1, coord.wyzw); + int4 indice1 = read_imagei(input1, coord.xyzw); indice.x = indice.x * block_size + gidx; indice.y = indice1.x; - indice.zw = coord.yx; + indice.zw = coord.zw; uint4 data = read_imageui(input0, indice); - write_imageui(output, coord.zy, data); + coord.x = gidx; + write_imageui(output, coord, data); } __kernel void gather_nd_batch_F16toF16_2D( __read_only image2d_array_t input0, - __read_only image2d_t input1, - __write_only image2d_t output, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, int block_size, int coord_dim ) { int gidx = get_global_id(0); // block_size - int gidy = get_global_id(1); // batch_num + int gidy = get_global_id(1); // index_num + int gidz = get_global_id(2); // batch_num - int4 coord = (int4)(0, gidy, gidx, 1); - int4 indice = read_imagei(input1, coord.xy); - int4 indice1 = read_imagei(input1, coord.wy); + int4 coord = (int4)(1, gidy, gidz, 0); + int4 indice = read_imagei(input1, coord.wyzw); + int4 indice1 = read_imagei(input1, coord.xyzw); indice.x = indice.x * block_size + gidx; indice.y = indice1.x; - indice.zw = coord.yx; + indice.zw = coord.zw; float4 data = read_imagef(input0, indice); - write_imagef(output, coord.zy, data); + coord.x = gidx; + write_imagef(output, coord, data); } __kernel void gather_nd_batch_I8toI8_2D( __read_only image2d_array_t input0, - __read_only image2d_t input1, - __write_only image2d_t output, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, int block_size, int coord_dim ) { int gidx = get_global_id(0); // block_size - int gidy = get_global_id(1); // batch_num + int gidy = get_global_id(1); // index_num + int gidz = get_global_id(2); // batch_num - int4 coord = (int4)(0, gidy, gidx, 1); - int4 indice = read_imagei(input1, coord.xy); - int4 indice1 = read_imagei(input1, coord.wy); + int4 coord = (int4)(1, gidy, gidz, 0); + int4 indice = read_imagei(input1, coord.wyzw); + int4 indice1 = read_imagei(input1, coord.xyzw); indice.x = indice.x * block_size + gidx; indice.y = indice1.x; indice.y = indice1.x; - indice.zw = coord.yx; + indice.zw = coord.zw; int4 data = read_imagei(input0, indice); - write_imagei(output, coord.zy, data); + coord.x = gidx; + write_imagei(output, coord, data); } diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_cross.cl b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_cross.cl new file mode 100644 index 000000000..e36f10353 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_cross.cl @@ -0,0 +1,95 @@ +__kernel void gemm_F32F32toF32_merge( + __read_only image2d_array_t inputA, + __read_only image2d_array_t inputB, + __write_only image2d_array_t output, + int M, + int K, + int N, + int ac2zero, + int bc2zero, + float scale_a, + float zp_a, + float scale_b, + float zp_b, + float scale_out, + float zp_out, + int outer) +{ + for(int i = 0; i < outer; i++) + { + int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? i : get_global_id(2)), 0); + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); + + float4 sum = (float4)(0); + + for(; coord_a.x < K;) + { + float4 tempA0; + float4 tempB0; + + tempA0 = read_imagef(inputA, coord_a); + tempB0 = read_imagef(inputB, coord_b); + coord_a.x++; + coord_b.y++; + + sum = sum + tempA0 * tempB0; + } + + coord_b.y = get_global_id(1); + coord_b.z = get_global_id(2) + i * get_global_size(2); + write_imagef(output, coord_b, sum); + } +} + +#define GEMM_MERGE(name, dst_type, read_image_type, convert_type, write_image_type) \ +__kernel void gemm_##name##_merge( \ + __read_only image2d_array_t inputA, \ + __read_only image2d_array_t inputB, \ + __write_only image2d_array_t output, \ + int M, \ + int K, \ + int N, \ + int ac2zero, \ + int bc2zero, \ + float scale_a, \ + float zp_a, \ + float scale_b, \ + float zp_b, \ + float scale_out, \ + float zp_out, \ + int outer) \ +{ \ + for(int i = 0; i < outer; i++) \ + { \ + int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? i : get_global_id(2)), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); \ + float4 sum = (float4)(0); \ + dst_type dst; \ + \ + for(; coord_a.x < K;) \ + { \ + float4 tempA0; \ + float4 tempB0; \ + \ + tempA0 = convert_float4(read_image_type(inputA, coord_a)); \ + tempB0 = convert_float4(read_image_type(inputB, coord_b)); \ + tempA0.x = (tempA0.x - zp_a) * scale_a; \ + tempB0.x = (tempB0.x - zp_b) * scale_b; \ + \ + coord_a.x++; \ + coord_b.y++; \ + \ + sum = sum + tempA0 * tempB0; \ + } \ + sum.x = sum.x * scale_out + zp_out; \ + dst = convert_type(sum); \ + \ + coord_b.y = get_global_id(1); \ + coord_b.z = get_global_id(2) + i * get_global_size(2); \ + write_image_type(output, coord_b, dst); \ + } \ +} +GEMM_MERGE(I8I8toI8,int4,read_imagei,convert_int4,write_imagei); +GEMM_MERGE(U8U8toU8,uint4,read_imageui,convert_uint4,write_imageui); +GEMM_MERGE(U8U8toF32,float4,read_imageui,convert_float4,write_imagef); + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/nearest_grid_sample.cl b/src/tim/vx/internal/src/libnnext/ops/cl/nearest_grid_sample.cl new file mode 100644 index 000000000..e427fe414 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/nearest_grid_sample.cl @@ -0,0 +1,77 @@ +__kernel void nearest_grid_sample_F32_F32toF32( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + float half_input0_w, + float half_input0_h, + float add_float_value_w, + float add_float_value_h, + int depth + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int2 coord_in1 = (int2)(get_global_id(0) * 2, get_global_id(1)); + + float fx = read_imagef(input1, coord_in1).x; + coord_in1.x = coord_in1.x + 1; + float fy = read_imagef(input1, coord_in1).x; + + fx = fx * half_input0_w + add_float_value_w; + fy = fy * half_input0_h + add_float_value_h; + int x_index = convert_int(fx); + int y_index = convert_int(fy); + int4 coord_in = (int4)(x_index, y_index, 0, 0); + + float4 dst; + + while (coord_in.z < depth){ + dst = read_imagef(input0, coord_in); + write_imagef(output, coord_out, dst); + coord_in.z++; + coord_out.z++; + } +} + + +__kernel void nearest_grid_sample_U8_U8toU8( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + float half_input0_w, + float half_input0_h, + float add_float_value_w, + float add_float_value_h, + int depth, + float in0_scale, + float in0_tail, + float in1_scale, + float in1_tail, + float out_scale, + float out_tail + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int2 coord_in1 = (int2)(get_global_id(0) * 2, get_global_id(1)); + + float fx = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail; + coord_in1.x = coord_in1.x + 1; + float fy = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail; + + fx = fx * half_input0_w + add_float_value_w; + fy = fy * half_input0_h + add_float_value_h; + int x_index = convert_int(fx); + int y_index = convert_int(fy); + int4 coord_in = (int4)(x_index, y_index, 0, 0); + + float4 val; + uint4 dst; + + while (coord_in.z < depth){ + val = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail; + dst = convert_uint4_rte(val * out_scale + out_tail); + write_imageui(output, coord_out, dst); + coord_in.z++; + coord_out.z++; + } + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/resize_3d_bilinear.cl b/src/tim/vx/internal/src/libnnext/ops/cl/resize_3d_bilinear.cl new file mode 100644 index 000000000..f835db5e5 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/resize_3d_bilinear.cl @@ -0,0 +1,161 @@ +#pragma OPENCL EXTENSION CL_VIV_asm : enable + +#define RESIZE_3D(in_name, out_name, read_image_type, dst_type, convert_type, write_image_type) \ +__kernel void resize_3d_bilinear_##in_name##to##out_name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float scale_x, \ + float scale_y, \ + float scale_z, \ + float half_pixel_value, \ + uint in_width, \ + uint in_height, \ + uint in_depth, \ + float in_scale, \ + float in_tail, \ + float out_scale, \ + float out_tail \ + ) \ +{ \ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value; \ + float left_x_f = fmax(floor(in_x), 0); \ + float x_lerp = in_x - left_x_f; \ + int left_x_idx = convert_int(left_x_f); \ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value; \ + float top_y_f = fmax(floor(in_y), 0); \ + float y_lerp = in_y - top_y_f; \ + int top_y_idx = convert_int(top_y_f); \ + float in_z = (convert_float(coord_out.z) + half_pixel_value) * scale_z - half_pixel_value; \ + float front_z_f = fmax(floor(in_z), 0); \ + float z_lerp = in_z - front_z_f; \ + int front_z_idx = convert_int(front_z_f); \ + int4 coord_in = (int4)(left_x_idx, top_y_idx, front_z_idx, 0); \ + float4 data_000, data_100, data_010, data_110, data_001, data_011, data_101, data_111; \ + dst_type dst; \ + \ + int dx, dy, dz; \ + dx = in_x < 0 ? 0 : (left_x_f < in_width - 1 ? 1 : 0); \ + dy = in_y < 0 ? 0 : (top_y_f < in_height - 1 ? 1 : 0); \ + dz = in_z < 0 ? 0 : (front_z_idx < in_depth - 1 ? 1 : 0); \ + \ + data_000 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \ + coord_in.y = coord_in.y + dy; \ + data_010 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \ + coord_in.x = coord_in.x + dx; \ + data_110 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \ + coord_in.y = coord_in.y - dy; \ + data_100 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \ + coord_in.z = coord_in.z + dz; \ + data_101 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \ + coord_in.y = coord_in.y + dy; \ + data_111 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \ + coord_in.x = coord_in.x - dx; \ + data_011 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \ + coord_in.y = coord_in.y - dy; \ + data_001 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \ + \ + data_000 = data_000 + (data_100 - data_000) * x_lerp; \ + data_010 = data_010 + (data_110 - data_010) * x_lerp; \ + data_000 = data_000 + (data_010 - data_000) * y_lerp; \ + \ + data_001 = data_001 + (data_101 - data_001) * x_lerp; \ + data_011 = data_011 + (data_111 - data_011) * x_lerp; \ + data_001 = data_001 + (data_011 - data_001) * y_lerp; \ + data_000 = data_000 + (data_001 - data_000) * z_lerp; \ + \ + dst = convert_type(data_000 * out_scale + out_tail); \ + \ + write_image_type(output, coord_out, dst); \ +} +RESIZE_3D(F32, F32, read_imagef, float4, convert_float4, write_imagef) +RESIZE_3D(F32, U8, read_imagef, uint4, convert_uint4, write_imageui) +RESIZE_3D(U8, F32, read_imageui, float4, convert_float4, write_imagef) +RESIZE_3D(U8, U8, read_imageui, uint4, convert_uint4, write_imageui) +RESIZE_3D(I8, I8, read_imagei, int4, convert_int4, write_imagei) + +__kernel void resize_3d_bilinear_BF16toBF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float scale_x, + float scale_y, + float scale_z, + float half_pixel_value, + uint in_width, + uint in_height, + uint in_depth, + float in_scale, + float in_tail, + float out_scale, + float out_tail + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value; + float left_x_f = fmax(floor(in_x), 0); + float x_lerp = in_x - left_x_f; + int left_x_idx = convert_int(left_x_f); + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value; + float top_y_f = fmax(floor(in_y), 0); + float y_lerp = in_y - top_y_f; + int top_y_idx = convert_int(top_y_f); + float in_z = (convert_float(coord_out.z) + half_pixel_value) * scale_z - half_pixel_value; + float front_z_f = fmax(floor(in_z), 0); + float z_lerp = in_z - front_z_f; + int front_z_idx = convert_int(front_z_f); + int4 coord_in = (int4)(left_x_idx, top_y_idx, front_z_idx, 0); + uint4 data_000, data_100, data_010, data_110, data_001, data_011, data_101, data_111; + float4 data_000_f, data_100_f, data_010_f, data_110_f, data_001_f, data_011_f, data_101_f, data_111_f; + uint4 dst; + + int dx, dy, dz; + dx = in_x < 0 ? 0 : (left_x_f < in_width - 1 ? 1 : 0); + dy = in_y < 0 ? 0 : (top_y_f < in_height - 1 ? 1 : 0); + dz = in_z < 0 ? 0 : (front_z_idx < in_depth - 1 ? 1 : 0); + + data_000 = read_imageui(input, coord_in); + data_000 = data_000 << 16; + coord_in.y = coord_in.y + dy; + data_010 = read_imageui(input, coord_in); + data_010 = data_010 << 16; + coord_in.x = coord_in.x + dx; + data_110 = read_imageui(input, coord_in); + data_110 = data_110 << 16; + coord_in.y = coord_in.y - dy; + data_100 = read_imageui(input, coord_in); + data_100 = data_100 << 16; + coord_in.z = coord_in.z + dz; + data_101 = read_imageui(input, coord_in); + data_101 = data_101 << 16; + coord_in.y = coord_in.y + dy; + data_111 = read_imageui(input, coord_in); + data_111 = data_111 << 16; + coord_in.x = coord_in.x - dx; + data_011 = read_imageui(input, coord_in); + data_011 = data_011 << 16; + coord_in.y = coord_in.y - dy; + data_001 = read_imageui(input, coord_in); + data_001 = data_001 << 16; + + _viv_asm(COPY, data_000_f, data_000, 16); + _viv_asm(COPY, data_010_f, data_010, 16); + _viv_asm(COPY, data_110_f, data_110, 16); + _viv_asm(COPY, data_100_f, data_100, 16); + _viv_asm(COPY, data_101_f, data_101, 16); + _viv_asm(COPY, data_111_f, data_111, 16); + _viv_asm(COPY, data_011_f, data_011, 16); + _viv_asm(COPY, data_001_f, data_001, 16); + + data_000_f = data_000_f + (data_100_f - data_000_f) * x_lerp; + data_010_f = data_010_f + (data_110_f - data_010_f) * x_lerp; + data_000_f = data_000_f + (data_010_f - data_000_f) * y_lerp; + + data_001_f = data_001_f + (data_101_f - data_001_f) * x_lerp; + data_011_f = data_011_f + (data_111_f - data_011_f) * x_lerp; + data_001_f = data_001_f + (data_011_f - data_001_f) * y_lerp; + data_000_f = data_000_f + (data_001_f - data_000_f) * z_lerp; + + _viv_asm(COPY, dst, data_000_f, 16); + dst = dst >> 16; + write_imageui(output, coord_out, dst); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/resize_3d_nearest.cl b/src/tim/vx/internal/src/libnnext/ops/cl/resize_3d_nearest.cl new file mode 100644 index 000000000..220acd351 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/resize_3d_nearest.cl @@ -0,0 +1,119 @@ + +#define NEAREST_INDEX_PROCESS() \ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x + round_value; \ + int in_x_idx = convert_int(in_x); \ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y + round_value; \ + int in_y_idx = convert_int(in_y); \ + float in_z = (convert_float(coord_out.z) + half_pixel_value) * scale_z + round_value; \ + int in_z_idx = convert_int(in_z); \ + +__kernel void resize_3d_nearest_F32toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float scale_x, + float scale_y, + float scale_z, + float half_pixel_value, + float round_value, + float output_scale, + float output_tail) +{ + NEAREST_INDEX_PROCESS() + int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0); + float4 dst; + dst = read_imagef(input, coord_in); + write_imagef(output, coord_out, dst); +} + + +__kernel void resize_3d_nearest_U8toU8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float scale_x, + float scale_y, + float scale_z, + float half_pixel_value, + float round_value, + float output_scale, + float output_tail) +{ + NEAREST_INDEX_PROCESS() + int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0); + uint4 dst; + dst = convert_uint4(convert_float4(read_imageui(input, coord_in)) * output_scale + output_tail); + write_imageui(output, coord_out, dst); +} + +__kernel void resize_3d_nearest_U8toF32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float scale_x, + float scale_y, + float scale_z, + float half_pixel_value, + float round_value, + float output_scale, + float output_tail) +{ + NEAREST_INDEX_PROCESS() + int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0); + float4 dst; + dst = convert_float4(read_imageui(input, coord_in)) * output_scale + output_tail; + write_imagef(output, coord_out, dst); +} + +__kernel void resize_3d_nearest_F32toU8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float scale_x, + float scale_y, + float scale_z, + float half_pixel_value, + float round_value, + float output_scale, + float output_tail) +{ + NEAREST_INDEX_PROCESS() + int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0); + uint4 dst; + dst = convert_uint4(read_imagef(input, coord_in) * output_scale + output_tail); + write_imageui(output, coord_out, dst); +} + +__kernel void resize_3d_nearest_I8toI8( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float scale_x, + float scale_y, + float scale_z, + float half_pixel_value, + float round_value, + float output_scale, + float output_tail) +{ + NEAREST_INDEX_PROCESS() + int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0); + int4 dst; + dst = convert_int4(convert_float4(read_imagei(input, coord_in)) * output_scale); + write_imagei(output, coord_out, dst); +} + +__kernel void resize_3d_nearest_BF16toBF16( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float scale_x, + float scale_y, + float scale_z, + float half_pixel_value, + float round_value, + float output_scale, + float output_tail) +{ + NEAREST_INDEX_PROCESS() + int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0); + uint4 dst; + dst = read_imageui(input, coord_in); + write_imageui(output, coord_out, dst); +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl b/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl index 117d6d25e..87a9df7d2 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl @@ -1,5 +1,5 @@ -#define TILE_3D(name0, name1, data_type, read_image_func, write_image_func) \ +#define TILE_3D(name0, name1, src_type, dst_type, conv_type, read_image_func, write_image_func) \ __kernel void tile_##name0##to##name1 \ ( \ __read_only image2d_array_t input, \ @@ -10,7 +10,9 @@ __kernel void tile_##name0##to##name1 \ int multiples_0, \ int multiples_1, \ int multiples_2, \ - int multiples_3 \ + int multiples_3, \ + float inoutscale, \ + float inouttail \ ) \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ @@ -18,7 +20,9 @@ __kernel void tile_##name0##to##name1 \ int width = get_image_width(input); \ int height = get_image_height(input); \ \ - data_type src; \ + src_type src; \ + dst_type dst; \ + \ read_image_func(src, input, coord); \ \ int batch_id = (short)coord.z / (short)depthIn; \ @@ -40,17 +44,19 @@ __kernel void tile_##name0##to##name1 \ for (int x = 0; x < multiples_0; x++) \ { \ coord_out.x = coord.x + x * width; \ - write_image_func(output, coord_out.xyzw, src); \ + dst = conv_type(convert_float4(src) * inoutscale + inouttail); \ + write_image_func(output, coord_out.xyzw, dst); \ } \ } \ } \ } \ } -TILE_3D(I32, I32, int4, READ_IMAGEI_2DARRAY, write_imagei) -TILE_3D(U32, U32, uint4, READ_IMAGEUI_2DARRAY, write_imageui) -TILE_3D(F32, F32, float4, READ_IMAGEF_2DARRAY, write_imagef) +TILE_3D(I32, I32, int4, int4, convert_int4_rte, READ_IMAGEI_2DARRAY, write_imagei) +TILE_3D(U32, U32, uint4, uint4, convert_uint4_rte, READ_IMAGEUI_2DARRAY, write_imageui) +TILE_3D(F32, F32, float4, float4,convert_float4_rte,READ_IMAGEF_2DARRAY, write_imagef) +TILE_3D(F32, U32, float4, uint4, convert_uint4_rte, READ_IMAGEF_2DARRAY, write_imageui) -#define TILE_2D(name0, name1, data_type, read_image_func, write_image_func) \ +#define TILE_2D(name0, name1, src_type, dst_type, conv_type, read_image_func, write_image_func) \ __kernel void tile_##name0##to##name1##_2D \ ( \ __read_only image2d_t input, \ @@ -61,7 +67,9 @@ __kernel void tile_##name0##to##name1##_2D \ int multiples_0, \ int multiples_1, \ int multiples_2, \ - int multiples_3 \ + int multiples_3, \ + float inoutscale, \ + float inouttail \ ) \ { \ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ @@ -70,22 +78,25 @@ __kernel void tile_##name0##to##name1##_2D \ int output_width = get_image_width(output); \ int output_height = get_image_height(output); \ \ - data_type src = read_image_func(input, coord); \ + src_type src = read_image_func(input, coord); \ + dst_type dst; \ \ do \ { \ do \ { \ - write_image_func(output, coord, src); \ + dst = conv_type(convert_float4(src) * inoutscale + inouttail); \ + write_image_func(output, coord, dst); \ coord.x += width; \ } while (coord.x < output_width); \ coord.x = get_global_id(0); \ coord.y += height; \ } while (coord.y < output_height); \ } -TILE_2D(I32, I32, int4, read_imagei, write_imagei) -TILE_2D(U32, U32, uint4, read_imageui, write_imageui) -TILE_2D(F32, F32, float4, read_imagef, write_imagef) +TILE_2D(I32, I32, int4, int4, convert_int4_rte, read_imagei, write_imagei) +TILE_2D(U32, U32, uint4, uint4, convert_uint4_rte, read_imageui, write_imageui) +TILE_2D(F32, F32, float4, float4,convert_float4_rte,read_imagef, write_imagef) +TILE_2D(F32, U32, float4, uint4, convert_uint4_rte, read_imagef, write_imageui) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis0.vx new file mode 100644 index 000000000..a20f024a3 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis0.vx @@ -0,0 +1,191 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4; +_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8; +_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4; +_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4; +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4; + +_viv_uniform VXC_512Bits uniSetZeroF16_2x8; + +_viv_uniform VXC_512Bits uniSumHorzRevF16toF16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzRevF16toF16B_4x4; +_viv_uniform VXC_512Bits uniSumHorzRevF16toF16C_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzRevF16toF16_2x8; + +_viv_uniform VXC_512Bits uniSumHorzRevU8toI16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzRevU8toI16B_8x4; +_viv_uniform VXC_512Bits uniSubZpRevI16toI16_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32A_4x4; +_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32B_4x4; + + +_viv_uniform int width; +_viv_uniform int input_zp; +_viv_uniform float in_out_scale; +_viv_uniform float output_zp; + +__kernel void cumsum_ex_rev_F16toF16_axis0( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev + ) +{ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + vxc_short8 src, dst; + vxc_half8 data, tmpsum, sum; + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); + if(exclusive == 0 && rev) + { + for(coord.x = width - 8; coord.x >= 0; coord.x -= 8) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data, src, 16); + + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4); + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4); + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniSumHorzRevF16toF16C_2x8); + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + } + else if(exclusive && rev == 0) + { + _viv_asm(COPY, dst, sum, 16); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + for(; coord.x < width - 8;) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_out.x = coord.x + 1; + coord.x += 8; + _viv_asm(COPY, data, src, 16); + + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4); + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4); + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8); + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + } + else if(exclusive && rev) + { + coord.x = width - 8; + coord_out.x = width - 1; + _viv_asm(COPY, dst, sum, 16); + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + for(; coord.x > 0;) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord_out.x = coord.x - 1; + coord.x -= 8; + _viv_asm(COPY, data, src, 16); + + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4); + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4); + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniSumHorzRevF16toF16C_2x8); + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + } +} + +#define CUMSUM_QINT_EX_REV_AXIS0(in_name, out_name, src_type, dst_type) \ +__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis0( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); \ + int4 coord_out = coord; \ + \ + src_type src; \ + dst_type dst; \ + vxc_short8 rowSum; \ + int4 sum0 = (int4)(0), sum1 = (int4)(0); \ + short zp = (short)input_zp; \ + \ + if(exclusive == 0 && rev) \ + { \ + for(coord.x = width - 8; coord.x >= 0; coord.x -= 8) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniAccSumHorzRevI16toI32A_4x4); \ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniAccSumHorzRevI16toI32B_4x4); \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ + else if(exclusive && rev == 0) \ + { \ + for(coord.x = -1; coord.x < width - 8;) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_out.x = coord.x + 1; \ + coord.x += 8; \ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniAccSumHorzI16toI32A_4x4); \ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniAccSumHorzI16toI32B_4x4); \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ + else if(exclusive && rev) \ + { \ + for(coord.x = width - 7; coord.x > 0;) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_out.x = coord.x - 1; \ + coord.x -= 8; \ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniAccSumHorzRevI16toI32A_4x4); \ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniAccSumHorzRevI16toI32B_4x4); \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ +} +CUMSUM_QINT_EX_REV_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16) +CUMSUM_QINT_EX_REV_AXIS0(I8, I8, vxc_char16, vxc_char16) +CUMSUM_QINT_EX_REV_AXIS0(I16, I16, vxc_short8, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis1.vx new file mode 100644 index 000000000..631964c5f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis1.vx @@ -0,0 +1,255 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniSetZeroF16_2x8; + +_viv_uniform int height; +_viv_uniform float in_out_scale; +_viv_uniform float in_out_zp_scale; +_viv_uniform float output_zp; + +__kernel void cumsum_ex_rev_F16toF16_axis1( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); + + vxc_short8 src, dst; + vxc_half8 data, sum; + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); + if(exclusive == 0 && rev) + { + for(coord.y = height - 1; coord.y >= 0; coord.y--) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data, src, 16); + + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + } + else if(exclusive && rev == 0) + { + dst ^= dst; + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + for(; coord.y < height - 1;) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + _viv_asm(COPY, data, src, 16); + + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + } + else if(exclusive && rev) + { + dst ^= dst; + coord.y = height - 1; + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + for(; coord.y > 0;) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y--; + _viv_asm(COPY, data, src, 16); + + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + } +} + +#define CUMSUM_8BITS_EX_REV_AXIS1(in_name, out_name, src_type, dst_type) \ +__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev) \ +{ \ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \ + \ + src_type src; \ + dst_type dst; \ + int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \ + \ + if(exclusive == 0 && rev) \ + { \ + for(coord.y = height - 1; coord.y >= 0; coord.y--) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \ + float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ + else if(exclusive && rev == 0) \ + { \ + int tmpAlpha0 = convert_int_rte(output_zp); \ + int4 tmpVal; \ + tmpVal.x = tmpAlpha0; \ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + for(; coord.y < height - 1;) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \ + float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp; \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8);\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8);\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ + else if(exclusive && rev) \ + { \ + coord.y = height - 1; \ + int tmpAlpha0 = convert_int_rte(output_zp); \ + int4 tmpVal; \ + tmpVal.x = tmpAlpha0; \ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + for(; coord.y > 0;) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \ + float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \ + coord.y--; \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8);\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8);\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ +} +CUMSUM_8BITS_EX_REV_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16) +CUMSUM_8BITS_EX_REV_AXIS1(I8, I8, vxc_char16, vxc_char16) + +__kernel void cumsum_ex_rev_I16toI16_axis1( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev) +{ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); + + vxc_short8 src, dst; + int4 sum0 = (int4)(0), sum1 = (int4)(0); + if(exclusive == 0 && rev) + { + for(coord.y = height - 1; coord.y >= 0; coord.y--) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); + float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; + int4 tmpDst0 = convert_int4_rte(tmpSum0); + int4 tmpDst1 = convert_int4_rte(tmpSum1); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), + uniConvertInt32toUint8_2x8); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + } + else if(exclusive && rev == 0) + { + int tmpAlpha0 = convert_int_rte(output_zp); + int4 tmpVal; + tmpVal.x = tmpAlpha0; + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + for(; coord.y < height - 1;) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.y++; + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); + float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp; + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; + int4 tmpDst0 = convert_int4_rte(tmpSum0); + int4 tmpDst1 = convert_int4_rte(tmpSum1); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), + uniConvertInt32toUint8_2x8); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + } + else if(exclusive && rev) + { + coord.y = height - 1; + int tmpAlpha0 = convert_int_rte(output_zp); + int4 tmpVal; + tmpVal.x = tmpAlpha0; + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + for(; coord.y > 0;) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); + float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; + coord.y--; + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; + int4 tmpDst0 = convert_int4_rte(tmpSum0); + int4 tmpDst1 = convert_int4_rte(tmpSum1); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), + uniConvertInt32toUint8_2x8); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis2.vx new file mode 100644 index 000000000..e8a8d2790 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis2.vx @@ -0,0 +1,252 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniSetZeroF16_2x8; + +_viv_uniform int channel; +_viv_uniform float in_out_scale; +_viv_uniform float in_out_zp_scale; +_viv_uniform float output_zp; + +__kernel void cumsum_ex_rev_F16toF16_axis2( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + vxc_short8 src, dst; + vxc_half8 data, sum; + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); + if(rev && exclusive == 0) + { + for(coord.z = channel - 1; coord.z >= 0; coord.z--) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data, src, 16); + + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + } + else if(rev == 0 && exclusive) + { + _viv_asm(COPY, dst, sum, 16); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + for(; coord.z < channel - 1;) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.z++; + _viv_asm(COPY, data, src, 16); + + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + } + else if(rev && exclusive) + { + _viv_asm(COPY, dst, sum, 16); + coord.z = channel - 1; + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + for(; coord.z > 0;) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.z--; + _viv_asm(COPY, data, src, 16); + + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + } +} + +#define CUMSUM_8BITS_EX_REV_AXIS2(in_name, out_name, src_type, dst_type) \ +__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + \ + src_type src; \ + dst_type dst; \ + int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \ + \ + if(rev && exclusive == 0) \ + { \ + for(coord.z = channel - 1; coord.z >= 0; coord.z--) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \ + float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8);\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \ + uniConvertInt32toUint8_2x8);\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ + else if(exclusive && rev == 0) \ + { \ + int tmpAlpha0 = convert_int_rte(output_zp); \ + int4 tmpVal; \ + tmpVal.x = tmpAlpha0; \ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + for(; coord.z < channel - 1;) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.z++; \ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \ + float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp; \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \ + uniConvertInt32toUint8_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ + else if(rev && exclusive) \ + { \ + coord.z = channel - 1; \ + int tmpAlpha0 = convert_int_rte(output_zp); \ + int4 tmpVal; \ + tmpVal.x = tmpAlpha0; \ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + for(; coord.z > 0;) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \ + float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \ + coord.z--; \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), + uniConvertInt32toUint8_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ +} +CUMSUM_8BITS_EX_REV_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16) +CUMSUM_8BITS_EX_REV_AXIS2(I8, I8, vxc_char16, vxc_char16) + +__kernel void cumsum_ex_rev_I16toI16_axis2( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + vxc_short8 src, dst; + int4 sum0 = (int4)(0), sum1 = (int4)(0); + if(exclusive == 0 && rev) + { + for(coord.z = channel - 1; coord.z >= 0; coord.z--) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); + float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; + int4 tmpDst0 = convert_int4_rte(tmpSum0); + int4 tmpDst1 = convert_int4_rte(tmpSum1); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1), + uniConvertInt32toUint8_2x8); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + } + else if(exclusive && rev == 0) + { + int tmpAlpha0 = convert_int_rte(output_zp); + int4 tmpVal; + tmpVal.x = tmpAlpha0; + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + for(; coord.z < channel - 1;) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.z++; + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); + float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp; + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; + int4 tmpDst0 = convert_int4_rte(tmpSum0); + int4 tmpDst1 = convert_int4_rte(tmpSum1); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1), + uniConvertInt32toUint8_2x8); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + } + else if(exclusive && rev) + { + coord.z = channel - 1; + int tmpAlpha0 = convert_int_rte(output_zp); + int4 tmpVal; + tmpVal.x = tmpAlpha0; + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + for(; coord.z > 0;) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); + float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; + coord.z--; + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; + int4 tmpDst0 = convert_int4_rte(tmpSum0); + int4 tmpDst1 = convert_int4_rte(tmpSum1); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1), + uniConvertInt32toUint8_2x8); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_f16_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_f16_u8.vx index b9f4e1754..60159d98a 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_f16_u8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_f16_u8.vx @@ -176,3 +176,135 @@ __kernel void cumsum_F16to##out_name##_axis0_2D( \ CUMSUM_F16TOQINT_AXIS0_2D(I8, vxc_half8, vxc_char16) CUMSUM_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8) CUMSUM_F16TOQINT_AXIS0_2D(U8, vxc_half8, vxc_uchar16) + +#define CUMSUM_F16TOQINT_EX_REV_AXIS2(out_name, src_type, dst_type) \ +__kernel void cumsum_ex_rev_F16to##out_name##_axis2( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + \ + vxc_short8 src; \ + dst_type dst; \ + vxc_half8 data, sum; \ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + if(exclusive == 0 && rev) \ + { \ + for(coord.z = channel - 1; coord.z >= 0; coord.z--) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ + else if(exclusive && rev == 0) \ + { \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + for(; coord.z < channel - 1;) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.z++; \ + _viv_asm(COPY, data, src, 16); \ + \ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ + else if(exclusive && rev) \ + { \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + coord.z = channel - 1; \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + for(; coord.z > 0;) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.z--; \ + _viv_asm(COPY, data, src, 16); \ + \ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ +} +CUMSUM_F16TOQINT_EX_REV_AXIS2(I8, vxc_half8, vxc_char16) +CUMSUM_F16TOQINT_EX_REV_AXIS2(I16, vxc_half8, vxc_short8) +CUMSUM_F16TOQINT_EX_REV_AXIS2(U8, vxc_half8, vxc_uchar16) + +#define CUMSUM_F16TOQINT_EX_REV_AXIS1(out_name, src_type, dst_type) \ +__kernel void cumsum_ex_rev_F16to##out_name##_axis1( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \ + \ + vxc_short8 src; \ + dst_type dst; \ + vxc_half8 data, sum; \ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + if(exclusive == 0 && rev) \ + { \ + for(coord.y = height - 1; coord.y >= 0; coord.y--) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ + else if(exclusive && rev == 0) \ + { \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + for(; coord.y < height - 1;) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ + else if(exclusive && rev) \ + { \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + coord.y = height - 1; \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + for(; coord.y > 0;) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y--; \ + _viv_asm(COPY, data, src, 16); \ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ + } \ +} +CUMSUM_F16TOQINT_EX_REV_AXIS1(I8, vxc_half8, vxc_char16) +CUMSUM_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8) +CUMSUM_F16TOQINT_EX_REV_AXIS1(U8, vxc_half8, vxc_uchar16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_rgb.vx b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_rgb.vx new file mode 100644 index 000000000..2088285dd --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_rgb.vx @@ -0,0 +1,316 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable + +#include "cl_viv_vx_ext.h" + +_viv_uniform float4 matrix0; +_viv_uniform float2 matrix1; +__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb_2D +( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float _m0, + float _m1, + float _m2, + float _m3, + float _m4, + float _m5 +) +{ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f = convert_float4(coord_in); + + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; + + coord_in.x = floor(coord_f.x) * 3; + coord_in.y = floor(coord_f.y); + coord_in.z = floor(coord_f.z) * 3; + coord_in.w = floor(coord_f.w); + + vxc_uchar16 dst; + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_in.x = coord_in.x + 1; + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_in.x = coord_in.x + 1; + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + coord_in.z = coord_in.z + 1; + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + coord_in.z = coord_in.z + 1; + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void custom_warp_affine_bilinear_U8toU8_rgb_2D +( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float _m0, + float _m1, + float _m2, + float _m3, + float _m4, + float _m5 +) +{ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f = convert_float4(coord_in); + + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; + + coord_in.x = floor(coord_f.x) * 3; + coord_in.y = floor(coord_f.y); + coord_in.z = floor(coord_f.z) * 3; + coord_in.w = floor(coord_f.w); + + vxc_uchar16 src0, src1, src_0, src_1, dst; + VXC_ReadImage(src_0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src_1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + src0.x = src_0.s0; + src0.y = src_0.s3; + src1.x = src_1.s0; + src1.y = src_1.s3; + +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#endif + + src0.x = src_0.s1; + src0.y = src_0.s4; + src1.x = src_1.s1; + src1.y = src_1.s4; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#endif + + src0.x = src_0.s2; + src0.y = src_0.s5; + src1.x = src_1.s2; + src1.y = src_1.s5; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_ReadImage(src_0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src_1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + src0.x = src_0.s0; + src0.y = src_0.s3; + src1.x = src_1.s0; + src1.y = src_1.s3; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#endif + + src0.x = src_0.s1; + src0.y = src_0.s4; + src1.x = src_1.s1; + src1.y = src_1.s4; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#endif + + src0.x = src_0.s2; + src0.y = src_0.s5; + src1.x = src_1.s2; + src1.y = src_1.s5; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb +( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float _m0, + float _m1, + float _m2, + float _m3, + float _m4, + float _m5 +) +{ + int4 coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f = convert_float4(coord_in); + + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; + + coord_in.x = floor(coord_f.x) * 3; + coord_in.y = floor(coord_f.y); + coord_in.z = floor(coord_f.z) * 3; + coord_in.w = floor(coord_f.w); + + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_input.w, baseAddr); + + vxc_uchar16 dst; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_input.x = coord_input.x + 1; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); + coord_input.x = coord_input.x + 1; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); + coord_input.x = coord_input.x + 1; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); + coord_input.x = coord_input.x + 1; + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void custom_warp_affine_bilinear_U8toU8_rgb +( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + float _m0, + float _m1, + float _m2, + float _m3, + float _m4, + float _m5 +) +{ + int4 coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1)); + + float4 coord_f = convert_float4(coord_in); + + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy; + + coord_in.x = floor(coord_f.x) * 3; + coord_in.y = floor(coord_f.y); + coord_in.z = floor(coord_f.z) * 3; + coord_in.w = floor(coord_f.w); + + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2)); + int8 input_desc; + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0; + _viv_asm(MOV, coord_input.w, baseAddr); + + vxc_uchar16 src0, src1, src_0, src_1, dst; + VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + src0.x = src_0.s0; + src0.y = src_0.s3; + src1.x = src_1.s0; + src1.y = src_1.s3; + +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +#endif + + src0.x = src_0.s1; + src0.y = src_0.s4; + src1.x = src_1.s1; + src1.y = src_1.s4; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); +#endif + + src0.x = src_0.s2; + src0.y = src_0.s5; + src1.x = src_1.s2; + src1.y = src_1.s5; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); +#endif + + coord_input.xy = coord_in.zw; + VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + src0.x = src_0.s0; + src0.y = src_0.s3; + src1.x = src_1.s0; + src1.y = src_1.s3; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); +#endif + + src0.x = src_0.s1; + src0.y = src_0.s4; + src1.x = src_1.s1; + src1.y = src_1.s4; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); +#endif + + src0.x = src_0.s2; + src0.y = src_0.s5; + src1.x = src_1.s2; + src1.y = src_1.s5; +#if (VX_VERSION==1) + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#else + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); + src1.s0 = src0.s1; + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); +#endif + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx index 3a1661e85..73171a8b0 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx @@ -18,6 +18,7 @@ __kernel void gather_I8toI8( int4 coord_in = (int4)(gidy, 0, gidx, 0); int4 indice = read_imagei(input1, coord_in.xy); + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.w = gidz * axis_num + indice.x; vxc_char16 src; @@ -42,6 +43,7 @@ __kernel void gather_U8toU8( int4 coord_in = (int4)(gidy, 0, gidx, 0); int4 indice = read_imagei(input1, coord_in.xy); + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.w = gidz * axis_num + indice.x; vxc_uchar16 src; @@ -66,8 +68,8 @@ __kernel void gather_I16toI16( int4 coord_in = (int4)(gidy, 0, gidx, 0); - int4 indice = read_imagei(input1, coord_in.xy); + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.w = gidz * axis_num + indice.x; vxc_short8 src; @@ -92,6 +94,7 @@ __kernel void gather_F16toF16( int4 coord_in = (int4)(gidy, 0, gidx, 0); int4 indice = read_imagei(input1, coord_in.xy); + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.w = gidz * axis_num + indice.x; vxc_short8 src; @@ -112,6 +115,7 @@ __kernel void gather_I8toI8_axis0( { int2 coord = (int2)(get_global_id(0), get_global_id(1)); int4 indices = read_imagei(input1, coord.xx); + indices = indices >= 0 ? indices : indices + axis_num; int2 coord_in = (int2)(indices.x, get_global_id(1)); vxc_char16 src, dst; @@ -138,6 +142,7 @@ __kernel void gather_U8toU8_axis0( { int2 coord = (int2)(get_global_id(0), get_global_id(1)); int4 indices = read_imagei(input1, coord.xx); + indices = indices >= 0 ? indices : indices + axis_num; int2 coord_in = (int2)(indices.x, get_global_id(1)); vxc_uchar16 src, dst; @@ -164,6 +169,7 @@ __kernel void gather_I16toI16_axis0( { int2 coord = (int2)(get_global_id(0), get_global_id(1)); int4 indices = read_imagei(input1, coord.xx); + indices = indices >= 0 ? indices : indices + axis_num; int2 coord_in = (int2)(indices.x, get_global_id(1)); vxc_short8 src, dst; @@ -190,6 +196,7 @@ __kernel void gather_F16toF16_axis0( { int2 coord = (int2)(get_global_id(0), get_global_id(1)); int4 indices = read_imagei(input1, coord.xx); + indices = indices >= 0 ? indices : indices + axis_num; int2 coord_in = (int2)(indices.x, get_global_id(1)); vxc_short8 src, dst; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx index 9ed287631..9c21fd131 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx @@ -18,6 +18,7 @@ __kernel void gather_I8toI8_array( int4 coord_in = (int4)(gidy, 0, gidx, 0); int4 indice = read_imagei(input1, coord_in.xy); + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.w = gidz * axis_num + indice.x; Image img1 = create_image_from_image2d(input0, 1); @@ -46,6 +47,7 @@ __kernel void gather_U8toU8_array( int4 coord_in = (int4)(gidy, 0, gidx, 0); int4 indice = read_imagei(input1, coord_in.xy); + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.w = gidz * axis_num + indice.x; Image img1 = create_image_from_image2d(input0, 1); @@ -74,8 +76,8 @@ __kernel void gather_I16toI16_array( int4 coord_in = (int4)(gidy, 0, gidx, 0); - int4 indice = read_imagei(input1, coord_in.xy); + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.w = gidz * axis_num + indice.x; Image img1 = create_image_from_image2d(input0, 2); @@ -105,6 +107,7 @@ __kernel void gather_F16toF16_array( int4 coord_in = (int4)(gidy, 0, gidx, 0); int4 indice = read_imagei(input1, coord_in.xy); + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.w = gidz * axis_num + indice.x; Image img1 = create_image_from_image2d(input0, 2); @@ -142,6 +145,7 @@ __kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \ uchar* output_ptr = get_image_ptr_from_coord(img2, coord.xy); \ __global data_type* data_ptr = (__global data_type*)input_ptr; \ __global write_type* out_ptr = (__global write_type*)output_ptr; \ + indices = indices >= 0 ? indices : indices + axis_num; \ src.s0 = data_ptr[indices.x]; \ src.s1 = data_ptr[indices.y]; \ src.s2 = data_ptr[indices.z]; \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx index 8d09d50d4..47f1db609 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx @@ -24,6 +24,7 @@ __kernel void gather_batch_I8toI8( { int4 indice = read_imagei(input1, coord_idx); coord_idx.y++; + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.y = gidz * axis_num + indice.x; vxc_char16 src; @@ -54,6 +55,7 @@ __kernel void gather_batch_U8toU8( { int4 indice = read_imagei(input1, coord_idx); coord_idx.y++; + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.y = gidz * axis_num + indice.x; vxc_uchar16 src; @@ -84,6 +86,7 @@ __kernel void gather_batch_I16toI16( { int4 indice = read_imagei(input1, coord_idx); coord_idx.y++; + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.y = gidz * axis_num + indice.x; vxc_short8 src; @@ -114,6 +117,7 @@ __kernel void gather_batch_F16toF16( { int4 indice = read_imagei(input1, coord_idx); coord_idx.y++; + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.y = gidz * axis_num + indice.x; vxc_short8 src; @@ -135,6 +139,7 @@ __kernel void gather_batch_I8toI8_axis0( { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); int4 indices = read_imagei(input1, coord.xz); + indices = indices >= 0 ? indices : indices + axis_num; int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); vxc_char16 src, dst; @@ -163,6 +168,7 @@ __kernel void gather_batch_U8toU8_axis0( { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); int4 indices = read_imagei(input1, coord.xz); + indices = indices >= 0 ? indices : indices + axis_num; int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); vxc_uchar16 src, dst; @@ -191,6 +197,7 @@ __kernel void gather_batch_I16toI16_axis0( { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); int4 indices = read_imagei(input1, coord.xz); + indices = indices >= 0 ? indices : indices + axis_num; int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); vxc_short8 src, dst; @@ -219,6 +226,7 @@ __kernel void gather_batch_F16toF16_axis0( { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); int4 indices = read_imagei(input1, coord.xz); + indices = indices >= 0 ? indices : indices + axis_num; int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); vxc_short8 src, dst; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_elements.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_elements.vx index 39a8a990d..9f962c410 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_elements.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_elements.vx @@ -1,6 +1,12 @@ #include "cl_viv_vx_ext.h" _viv_uniform int axis_size; +_viv_uniform uint width0; +_viv_uniform uint height0; +_viv_uniform uint width1; +_viv_uniform uint height1; +_viv_uniform uint width_out; +_viv_uniform uint height_out; #define GATHER_ELEMENTS_AXIS0_2D(name, data_type) \ __kernel void gather_elements_axis0_##name##_I32to##name##_2D \ @@ -151,3 +157,141 @@ GATHER_ELEMENTS_AXIS2(F16, vxc_short4) GATHER_ELEMENTS_AXIS2(I16, vxc_short4) GATHER_ELEMENTS_AXIS2(I8, vxc_char4) GATHER_ELEMENTS_AXIS2(U8, vxc_uchar4) + +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(name, data_type, data_type_ptr, stride) \ +__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output, \ + int axis \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \ + Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \ + int* index_ptr = (int*)index_tensor.ptr; \ + int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \ + \ + Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \ + data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \ + data_type data = input_ptr[index + coord.y * width0 + coord.z * width0 * height0]; \ + \ + Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \ + data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \ + output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \ +} +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(F16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I8, char, char*, 1) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(U8, uchar, uchar*, 1) + +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(name, data_type, data_type_ptr, stride) \ +__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output, \ + int axis \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \ + Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \ + int* index_ptr = (int*)index_tensor.ptr; \ + int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \ + \ + Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \ + data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \ + data_type data = input_ptr[coord.x + index * width0 + coord.z * width0 * height0]; \ + \ + Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \ + data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \ + output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \ +} +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(F16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I8, char, char*, 1) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(U8, uchar, uchar*, 1) + +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(name, data_type, data_type_ptr, stride) \ +__kernel void gather_elements_beyond_maxwidth_axis2_##name##_I32to##name \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output, \ + int axis \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \ + Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \ + int* index_ptr = (int*)index_tensor.ptr; \ + int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \ + \ + Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \ + data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \ + data_type data = input_ptr[coord.x + coord.y * width0 + index * width0 * height0]; \ + \ + Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \ + data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \ + output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \ +} +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(F16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I8, char, char*, 1) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(U8, uchar, uchar*, 1) + + +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(name, data_type, data_type_ptr, stride) \ +__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name##_2D \ + ( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int axis \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + Image index_img = create_image_from_image2d(input1, 4); \ + int* index_ptr = (int*)index_img.ptr; \ + int index = index_ptr[coord.x + coord.y * width1]; \ + \ + Image input_img = create_image_from_image2d(input0, stride); \ + data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \ + data_type data = input_ptr[index + coord.y * width0]; \ + \ + Image output_img = create_image_from_image2d(output, stride); \ + data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \ + output_ptr[coord.x + coord.y * width_out] = data; \ +} +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(F16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I8, char, char*, 1) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(U8, uchar, uchar*, 1) + +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(name, data_type, data_type_ptr, stride) \ +__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name##_2D \ + ( \ + __read_only image2d_t input0, \ + __read_only image2d_t input1, \ + __write_only image2d_t output, \ + int axis \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + Image index_img = create_image_from_image2d(input1, 4); \ + int* index_ptr = (int*)index_img.ptr; \ + int index = index_ptr[coord.x + coord.y * width1]; \ + \ + Image input_img = create_image_from_image2d(input0, stride); \ + data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \ + data_type data = input_ptr[coord.x + index * width0]; \ + \ + Image output_img = create_image_from_image2d(output, stride); \ + data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \ + output_ptr[coord.x + coord.y * width_out] = data; \ +} +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(F16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I16, short, short*, 2) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I8, char, char*, 1) +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(U8, uchar, uchar*, 1) + + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx index e9b8fd14e..87825fd13 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx @@ -24,6 +24,7 @@ __kernel void gather_##src0_type_name##toF16( \ \ int4 coord_in = (int4)(gidy, 0, gidx, 0); \ int4 indice = read_imagei(input1, coord_in.xy); \ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \ coord_in.w = gidz * axis_num + indice.x; \ \ read_type src; \ @@ -60,6 +61,7 @@ __kernel void gather_F16to##src1_type_name( \ int4 coord_in = (int4)(gidy, 0, gidx, 0); \ \ int4 indice = read_imagei(input1, coord_in.xy); \ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \ coord_in.w = gidz * axis_num + indice.x; \ \ vxc_short8 src; \ @@ -92,6 +94,7 @@ __kernel void gather_I16toF16( int4 coord_in = (int4)(gidy, 0, gidx, 0); int4 indice = read_imagei(input1, coord_in.xy); + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.w = gidz * axis_num + indice.x; vxc_short8 src; @@ -122,6 +125,7 @@ __kernel void gather_##src0_type_name##toF16_axis0( \ { \ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ int4 indices = read_imagei(input1, coord.xx); \ + indices = indices >= 0 ? indices : indices + axis_num; \ int2 coord_in = (int2)(indices.x, get_global_id(1)); \ \ read_type src; \ @@ -153,6 +157,7 @@ __kernel void gather_F16to##src1_type_name##_axis0( \ { \ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ int4 indices = read_imagei(input1, coord.xx); \ + indices = indices >= 0 ? indices : indices + axis_num; \ int2 coord_in = (int2)(indices.x, get_global_id(1)); \ \ vxc_short8 src; \ @@ -184,6 +189,7 @@ __kernel void gather_I16toF16_axis0( { int2 coord = (int2)(get_global_id(0), get_global_id(1)); int4 indices = read_imagei(input1, coord.xx); + indices = indices >= 0 ? indices : indices + axis_num; int2 coord_in = (int2)(indices.x, get_global_id(1)); vxc_short8 src; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx index 0e94445ca..988c81183 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx @@ -33,6 +33,7 @@ __kernel void gather_batch_##src0_type_name##toF16( \ { \ int4 indice = read_imagei(input1, coord_idx); \ coord_idx.y++; \ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \ coord_in.y = gidz * axis_num + indice.x; \ \ read_type src; \ @@ -78,6 +79,7 @@ __kernel void gather_batch_F16to##src1_type_name( \ { \ int4 indice = read_imagei(input1, coord_idx); \ coord_idx.y++; \ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \ coord_in.y = gidz * axis_num + indice.x; \ \ vxc_short8 src; \ @@ -120,6 +122,7 @@ __kernel void gather_batch_I16toF16( { int4 indice = read_imagei(input1, coord_idx); coord_idx.y++; + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; coord_in.y = gidz * axis_num + indice.x; vxc_short8 src; @@ -145,6 +148,7 @@ __kernel void gather_batch_##src0_type_name##toF16_axis0( \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ int4 indices = read_imagei(input1, coord.xz); \ + indices = indices >= 0 ? indices : indices + axis_num; \ int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \ \ read_type src; \ @@ -179,6 +183,7 @@ __kernel void gather_batch_F16to##src1_type_name##_axis0( \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ int4 indices = read_imagei(input1, coord.xz); \ + indices = indices >= 0 ? indices : indices + axis_num; \ int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \ \ vxc_short8 src; \ @@ -213,6 +218,7 @@ __kernel void gather_batch_I16toF16_axis0( { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); int4 indices = read_imagei(input1, coord.xz); + indices = indices >= 0 ? indices : indices + axis_num; int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); vxc_short8 src, dst; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx index c479a3b58..e467f252e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx @@ -2,93 +2,96 @@ __kernel void gather_nd_batch_I8toI8_1D( __read_only image2d_t input0, - __read_only image2d_t input1, - __write_only image2d_t output, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, int block_size, int coord_dim ) { int gidx = get_global_id(0); // block_size - int gidy = get_global_id(1); // batch + int gidy = get_global_id(1); // index num + int gidz = get_global_id(2); // batch num - int4 coord = (int4)(gidx, gidy, 0, 0); - Image img = create_image_from_image2d(input1, 4); - uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy); + int4 coord = (int4)(gidx, gidy, gidz, 0); + Tensor img = create_tensor_from_image2d_array(input1, 4); + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw); int4 indice = ((int4 *)indice_ptr)[0]; - - coord.z = indice.x * block_size + gidx; + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz); vxc_char16 src; - VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } __kernel void gather_nd_batch_U8toU8_1D( __read_only image2d_t input0, - __read_only image2d_t input1, - __write_only image2d_t output, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, int block_size, int coord_dim ) { int gidx = get_global_id(0); // block_size - int gidy = get_global_id(1); // batch num + int gidy = get_global_id(1); // index num + int gidz = get_global_id(2); // batch num - int4 coord = (int4)(gidx, gidy, 0, 0); - Image img = create_image_from_image2d(input1, 4); - uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy); + int4 coord = (int4)(gidx, gidy, gidz, 0); + Tensor img = create_tensor_from_image2d_array(input1, 4); + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw); int4 indice = ((int4 *)indice_ptr)[0]; - coord.z = indice.x * block_size + gidx; + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz); vxc_uchar16 src; - VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } __kernel void gather_nd_batch_I16toI16_1D( __read_only image2d_t input0, - __read_only image2d_t input1, - __write_only image2d_t output, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, int block_size, int coord_dim ) { int gidx = get_global_id(0); // block_size - int gidy = get_global_id(1); // batch num + int gidy = get_global_id(1); // index num + int gidz = get_global_id(2); // batch num - int4 coord = (int4)(gidx, gidy, 0, 0); - Image img = create_image_from_image2d(input1, 4); - uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy); + int4 coord = (int4)(gidx, gidy, gidz, 0); + Tensor img = create_tensor_from_image2d_array(input1, 4); + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw); int4 indice = ((int4 *)indice_ptr)[0]; - coord.z = indice.x * block_size + gidx; + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz); vxc_short8 src; - VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } __kernel void gather_nd_batch_F16toF16_1D( __read_only image2d_t input0, - __read_only image2d_t input1, - __write_only image2d_t output, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, int block_size, int coord_dim ) { int gidx = get_global_id(0); // block_size - int gidy = get_global_id(1); // batch num + int gidy = get_global_id(1); // index num + int gidz = get_global_id(2); // batch num - int4 coord = (int4)(gidx, gidy, 0, 0); - Image img = create_image_from_image2d(input1, 4); - uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy); + int4 coord = (int4)(gidx, gidy, gidz, 0); + Tensor img = create_tensor_from_image2d_array(input1, 4); + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw); int4 indice = ((int4 *)indice_ptr)[0]; - coord.z = indice.x * block_size + gidx; + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz); vxc_short8 src; - VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx index acc6c4cfc..58c2af349 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx @@ -2,18 +2,19 @@ __kernel void gather_nd_batch_I8toI8_2D( __read_only image2d_array_t input0, - __read_only image2d_t input1, - __write_only image2d_t output, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, int block_size, int coord_dim ) { int gidx = get_global_id(0); // block_size - int gidy = get_global_id(1); // batch num + int gidy = get_global_id(1); // index num + int gidz = get_global_id(2); // batch num - int4 coord = (int4)(gidx, 0, gidy, 0); - Image img = create_image_from_image2d(input1, 4); - uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz); + int4 coord = (int4)(gidx, gidy, gidz, 0); + Tensor img = create_tensor_from_image2d_array(input1, 4); + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw); int4 indice = ((int4 *)indice_ptr)[0]; indice.x = indice.x * block_size + gidx; @@ -22,23 +23,24 @@ __kernel void gather_nd_batch_I8toI8_2D( vxc_char16 src; VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } __kernel void gather_nd_U8toU8_2D( __read_only image2d_array_t input0, - __read_only image2d_t input1, - __write_only image2d_t output, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, int block_size, int coord_dim ) { int gidx = get_global_id(0); // block_size - int gidy = get_global_id(1); // batch num + int gidy = get_global_id(1); // index num + int gidz = get_global_id(2); // batch num - int4 coord = (int4)(gidx, 0, gidy, 0); - Image img = create_image_from_image2d(input1, 4); - uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz); + int4 coord = (int4)(gidx, gidy, gidz, 0); + Tensor img = create_tensor_from_image2d_array(input1, 4); + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw); int4 indice = ((int4 *)indice_ptr)[0]; indice.x = indice.x * block_size + gidx; @@ -46,23 +48,24 @@ __kernel void gather_nd_U8toU8_2D( vxc_uchar16 src; VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } __kernel void gather_nd_I16toI16_2D( __read_only image2d_array_t input0, - __read_only image2d_t input1, - __write_only image2d_t output, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, int block_size, int coord_dim ) { int gidx = get_global_id(0); // block_size - int gidy = get_global_id(1); // batch num + int gidy = get_global_id(1); // index num + int gidz = get_global_id(2); // batch num - int4 coord = (int4)(gidx, 0, gidy, 0); - Image img = create_image_from_image2d(input1, 4); - uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz); + int4 coord = (int4)(gidx, gidy, gidz, 0); + Tensor img = create_tensor_from_image2d_array(input1, 4); + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw); int4 indice = ((int4 *)indice_ptr)[0]; indice.x = indice.x * block_size + gidx; @@ -70,23 +73,24 @@ __kernel void gather_nd_I16toI16_2D( vxc_short8 src; VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } __kernel void gather_nd_F16toF16_2D( __read_only image2d_array_t input0, - __read_only image2d_t input1, - __write_only image2d_t output, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, int block_size, int coord_dim ) { int gidx = get_global_id(0); // block_size - int gidy = get_global_id(1); // batch num + int gidy = get_global_id(1); // index num + int gidz = get_global_id(2); // batch num - int4 coord = (int4)(gidx, 0, gidy, 0); - Image img = create_image_from_image2d(input1, 4); - uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz); + int4 coord = (int4)(gidx, gidy, gidz, 0); + Tensor img = create_tensor_from_image2d_array(input1, 4); + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw); int4 indice = ((int4 *)indice_ptr)[0]; indice.x = indice.x * block_size + gidx; @@ -94,5 +98,5 @@ __kernel void gather_nd_F16toF16_2D( vxc_short8 src; VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_0.vx index c1b970d43..5dfbc3ad7 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_0.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_0.vx @@ -184,12 +184,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( __read_only image2d_array_t scale, \ __read_only image2d_t meanVari, \ __write_only image2d_array_t output, \ - float eps, int is2D, float rSpaceOrg, int pStride) \ + float eps, int is2D, float rSpaceOrg, float pStride) \ { \ + int gidx = get_global_id(0); \ int gidy = get_global_id(1); \ int gidz = get_global_id(2); \ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \ + int4 coord = (int4)(gidx, gidy, gidz, 0); \ + int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \ src_type src0; \ dst_type dst; \ vxc_short8 src1; \ @@ -235,7 +236,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name# __read_only image2d_array_t scale, \ __read_only image2d_t meanVari, \ __write_only image2d_array_t output, \ - float eps, int is2D, float rSpaceOrg, int pStride) \ + float eps, int is2D, float rSpaceOrg, float pStride) \ { \ int gidz = get_global_id(1); \ int2 coord = (int2)(get_global_id(0), gidz); \ @@ -285,12 +286,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( __read_only image2d_t scale, \ __read_only image2d_t meanVari, \ __write_only image2d_array_t output, \ - float eps, int is2D, float rSpaceOrg, int pStride) \ + float eps, int is2D, float rSpaceOrg, float pStride) \ { \ + int gidx = get_global_id(0); \ int gidy = get_global_id(1); \ int gidz = get_global_id(2); \ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \ + int4 coord = (int4)(gidx, gidy, gidz, 0); \ + int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \ src_type src0; \ dst_type dst; \ float scale_vari, bias_val; \ @@ -331,7 +333,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name# __read_only image2d_t scale, \ __read_only image2d_t meanVari, \ __write_only image2d_array_t output, \ - float eps, int is2D, float rSpaceOrg, int pStride) \ + float eps, int is2D, float rSpaceOrg, float pStride) \ { \ int gidz = get_global_id(1); \ int2 coord = (int2)(get_global_id(0), gidz); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx index 3562ae557..8b45e178f 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx @@ -17,12 +17,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( __read_only image2d_array_t scale, \ __read_only image2d_t meanVari, \ __write_only image2d_array_t output, \ - float eps, int is2D, float rSpaceOrg, int pStride) \ + float eps, int is2D, float rSpaceOrg, float pStride) \ { \ + int gidx = get_global_id(0); \ int gidy = get_global_id(1); \ int gidz = get_global_id(2); \ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \ + int4 coord = (int4)(gidx, gidy, gidz, 0); \ + int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \ src_type src0; \ vxc_short8 src1, outval; \ vxc_half8 scale_h, dst; \ @@ -75,7 +76,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name# __read_only image2d_array_t scale, \ __read_only image2d_t meanVari, \ __write_only image2d_array_t output, \ - float eps, int is2D, float rSpaceOrg, int pStride) \ + float eps, int is2D, float rSpaceOrg, float pStride) \ { \ int gidz = get_global_id(1); \ int2 coord = (int2)(get_global_id(0), gidz); \ @@ -132,12 +133,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( __read_only image2d_t scale, \ __read_only image2d_t meanVari, \ __write_only image2d_array_t output, \ - float eps, int is2D, float rSpaceOrg, int pStride) \ + float eps, int is2D, float rSpaceOrg, float pStride) \ { \ + int gidx = get_global_id(0); \ int gidy = get_global_id(1); \ int gidz = get_global_id(2); \ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \ + int4 coord = (int4)(gidx, gidy, gidz, 0); \ + int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \ src_type src0; \ vxc_short8 outval; \ vxc_half8 dst; \ @@ -186,7 +188,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name# __read_only image2d_t scale, \ __read_only image2d_t meanVari, \ __write_only image2d_array_t output, \ - float eps, int is2D, float rSpaceOrg, int pStride) \ + float eps, int is2D, float rSpaceOrg, float pStride) \ { \ int gidz = get_global_id(1); \ int2 coord = (int2)(get_global_id(0), gidz); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx index b62b67faf..33edef844 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx @@ -138,12 +138,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( __read_only image2d_array_t scale, \ __read_only image2d_t meanVari, \ __write_only image2d_array_t output, \ - float eps, int is2D, float rSpaceOrg, int pStride) \ + float eps, int is2D, float rSpaceOrg, float pStride) \ { \ + int gidx = get_global_id(0); \ int gidy = get_global_id(1); \ int gidz = get_global_id(2); \ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \ + int4 coord = (int4)(gidx, gidy, gidz, 0); \ + int4 coord_para = (int4)((convert_int(gidx* rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \ vxc_short8 src0; \ vxc_short8 src1; \ vxc_half8 scale_h; \ @@ -195,7 +196,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name# __read_only image2d_array_t scale, \ __read_only image2d_t meanVari, \ __write_only image2d_array_t output, \ - float eps, int is2D, float rSpaceOrg, int pStride) \ + float eps, int is2D, float rSpaceOrg, float pStride) \ { \ int gidz = get_global_id(1); \ int2 coord = (int2)(get_global_id(0), gidz); \ @@ -250,12 +251,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( __read_only image2d_t scale, \ __read_only image2d_t meanVari, \ __write_only image2d_array_t output, \ - float eps, int is2D, float rSpaceOrg, int pStride) \ + float eps, int is2D, float rSpaceOrg, float pStride) \ { \ + int gidx = get_global_id(0); \ int gidy = get_global_id(1); \ int gidz = get_global_id(2); \ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \ + int4 coord = (int4)(gidx, gidy, gidz, 0); \ + int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \ vxc_short8 src0; \ src_type in_h; \ float scale_vari, bias_val; \ @@ -302,7 +304,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name# __read_only image2d_t scale, \ __read_only image2d_t meanVari, \ __write_only image2d_array_t output, \ - float eps, int is2D, float rSpaceOrg, int pStride) \ + float eps, int is2D, float rSpaceOrg, float pStride) \ { \ int gidz = get_global_id(1); \ int2 coord = (int2)(get_global_id(0), gidz); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx index 77fdcc99a..8086f28c9 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx @@ -29,8 +29,8 @@ _viv_uniform VXC_512Bits uniConvertF16_0_4x4; _viv_uniform VXC_512Bits uniConvertF16_1_4x4; _viv_uniform VXC_512Bits uniExtract8Data_2x8; -#define GRUCELL_F16_F16TOF16(act_name, act_func) \ -__kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \ +#define GRUCELL_F16_F16TOF16(act_name, act_func, rec_act_name, rec_act_func) \ +__kernel void grucell_reset_after_activation_F16_F16toF16_##act_name##_##rec_act_name( \ __read_only image2d_t hstate_in, \ __read_only image2d_t input_z_conv, \ __read_only image2d_t input_r_conv, \ @@ -62,15 +62,15 @@ __kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \ \ float4 r; \ VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \ - r = act_func(r); \ + r = rec_act_func(r); \ float4 h0, h1; \ VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \ VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \ float4 h = h0 + r * h1; \ float4 z; \ VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \ - z = act_func(z); \ - h = tanh_func(h); \ + z = rec_act_func(z); \ + h = act_func(h); \ float4 h_tm; \ VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \ float4 result = (1 - z) * h + z * h_tm; \ @@ -83,14 +83,15 @@ __kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \ VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ } -GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func) +GRUCELL_F16_F16TOF16(TANH, tanh_func, SIGMOID, sigmoid_func) +GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func, SIGMOID, sigmoid_func) _viv_uniform float hstate_in_scale; _viv_uniform float hstate_in_tail; _viv_uniform float output_scale; _viv_uniform float output_zp; -#define GRUCELL_QNT_F16TO_QNT(name0, name1, act_name, act_func, src0_type, dst_type) \ -__kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name( \ +#define GRUCELL_QNT_F16TO_QNT(name, act_func, rec_act_func, src0_type, dst_type) \ +__kernel void grucell_reset_after_activation_##name( \ __read_only image2d_t hstate_in, \ __read_only image2d_t input_z_conv, \ __read_only image2d_t input_r_conv, \ @@ -122,15 +123,15 @@ __kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name \ float4 r; \ VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \ - r = act_func(r); \ + r = rec_act_func(r); \ float4 h0, h1; \ VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \ VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \ float4 h = h0 + r * h1; \ float4 z; \ VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \ - z = act_func(z); \ - h = tanh_func(h); \ + z = rec_act_func(z); \ + h = act_func(h); \ float4 h_tm; \ VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \ h_tm = h_tm * hstate_in_scale + hstate_in_tail; \ @@ -143,6 +144,9 @@ __kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ } -GRUCELL_QNT_F16TO_QNT(U8, U8, SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8) -GRUCELL_QNT_F16TO_QNT(I8, I8, SIGMOID, sigmoid_func, vxc_char8, vxc_char8) -GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8) +GRUCELL_QNT_F16TO_QNT(U8_F16toU8_TANH_SIGMOID, tanh_func, sigmoid_func, vxc_uchar8, vxc_uchar8) +GRUCELL_QNT_F16TO_QNT(I8_F16toI8_TANH_SIGMOID, tanh_func, sigmoid_func, vxc_char8, vxc_char8) +GRUCELL_QNT_F16TO_QNT(I16_F16toI16_TANH_SIGMOID, tanh_func, sigmoid_func, vxc_short8, vxc_short8) +GRUCELL_QNT_F16TO_QNT(U8_F16toU8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_uchar8, vxc_uchar8) +GRUCELL_QNT_F16TO_QNT(I8_F16toI8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_char8, vxc_char8) +GRUCELL_QNT_F16TO_QNT(I16_F16toI16_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_short8, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_cross.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_cross.vx new file mode 100644 index 000000000..b4dc43c24 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_cross.vx @@ -0,0 +1,208 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float output_ZP; +_viv_uniform float mulKIn0In1Zp; +_viv_uniform float inOutScale; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4; +_viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4; + +_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b; + +#define GEMM_QINT_TO_QINT_CROSS(src0_type_name, read_type) \ +__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_cross( \ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N, \ + int axis_size, int inner_size, int outer_size, int axis_size0, \ + int inner_size0, int outer_size0, int axis_size1, int inner_size1, \ + int outer_size1, int axis_size2, int inner_size2, int outer_size2) \ +{ \ + read_type srcA0, srcA1, srcA2, srcA3, srcB, outC; \ + vxc_float4 sum = (vxc_float4)(mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp); \ + int gidz = get_global_id(2); \ + for(int j = 0; j < outer_size; j++) \ + { \ + for(int i = 0; i < inner_size; i++) \ + { \ + vxc_float4 sum0 = sum, sum1 = sum, sum2 = sum, sum3 = sum; \ + int4 coord_a = (int4)(0, get_global_id(1), gidz * axis_size0 + i * inner_size0 + j * outer_size0, 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, gidz * axis_size1 + i * inner_size1 + j * outer_size1, 0); \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; coord_b.y += 4; \ + VXC_DP4x4(tempA0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8toFp32Block4_4x4); \ + VXC_DP4x4(tempA1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8toFp32Block4_4x4); \ + VXC_DP4x4(tempA2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8toFp32Block4_4x4); \ + VXC_DP4x4(tempA3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8toFp32Block4_4x4); \ + VXC_DP8x4(tempB0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8MulZptoFp32_8x4); \ + VXC_DP8x4(tempB1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8MulZptoFp32_8x4); \ + VXC_DP8x4(tempB2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8MulZptoFp32_8x4); \ + VXC_DP8x4(tempB3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8MulZptoFp32_8x4); \ + sum0 += tempA0 + tempB0; \ + sum1 += tempA1 + tempB1; \ + sum2 += tempA2 + tempB2; \ + sum3 += tempA3 + tempB3; \ + } \ + vxc_int4 tmpOut0, tmpOut1; \ + coord_b.y = get_global_id(1); \ + coord_b.z = gidz * axis_size2 + i * inner_size2 + j * outer_size2; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + tmpOut0 = convert_int4_rte(sum0 * inOutScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum1 * inOutScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + tmpOut0 = convert_int4_rte(sum2 * inOutScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum3 * inOutScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + } \ + } \ +} +GEMM_QINT_TO_QINT_CROSS(U8, vxc_uchar16) +GEMM_QINT_TO_QINT_CROSS(I8, vxc_char16) + +__kernel void gemm_F16F16toF16_cross(image2d_array_t inputA, + image2d_array_t inputB, image2d_array_t output, + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N, + int axis_size, int inner_size, int outer_size, int axis_size0, + int inner_size0, int outer_size0, int axis_size1, int inner_size1, + int outer_size1, int axis_size2, int inner_size2, int outer_size2) +{ + uint gidy = get_global_id(1); + uint gidz = get_global_id(2); + for(int j = 0; j < outer_size; j++) + { + for(int i = 0; i < inner_size; i++) + { + int4 coord_a = (int4)(0, gidy, gidz * axis_size0 + i * inner_size0 + j * outer_size0, 0); + int4 coord_b = (int4)(get_global_id(0), 0, gidz * axis_size1 + i * inner_size1 + j * outer_size1, 0); + + half4 valC; + vxc_short8 srcA0, srcA1, srcA2, srcA3, outC; + vxc_half8 tmpA0, tmpA1, tmpA2, tmpA3; + vxc_short16 srcB; + vxc_half16 tmpB; + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); + + int8 inputA_desc, inputB_desc, output_desc; + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; + _viv_asm(MOV, coord_a.w, baseAddr_a); + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr_b); + + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) + { + vxc_float4 tempA0, tempA1, tempA2, tempA3; + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + coord_a.x += 4; coord_b.y += 4; + _viv_asm(COPY, tmpA0, srcA0, 16); + _viv_asm(COPY, tmpA1, srcA1, 16); + _viv_asm(COPY, tmpA2, srcA2, 16); + _viv_asm(COPY, tmpA3, srcA3, 16); + _viv_asm(COPY, tmpB.hi, srcB.hi, 16); + _viv_asm(COPY, tmpB.lo, srcB.lo, 16); + VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmU8F16toF32Lo_4x4b); + VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmU8F16toF32Lo_4x4b); + VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, tmpA2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmU8F16toF32Lo_4x4b); + VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, tmpA3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmU8F16toF32Lo_4x4b); + sum0 += (tempA0); + sum1 += (tempA1); + sum2 += (tempA2); + sum3 += (tempA3); + } + coord_b.y = gidy; + coord_b.z = gidz * axis_size2 + i * inner_size2 + j * outer_size2; + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr); + _viv_asm(CONV, valC, sum0); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_b.y++; + _viv_asm(CONV, valC, sum1); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_b.y++; + _viv_asm(CONV, valC, sum2); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_b.y++; + _viv_asm(CONV, valC, sum3); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + } + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_cross_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_cross_i16.vx new file mode 100644 index 000000000..241118079 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_cross_i16.vx @@ -0,0 +1,214 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int input0_ZP; +_viv_uniform int input1_ZP; +_viv_uniform float output_ZP; +_viv_uniform float outputScale; +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32B_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform int ac2zero; +_viv_uniform int bc2zero; + +_viv_uniform int outer; + +#define GEMM_QINT_TO_QINT_MERGE(src0_type_name, read_type) \ +__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_merge( \ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \ +{ \ + uint gidy = get_global_id(1); \ + short in0_zp, in1_zp; \ + _viv_asm(COPY, in0_zp, input0_ZP, 4); \ + _viv_asm(COPY, in1_zp, input1_ZP, 4); \ + for(int i = 0; i < outer; i++) \ + { \ + read_type srcA, srcB, outC; \ + int4 coord_a = (int4)(0, gidy, (ac2zero ? i : get_global_id(2)), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); \ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; \ + coord_b.y += 4; \ + VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \ + sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \ + sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \ + sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \ + } \ + vxc_int4 tmpOut0, tmpOut1; \ + coord_b.y = gidy; \ + coord_b.z = get_global_id(2) + i * get_global_size(2); \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + } \ +} +GEMM_QINT_TO_QINT_MERGE(I16, vxc_short8) + +#define GEMM_QINT_TO_QINT_CROSS(src0_type_name, read_type) \ +__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_cross( \ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N, \ + int axis_size, int inner_size, int outer_size, int axis_size0, \ + int inner_size0, int outer_size0, int axis_size1, int inner_size1, \ + int outer_size1, int axis_size2, int inner_size2, int outer_size2) \ +{ \ + uint gidy = get_global_id(1); \ + uint gidz = get_global_id(2); \ + short in0_zp, in1_zp; \ + _viv_asm(COPY, in0_zp, input0_ZP, 4); \ + _viv_asm(COPY, in1_zp, input1_ZP, 4); \ + for(int j = 0; j < outer_size; j++) \ + { \ + for(int i = 0; i < inner_size; i++) \ + { \ + read_type srcA, srcB, outC; \ + int4 coord_a = (int4)(0, gidy, gidz * axis_size0 + i * inner_size0 + j * outer_size0, 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, gidz * axis_size1 + i * inner_size1 + j * outer_size1, 0); \ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \ + \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; \ + coord_b.y += 4; \ + VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniConvertUint8SubZpToFp32B_4x4); \ + sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \ + sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \ + sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \ + sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \ + } \ + vxc_int4 tmpOut0, tmpOut1; \ + coord_b.y = gidy; \ + coord_b.z = gidz * axis_size2 + i * inner_size2 + j * outer_size2; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + } \ + } \ +} +GEMM_QINT_TO_QINT_CROSS(I16, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_merge.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_merge.vx new file mode 100644 index 000000000..9f33be797 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_merge.vx @@ -0,0 +1,294 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float output_ZP; +_viv_uniform float mulKIn0In1Zp; +_viv_uniform float inOutScale; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform int ac2zero; +_viv_uniform int bc2zero; +_viv_uniform int outer; + +_viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4; +_viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4; + +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Lo_4x4; +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Hi_4x4; +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Lo_4x4; +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Hi_4x4; + +_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b; + +#define GEMM_QINT_TO_QINT_MERGE(src0_type_name, read_type) \ +__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_merge( \ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \ +{ \ + read_type srcA0, srcA1, srcA2, srcA3, srcB, outC; \ + vxc_float4 sum = (vxc_float4)(mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp); \ + for(int i = 0; i < outer; i++) \ + { \ + vxc_float4 sum0 = sum, sum1 = sum, sum2 = sum, sum3 = sum; \ + int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? i : get_global_id(2)), 0); \ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); \ + int8 inputA_desc, inputB_desc, output_desc; \ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \ + _viv_asm(MOV, coord_a.w, baseAddr_a); \ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr_b); \ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \ + { \ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \ + VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \ + VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_a.x += 4; coord_b.y += 4; \ + VXC_DP4x4(tempA0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8toFp32Block4_4x4); \ + VXC_DP4x4(tempA1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8toFp32Block4_4x4); \ + VXC_DP4x4(tempA2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8toFp32Block4_4x4); \ + VXC_DP4x4(tempA3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8toFp32Block4_4x4); \ + VXC_DP8x4(tempB0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8MulZptoFp32_8x4); \ + VXC_DP8x4(tempB1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8MulZptoFp32_8x4); \ + VXC_DP8x4(tempB2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8MulZptoFp32_8x4); \ + VXC_DP8x4(tempB3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGemmU8U8MulZptoFp32_8x4); \ + sum0 += tempA0 + tempB0; \ + sum1 += tempA1 + tempB1; \ + sum2 += tempA2 + tempB2; \ + sum3 += tempA3 + tempB3; \ + } \ + vxc_int4 tmpOut0, tmpOut1; \ + coord_b.y = get_global_id(1); \ + coord_b.z = get_global_id(2) + i * get_global_size(2); \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_b.w, baseAddr); \ + tmpOut0 = convert_int4_rte(sum0 * inOutScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum1 * inOutScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + tmpOut0 = convert_int4_rte(sum2 * inOutScale + output_ZP); \ + tmpOut1 = convert_int4_rte(sum3 * inOutScale + output_ZP); \ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_b.y++; \ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + } \ +} +GEMM_QINT_TO_QINT_MERGE(U8, vxc_uchar16) +GEMM_QINT_TO_QINT_MERGE(I8, vxc_char16) + +#if (VX_VERSION==2) +__kernel void gemm_F16F16toF16_merge(image2d_array_t inputA, + image2d_array_t inputB, image2d_array_t output, + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) +{ + uint gidy = get_global_id(1); + for(int i = 0; i < outer; i++) + { + int4 coord_a = (int4)(0, gidy, (ac2zero ? i : get_global_id(2)), 0); + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); + + half4 valC; + vxc_short8 srcA0, srcA1, srcA2, srcA3, outC; + vxc_half8 tmpA0, tmpA1, tmpA2, tmpA3; + vxc_short16 srcB; + vxc_half16 tmpB; + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); + + int8 inputA_desc, inputB_desc, output_desc; + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; + _viv_asm(MOV, coord_a.w, baseAddr_a); + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr_b); + + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) + { + vxc_float4 tempA0, tempA1, tempA2, tempA3; + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + coord_a.x += 4; coord_b.y += 4; + _viv_asm(COPY, tmpA0, srcA0, 16); + _viv_asm(COPY, tmpA1, srcA1, 16); + _viv_asm(COPY, tmpA2, srcA2, 16); + _viv_asm(COPY, tmpA3, srcA3, 16); + _viv_asm(COPY, tmpB.hi, srcB.hi, 16); + _viv_asm(COPY, tmpB.lo, srcB.lo, 16); + VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmU8F16toF32Lo_4x4b); + VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmU8F16toF32Lo_4x4b); + VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, tmpA2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmU8F16toF32Lo_4x4b); + VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, tmpA3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmU8F16toF32Lo_4x4b); + sum0 += (tempA0); + sum1 += (tempA1); + sum2 += (tempA2); + sum3 += (tempA3); + } + coord_b.y = gidy; + coord_b.z = get_global_id(2) + i * get_global_size(2); + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr); + _viv_asm(CONV, valC, sum0); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_b.y++; + _viv_asm(CONV, valC, sum1); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_b.y++; + _viv_asm(CONV, valC, sum2); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_b.y++; + _viv_asm(CONV, valC, sum3); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + } +} +#else +__kernel void gemm_F16F16toF16_merge(image2d_array_t inputA, + image2d_array_t inputB, image2d_array_t output, + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) +{ + uint gidy = get_global_id(1); + for(int i = 0; i < outer; i++) + { + int4 coord_a = (int4)(0, gidy, (ac2zero ? i : get_global_id(2)), 0); + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); + + half4 valC; + vxc_short8 srcA0, srcB0, srcA1, srcB1, outC; + vxc_half8 tmpA0, tmpB0, tmpA1, tmpB1; + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); + + int8 inputA_desc, inputB_desc, output_desc; + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; + _viv_asm(MOV, coord_a.w, baseAddr_a); + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr_b); + + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) + { + vxc_float4 tempA0, tempA1, tempA2, tempA3; + vxc_float4 tempB0, tempB1, tempB2, tempB3; + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + coord_a.x += 4; coord_b.y += 4; + _viv_asm(COPY, tmpA0, srcA0, 16); + _viv_asm(COPY, tmpB0, srcB0, 16); + _viv_asm(COPY, tmpA1, srcA1, 16); + _viv_asm(COPY, tmpB1, srcB1, 16); + + VXC_DP4x4(tempA0, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmFp16toFp32Row0Lo_4x4); + VXC_DP4x4(tempB0, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmFp16toFp32Row0Hi_4x4); + VXC_DP4x4(tempA1, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmFp16toFp32Row1Lo_4x4); + VXC_DP4x4(tempB1, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmFp16toFp32Row1Hi_4x4); + VXC_DP4x4(tempA2, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmFp16toFp32Row0Lo_4x4); + VXC_DP4x4(tempB2, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmFp16toFp32Row0Hi_4x4); + VXC_DP4x4(tempA3, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmFp16toFp32Row1Lo_4x4); + VXC_DP4x4(tempB3, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniGemmFp16toFp32Row1Hi_4x4); + sum0 += (tempA0 + tempB0); + sum1 += (tempA1 + tempB1); + sum2 += (tempA2 + tempB2); + sum3 += (tempA3 + tempB3); + } + coord_b.y = gidy; + coord_b.z = get_global_id(2) + i * get_global_size(2); + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); + int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; + _viv_asm(MOV, coord_b.w, baseAddr); + _viv_asm(CONV, valC, sum0); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_b.y++; + _viv_asm(CONV, valC, sum1); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_b.y++; + _viv_asm(CONV, valC, sum2); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + coord_b.y++; + _viv_asm(CONV, valC, sum3); + _viv_asm(COPY, outC, valC, 16); + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + } +} +#endif diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_BF16_to_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_BF16_to_BF16.vx new file mode 100644 index 000000000..03b2c33d4 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_BF16_to_BF16.vx @@ -0,0 +1,99 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float2 half_input0_wh; +_viv_uniform float2 add_float_value; +_viv_uniform int depth; + +_viv_uniform VXC_512Bits uniBF16toFp32_part0_2x8; +_viv_uniform VXC_512Bits uniBF16toFp32_part1_2x8; + +#define GRID_SAMPLE_BF16_PROCESS() \ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \ + int4 x_idx = convert_int4(in_x); \ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \ + int4 y_idx = convert_int4(in_y); \ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \ + int baseAddr = input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + vxc_short8 src; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + int loop = depth - 1; \ + while (coord_in.z < loop) \ + { \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \ + coord_in.x = x_idx.x; \ + coord_in.y = y_idx.x; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + } \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + + + +__kernel void nearest_grid_sample_BF16_BF16toBF16( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + + vxc_short8 read_val; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + + float4 fxy0; + float4 fxy1; + + vxc_short8 read_src; + VXC_DP2x8(read_src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part0_2x8); + _viv_asm(COPY, fxy0, read_src, 16); + VXC_DP2x8(read_src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part1_2x8); + _viv_asm(COPY, fxy1, read_src, 16); + + + + GRID_SAMPLE_BF16_PROCESS(); + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_F16_to_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_F16_to_F16.vx new file mode 100644 index 000000000..ec90f1daa --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_F16_to_F16.vx @@ -0,0 +1,148 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float2 half_input0_wh; +_viv_uniform float2 add_float_value; +_viv_uniform int depth; +_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4; +_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4; +_viv_uniform VXC_512Bits uniExtactHalf8_2x8; + +#define GRID_SAMPLE_F16_PROCESS() \ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \ + int4 x_idx = convert_int4(in_x); \ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \ + int4 y_idx = convert_int4(in_y); \ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \ + int baseAddr = input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + vxc_short8 src; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + int loop = depth - 1; \ + while (coord_in.z < loop) \ + { \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \ + coord_in.x = x_idx.x; \ + coord_in.y = y_idx.x; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + } \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + + +__kernel void nearest_grid_sample_F16_F32toF16( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + coord_in1.z = coord_in1.z + 4; + + float4 fxy0 = read_imagef(input1, coord_in1.xy); + float4 fxy1 = read_imagef(input1, coord_in1.zw); + + GRID_SAMPLE_F16_PROCESS(); + +} + +_viv_uniform int input1_ZP; +_viv_uniform float input1Scale; +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4; +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4; + +__kernel void nearest_grid_sample_F16_U8toF16( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + coord_in1.xz = coord_in1.xz * 2; + vxc_uchar16 read_coord; + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 fxy0; + float4 fxy1; + unsigned char input1ZP; + _viv_asm(COPY, input1ZP, input1_ZP, 4); + VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4); + VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4); + fxy0 = fxy0 * input1Scale; + fxy1 = fxy1 * input1Scale; + + GRID_SAMPLE_F16_PROCESS(); + +} + + +_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4; +_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4; + +__kernel void nearest_grid_sample_F16_F16toF16( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + + vxc_short8 read_val; + vxc_half8 read_coord; + + VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, read_coord, read_val, 16); + + float4 fxy0; + float4 fxy1; + + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4); + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4); + + GRID_SAMPLE_F16_PROCESS(); + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_F16_to_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_F16_to_U8.vx new file mode 100644 index 000000000..6a43dddd0 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_F16_to_U8.vx @@ -0,0 +1,171 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float2 half_input0_wh; +_viv_uniform float2 add_float_value; +_viv_uniform int depth; + +_viv_uniform VXC_512Bits uniExtact8Bit_2x8; +_viv_uniform float uint8Scale; +_viv_uniform float output_ZP; + +_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4; +_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4; + +#define GRID_SAMPLE_F16_to_U8_PROCESS() \ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \ + int4 x_idx = convert_int4(in_x); \ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \ + int4 y_idx = convert_int4(in_y); \ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \ + int baseAddr = input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + vxc_short8 s0; \ + vxc_uchar16 result; \ + vxc_half8 src; \ + VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, s0, 16); \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + int loop = depth - 1; \ + float4 dst4; \ + int4 dst; \ + while (coord_in.z < loop) \ + { \ + VXC_DP4x4(dst4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4); \ + dst4 = dst4 * uint8Scale + output_ZP; \ + dst = convert_int4_rte(dst4); \ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \ + result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \ + coord_in.x = x_idx.x; \ + coord_in.y = y_idx.x; \ + VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src, s0, 16); \ + } \ + VXC_DP4x4(dst4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4); \ + dst4 = dst4 * uint8Scale + output_ZP; \ + dst = convert_int4_rte(dst4); \ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + +__kernel void nearest_grid_sample_F16_F32toU8( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + coord_in1.z = coord_in1.z + 4; + + float4 fxy0 = read_imagef(input1, coord_in1.xy); + float4 fxy1 = read_imagef(input1, coord_in1.zw); + GRID_SAMPLE_F16_to_U8_PROCESS(); + +} + +_viv_uniform int input1_ZP; +_viv_uniform float input1Scale; +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4; +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4; + + +__kernel void nearest_grid_sample_F16_U8toU8( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + + vxc_uchar16 read_coord; + + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + float4 fxy0; + float4 fxy1; + + unsigned char input1ZP; + _viv_asm(COPY, input1ZP, input1_ZP, 4); + + VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4); + VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4); + + fxy0 = fxy0 * input1Scale; + fxy1 = fxy1 * input1Scale; + + GRID_SAMPLE_F16_to_U8_PROCESS(); + +} + + +__kernel void nearest_grid_sample_F16_F16toU8( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + + vxc_short8 read_val; + vxc_half8 read_coord; + + VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, read_coord, read_val, 16); + + float4 fxy0; + float4 fxy1; + + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4); + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4); + + GRID_SAMPLE_F16_to_U8_PROCESS(); + +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_I16_to_I16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_I16_to_I16.vx new file mode 100644 index 000000000..b838b08d8 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_I16_to_I16.vx @@ -0,0 +1,98 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float2 half_input0_wh; +_viv_uniform float2 add_float_value; +_viv_uniform int depth; + +_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4; +_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4; +_viv_uniform float input1_scale; +_viv_uniform VXC_512Bits uniConvertI8toI8_2x8; + + +#define GRID_SAMPLE_I16_PROCESS() \ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \ + int4 x_idx = convert_int4(in_x); \ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \ + int4 y_idx = convert_int4(in_y); \ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \ + int baseAddr = input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + vxc_short8 src, dst; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + int loop = depth - 1; \ + while (coord_in.z < loop) \ + { \ + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \ + coord_in.x = x_idx.x; \ + coord_in.y = y_idx.x; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + } \ + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + + +__kernel void nearest_grid_sample_I16_I16toI16( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + vxc_short8 read_coord; + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + float4 fxy0; + float4 fxy1; + + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4); + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4); + + fxy0 = fxy0 * input1_scale; + fxy1 = fxy1 * input1_scale; + + GRID_SAMPLE_I16_PROCESS(); + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_I8_to_I8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_I8_to_I8.vx new file mode 100644 index 000000000..871383cbc --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_I8_to_I8.vx @@ -0,0 +1,97 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float2 half_input0_wh; +_viv_uniform float2 add_float_value; +_viv_uniform int depth; + + +_viv_uniform float input1_scale; +_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4; +_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4; +_viv_uniform VXC_512Bits uniConvertI8toI8_2x8; + +#define GRID_SAMPLE_I8_PROCESS() \ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \ + int4 x_idx = convert_int4(in_x); \ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \ + int4 y_idx = convert_int4(in_y); \ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \ + int baseAddr = input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + vxc_char16 src, dst; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + int loop = depth - 1; \ + while (coord_in.z < loop) \ + { \ + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \ + coord_in.x = x_idx.x; \ + coord_in.y = y_idx.x; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + } \ + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + +__kernel void nearest_grid_sample_I8_I8toI8( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + vxc_char16 read_coord; + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + float4 fxy0; + float4 fxy1; + + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4); + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4); + + fxy0 = fxy0 * input1_scale; + fxy1 = fxy1 * input1_scale; + + GRID_SAMPLE_I8_PROCESS(); + +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_U8_to_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_U8_to_U8.vx new file mode 100644 index 000000000..696c96dc9 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_U8_to_U8.vx @@ -0,0 +1,160 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform float2 half_input0_wh; +_viv_uniform float2 add_float_value; +_viv_uniform int depth; + +_viv_uniform int input1_ZP; +_viv_uniform float input1Scale; +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4; +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4; + +_viv_uniform VXC_512Bits uniMultiplyAndPostShift_2x8; +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp + +#define GRID_SAMPLE_U8_PROCESS() \ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \ + int4 x_idx = convert_int4(in_x); \ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \ + int4 y_idx = convert_int4(in_y); \ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \ + int baseAddr = input_desc.s0; \ + _viv_asm(MOV, coord_in.w, baseAddr); \ + vxc_uchar16 src, dst; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + int8 output_desc; \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.w, baseAddr); \ + int loop = depth - 1; \ + vxc_ushort8 multiplier; \ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \ + while (coord_in.z < loop) \ + { \ + VXC_DP2x8(dst, src, multiplier, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \ + coord_in.x = x_idx.x; \ + coord_in.y = y_idx.x; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.y; \ + coord_in.y = y_idx.y; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.z; \ + coord_in.y = y_idx.z; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = x_idx.w; \ + coord_in.y = y_idx.w; \ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \ + } \ + VXC_DP2x8(dst, src, multiplier, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); + + +__kernel void nearest_grid_sample_U8_F32toU8( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + coord_in1.z = coord_in1.z + 4; + + float4 fxy0 = read_imagef(input1, coord_in1.xy); + float4 fxy1 = read_imagef(input1, coord_in1.zw); + GRID_SAMPLE_U8_PROCESS(); + +} + + +__kernel void nearest_grid_sample_U8_U8toU8( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + + vxc_uchar16 read_coord; + + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + float4 fxy0; + float4 fxy1; + + unsigned char input1ZP; + _viv_asm(COPY, input1ZP, input1_ZP, 4); + + VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4); + VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4); + + fxy0 = fxy0 * input1Scale; + fxy1 = fxy1 * input1Scale; + + GRID_SAMPLE_U8_PROCESS(); + +} + +_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4; +_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4; + +__kernel void nearest_grid_sample_U8_F16toU8( + __read_only image2d_array_t input0, + __read_only image2d_t input1, + __write_only image2d_array_t output, + int align_corners) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int4 coord_in1 = coord_out.xyxy; + + coord_in1.xz = coord_in1.xz * 2; + + vxc_short8 read_val; + vxc_half8 read_coord; + + VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, read_coord, read_val, 16); + + float4 fxy0; + float4 fxy1; + + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4); + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4); + + GRID_SAMPLE_U8_PROCESS(); + +} + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow.vx index 19873f170..438d7be12 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pow.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pow.vx @@ -22,8 +22,8 @@ __kernel void pow_##name \ \ src0_type src0; \ copy0_type data0; \ - src0_type src1; \ - copy0_type data1; \ + src1_type src1; \ + copy1_type data1; \ VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, data0, src0, 16); \ VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ @@ -94,8 +94,8 @@ __kernel void pow_##name##_2D \ \ src0_type src0; \ copy0_type data0; \ - src0_type src1; \ - copy0_type data1; \ + src1_type src1; \ + copy1_type data1; \ VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, data0, src0, 16); \ VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra.vx index 28f3f0c0e..91e4213dd 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra.vx @@ -28,9 +28,21 @@ _viv_uniform int zp; _viv_uniform float outputScale; __kernel void pre_process_bgra_scale_U8toU8( - __read_only image2d_array_t input, __write_only image2d_array_t output, - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) + __read_only image2d_array_t input, + __write_only image2d_array_t output, + global int * xRatio, + global int * yRatio, + global int * xOffset, + global int * yOffset, + float rMean, + float gMean, + float bMean, + float r_scale, + int reverse_channel, + int trans, + float g_scale, + float b_scale + ) { int4 gidx = get_global_id(0); int gidy = get_global_id(1); @@ -86,6 +98,7 @@ __kernel void pre_process_bgra_scale_U8toU8( int4 tmp1, tmp2, result1, result2; float4 tmpDst, tmp0; float4 mean = (float4)(bMean, gMean, rMean, 0); + float4 var = (float4)(b_scale, g_scale, r_scale, 0); //tmpFx = (int4)(fx.x, fx.x, fx.x, fx.x); int tmpV = 1 << 19; vxc_short8 tmpFx; @@ -148,9 +161,21 @@ __kernel void pre_process_bgra_scale_U8toU8( } __kernel void pre_process_bgra_copy_U8toU8( - __read_only image2d_array_t input, __write_only image2d_array_t output, - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) + __read_only image2d_array_t input, + __write_only image2d_array_t output, + global int * xRatio, + global int * yRatio, + global int * xOffset, + global int * yOffset, + float rMean, + float gMean, + float bMean, + float r_scale, + int reverse_channel, + int trans, + float g_scale, + float b_scale +) { int2 pos = (int2)((get_global_id(0) + (*xOffset)) << 2, get_global_id(1) + (*yOffset)); @@ -165,10 +190,10 @@ __kernel void pre_process_bgra_copy_U8toU8( VXC_DP4x4(tmpG, lineBGRA0, lineBGRA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGfromBgra_4x4); VXC_DP4x4(tmpR, lineBGRA0, lineBGRA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRfromBgra_4x4); - tmpDst = (tmpB - bMean) * var; + tmpDst = (tmpB - bMean) * b_scale; result1 = convert_int4_rte(tmpDst * outputScale + zp); - tmpDst = (tmpG - gMean) * var; + tmpDst = (tmpG - gMean) * g_scale; result2 = convert_int4_rte(tmpDst * outputScale + zp); VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); @@ -178,7 +203,7 @@ __kernel void pre_process_bgra_copy_U8toU8( dstPos.z = 1; VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); - tmpDst = (tmpR - rMean) * var; + tmpDst = (tmpR - rMean) * r_scale; result1 = convert_int4_rte(tmpDst * outputScale + zp); VXC_DP2x8(dst, result1, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx index fcc8d9c06..a20a579f6 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx @@ -3,7 +3,10 @@ _viv_uniform int bOrder; _viv_uniform int rOrder; -_viv_uniform float outputScaleVar; +_viv_uniform float outputScaleVar_b; +_viv_uniform float outputScaleVar_g; +_viv_uniform float outputScaleVar_r; + _viv_uniform float bMeanScaleVarZp; _viv_uniform float gMeanScaleVarZp; _viv_uniform float rMeanScaleVarZp; @@ -28,10 +31,12 @@ __kernel void pre_process_nv12_copy_##name \ float rMean, \ float gMean, \ float bMean, \ - float var, \ + float r_scale, \ int reverse_channel, \ int trans, \ - int nv_type \ + int nv_type, \ + float g_scale, \ + float b_scale \ ) \ { \ int gidx = get_global_id(0); \ @@ -65,21 +70,21 @@ __kernel void pre_process_nv12_copy_##name \ dst_type dst0; \ save_type dst; \ int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \ - tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \ + tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \ _viv_asm(CONV_RTE, result, tmpDstB); \ dstPos.z = bOrder; \ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ _viv_asm(COPY, dst, dst0, copy_bytes); \ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ - tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \ + tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \ _viv_asm(CONV_RTE, result, tmpDstG); \ dstPos.z = 1; \ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ _viv_asm(COPY, dst, dst0, copy_bytes); \ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ - tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \ + tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \ _viv_asm(CONV_RTE, result, tmpDstR); \ dstPos.z = rOrder; \ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx index f4ac83b40..2fe9ad62f 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx @@ -3,7 +3,10 @@ _viv_uniform int bOrder; _viv_uniform int rOrder; -_viv_uniform float outputScaleVar; +_viv_uniform float outputScaleVar_b; +_viv_uniform float outputScaleVar_g; +_viv_uniform float outputScaleVar_r; + _viv_uniform float bMeanScaleVarZp; _viv_uniform float gMeanScaleVarZp; _viv_uniform float rMeanScaleVarZp; @@ -36,10 +39,12 @@ __kernel void pre_process_nv12_scale_##name##_gq \ float rMean, \ float gMean, \ float bMean, \ - float var, \ + float r_scale, \ int reverse_channel, \ int trans, \ - int nv_type \ + int nv_type, \ + float g_scale, \ + float b_scale \ ) \ { \ uint4 gidx = get_global_id(0); \ @@ -93,21 +98,21 @@ __kernel void pre_process_nv12_scale_##name##_gq \ dst_type dst0; \ save_type dst; \ int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \ - tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \ + tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \ _viv_asm(CONV_RTE, result, tmpDstB); \ dstPos.z = bOrder; \ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ _viv_asm(COPY, dst, dst0, copy_bytes); \ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ - tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \ + tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \ _viv_asm(CONV_RTE, result, tmpDstG); \ dstPos.z = 1; \ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ _viv_asm(COPY, dst, dst0, copy_bytes); \ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ - tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \ + tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \ _viv_asm(CONV_RTE, result, tmpDstR); \ dstPos.z = rOrder; \ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ @@ -132,10 +137,12 @@ __kernel void pre_process_nv12_scale_##name \ float rMean, \ float gMean, \ float bMean, \ - float var, \ + float r_scale, \ int reverse_channel, \ int trans, \ - int nv_type \ + int nv_type, \ + float g_scale, \ + float b_scale \ ) \ { \ uint4 gidx = get_global_id(0); \ @@ -187,21 +194,21 @@ __kernel void pre_process_nv12_scale_##name \ dst_type dst0; \ save_type dst; \ int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \ - tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \ + tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \ _viv_asm(CONV_RTE, result, tmpDstB); \ dstPos.z = bOrder; \ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ _viv_asm(COPY, dst, dst0, copy_bytes); \ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ - tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \ + tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \ _viv_asm(CONV_RTE, result, tmpDstG); \ dstPos.z = 1; \ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ _viv_asm(COPY, dst, dst0, copy_bytes); \ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ - tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \ + tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \ _viv_asm(CONV_RTE, result, tmpDstR); \ dstPos.z = rOrder; \ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb.vx index 536c18df0..c42f2eb6b 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb.vx @@ -29,9 +29,11 @@ __write_only image2d_array_t output, \ float rMean, \ float gMean, \ float bMean, \ - float f32Var, \ + float r_scale, \ int reverse_channel, \ - int trans \ + int trans, \ + float g_scale, \ + float b_scale \ ) \ { \ int2 ratioXY = (int2)(*xRatio, *yRatio); \ @@ -80,7 +82,7 @@ __write_only image2d_array_t output, \ \ float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \ \ - bgrMean *= f32Var; \ + bgrMean *= (float4)(b_scale, g_scale, r_scale, 0); \ \ int4 test01, temp1; \ int4 test02, temp2; \ @@ -113,7 +115,7 @@ __write_only image2d_array_t output, \ \ /*convert U8 to dst*/ \ dst_type dst; \ - tmp_dst = tmp_dst * f32Var - bgrMean.zzzz; \ + tmp_dst = tmp_dst * r_scale - bgrMean.zzzz; \ tmp_dst = tmp_dst * outputScale + outputZP; \ conv_type dst0; \ _viv_asm(CONV_RTE, dst0, tmp_dst); \ @@ -140,7 +142,7 @@ __write_only image2d_array_t output, \ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ uniConvertIntergetoF32_4x4); \ \ - tmp_dst = tmp_dst * f32Var - bgrMean.y; \ + tmp_dst = tmp_dst * g_scale - bgrMean.y; \ tmp_dst = tmp_dst * outputScale + outputZP; \ _viv_asm(CONV_RTE, dst0, tmp_dst); \ VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ @@ -165,7 +167,7 @@ __write_only image2d_array_t output, \ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ uniConvertIntergetoF32_4x4); \ \ - tmp_dst = tmp_dst * f32Var - bgrMean.x; \ + tmp_dst = tmp_dst * b_scale - bgrMean.x; \ tmp_dst = tmp_dst * outputScale + outputZP; \ _viv_asm(CONV_RTE, dst0, tmp_dst); \ VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx index 5cb3ebbe7..a008b46e2 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx @@ -10,8 +10,9 @@ _viv_uniform VXC_512Bits uniExtract8Data_2x8; _viv_uniform float output_scale; _viv_uniform float output_zp; +_viv_uniform int4 rgb_order; -#define RESIZE_BILINEAR_4X1(mean, output) \ +#define RESIZE_BILINEAR_4X1(scale, mean, output, _coord) \ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ @@ -49,21 +50,19 @@ _viv_uniform float output_zp; VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ uniConvertIntergetoF32_4x4); \ \ - tmp_dst = tmp_dst * output_scale - mean * output_scale + output_zp; \ + tmp_dst = tmp_dst * scale * output_scale - mean * scale * output_scale + output_zp; \ _viv_asm(CONV, dst0, tmp_dst); \ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ uniExtract8Data_2x8); \ _viv_asm(COPY, dst, dst1, 8); \ - VXC_WriteImage(output, coord_out, dst, \ + VXC_WriteImage(output, _coord, dst, \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); #define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \ ( \ __read_only image2d_array_t input, \ - __write_only image2d_array_t output0, \ - __write_only image2d_array_t output1, \ - __write_only image2d_array_t output2, \ + __write_only image2d_array_t output, \ global int *xRatio, \ global int *yRatio, \ global int *xOffset, \ @@ -71,7 +70,11 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \ float rMean, \ float gMean, \ float bMean, \ - float f32Var \ + float r_scale, \ + int reverse, \ + int height, \ + float g_scale, \ + float b_scale \ ) \ { \ int2 ratioXY = (int2)(*xRatio, *yRatio); \ @@ -133,7 +136,8 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \ int4 test02, temp2; \ int4 tt; \ vxc_uchar4 val; \ - int2 coord_out = (int2)(xPos.x, yPos); \ + int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \ + coord_out.yzw += rgb_order.xyz; \ \ vxc_uchar8 line1, line2; \ \ @@ -158,16 +162,16 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \ conv_type dst0; \ dst_type dst1; \ copy_type dst; \ - tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \ + tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \ _viv_asm(CONV, dst0, tmp_dst); \ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ uniExtract8Data_2x8); \ _viv_asm(COPY, dst, dst1, 8); \ - VXC_WriteImage(output0, coord_out, dst, \ + VXC_WriteImage(output, coord_out.xy, dst, \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ - RESIZE_BILINEAR_4X1(gMean, output1) \ - RESIZE_BILINEAR_4X1(bMean, output2) \ + RESIZE_BILINEAR_4X1(g_scale, gMean, output, coord_out.xz) \ + RESIZE_BILINEAR_4X1(b_scale, bMean, output, coord_out.xw) \ } PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8, half4, vxc_short8) PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4, vxc_short8) @@ -176,9 +180,7 @@ PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4, vxc_short8) __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \ ( \ __read_only image2d_array_t input, \ - __write_only image2d_array_t output0, \ - __write_only image2d_array_t output1, \ - __write_only image2d_array_t output2, \ + __write_only image2d_array_t output, \ global int *xRatio, \ global int *yRatio, \ global int *xOffset, \ @@ -186,7 +188,11 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \ float rMean, \ float gMean, \ float bMean, \ - float f32Var \ + float r_scale, \ + int reverse, \ + int height, \ + float g_scale, \ + float b_scale \ ) \ { \ int2 ratioXY = (int2)(*xRatio, *yRatio); \ @@ -241,7 +247,8 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \ \ int4 test01, temp1; \ int4 test02, temp2; \ - int2 coord_out = (int2)(xPos.x, yPos); \ + int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \ + coord_out.yzw += rgb_order.xyz; \ \ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniVecShift10); \ @@ -265,12 +272,12 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \ \ int4 dst0; \ write_type dst; \ - tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \ + tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \ dst0 = convert_int4_rte(tmp_dst); \ VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ uniExtract8Data_2x8); \ \ - VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ coord_in.x = coord.x; \ coord_in.z = 1; \ @@ -310,12 +317,12 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \ uniExtractBytes); \ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ uniConvertIntergetoF32_4x4); \ - tmp_dst = tmp_dst * output_scale - gMean * output_scale + output_zp; \ + tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \ dst0 = convert_int4_rte(tmp_dst); \ VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ uniExtract8Data_2x8); \ \ - VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.xz, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ coord_in.x = coord.x; \ coord_in.z = 2; \ @@ -355,12 +362,12 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \ uniExtractBytes); \ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ uniConvertIntergetoF32_4x4); \ - tmp_dst = tmp_dst * output_scale - bMean * output_scale + output_zp; \ + tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \ dst0 = convert_int4_rte(tmp_dst); \ VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ uniExtract8Data_2x8); \ \ - VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.xw, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ } PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16) PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx index b0714e47c..724b28ad3 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx @@ -6,14 +6,13 @@ _viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8; _viv_uniform float output_scale; _viv_uniform float output_zp; +_viv_uniform int4 rgb_order; #define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \ ( \ __read_only image2d_array_t input, \ - __write_only image2d_array_t output0, \ - __write_only image2d_array_t output1, \ - __write_only image2d_array_t output2, \ + __write_only image2d_array_t output, \ global int *xRatio, \ global int *yRatio, \ global int *xOffset, \ @@ -21,7 +20,11 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \ float rMean, \ float gMean, \ float bMean, \ - float f32Var \ + float r_scale, \ + int reverse, \ + int height, \ + float g_scale, \ + float b_scale \ ) \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ @@ -38,8 +41,9 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \ VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ \ coord.x = coord.z + 8; \ - float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \ - rMean * output_scale - output_zp, output_scale); \ + float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp,\ + rMean * r_scale * output_scale - output_zp, \ + rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \ \ half4 paramData_f16; \ copy_type tmp_dst; \ @@ -49,33 +53,38 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \ VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ uniDataMeanStddevHi_2x8); \ _viv_asm(COPY, tmp_dst, dst0, 16); \ - VXC_WriteImage(output0, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + int4 coord_out = coord; \ + coord_out.yw = coord_out.ww + rgb_order.xy; \ + VXC_WriteImage(output, coord_out.zy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, tmp_dst, dst1, 16); \ - VXC_WriteImage(output0, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.xy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ \ - float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \ - gMean * output_scale - output_zp, output_scale); \ + float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \ + gMean * g_scale * output_scale - output_zp, \ + gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \ _viv_asm(CONV, paramData_f16, paramData1); \ VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ uniDataMeanStddevLo_2x8); \ VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ uniDataMeanStddevHi_2x8); \ _viv_asm(COPY, tmp_dst, dst0, 16); \ - VXC_WriteImage(output1, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, tmp_dst, dst1, 16); \ - VXC_WriteImage(output1, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ \ - float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \ - bMean * output_scale - output_zp, output_scale); \ + float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \ + bMean * b_scale * output_scale - output_zp, \ + bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \ _viv_asm(CONV, paramData_f16, paramData2); \ VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ uniDataMeanStddevLo_2x8); \ VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ uniDataMeanStddevHi_2x8); \ _viv_asm(COPY, tmp_dst, dst0, 16); \ - VXC_WriteImage(output2, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_out.w = coord.w + rgb_order.z; \ + VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, tmp_dst, dst1, 16); \ - VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ } PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8, vxc_short8) PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8) @@ -84,9 +93,7 @@ PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8) __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \ ( \ __read_only image2d_array_t input, \ - __write_only image2d_array_t output0, \ - __write_only image2d_array_t output1, \ - __write_only image2d_array_t output2, \ + __write_only image2d_array_t output, \ global int *xRatio, \ global int *yRatio, \ global int *xOffset, \ @@ -94,7 +101,11 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \ float rMean, \ float gMean, \ float bMean, \ - float f32Var \ + float r_scale, \ + int reverse, \ + int height, \ + float g_scale, \ + float b_scale \ ) \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ @@ -110,8 +121,11 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \ coord_in.z ++; \ VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ \ - float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \ - rMean * output_scale - output_zp, output_scale); \ + int4 coord_out = coord; \ + coord_out.xyw = coord.www + rgb_order.xyz; \ + float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \ + rMean * r_scale * output_scale - output_zp, \ + rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \ \ half4 paramData_f16; \ _viv_asm(CONV, paramData_f16, paramData0); \ @@ -120,27 +134,29 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \ uniDataMeanStddevLo_2x8); \ VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \ uniDataMeanStddevHi_2x8); \ - VXC_WriteImage(output0, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.zx, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ \ - float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \ - gMean * output_scale - output_zp, output_scale); \ + float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \ + gMean * g_scale * output_scale - output_zp, \ + gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \ _viv_asm(CONV, paramData_f16, paramData1); \ \ VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ uniDataMeanStddevLo_2x8); \ VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \ uniDataMeanStddevHi_2x8); \ - VXC_WriteImage(output1, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ \ - float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \ - bMean * output_scale - output_zp, output_scale); \ + float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \ + bMean * b_scale * output_scale - output_zp, \ + bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \ _viv_asm(CONV, paramData_f16, paramData2); \ \ VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ uniDataMeanStddevLo_2x8); \ VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \ uniDataMeanStddevHi_2x8); \ - VXC_WriteImage(output2, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ } PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16) PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx index 1ac60fe72..ed58fa920 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx @@ -5,13 +5,12 @@ _viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8; _viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4; _viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4; _viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4; +_viv_uniform int4 rgb_order; __kernel void pre_process_rgb888_planar_4over3_U8toU8 ( __read_only image2d_array_t input, - __write_only image2d_array_t output0, - __write_only image2d_array_t output1, - __write_only image2d_array_t output2, + __write_only image2d_array_t output, global int *xRatio, global int *yRatio, global int *xOffset, @@ -19,7 +18,11 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8 float rMean, float gMean, float bMean, - float f32Var + float r_scale, + int reverse, + int height, + float g_scale, + float b_scale ) { int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0); @@ -49,9 +52,11 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8 VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4); VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4); - VXC_WriteImage(output0, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + int4 coord_r = coord_out; + coord_r.yzw += rgb_order.xxx; + VXC_WriteImage(output, coord_r.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_r.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_r.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); @@ -72,9 +77,11 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8 VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4); VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4); - VXC_WriteImage(output1, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + int4 coord_g = coord_out; + coord_g.yzw += rgb_order.yyy; + VXC_WriteImage(output, coord_g.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_g.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_g.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); @@ -94,17 +101,17 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8 VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4); VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4); - VXC_WriteImage(output2, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output2, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + int4 coord_b = coord_out; + coord_b.yzw += rgb_order.zzz; + VXC_WriteImage(output, coord_b.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_b.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_b.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); } __kernel void pre_process_rgb888_planar_half_U8toU8 ( __read_only image2d_array_t input, - __write_only image2d_array_t output0, - __write_only image2d_array_t output1, - __write_only image2d_array_t output2, + __write_only image2d_array_t output, global int *xRatio, global int *yRatio, global int *xOffset, @@ -112,7 +119,11 @@ __kernel void pre_process_rgb888_planar_half_U8toU8 float rMean, float gMean, float bMean, - float f32Var + float r_scale, + int reverse, + int height, + float g_scale, + float b_scale ) { int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0); @@ -130,7 +141,9 @@ __kernel void pre_process_rgb888_planar_half_U8toU8 int2 coord = coord_in.xy >> 1; - VXC_WriteImage(output0, coord, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output1, coord, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output2, coord, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + int4 coord_rgb = coord.xyyy; + coord_rgb.yzw += rgb_order.xyz; + VXC_WriteImage(output, coord_rgb.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_rgb.xz, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_rgb.xw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_0.vx new file mode 100644 index 000000000..336c4e6e1 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_0.vx @@ -0,0 +1,377 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniVecShift10; +_viv_uniform VXC_512Bits uniAddRShift; +_viv_uniform VXC_512Bits uniGetTempVal; +_viv_uniform VXC_512Bits uniExtractBytes; + +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8; +_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8; +_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8; +_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8; + +_viv_uniform float output_scale; +_viv_uniform float output_zp; + +#define RESIZE_BILINEAR_4X1(scale, mean) \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.y; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.z; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.w; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_in.z ++; \ + coord_in.x = coord.x; \ + \ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + \ + tmp_dst = tmp_dst * scale * output_scale - mean * scale * output_scale + output_zp; \ + _viv_asm(CONV, dst0, tmp_dst); +#define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \ +__kernel void pre_process_rgb888_planar_scale_U8to##dst_name##_nhwc \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float r_scale, \ + int reverse, \ + float g_scale, \ + float b_scale \ + ) \ +{ \ + int2 ratioXY = (int2)(*xRatio, *yRatio); \ + \ + int4 xPos = get_global_id(0); \ + int yPos = get_global_id(1); \ + \ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \ + xPos += (int4)(0, 1, 2, 3); \ + \ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \ + int4 sx = fx0 & 0xffff8000; \ + fx0 -= sx; \ + sx = sx >> 15; \ + \ + vxc_short4 fx; \ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniAddRShift); \ + \ + int fy = yPos * ratioXY.y + ratioSufXY.y; \ + int sy = fy & 0xffff8000; \ + \ + fy -= sy; \ + sy = sy >> 15; \ + \ + fy = (fy + (1<< 4)) >> 5; \ + \ + vxc_uchar16 line0Y; \ + vxc_uchar16 line1Y; \ + int4 coord; \ + int4 coord_in = (int4)(0, 0, 0, 0); \ + sx = sx + *xOffset; \ + coord = sx.xyzw; \ + coord_in.y = sy + *yOffset; \ + coord_in.x = coord.x; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.y; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.z; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.w; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_in.z ++; \ + coord_in.x = coord.x; \ + \ + int4 test01, temp1; \ + int4 test02, temp2; \ + int4 tt; \ + vxc_uchar4 val; \ + int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \ + coord_out.x = coord_out.x * 3; \ + coord_out.z = coord_out.x + 8; \ + \ + vxc_uchar8 line1, line2; \ + \ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + \ + vxc_float4 tmp_dst; \ + vxc_uchar4 u8_dst; \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + \ + conv_type dst0; \ + dst_type dst1, dst2; \ + copy_type data0, data1, dst; \ + tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \ + _viv_asm(CONV, dst0, tmp_dst); \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniExtract8Data_2x8); \ + \ + RESIZE_BILINEAR_4X1(g_scale, gMean) \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniExtract8Data_2x8); \ + RESIZE_BILINEAR_4X1(b_scale, bMean) \ + VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniExtract8Data_2x8); \ + _viv_asm(COPY, data0, dst1, 16); \ + _viv_asm(COPY, data1, dst2, 16); \ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uni16BitsDataInterleave_0_2x8); \ + VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uni16BitsDataInterleave_1_2x8); \ + VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8, half4, vxc_short8) +PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4, vxc_short8) + +#define PRE_PROCESS_RGB888_PLANAR_8BITS(dst_name, write_type) \ +__kernel void pre_process_rgb888_planar_scale_U8to##dst_name##_nhwc \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float r_scale, \ + int reverse, \ + float g_scale, \ + float b_scale \ + ) \ +{ \ + int2 ratioXY = (int2)(*xRatio, *yRatio); \ + int4 xPos = get_global_id(0); \ + int yPos = get_global_id(1); \ + \ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \ + xPos += (int4)(0, 1, 2, 3); \ + \ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \ + int4 sx = fx0 & 0xffff8000; \ + fx0 -= sx; \ + sx = sx >> 15; \ + \ + vxc_short4 fx; \ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \ + \ + int fy = yPos * ratioXY.y + ratioSufXY.y; \ + int sy = fy & 0xffff8000; \ + \ + fy -= sy; \ + sy = sy >> 15; \ + fy = (fy + (1<< 4)) >> 5; \ + \ + vxc_uchar16 line0Y; \ + vxc_uchar16 line1Y; \ + int4 coord; \ + sx = sx + *xOffset; \ + coord.xyz = sx.xyz; \ + coord.w = sy + *yOffset; \ + int2 coord1 = (int2)(sx.w, coord.w); \ + int4 coord_in = (int4)(coord.xw, 0, 0); \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.y; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.z; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord1.x; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + int4 test01, temp1; \ + int4 test02, temp2; \ + int2 coord_out = (int2)(xPos.x, yPos); \ + coord_out.x = coord_out.x * 3; \ + \ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + \ + vxc_float4 tmp_dst; \ + vxc_uchar4 u8_dst; \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + \ + int4 dst0; \ + write_type dst1, dst; \ + tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \ + dst0 = convert_int4_rte(tmp_dst); \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniExtract8Data_2x8); \ + \ + coord_in.x = coord.x; \ + coord_in.z = 1; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.y; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.z; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord1.x; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \ + dst0 = convert_int4_rte(tmp_dst); \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniExtract8Data_2x8); \ + \ + coord_in.x = coord.x; \ + coord_in.z = 2; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.y; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.z; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord1.x; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \ + dst0 = convert_int4_rte(tmp_dst); \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \ + uniExtract8Data_2x8); \ + VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uni8BitsDataInterleave_0_2x8); \ + VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \ + uni16BitsDataInterleave_1_2x8); \ + VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \ +} +PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16) +PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_1.vx new file mode 100644 index 000000000..80c603bc2 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_1.vx @@ -0,0 +1,153 @@ + +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8; +_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8; + +_viv_uniform float output_scale; +_viv_uniform float output_zp; +_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8; +_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8; +_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8; +_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8; +_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8; + +#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \ +__kernel void pre_process_rgb888_planar_copy_U8to##dst_name##_nhwc \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float r_scale, \ + int reverse, \ + float g_scale, \ + float b_scale \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ + \ + coord.xy += (int2)(*xOffset, *yOffset); \ + vxc_uchar16 src0, src1, src2; \ + dst_type dst0, dst1; \ + \ + int4 coord_in = (int4)(coord.xy, 0, 0); \ + VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_in.z ++; \ + VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_in.z ++; \ + VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + int4 coord_out = coord; \ + coord_out.z = coord_out.z * 3; \ + coord_out.x = coord_out.z + 8; \ + float4 paramData0 = (float4)(rMean * output_scale * r_scale - output_zp, \ + rMean * r_scale * output_scale - output_zp, \ + rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \ + \ + half4 paramData_f16; \ + copy_type data0, data1, data2, dst; \ + _viv_asm(CONV, paramData_f16, paramData0); \ + VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniDataMeanStddevLo_2x8); \ + float4 paramData1 = (float4)(gMean * output_scale * g_scale - output_zp,\ + gMean * g_scale * output_scale - output_zp, \ + gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \ + _viv_asm(CONV, paramData_f16, paramData1); \ + VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), \ + uniDataMeanStddevLo_2x8); \ + _viv_asm(COPY, data0, dst0, 16); \ + \ + float4 paramData2 = (float4)(bMean * output_scale * b_scale - output_zp, \ + bMean * b_scale * output_scale - output_zp, \ + bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \ + _viv_asm(CONV, paramData_f16, paramData2); \ + VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniDataMeanStddevLo_2x8); \ + _viv_asm(COPY, data1, dst1, 16); \ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uni16BitsDataInterleave_0_2x8); \ + VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uni16BitsDataInterleave_1_2x8); \ + VXC_WriteImage(output, coord_out.xw, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8, vxc_short8) +PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8) + +#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \ +__kernel void pre_process_rgb888_planar_copy_U8to##dst_name##_nhwc \ + ( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float r_scale, \ + int reverse, \ + int height, \ + float g_scale, \ + float b_scale \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ + \ + coord.xy += (int2) (*xOffset, *yOffset); \ + vxc_uchar16 src0, src1, src2; \ + write_type dst0, dst1, dst2, dst3; \ + \ + int4 coord_in = (int4)(coord.xy, 0, 0); \ + VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_in.z ++; \ + VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_in.z ++; \ + VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + int4 coord_out = coord; \ + coord_out.z = coord_out.z * 3; \ + coord_out.x = coord_out.z + 16; \ + float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \ + rMean * r_scale * output_scale - output_zp, \ + rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \ + \ + half4 paramData_f16; \ + _viv_asm(CONV, paramData_f16, paramData0); \ + \ + VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniDataMeanStddevLo_2x8); \ + \ + float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \ + gMean * g_scale * output_scale - output_zp, \ + gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \ + _viv_asm(CONV, paramData_f16, paramData1); \ + \ + VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \ + uniDataMeanStddevLo_2x8); \ + \ + float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \ + bMean * b_scale * output_scale - output_zp, \ + bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \ + _viv_asm(CONV, paramData_f16, paramData2); \ + \ + VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniDataMeanStddevLo_2x8); \ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uni8BitsDataInterleave_0_2x8); \ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \ + uni8BitsDataInterleave_1_2x8); \ + VXC_DP2x8(dst3, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uni8BitsDataInterleave_2_2x8); \ + VXC_WriteImage(output, coord_out.zw, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.xw, dst3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16) +PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_2.vx new file mode 100644 index 000000000..8d686ebd6 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_2.vx @@ -0,0 +1,57 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8; +_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8; +_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8; + +__kernel void pre_process_rgb888_planar_half_U8toU8_nhwc + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float rMean, + float gMean, + float bMean, + float r_scale, + int reverse, + float g_scale, + float b_scale + ) +{ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0); + + vxc_uchar16 src0, src1, src2; + + VXC_ReadImage2DArray(src0, input, coord_in, 0, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_in.z ++; + VXC_ReadImage2DArray(src1, input, coord_in, 0, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_in.z ++; + VXC_ReadImage2DArray(src2, input, coord_in, 0, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + int4 coord; + coord.xy = coord_in.xy >> 1; + + coord.x = coord.x * 3; + coord.z = coord.x + 16; + + vxc_uchar16 dst0, dst1; + src0.lo = src0.s02468ace; + src0.hi = src1.s02468ace; + src1.lo = src2.s02468ace; + + VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), + uni8BitsDataInterleave_0_2x8); + VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), + uni8BitsDataInterleave_1_2x8); + VXC_DP2x8(dst1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), + uni8BitsDataInterleave_2_2x8); + + VXC_WriteImage(output, coord.xy, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.zy, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_0.vx index 107846e09..de9dbdeaf 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_0.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_0.vx @@ -10,8 +10,9 @@ _viv_uniform VXC_512Bits uniExtract8Data_2x8; _viv_uniform float output_scale; _viv_uniform float output_zp; +_viv_uniform int4 rgb_order; -#define RESIZE_BILINEAR_4X1(input, mean, output) \ +#define RESIZE_BILINEAR_4X1(input, scale, mean, output, _coord) \ VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ @@ -41,12 +42,12 @@ _viv_uniform float output_zp; VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ uniConvertIntergetoF32_4x4); \ \ - tmp_dst = tmp_dst * output_scale - mean * output_scale + output_zp; \ + tmp_dst = tmp_dst * scale * output_scale - scale * mean * output_scale + output_zp; \ _viv_asm(CONV, dst0, tmp_dst); \ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ uniExtract8Data_2x8); \ _viv_asm(COPY, dst, dst1, 8); \ - VXC_WriteImage(output, coord_out, dst, \ + VXC_WriteImage(output, _coord, dst, \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); #define RGB888_PLANAR_SEP_16BITS(dst_name, dst_type, conv_type, copy_type) \ @@ -55,9 +56,7 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \ __read_only image2d_array_t input0, \ __read_only image2d_array_t input1, \ __read_only image2d_array_t input2, \ - __write_only image2d_array_t output0, \ - __write_only image2d_array_t output1, \ - __write_only image2d_array_t output2, \ + __write_only image2d_array_t output, \ global int *xRatio, \ global int *yRatio, \ global int *xOffset, \ @@ -65,7 +64,11 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \ float rMean, \ float gMean, \ float bMean, \ - float f32Var \ + float r_scale, \ + int reverse, \ + int height, \ + float g_scale, \ + float b_scale \ ) \ { \ int2 ratioXY = (int2)(*xRatio, *yRatio); \ @@ -118,7 +121,8 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \ int4 test02, temp2; \ int4 tt; \ vxc_uchar4 val; \ - int2 coord_out = (int2)(xPos.x, yPos); \ + int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \ + coord_out.yzw += rgb_order.xyz; \ \ vxc_uchar8 line1, line2; \ \ @@ -143,16 +147,16 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \ conv_type dst0; \ dst_type dst1; \ copy_type dst; \ - tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \ + tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \ _viv_asm(CONV, dst0, tmp_dst); \ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ uniExtract8Data_2x8); \ _viv_asm(COPY, dst, dst1, 8); \ - VXC_WriteImage(output0, coord_out, dst, \ + VXC_WriteImage(output, coord_out.xy, dst, \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ - RESIZE_BILINEAR_4X1(input1, gMean, output1) \ - RESIZE_BILINEAR_4X1(input2, bMean, output2) \ + RESIZE_BILINEAR_4X1(input1, g_scale, gMean, output, coord_out.xz) \ + RESIZE_BILINEAR_4X1(input2, b_scale, bMean, output, coord_out.xw) \ } RGB888_PLANAR_SEP_16BITS(F16, vxc_half8, half4, vxc_short8) RGB888_PLANAR_SEP_16BITS(I16, vxc_short8, int4, vxc_short8) @@ -163,9 +167,7 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \ __read_only image2d_array_t input0, \ __read_only image2d_array_t input1, \ __read_only image2d_array_t input2, \ - __write_only image2d_array_t output0, \ - __write_only image2d_array_t output1, \ - __write_only image2d_array_t output2, \ + __write_only image2d_array_t output, \ global int *xRatio, \ global int *yRatio, \ global int *xOffset, \ @@ -173,7 +175,11 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \ float rMean, \ float gMean, \ float bMean, \ - float f32Var \ + float r_scale, \ + int reverse, \ + int height, \ + float g_scale, \ + float b_scale \ ) \ { \ int2 ratioXY = (int2)(*xRatio, *yRatio); \ @@ -221,7 +227,8 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \ \ int4 test01, temp1; \ int4 test02, temp2; \ - int2 coord_out = (int2)(xPos.x, yPos); \ + int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \ + coord_out.yzw += rgb_order.xyz; \ \ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniVecShift10); \ @@ -245,12 +252,13 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \ \ int4 dst0; \ write_type dst; \ - tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \ + tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \ dst0 = convert_int4_rte(tmp_dst); \ VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ uniExtract8Data_2x8); \ \ - VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.xy, dst, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ VXC_ReadImage(line0Y, input1, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage(line0Y, input1, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ @@ -282,12 +290,13 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \ uniExtractBytes); \ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ uniConvertIntergetoF32_4x4); \ - tmp_dst = tmp_dst * output_scale - gMean * output_scale + output_zp; \ + tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \ dst0 = convert_int4_rte(tmp_dst); \ VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ uniExtract8Data_2x8); \ \ - VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.xz, \ + dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ VXC_ReadImage(line0Y, input2, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage(line0Y, input2, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ @@ -319,12 +328,13 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \ uniExtractBytes); \ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ uniConvertIntergetoF32_4x4); \ - tmp_dst = tmp_dst * output_scale - bMean * output_scale + output_zp; \ + tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \ dst0 = convert_int4_rte(tmp_dst); \ VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ uniExtract8Data_2x8); \ \ - VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.xw, \ + dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ } RGB888_PLANAR_SEP_8BITS(U8, vxc_uchar16) -RGB888_PLANAR_SEP_8BITS(I8, vxc_char16) \ No newline at end of file +RGB888_PLANAR_SEP_8BITS(I8, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx index ff55851a6..b308e65cc 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx @@ -5,6 +5,7 @@ _viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8; _viv_uniform float output_scale; _viv_uniform float output_zp; +_viv_uniform int4 rgb_order; #define RGB888_PLANAR_SEP_COPY_16BITS(dst_name, dst_type, copy_type) \ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \ @@ -12,9 +13,7 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \ __read_only image2d_array_t input0, \ __read_only image2d_array_t input1, \ __read_only image2d_array_t input2, \ - __write_only image2d_array_t output0, \ - __write_only image2d_array_t output1, \ - __write_only image2d_array_t output2, \ + __write_only image2d_array_t output, \ global int *xRatio, \ global int *yRatio, \ global int *xOffset, \ @@ -22,7 +21,11 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \ float rMean, \ float gMean, \ float bMean, \ - float f32Var \ + float r_scale, \ + int reverse, \ + int height, \ + float g_scale, \ + float b_scale \ ) \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ @@ -36,8 +39,9 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \ VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ \ coord.x = coord.z + 8; \ - float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \ - rMean * output_scale - output_zp, output_scale); \ + float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \ + rMean * r_scale * output_scale - output_zp, \ + rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \ \ half4 paramData_f16; \ copy_type tmp_dst; \ @@ -47,33 +51,38 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \ VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ uniDataMeanStddevHi_2x8); \ _viv_asm(COPY, tmp_dst, dst0, 16); \ - VXC_WriteImage(output0, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + int4 coord_out = coord; \ + coord_out.yw = coord_out.ww + rgb_order.xy; \ + VXC_WriteImage(output, coord_out.zy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, tmp_dst, dst1, 16); \ - VXC_WriteImage(output0, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.xy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ \ - float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \ - gMean * output_scale - output_zp, output_scale); \ + float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \ + gMean * g_scale * output_scale - output_zp, \ + gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \ _viv_asm(CONV, paramData_f16, paramData1); \ VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ uniDataMeanStddevLo_2x8); \ VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ uniDataMeanStddevHi_2x8); \ _viv_asm(COPY, tmp_dst, dst0, 16); \ - VXC_WriteImage(output1, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, tmp_dst, dst1, 16); \ - VXC_WriteImage(output1, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ \ - float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \ - bMean * output_scale - output_zp, output_scale); \ + float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \ + bMean * b_scale * output_scale - output_zp, \ + bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \ _viv_asm(CONV, paramData_f16, paramData2); \ VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ uniDataMeanStddevLo_2x8); \ VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ uniDataMeanStddevHi_2x8); \ _viv_asm(COPY, tmp_dst, dst0, 16); \ - VXC_WriteImage(output2, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_out.w = coord.w + rgb_order.z; \ + VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ _viv_asm(COPY, tmp_dst, dst1, 16); \ - VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ } RGB888_PLANAR_SEP_COPY_16BITS(F16, vxc_half8, vxc_short8) RGB888_PLANAR_SEP_COPY_16BITS(I16, vxc_short8, vxc_short8) @@ -84,9 +93,7 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \ __read_only image2d_array_t input0, \ __read_only image2d_array_t input1, \ __read_only image2d_array_t input2, \ - __write_only image2d_array_t output0, \ - __write_only image2d_array_t output1, \ - __write_only image2d_array_t output2, \ + __write_only image2d_array_t output, \ global int *xRatio, \ global int *yRatio, \ global int *xOffset, \ @@ -94,7 +101,11 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \ float rMean, \ float gMean, \ float bMean, \ - float f32Var \ + float r_scale, \ + int reverse, \ + int height, \ + float g_scale, \ + float b_scale \ ) \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ @@ -107,8 +118,11 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \ VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ \ - float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \ - rMean * output_scale - output_zp, output_scale); \ + int4 coord_out = coord; \ + coord_out.xyw += rgb_order.xyz; \ + float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \ + rMean * r_scale * output_scale - output_zp, \ + rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \ \ half4 paramData_f16; \ _viv_asm(CONV, paramData_f16, paramData0); \ @@ -117,27 +131,29 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \ uniDataMeanStddevLo_2x8); \ VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \ uniDataMeanStddevHi_2x8); \ - VXC_WriteImage(output0, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.zx, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ \ - float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \ - gMean * output_scale - output_zp, output_scale); \ + float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \ + gMean * g_scale * output_scale - output_zp, \ + gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \ _viv_asm(CONV, paramData_f16, paramData1); \ \ VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ uniDataMeanStddevLo_2x8); \ VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \ uniDataMeanStddevHi_2x8); \ - VXC_WriteImage(output1, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ \ - float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \ - bMean * output_scale - output_zp, output_scale); \ + float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \ + bMean * b_scale * output_scale - output_zp, \ + bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \ _viv_asm(CONV, paramData_f16, paramData2); \ \ VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ uniDataMeanStddevLo_2x8); \ VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \ uniDataMeanStddevHi_2x8); \ - VXC_WriteImage(output2, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ } PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16) PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_2.vx index bbfed6e7e..51a97f047 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_2.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_2.vx @@ -5,15 +5,14 @@ _viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8; _viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4; _viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4; _viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4; +_viv_uniform int4 rgb_order; __kernel void pre_process_rgb888_planar_sep_4over3_U8toU8 ( __read_only image2d_array_t input0, __read_only image2d_array_t input1, __read_only image2d_array_t input2, - __write_only image2d_array_t output0, - __write_only image2d_array_t output1, - __write_only image2d_array_t output2, + __write_only image2d_array_t output, global int *xRatio, global int *yRatio, global int *xOffset, @@ -21,7 +20,11 @@ __kernel void pre_process_rgb888_planar_sep_4over3_U8toU8 float rMean, float gMean, float bMean, - float f32Var + float r_scale, + int reverse, + int height, + float g_scale, + float b_scale ) { int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); @@ -47,9 +50,11 @@ __kernel void pre_process_rgb888_planar_sep_4over3_U8toU8 VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4); VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4); - VXC_WriteImage(output0, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + int4 coord_r = coord_out; + coord_r.yzw += rgb_order.xxx; + VXC_WriteImage(output, coord_r.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_r.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_r.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); VXC_ReadImage(src0, input1, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); VXC_ReadImage(src1, input1, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); @@ -65,9 +70,11 @@ __kernel void pre_process_rgb888_planar_sep_4over3_U8toU8 VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4); VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4); - VXC_WriteImage(output1, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + int4 coord_g = coord_out; + coord_g.yzw += rgb_order.yyy; + VXC_WriteImage(output, coord_g.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_g.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_g.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); VXC_ReadImage(src0, input2, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); VXC_ReadImage(src1, input2, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); @@ -83,9 +90,11 @@ __kernel void pre_process_rgb888_planar_sep_4over3_U8toU8 VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4); VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4); - VXC_WriteImage(output2, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output2, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + int4 coord_b = coord_out; + coord_b.yzw += rgb_order.zzz; + VXC_WriteImage(output, coord_b.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_b.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_b.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); } __kernel void pre_process_rgb888_planar_sep_half_U8toU8 @@ -93,9 +102,7 @@ __kernel void pre_process_rgb888_planar_sep_half_U8toU8 __read_only image2d_array_t input0, __read_only image2d_array_t input1, __read_only image2d_array_t input2, - __write_only image2d_array_t output0, - __write_only image2d_array_t output1, - __write_only image2d_array_t output2, + __write_only image2d_array_t output, global int *xRatio, global int *yRatio, global int *xOffset, @@ -103,7 +110,11 @@ __kernel void pre_process_rgb888_planar_sep_half_U8toU8 float rMean, float gMean, float bMean, - float f32Var + float r_scale, + int reverse, + int height, + float g_scale, + float b_scale ) { int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); @@ -116,7 +127,9 @@ __kernel void pre_process_rgb888_planar_sep_half_U8toU8 coord_in.zw = coord_in.xy >> 1; - VXC_WriteImage(output0, coord_in.zw, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output1, coord_in.zw, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output2, coord_in.zw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + int4 coord_rgb = coord_in.zwww; + coord_rgb.yzw += rgb_order.xyz; + VXC_WriteImage(output, coord_rgb.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_rgb.xz, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_rgb.xw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_0.vx new file mode 100644 index 000000000..a9b792599 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_0.vx @@ -0,0 +1,342 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniVecShift10; +_viv_uniform VXC_512Bits uniAddRShift; +_viv_uniform VXC_512Bits uniGetTempVal; +_viv_uniform VXC_512Bits uniExtractBytes; + +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8; +_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8; +_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8; +_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8; + +_viv_uniform float output_scale; +_viv_uniform float output_zp; + +#define RESIZE_BILINEAR_4X1(input, scale, mean) \ + VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + \ + tmp_dst = tmp_dst * scale * output_scale - mean * scale * output_scale + output_zp; \ + _viv_asm(CONV, dst0, tmp_dst); + +#define RGB888_PLANAR_SEP_16BITS(dst_name, dst_type, conv_type, copy_type) \ +__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name##_nhwc \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __read_only image2d_array_t input2, \ + __write_only image2d_array_t output, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float r_scale, \ + int reverse, \ + float g_scale, \ + float b_scale \ + ) \ +{ \ + int2 ratioXY = (int2)(*xRatio, *yRatio); \ + \ + int4 xPos = get_global_id(0); \ + int yPos = get_global_id(1); \ + \ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \ + xPos += (int4)(0, 1, 2, 3); \ + \ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \ + int4 sx = fx0 & 0xffff8000; \ + fx0 -= sx; \ + sx = sx >> 15; \ + \ + vxc_short4 fx; \ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniAddRShift); \ + \ + int fy = yPos * ratioXY.y + ratioSufXY.y; \ + int sy = fy & 0xffff8000; \ + \ + fy -= sy; \ + sy = sy >> 15; \ + \ + fy = (fy + (1<< 4)) >> 5; \ + \ + vxc_uchar16 line0Y; \ + vxc_uchar16 line1Y; \ + int4 coord; \ + sx = sx + *xOffset; \ + coord.xyz = sx.xyz; \ + coord.w = sy + *yOffset; \ + int2 coord1 = (int2)(sx.w, coord.w); \ + VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + int4 test01, temp1; \ + int4 test02, temp2; \ + int4 tt; \ + vxc_uchar4 val; \ + int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \ + coord_out.x = coord_out.x * 3; \ + coord_out.z = coord_out.x + 8; \ + \ + vxc_uchar8 line1, line2; \ + \ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + \ + vxc_float4 tmp_dst; \ + vxc_uchar4 u8_dst; \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + \ + conv_type dst0; \ + dst_type dst1, dst2; \ + copy_type data0, data1, dst; \ + tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \ + _viv_asm(CONV, dst0, tmp_dst); \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniExtract8Data_2x8); \ + RESIZE_BILINEAR_4X1(input1, g_scale, gMean) \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniExtract8Data_2x8); \ + \ + RESIZE_BILINEAR_4X1(input2, b_scale, bMean) \ + VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniExtract8Data_2x8); \ + _viv_asm(COPY, data0, dst1, 16); \ + _viv_asm(COPY, data1, dst2, 16); \ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uni16BitsDataInterleave_0_2x8); \ + VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uni16BitsDataInterleave_1_2x8); \ + VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +RGB888_PLANAR_SEP_16BITS(F16, vxc_half8, half4, vxc_short8) +RGB888_PLANAR_SEP_16BITS(I16, vxc_short8, int4, vxc_short8) + +#define RGB888_PLANAR_SEP_8BITS(dst_name, write_type) \ +__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name##_nhwc \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __read_only image2d_array_t input2, \ + __write_only image2d_array_t output, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float r_scale, \ + int reverse, \ + float g_scale, \ + float b_scale \ + ) \ +{ \ + int2 ratioXY = (int2)(*xRatio, *yRatio); \ + int4 xPos = get_global_id(0); \ + int yPos = get_global_id(1); \ + \ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \ + xPos += (int4)(0, 1, 2, 3); \ + \ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \ + int4 sx = fx0 & 0xffff8000; \ + fx0 -= sx; \ + sx = sx >> 15; \ + \ + vxc_short4 fx; \ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \ + \ + int fy = yPos * ratioXY.y + ratioSufXY.y; \ + int sy = fy & 0xffff8000; \ + \ + fy -= sy; \ + sy = sy >> 15; \ + fy = (fy + (1<< 4)) >> 5; \ + \ + vxc_uchar16 line0Y; \ + vxc_uchar16 line1Y; \ + int4 coord; \ + sx = sx + *xOffset; \ + coord.xyz = sx.xyz; \ + coord.w = sy + *yOffset; \ + int2 coord1 = (int2)(sx.w, coord.w); \ + VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + int4 test01, temp1; \ + int4 test02, temp2; \ + int2 coord_out = (int2)(xPos.x, yPos); \ + coord_out.x = coord_out.x * 3; \ + \ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + \ + vxc_float4 tmp_dst; \ + vxc_uchar4 u8_dst; \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + \ + int4 dst0; \ + write_type dst1, dst; \ + tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \ + dst0 = convert_int4_rte(tmp_dst); \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniExtract8Data_2x8); \ + \ + VXC_ReadImage(line0Y, input1, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input1, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input1, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input1, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(line1Y, input1, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input1, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input1, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input1, coord1, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \ + dst0 = convert_int4_rte(tmp_dst); \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniExtract8Data_2x8); \ + \ + VXC_ReadImage(line0Y, input2, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input2, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input2, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input2, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(line1Y, input2, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input2, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input2, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input2, coord1, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \ + dst0 = convert_int4_rte(tmp_dst); \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \ + uniExtract8Data_2x8); \ + VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uni8BitsDataInterleave_0_2x8); \ + VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \ + uni16BitsDataInterleave_1_2x8); \ + VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \ +} +RGB888_PLANAR_SEP_8BITS(U8, vxc_uchar16) +RGB888_PLANAR_SEP_8BITS(I8, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_1.vx new file mode 100644 index 000000000..1ae298c22 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_1.vx @@ -0,0 +1,148 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8; + +_viv_uniform float output_scale; +_viv_uniform float output_zp; +_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8; +_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8; +_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8; +_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8; +_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8; + +#define RGB888_PLANAR_SEP_COPY_16BITS(dst_name, dst_type, copy_type) \ +__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name##_nhwc \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __read_only image2d_array_t input2, \ + __write_only image2d_array_t output, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float r_scale, \ + int reverse, \ + float g_scale, \ + float b_scale \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ + \ + coord.xy += (int2)(*xOffset, *yOffset); \ + vxc_uchar16 src0, src1, src2; \ + dst_type dst0, dst1; \ + \ + VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + int4 coord_out = coord; \ + coord_out.z = coord_out.z * 3; \ + coord_out.x = coord_out.z + 8; \ + float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp,\ + rMean * r_scale * output_scale - output_zp, \ + rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \ + \ + half4 paramData_f16; \ + copy_type data0, data1, data2, dst; \ + _viv_asm(CONV, paramData_f16, paramData0); \ + VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniDataMeanStddevLo_2x8); \ + float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp,\ + gMean * g_scale * output_scale - output_zp, \ + gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \ + _viv_asm(CONV, paramData_f16, paramData1); \ + VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), \ + uniDataMeanStddevLo_2x8); \ + _viv_asm(COPY, data0, dst0, 16); \ + \ + float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp,\ + bMean * b_scale * output_scale - output_zp, \ + bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \ + _viv_asm(CONV, paramData_f16, paramData2); \ + VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniDataMeanStddevLo_2x8); \ + _viv_asm(COPY, data1, dst0, 16); \ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uni16BitsDataInterleave_0_2x8); \ + VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uni16BitsDataInterleave_1_2x8); \ + VXC_WriteImage(output, coord_out.xw, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +RGB888_PLANAR_SEP_COPY_16BITS(F16, vxc_half8, vxc_short8) +RGB888_PLANAR_SEP_COPY_16BITS(I16, vxc_short8, vxc_short8) + +#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \ +__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name##_nhwc \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __read_only image2d_array_t input2, \ + __write_only image2d_array_t output, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float r_scale, \ + int reverse, \ + float g_scale, \ + float b_scale \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ + \ + coord.xy += (int2) (*xOffset, *yOffset); \ + vxc_uchar16 src0, src1, src2; \ + write_type dst0, dst1, dst2, dst3; \ + \ + VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + int4 coord_out = coord; \ + coord_out.z = coord_out.z * 3; \ + coord_out.x = coord_out.z + 16; \ + float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp,\ + rMean * r_scale * output_scale - output_zp, \ + rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \ + \ + half4 paramData_f16; \ + _viv_asm(CONV, paramData_f16, paramData0); \ + \ + VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniDataMeanStddevLo_2x8); \ + \ + float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp,\ + gMean * g_scale * output_scale - output_zp, \ + gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \ + _viv_asm(CONV, paramData_f16, paramData1); \ + \ + VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \ + uniDataMeanStddevLo_2x8); \ + \ + float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp,\ + bMean * b_scale * output_scale - output_zp, \ + bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \ + _viv_asm(CONV, paramData_f16, paramData2); \ + \ + VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniDataMeanStddevLo_2x8); \ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uni8BitsDataInterleave_0_2x8); \ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \ + uni8BitsDataInterleave_1_2x8); \ + VXC_DP2x8(dst3, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uni8BitsDataInterleave_2_2x8); \ + VXC_WriteImage(output, coord_out.zw, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(output, coord_out.xw, dst3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16) +PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_2.vx new file mode 100644 index 000000000..d43f82587 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_2.vx @@ -0,0 +1,54 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8; +_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8; +_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8; + +__kernel void pre_process_rgb888_planar_sep_half_U8toU8_nhwc + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __read_only image2d_array_t input2, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float rMean, + float gMean, + float bMean, + float r_scale, + int reverse, + float g_scale, + float b_scale + ) +{ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_uchar16 src0, src1, src2; + + VXC_ReadImage(src0, input0, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src2, input2, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + int4 coord; + coord.xy = coord_in.xy >> 1; + + coord.x = coord.x * 3; + coord.z = coord.x + 16; + + vxc_uchar16 dst0, dst1; + src0.lo = src0.s02468ace; + src0.hi = src1.s02468ace; + src1.lo = src2.s02468ace; + + VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), + uni8BitsDataInterleave_0_2x8); + VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), + uni8BitsDataInterleave_1_2x8); + VXC_DP2x8(dst1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), + uni8BitsDataInterleave_2_2x8); + + VXC_WriteImage(output, coord.xy, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.zy, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx index c200019c3..5a343e708 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx @@ -17,6 +17,8 @@ _viv_uniform VXC_512Bits uniExtractBtoF32_part1_4x4; _viv_uniform VXC_512Bits uniExtractBtoF32_part2_4x4; _viv_uniform VXC_512Bits uniExtractBtoF32_part3_4x4; _viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform float4 param_data; +_viv_uniform float4 rgb_scale; #define IMAGE_PRE_PROCESS_COPY_16BITS(dst_name, dst_type, copy_type, convert_type) \ __kernel void pre_process_rgb_copy_U8to##dst_name \ @@ -30,9 +32,11 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \ float rMean, \ float gMean, \ float bMean, \ - float f32Var, \ + float r_scale, \ int reverse_channel, \ - int trans \ + int trans, \ + float g_scale, \ + float b_scale \ ) \ { \ int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); \ @@ -46,10 +50,6 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ - \ - f32Var *= outputScale; \ - float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \ - bMean * f32Var - outputZP, f32Var); \ \ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), r_order, 0); \ float4 tmp0, tmp1; \ @@ -57,8 +57,8 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \ \ VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \ - tmp0 = tmp0 * paramData.w - paramData.x; \ - tmp1 = tmp1 * paramData.w - paramData.x; \ + tmp0 = tmp0 * rgb_scale.x - param_data.x; \ + tmp1 = tmp1 * rgb_scale.x - param_data.x; \ _viv_asm(CONV_RTE, result0, tmp0); \ _viv_asm(CONV_RTE, result1, tmp1); \ VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ @@ -68,8 +68,8 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \ coord_out.z = 1; \ VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \ - tmp0 = tmp0 * paramData.w - paramData.y; \ - tmp1 = tmp1 * paramData.w - paramData.y; \ + tmp0 = tmp0 * rgb_scale.y - param_data.y; \ + tmp1 = tmp1 * rgb_scale.y - param_data.y; \ _viv_asm(CONV_RTE, result0, tmp0); \ _viv_asm(CONV_RTE, result1, tmp1); \ VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ @@ -79,8 +79,8 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \ coord_out.z = b_order; \ VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \ - tmp0 = tmp0 * paramData.w - paramData.z; \ - tmp1 = tmp1 * paramData.w - paramData.z; \ + tmp0 = tmp0 * rgb_scale.z - param_data.z; \ + tmp1 = tmp1 * rgb_scale.z - param_data.z; \ _viv_asm(CONV_RTE, result0, tmp0); \ _viv_asm(CONV_RTE, result1, tmp1); \ VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ @@ -102,9 +102,11 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \ float rMean, \ float gMean, \ float bMean, \ - float f32Var, \ + float r_scale, \ int reverse_channel, \ - int trans \ + int trans, \ + float g_scale, \ + float b_scale \ ) \ { \ int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); \ @@ -119,10 +121,6 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \ coord.x += 16; \ VXC_ReadImage(src2, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ - \ - f32Var *= outputScale; \ - float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \ - bMean * f32Var - outputZP, f32Var); \ \ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), r_order, 0); \ float4 tmp0, tmp1; \ @@ -130,15 +128,15 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \ \ VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \ - tmp0 = tmp0 * paramData.w - paramData.x; \ - tmp1 = tmp1 * paramData.w - paramData.x; \ + tmp0 = tmp0 * rgb_scale.x - param_data.x; \ + tmp1 = tmp1 * rgb_scale.x - param_data.x; \ result0 = convert_int4_rte(tmp0); \ result1 = convert_int4_rte(tmp1); \ VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part2_4x4); \ VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part3_4x4); \ - tmp0 = tmp0 * paramData.w - paramData.x; \ - tmp1 = tmp1 * paramData.w - paramData.x; \ + tmp0 = tmp0 * rgb_scale.x - param_data.x; \ + tmp1 = tmp1 * rgb_scale.x - param_data.x; \ result0 = convert_int4_rte(tmp0); \ result1 = convert_int4_rte(tmp1); \ VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ @@ -147,15 +145,15 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \ coord_out.z = 1; \ VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \ - tmp0 = tmp0 * paramData.w - paramData.y; \ - tmp1 = tmp1 * paramData.w - paramData.y; \ + tmp0 = tmp0 * rgb_scale.y - param_data.y; \ + tmp1 = tmp1 * rgb_scale.y - param_data.y; \ result0 = convert_int4_rte(tmp0); \ result1 = convert_int4_rte(tmp1); \ VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part2_4x4); \ VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part3_4x4); \ - tmp0 = tmp0 * paramData.w - paramData.y; \ - tmp1 = tmp1 * paramData.w - paramData.y; \ + tmp0 = tmp0 * rgb_scale.y - param_data.y; \ + tmp1 = tmp1 * rgb_scale.y - param_data.y; \ result0 = convert_int4_rte(tmp0); \ result1 = convert_int4_rte(tmp1); \ VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ @@ -164,15 +162,15 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \ coord_out.z = b_order; \ VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \ - tmp0 = tmp0 * paramData.w - paramData.z; \ - tmp1 = tmp1 * paramData.w - paramData.z; \ + tmp0 = tmp0 * rgb_scale.z - param_data.z; \ + tmp1 = tmp1 * rgb_scale.z - param_data.z; \ result0 = convert_int4_rte(tmp0); \ result1 = convert_int4_rte(tmp1); \ VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part2_4x4); \ VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part3_4x4); \ - tmp0 = tmp0 * paramData.w - paramData.z; \ - tmp1 = tmp1 * paramData.w - paramData.z; \ + tmp0 = tmp0 * rgb_scale.z - param_data.z; \ + tmp1 = tmp1 * rgb_scale.z - param_data.z; \ result0 = convert_int4_rte(tmp0); \ result1 = convert_int4_rte(tmp1); \ VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx index 25f981a11..3a91a3559 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx @@ -49,9 +49,11 @@ __kernel void pre_process_yuv420_copy_##name \ float rMean, \ float gMean, \ float bMean, \ - float var, \ + float r_scale, \ int reverse_channel, \ - int trans \ + int trans, \ + float g_scale, \ + float b_scale \ ) \ { \ int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \ @@ -110,17 +112,23 @@ __kernel void pre_process_yuv420_copy_##name \ VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ \ - var *= output_scale; \ - float4 paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \ - rMean * var - output_zp, var); \ + float4 paramData = (float4)(bMean * b_scale * output_scale - output_zp,\ + gMean * g_scale * output_scale - output_zp, \ + rMean * r_scale * output_scale - output_zp, b_scale * output_scale); \ half4 paramData_f16; \ _viv_asm(CONV, paramData_f16, paramData); \ \ VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \ VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \ + \ + paramData.w = g_scale * output_scale; \ + _viv_asm(CONV, paramData_f16, paramData); \ \ VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \ VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \ + \ + paramData.w = r_scale * output_scale; \ + _viv_asm(CONV, paramData_f16, paramData); \ \ VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \ VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \ @@ -150,9 +158,11 @@ __kernel void pre_process_yuv420_copy_##name \ float rMean, \ float gMean, \ float bMean, \ - float var, \ + float r_scale, \ int reverse_channel, \ - int trans \ + int trans, \ + float g_scale, \ + float b_scale \ ) \ { \ int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \ @@ -202,18 +212,22 @@ __kernel void pre_process_yuv420_copy_##name \ VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \ \ - var *= output_scale; \ - float4 paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \ - rMean * var - output_zp, var); \ + float4 paramData = (float4)(bMean * b_scale * output_scale - output_zp, \ + gMean * g_scale * output_scale - output_zp, \ + rMean * r_scale * output_scale - output_zp, b_scale * output_scale); \ half4 paramData_f16; \ _viv_asm(CONV, paramData_f16, paramData); \ \ VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \ VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \ \ + paramData.w = g_scale * output_scale; \ + _viv_asm(CONV, paramData_f16, paramData); \ VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \ VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \ \ + paramData.w = r_scale * output_scale; \ + _viv_asm(CONV, paramData_f16, paramData); \ VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \ VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \ \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx index 40db13719..99a64459e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx @@ -48,9 +48,11 @@ __kernel void pre_process_yuv420_scale_##name \ float rMean, \ float gMean, \ float bMean, \ - float var, \ + float r_scale, \ int reverse_channel, \ - int trans \ + int trans, \ + float g_scale, \ + float b_scale \ ) \ { \ int4 gidx = get_global_id(0); \ @@ -199,7 +201,7 @@ __kernel void pre_process_yuv420_scale_##name \ float4 tmpDst; \ int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ - tmpDst = (tmpDst - bMean) * var; \ + tmpDst = (tmpDst - bMean) * b_scale; \ dstPos.z = bOrder; \ result = convert_int4_rte(tmpDst * output_scale + output_zp); \ VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ @@ -213,7 +215,7 @@ __kernel void pre_process_yuv420_scale_##name \ temp2 = fx * tmpData0 + tmpData1; \ result = fy * temp2 + (temp1 << 10); \ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ - tmpDst = (tmpDst - gMean) * var; \ + tmpDst = (tmpDst - gMean) * g_scale; \ dstPos.z = 1; \ result = convert_int4_rte(tmpDst * output_scale + output_zp); \ VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ @@ -227,7 +229,7 @@ __kernel void pre_process_yuv420_scale_##name \ temp2 = fx * tmpData0 + tmpData1; \ result = fy * temp2 + (temp1 << 10); \ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ - tmpDst = (tmpDst - rMean) * var; \ + tmpDst = (tmpDst - rMean) * r_scale; \ dstPos.z = rOrder; \ result = convert_int4_rte(tmpDst * output_scale + output_zp); \ VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx index 7bfa6d112..676a8485c 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx @@ -48,9 +48,11 @@ __kernel void pre_process_yuv420_scale_##name \ float rMean, \ float gMean, \ float bMean, \ - float var, \ + float r_scale, \ int reverse_channel, \ - int trans \ + int trans, \ + float g_scale, \ + float b_scale \ ) \ { \ int4 gidx = get_global_id(0); \ @@ -201,7 +203,7 @@ __kernel void pre_process_yuv420_scale_##name \ float4 tmpDst; \ int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ - tmpDst = (tmpDst - bMean) * var; \ + tmpDst = (tmpDst - bMean) * b_scale; \ dstPos.z = bOrder; \ tmpDst = tmpDst * output_scale + output_zp; \ _viv_asm(CONV_RTE, tmpVal, tmpDst); \ @@ -217,7 +219,7 @@ __kernel void pre_process_yuv420_scale_##name \ temp2 = fx * tmpData0 + tmpData1; \ result = fy * temp2 + (temp1 << 10); \ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ - tmpDst = (tmpDst - gMean) * var; \ + tmpDst = (tmpDst - gMean) * g_scale; \ dstPos.z = 1; \ tmpDst = tmpDst * output_scale + output_zp; \ _viv_asm(CONV_RTE, tmpVal, tmpDst); \ @@ -233,7 +235,7 @@ __kernel void pre_process_yuv420_scale_##name \ temp2 = fx * tmpData0 + tmpData1; \ result = fy * temp2 + (temp1 << 10); \ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ - tmpDst = (tmpDst - rMean) * var; \ + tmpDst = (tmpDst - rMean) * r_scale; \ dstPos.z = rOrder; \ tmpDst = tmpDst * output_scale + output_zp; \ _viv_asm(CONV_RTE, tmpVal, tmpDst); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx index eed071587..0006e4a71 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx @@ -3,7 +3,9 @@ _viv_uniform int bOrder; _viv_uniform int rOrder; -_viv_uniform float outputScaleVar; +_viv_uniform float outputScaleVar_b; +_viv_uniform float outputScaleVar_g; +_viv_uniform float outputScaleVar_r; _viv_uniform float bMeanScaleVarZp; _viv_uniform float gMeanScaleVarZp; _viv_uniform float rMeanScaleVarZp; @@ -27,10 +29,12 @@ __kernel void pre_process_yuv422_copy_##name \ float rMean, \ float gMean, \ float bMean, \ - float var, \ + float r_scale, \ int reverse_channel, \ int trans, \ - int yuv422_type \ + int yuv422_type, \ + float g_scale, \ + float b_scale \ ) \ { \ int gidx = get_global_id(0); \ @@ -60,21 +64,21 @@ __kernel void pre_process_yuv422_copy_##name \ dst_type dst0; \ save_type dst; \ int4 dstPos = (int4)(gidx, gidy, 0, 0); \ - tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \ + tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \ _viv_asm(CONV_RTE, result, tmpDstB); \ dstPos.z = bOrder; \ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \ _viv_asm(COPY, dst, dst0, copy_bytes); \ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ - tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \ + tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \ _viv_asm(CONV_RTE, result, tmpDstG); \ dstPos.z = 1; \ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \ _viv_asm(COPY, dst, dst0, copy_bytes); \ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ - tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \ + tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \ _viv_asm(CONV_RTE, result, tmpDstR); \ dstPos.z = rOrder; \ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx index 78546d991..9fb80e504 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx @@ -3,7 +3,10 @@ _viv_uniform int bOrder; _viv_uniform int rOrder; -_viv_uniform float outputScaleVar; +_viv_uniform float outputScaleVar_b; +_viv_uniform float outputScaleVar_g; +_viv_uniform float outputScaleVar_r; + _viv_uniform float bMeanScaleVarZp; _viv_uniform float gMeanScaleVarZp; _viv_uniform float rMeanScaleVarZp; @@ -33,10 +36,12 @@ __kernel void pre_process_yuv422_scale_##name \ float rMean, \ float gMean, \ float bMean, \ - float var, \ + float r_scale, \ int reverse_channel, \ int trans, \ - int yuv422_type \ + int yuv422_type, \ + float g_scale, \ + float b_scale \ ) \ { \ int4 gidx = get_global_id(0); \ @@ -108,21 +113,21 @@ __kernel void pre_process_yuv422_scale_##name \ dst_type dst0; \ save_type dst; \ int4 dstPos = (int4)(gidx.x, gidy, 0, 0); \ - tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \ + tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \ _viv_asm(CONV_RTE, result, tmpDstB); \ dstPos.z = bOrder; \ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \ _viv_asm(COPY, dst, dst0, copy_bytes); \ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ - tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \ + tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \ _viv_asm(CONV_RTE, result, tmpDstG); \ dstPos.z = 1; \ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \ _viv_asm(COPY, dst, dst0, copy_bytes); \ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ - tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \ + tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \ _viv_asm(CONV_RTE, result, tmpDstR); \ dstPos.z = rOrder; \ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx index 05f9973c3..3a6a3c50f 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx @@ -46,9 +46,11 @@ __kernel void pre_process_yuv444_copy_U8toU8( float rMean, float gMean, float bMean, - float var, + float r_scale, int reverse_channel, - int trans + int trans, + float g_scale, + float b_scale ) { int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset)); @@ -107,18 +109,23 @@ __kernel void pre_process_yuv444_copy_U8toU8( VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - var *= outputScale; - float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\ - rMean * var - zp, var); + float4 paramData = (float4)(bMean * b_scale * outputScale - zp, gMean * g_scale * outputScale - zp,\ + rMean * r_scale * outputScale - zp, b_scale * outputScale); half4 paramData_f16; _viv_asm(CONV, paramData_f16, paramData); VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); + paramData.w = g_scale * outputScale; + _viv_asm(CONV, paramData_f16, paramData); + VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); + paramData.w = r_scale * outputScale; + _viv_asm(CONV, paramData_f16, paramData); + VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); @@ -142,9 +149,11 @@ __kernel void pre_process_yuv444_copy_U8toF16( float rMean, float gMean, float bMean, - float var, + float r_scale, int reverse_channel, - int trans + int trans, + float g_scale, + float b_scale ) { int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset)); @@ -204,17 +213,23 @@ __kernel void pre_process_yuv444_copy_U8toF16( VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); - float4 paramData = (float4)(bMean * var, gMean * var,\ - rMean * var, var); + float4 paramData = (float4)(bMean * b_scale * outputScale, gMean * g_scale * outputScale,\ + rMean * r_scale * outputScale, b_scale * outputScale); half4 paramData_f16; _viv_asm(CONV, paramData_f16, paramData); VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); + paramData.w = g_scale * outputScale; + _viv_asm(CONV, paramData_f16, paramData); + VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); + paramData.w = r_scale * outputScale; + _viv_asm(CONV, paramData_f16, paramData); + VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale.vx index a195750c4..9b4a418e2 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale.vx @@ -39,7 +39,8 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \ __read_only image2d_t y_img, __read_only image2d_t u_img, \ __read_only image2d_t v_img, __write_only image2d_array_t output, \ global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, \ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) \ + float rMean, float gMean, float bMean, float r_scale, int reverse_channel, int trans, \ + float g_scale, float b_scale) \ { \ int4 gidx = get_global_id(0); \ int gidy = get_global_id(1); \ @@ -151,7 +152,7 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \ float4 tmpDst; \ int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ - tmpDst = (tmpDst - bMean) * var; \ + tmpDst = (tmpDst - bMean) * b_scale; \ dstPos.z = bOrder; \ result = convert_int4_rte(tmpDst * outputScale + zp); \ VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \ @@ -165,7 +166,7 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \ temp2 = fx * tmpData0 + tmpData1; \ result = fy * temp2 + (temp1 << 10); \ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ - tmpDst = (tmpDst - gMean) * var; \ + tmpDst = (tmpDst - gMean) * g_scale; \ dstPos.z = 1; \ result = convert_int4_rte(tmpDst * outputScale + zp); \ VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \ @@ -179,7 +180,7 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \ temp2 = fx * tmpData0 + tmpData1; \ result = fy * temp2 + (temp1 << 10); \ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \ - tmpDst = (tmpDst - rMean) * var; \ + tmpDst = (tmpDst - rMean) * r_scale; \ dstPos.z = rOrder; \ result = convert_int4_rte(tmpDst * outputScale + zp); \ VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale_fp16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale_fp16.vx index c5e706d9a..99325d87d 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale_fp16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale_fp16.vx @@ -37,7 +37,8 @@ __kernel void pre_process_yuv444_scale_U8toF16( __read_only image2d_t y_img, __read_only image2d_t u_img, __read_only image2d_t v_img, __write_only image2d_array_t output, global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) + float rMean, float gMean, float bMean, float r_scale, int reverse_channel, int trans, + float g_scale, float b_scale) { int4 gidx = get_global_id(0); int gidy = get_global_id(1); @@ -157,7 +158,7 @@ __kernel void pre_process_yuv444_scale_U8toF16( float4 tmpDst; int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - tmpDst = (tmpDst - bMean) * var; + tmpDst = (tmpDst - bMean) * b_scale; dstPos.z = bOrder; _viv_asm(CONV, hDst, tmpDst); VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); @@ -172,7 +173,7 @@ __kernel void pre_process_yuv444_scale_U8toF16( temp2 = fx * tmpData0 + tmpData1; result = fy * temp2 + (temp1 << 10); VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - tmpDst = (tmpDst - gMean) * var; + tmpDst = (tmpDst - gMean) * g_scale; dstPos.z = 1; _viv_asm(CONV, hDst, tmpDst); VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); @@ -187,7 +188,7 @@ __kernel void pre_process_yuv444_scale_U8toF16( temp2 = fx * tmpData0 + tmpData1; result = fy * temp2 + (temp1 << 10); VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); - tmpDst = (tmpDst - rMean) * var; + tmpDst = (tmpDst - rMean) * r_scale; dstPos.z = rOrder; _viv_asm(CONV, hDst, tmpDst); VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx index 80840646b..750eadaf1 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx @@ -2,7 +2,6 @@ _viv_uniform VXC_512Bits uniExtact8Bit_2x8; _viv_uniform VXC_512Bits uniFp16toFp32_4x4; -_viv_uniform VXC_512Bits uniRightSubLeft_4x4; _viv_uniform VXC_512Bits uniExtactHalf8_2x8; _viv_uniform float scale_x; _viv_uniform int out_height; @@ -63,8 +62,10 @@ __kernel void resize_1d_bilinear_F16toF16_DOWN _viv_asm(COPY, src_half, src, 16); - VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_left_4x4); - VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_right_4x4); + VXC_DP4x4(left4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniConvertFp2FP32_left_4x4); + VXC_DP4x4(right4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniConvertFp2FP32_right_4x4); right4 -= left4; float4 dst4 = right4 * x_lerp + left4; @@ -129,8 +130,10 @@ __kernel void resize_1d_bilinear_F16toU8_DOWN _viv_asm(COPY, src_half, src, 16); - VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_left_4x4); - VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_right_4x4); + VXC_DP4x4(left4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniConvertFp2FP32_left_4x4); + VXC_DP4x4(right4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), + uniConvertFp2FP32_right_4x4); right4 -= left4; float4 dst4 = right4 * x_lerp + left4; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_fp.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_fp.vx new file mode 100644 index 000000000..a60e9b8e9 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_fp.vx @@ -0,0 +1,307 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int update_width; +_viv_uniform int output_width; +_viv_uniform int ref_stride; +_viv_uniform int output_stride; + +_viv_uniform int4 coord_stride; +_viv_uniform int4 coord_stride1; +_viv_uniform float inout_scale; +_viv_uniform float output_zp; + +_viv_uniform VXC_512Bits uniConvertFp16ToFp32_4x4; + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +inline void AtomicAdd_float(volatile __global float *source, const float operand) +{ + union + { + unsigned int intVal; + float floatVal; + } newVal; + union + { + unsigned int intVal; + float floatVal; + } prevVal; + do + { + prevVal.floatVal = *source; + newVal.floatVal = prevVal.floatVal + operand; + } while(atomic_cmpxchg((volatile __global unsigned int *)source, + prevVal.intVal, newVal.intVal) != prevVal.intVal); +} + +__kernel void scatter_nd_update_update_F16( + __read_only image2d_t index, + __read_only image2d_t update, + image2d_t temp_buf_float, + image2d_t link_buffer0, + int width, int area, int vol, int val4, + int val5, int val6, int val7, int coord_dim) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + Image img1 = create_image_from_image2d(index, 4); + Image img2 = create_image_from_image2d(update, 2); + Image img3 = create_image_from_image2d(temp_buf_float, 4); + __global int* index_ptr = (__global int*)img1.ptr; + __global short* update_ptr = (__global short*)img2.ptr; + __global float* output_ptr = (__global float*)img3.ptr; + half src; + + int4 indice = vload4(0, index_ptr + gidy * coord_dim); + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); + short tmpData = update_ptr[gidy * update_width + gidx]; + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; + int loc = idx * output_width + gidx; + _viv_asm(COPY, src, tmpData, 4); + float data; + _viv_asm(CONV, data, src); + AtomicAdd_float(output_ptr + loc, data); +} + +__kernel void scatter_nd_update_update_F16_4X( + __read_only image2d_t index, + __read_only image2d_t update, + image2d_t temp_buf_float, + image2d_t link_buffer0, + int width, int area, int vol, int val4, + int val5, int val6, int val7, int coord_dim) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + Image img1 = create_image_from_image2d(index, 4); + Image img2 = create_image_from_image2d(update, 2); + Image img3 = create_image_from_image2d(temp_buf_float, 4); + __global int* index_ptr = (__global int*)img1.ptr; + __global vxc_short4* update_ptr = (__global vxc_short4*)img2.ptr; + __global float* output_ptr = (__global float*)img3.ptr; + vxc_half4 src; + + int4 indice = vload4(0, index_ptr + gidy * coord_dim); + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); + vxc_short4 tmpData = update_ptr[gidy * update_width + gidx]; + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; + int4 loc = idx * output_width + gidx * 4 + (int4)(0, 1, 2, 3); + + _viv_asm(COPY, src, tmpData, 8); + float4 data; + VXC_DP4x4(data, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), + uniConvertFp16ToFp32_4x4); + AtomicAdd_float(output_ptr + loc.x, data.x); + AtomicAdd_float(output_ptr + loc.y, data.y); + AtomicAdd_float(output_ptr + loc.z, data.z); + AtomicAdd_float(output_ptr + loc.w, data.w); +} + +__kernel void scatter_nd_update_update_BF16( + __read_only image2d_t index, + __read_only image2d_t update, + image2d_t temp_buf_float, + image2d_t link_buffer0, + int width, int area, int vol, int val4, + int val5, int val6, int val7, int coord_dim) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + Image img1 = create_image_from_image2d(index, 4); + Image img2 = create_image_from_image2d(update, 2); + Image img3 = create_image_from_image2d(temp_buf_float, 4); + __global int* index_ptr = (__global int*)img1.ptr; + __global short* update_ptr = (__global short*)img2.ptr; + __global float* output_ptr = (__global float*)img3.ptr; + float data; + + int4 indice = vload4(0, index_ptr + gidy * coord_dim); + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); + short tmpData = update_ptr[gidy * update_width + gidx]; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_short8 src0, src1; + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; + int loc = idx * output_width + gidx; + _viv_asm(COPY, src0, tmpData, 4); + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, data, src1, 4); + AtomicAdd_float(output_ptr + loc, data); +} + +__kernel void scatter_nd_update_update_BF16_4X( + __read_only image2d_t index, + __read_only image2d_t update, + image2d_t temp_buf_float, + image2d_t link_buffer0, + int width, int area, int vol, int val4, + int val5, int val6, int val7, int coord_dim) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + Image img1 = create_image_from_image2d(index, 4); + Image img2 = create_image_from_image2d(update, 2); + Image img3 = create_image_from_image2d(temp_buf_float, 4); + __global int* index_ptr = (__global int*)img1.ptr; + __global vxc_short4* update_ptr = (__global vxc_short4*)img2.ptr; + __global float* output_ptr = (__global float*)img3.ptr; + + int4 indice = vload4(0, index_ptr + gidy * coord_dim); + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); + vxc_short4 tmpData = update_ptr[gidy * update_width + gidx]; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_short8 src0, src1; + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; + int4 loc = idx * output_width + gidx * 4 + (int4)(0, 1, 2, 3); + + _viv_asm(COPY, src0, tmpData, 8); + float4 data; + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, data, src1, 16); + AtomicAdd_float(output_ptr + loc.x, data.x); + AtomicAdd_float(output_ptr + loc.y, data.y); + AtomicAdd_float(output_ptr + loc.z, data.z); + AtomicAdd_float(output_ptr + loc.w, data.w); +} + +#define SCATTER_ND_UPDATE_REF_FP16(type0, type1, ptr_type) \ +__kernel void scatter_nd_update_ref_##type0##to##type1( \ + __read_only image2d_t index, \ + __read_only image2d_t update, \ + __read_only image2d_t temp_buf_int, \ + image2d_t temp_ref, \ + image2d_t link_buffer0, \ + image2d_t link_buffer1, \ + int width, int area, int vol, int val4, \ + int val5, int val6, int val7, int coord_dim) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + Image img1 = create_image_from_image2d(index, 4); \ + Image img2 = create_image_from_image2d(temp_buf_int, 4); \ + Image img3 = create_image_from_image2d(temp_ref, 2); \ + __global int* index_ptr = (__global int*)img1.ptr; \ + __global ptr_type* acc_ptr = (__global ptr_type*)img2.ptr; \ + __global short* ref_ptr = (__global short*)img3.ptr; \ + \ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \ + int loc = idx * output_stride + gidx; \ + float4 tmpData; \ + tmpData.x = convert_float(acc_ptr[loc]) * inout_scale + output_zp; \ + half4 data; \ + short tmpDst; \ + _viv_asm(CONV, data, tmpData); \ + _viv_asm(COPY, tmpDst, data, 4); \ + ref_ptr[loc] = tmpDst; \ +} +SCATTER_ND_UPDATE_REF_FP16(I32, F16, int) +SCATTER_ND_UPDATE_REF_FP16(F32, F16, float) + +#define SCATTER_ND_UPDATE_REF_FP16_4X(type0, type1, ptr_type) \ +__kernel void scatter_nd_update_ref_##type0##to##type1##_4X( \ + __read_only image2d_t index, \ + __read_only image2d_t update, \ + __read_only image2d_t temp_buf_int, \ + image2d_t temp_ref, \ + image2d_t link_buffer0, \ + image2d_t link_buffer1, \ + int width, int area, int vol, int val4, \ + int val5, int val6, int val7, int coord_dim) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + Image img1 = create_image_from_image2d(index, 4); \ + Image img2 = create_image_from_image2d(temp_buf_int, 4); \ + Image img3 = create_image_from_image2d(temp_ref, 2); \ + __global int* index_ptr = (__global int*)img1.ptr; \ + __global ptr_type* acc_ptr = (__global ptr_type*)img2.ptr; \ + __global vxc_short4* ref_ptr = (__global vxc_short4*)img3.ptr; \ + \ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \ + float4 tmpData = convert_float4(vload4(gidx, acc_ptr + idx * ref_stride)); \ + int loc = idx * output_stride + gidx; \ + float4 tmpVal = tmpData * inout_scale + output_zp; \ + half4 data; \ + vxc_short8 tmpDst; \ + _viv_asm(CONV, data, tmpVal); \ + _viv_asm(COPY, tmpDst, data, 16); \ + ref_ptr[loc] = tmpDst.s0246; \ +} +SCATTER_ND_UPDATE_REF_FP16_4X(I32, F16, int) +SCATTER_ND_UPDATE_REF_FP16_4X(F32, F16, float) + +__kernel void scatter_nd_update_ref_F32toBF16( + __read_only image2d_t index, + __read_only image2d_t update, + __read_only image2d_t temp_buf_int, + image2d_t temp_ref, + image2d_t link_buffer0, + image2d_t link_buffer1, + int width, int area, int vol, int val4, + int val5, int val6, int val7, int coord_dim) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + Image img1 = create_image_from_image2d(index, 4); + Image img2 = create_image_from_image2d(temp_buf_int, 4); + Image img3 = create_image_from_image2d(temp_ref, 2); + __global int* index_ptr = (__global int*)img1.ptr; + __global float* acc_ptr = (__global float*)img2.ptr; + __global short* ref_ptr = (__global short*)img3.ptr; + + int4 indice = vload4(0, index_ptr + gidy * coord_dim); + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; + int loc = idx * output_stride + gidx; + float tmpData; + tmpData = acc_ptr[loc]; + vxc_ushort8 src0, src2; + _viv_asm(COPY, src0, tmpData, 4); + VXC_DP2x8(src2, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + + ref_ptr[loc] = src2.x; +} + +__kernel void scatter_nd_update_ref_F32toBF16_4X( + __read_only image2d_t index, + __read_only image2d_t update, + __read_only image2d_t temp_buf_int, + image2d_t temp_ref, + image2d_t link_buffer0, + image2d_t link_buffer1, + int width, int area, int vol, int val4, + int val5, int val6, int val7, int coord_dim) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + Image img1 = create_image_from_image2d(index, 4); + Image img2 = create_image_from_image2d(temp_buf_int, 4); + Image img3 = create_image_from_image2d(temp_ref, 2); + __global int* index_ptr = (__global int*)img1.ptr; + __global float* acc_ptr = (__global float*)img2.ptr; + __global vxc_short4* ref_ptr = (__global vxc_short4*)img3.ptr; + + int4 indice = vload4(0, index_ptr + gidy * coord_dim); + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; + float4 tmpData = vload4(gidx, acc_ptr + idx * ref_stride); + int loc = idx * output_stride + gidx; + vxc_short8 src0, src2; + _viv_asm(COPY, src0, tmpData, 16); + VXC_DP2x8(src2, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + ref_ptr[loc] = src2.s0123; +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_qint.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_qint.vx new file mode 100644 index 000000000..2284f49ce --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_qint.vx @@ -0,0 +1,263 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8; +_viv_uniform int update_width; +_viv_uniform int output_width; +_viv_uniform int ref_stride; +_viv_uniform int output_stride; +_viv_uniform int2 multAndoutZP0; + +_viv_uniform int4 coord_stride; +_viv_uniform int4 coord_stride1; + +_viv_uniform float output_zp; +_viv_uniform int input_zp; +_viv_uniform float input_scale; +_viv_uniform float inout_scale; +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +#define SCATTER_RESET(name0, name1, ptr0, ptr1, type0, type1, len0, len1, size0, size1, ptr2, ptr3, len3) \ +__kernel void scatter_nd_update_reset_##name0##to##name1( \ + __read_only image2d_t input_ref, \ + image2d_t temp_ref, \ + image2d_t temp_buf_int, \ + int length, int res) \ +{ \ + int gidx = get_global_id(0); \ + Image img1 = create_image_from_image2d(input_ref, size0); \ + Image img2 = create_image_from_image2d(temp_ref, size1); \ + Image img3 = create_image_from_image2d(temp_buf_int, 4); \ + __global ptr0* input_ptr = (__global ptr0*)img1.ptr; \ + __global ptr1* output_ptr = (__global ptr1*)img2.ptr; \ + __global int* tmp_update_ptr = (__global int*)img3.ptr; \ + ptr0 tmpData = input_ptr[gidx]; \ + int4 zeros = (int4)(0); \ + int loc2 = gidx * 8; \ + type0 src; \ + type1 tmpDst; \ + ptr1 dst; \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + _viv_asm(COPY, src, tmpData, len0); \ + VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + _viv_asm(COPY, dst, tmpDst, len1); \ + output_ptr[gidx] = dst; \ + vstore4(zeros, 0, tmp_update_ptr + loc2); \ + vstore4(zeros, 1, tmp_update_ptr + loc2); \ + if(gidx < res) \ + { \ + __global ptr2* input_ptr1 = (__global ptr2*)img1.ptr; \ + __global ptr3* output_ptr1 = (__global ptr3*)img2.ptr; \ + ptr2 tmpData1 = input_ptr1[length + gidx]; \ + ptr3 dst1; \ + dst1 ^= dst1; \ + tmp_update_ptr[length + gidx] = 0; \ + _viv_asm(COPY, src, tmpData1, 4); \ + VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + _viv_asm(COPY, dst1, tmpDst, len3); \ + output_ptr1[length + gidx] = dst1; \ + } \ +} +SCATTER_RESET(U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, 8, 8, 1, 1, uchar, uchar, 1) +SCATTER_RESET(I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, 8, 8, 1, 1, char, char, 1) +SCATTER_RESET(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, 16, 16, 2, 2, short, short, 2) +SCATTER_RESET(F16, F16, vxc_short8, vxc_short8, vxc_half8, vxc_half8, 16, 16, 2, 2, short, short, 2) +SCATTER_RESET(U8, F16, vxc_uchar8, vxc_short8, vxc_uchar8, vxc_half8, 8, 16, 1, 2, uchar, short, 2) +SCATTER_RESET(I8, F16, vxc_char8, vxc_short8, vxc_char8, vxc_half8, 8, 16, 1, 2, char, short, 2) +SCATTER_RESET(I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, 16, 8, 2, 1, short, short, 2) +SCATTER_RESET(F16, U8, vxc_short8, vxc_uchar8, vxc_half8, vxc_uchar8, 16, 8, 2, 1, short, uchar, 1) + +__kernel void scatter_nd_update_reset_BF16toBF16( + __read_only image2d_t input_ref, + image2d_t temp_ref, + image2d_t temp_buf_int) +{ + int gidx = get_global_id(0); + Image img1 = create_image_from_image2d(input_ref, 2); + Image img2 = create_image_from_image2d(temp_ref, 2); + Image img3 = create_image_from_image2d(temp_buf_int, 4); + __global vxc_short8* input_ptr = (__global vxc_short8*)img1.ptr; + __global vxc_short8* output_ptr = (__global vxc_short8*)img2.ptr; + __global float* tmp_update_ptr = (__global float*)img3.ptr; + vxc_short8 src = input_ptr[gidx]; + float4 zeros = (float4)(0, 0, 0, 0); + int loc2 = gidx * 8; + output_ptr[gidx] = src; + vstore4(zeros, 0, tmp_update_ptr + loc2); + vstore4(zeros, 1, tmp_update_ptr + loc2); +} + +#define SCATTER_ND_UPDATE_QINT(src0_type, data_type, ptr_type, element_size) \ +__kernel void scatter_nd_update_update_##src0_type( \ + __read_only image2d_t index, \ + __read_only image2d_t update, \ + image2d_t temp_buf_int, \ + image2d_t link_buffer0, \ + int width, int area, int vol, int val4, \ + int val5, int val6, int val7, int coord_dim) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + Image img1 = create_image_from_image2d(index, 4); \ + Image img2 = create_image_from_image2d(update, element_size); \ + Image img3 = create_image_from_image2d(temp_buf_int, 4); \ + __global int* index_ptr = (__global int*)img1.ptr; \ + __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \ + __global int* output_ptr = (__global int*)img3.ptr; \ + data_type src; \ + \ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \ + ptr_type tmpData = update_ptr[gidy * update_width + gidx]; \ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \ + int loc = idx * output_width + gidx; \ + _viv_asm(COPY, src, tmpData, 4); \ + vxc_int4 data; \ + short zp = input_zp; \ + VXC_DP4x4(data, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvert1stUint8SubZpToFp32_4x4); \ + atomic_add(output_ptr + loc, data.x); \ +} +SCATTER_ND_UPDATE_QINT(U8, vxc_uchar8, uchar, 1) +SCATTER_ND_UPDATE_QINT(I8, vxc_char8, char, 1) +SCATTER_ND_UPDATE_QINT(I16, vxc_short8, short, 2) + +#define SCATTER_ND_UPDATE_QINT_4X(src0_type, data_type, ptr_type, element_size) \ +__kernel void scatter_nd_update_update_##src0_type##_4X( \ + __read_only image2d_t index, \ + __read_only image2d_t update, \ + image2d_t temp_buf_int, \ + image2d_t link_buffer0, \ + int width, int area, int vol, int val4, \ + int val5, int val6, int val7, int coord_dim) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + Image img1 = create_image_from_image2d(index, 4); \ + Image img2 = create_image_from_image2d(update, element_size); \ + Image img3 = create_image_from_image2d(temp_buf_int, 4); \ + __global int* index_ptr = (__global int*)img1.ptr; \ + __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \ + __global int* output_ptr = (__global int*)img3.ptr; \ + \ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \ + ptr_type src = update_ptr[gidy * update_width + gidx]; \ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \ + int4 loc = idx * output_width + gidx * 4 + (int4)(0, 1, 2, 3); \ + vxc_int4 data; \ + short zp = input_zp; \ + VXC_DP4x4(data, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvert1stUint8SubZpToFp32_4x4); \ + atomic_add(output_ptr + loc.x, data.x); \ + atomic_add(output_ptr + loc.y, data.y); \ + atomic_add(output_ptr + loc.z, data.z); \ + atomic_add(output_ptr + loc.w, data.w); \ +} +SCATTER_ND_UPDATE_QINT_4X(U8, vxc_uchar8, vxc_uchar4, 1) +SCATTER_ND_UPDATE_QINT_4X(I8, vxc_char8, vxc_char4, 1) +SCATTER_ND_UPDATE_QINT_4X(I16, vxc_short8, vxc_short4, 2) + +#define SCATTER_ND_UPDATE_REF(src0_type, dst_type, data_type, ptr_type, element_size) \ +__kernel void scatter_nd_update_ref_##src0_type##to##dst_type( \ + __read_only image2d_t index, \ + __read_only image2d_t update, \ + __read_only image2d_t temp_buf_int, \ + image2d_t temp_ref, \ + image2d_t link_buffer0, \ + image2d_t link_buffer1, \ + int width, int area, int vol, int val4, \ + int val5, int val6, int val7, int coord_dim) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + Image img1 = create_image_from_image2d(index, 4); \ + Image img2 = create_image_from_image2d(temp_buf_int, 4); \ + Image img3 = create_image_from_image2d(temp_ref, element_size); \ + __global int* index_ptr = (__global int*)img1.ptr; \ + __global int* acc_ptr = (__global int*)img2.ptr; \ + __global ptr_type* ref_ptr = (__global ptr_type*)img3.ptr; \ + data_type dst; \ + \ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \ + int loc = idx * output_stride + gidx; \ + int tmpData = acc_ptr[loc]; \ + int4 data; \ + data.x = convert_int_rte(tmpData * inout_scale + output_zp); \ + VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + ref_ptr[loc] = dst.x; \ +} +SCATTER_ND_UPDATE_REF(I32, U8, vxc_uchar8, uchar, 1) +SCATTER_ND_UPDATE_REF(I32, I8, vxc_char8, char, 1) +SCATTER_ND_UPDATE_REF(I32, I16, vxc_short8, short, 2) + +#define SCATTER_ND_UPDATE_REF_4X(src0_type, dst_type, data_type, ptr_type, element_size) \ +__kernel void scatter_nd_update_ref_##src0_type##to##dst_type##_4X( \ + __read_only image2d_t index, \ + __read_only image2d_t update, \ + __read_only image2d_t temp_buf_int, \ + image2d_t temp_ref, \ + image2d_t link_buffer0, \ + image2d_t link_buffer1, \ + int width, int area, int vol, int val4, \ + int val5, int val6, int val7, int coord_dim) \ +{ \ + int gidx = get_global_id(0); \ + int gidy = get_global_id(1); \ + Image img1 = create_image_from_image2d(index, 4); \ + Image img2 = create_image_from_image2d(temp_buf_int, 4); \ + Image img3 = create_image_from_image2d(temp_ref, element_size); \ + __global int* index_ptr = (__global int*)img1.ptr; \ + __global int* acc_ptr = (__global int*)img2.ptr; \ + __global ptr_type* ref_ptr = (__global ptr_type*)img3.ptr; \ + data_type dst; \ + \ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \ + float4 tmpData = convert_float4(vload4(gidx, acc_ptr + idx * ref_stride)); \ + int loc = idx * output_stride + gidx; \ + int4 data = convert_int4_rte(tmpData * inout_scale + output_zp); \ + VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + ref_ptr[loc] = dst.xyzw; \ +} +SCATTER_ND_UPDATE_REF_4X(I32, U8, vxc_uchar8, vxc_uchar4, 1) +SCATTER_ND_UPDATE_REF_4X(I32, I8, vxc_char8, vxc_char4, 1) +SCATTER_ND_UPDATE_REF_4X(I32, I16, vxc_short8, vxc_short4, 2) + +#define SCATTER_ND_UPDATE_COPY(src0_type, ptr_type, element_size, ptr_type1) \ +__kernel void scatter_nd_update_copy_##src0_type( \ + __read_only image2d_t temp_ref, \ + __read_only image2d_t link_buffer1, \ + image2d_t output, \ + int length, int res) \ +{ \ + int gidx = get_global_id(0); \ + Image img1 = create_image_from_image2d(temp_ref, element_size); \ + Image img2 = create_image_from_image2d(output, element_size); \ + __global ptr_type* input_ptr = (__global ptr_type*)img1.ptr; \ + __global ptr_type* output_ptr = (__global ptr_type*)img2.ptr; \ + output_ptr[gidx] = input_ptr[gidx]; \ + if(gidx < res) \ + { \ + __global ptr_type1* input_ptr1 = (__global ptr_type1*)img1.ptr; \ + __global ptr_type1* output_ptr1 = (__global ptr_type1*)img2.ptr; \ + output_ptr1[length + gidx] = input_ptr1[length + gidx]; \ + } \ +} +SCATTER_ND_UPDATE_COPY(U8, vxc_uchar8, 1, uchar) +SCATTER_ND_UPDATE_COPY(I8, vxc_char8, 1, char) +SCATTER_ND_UPDATE_COPY(I16, vxc_short8, 2, short) +SCATTER_ND_UPDATE_COPY(F16, vxc_short8, 2, short) +SCATTER_ND_UPDATE_COPY(BF16, vxc_short8, 2, short) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx b/src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx index 319348593..3c770f373 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx @@ -21,7 +21,7 @@ __kernel void sequence_mask_##src0_type_name##to##src1_type_name##_2D( \ short zp = inputZP; \ VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvert1stUint8SubZpToFp32_4x4); \ - int index = convert_int_rte(tmpData.s0 * input_scale); \ + int index = convert_int_rtz(tmpData.s0 * input_scale); \ int4 data; \ data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \ write_type dst; \ @@ -47,7 +47,7 @@ __kernel void sequence_mask_##src0_type_name##to##src1_type_name( \ short zp = inputZP; \ VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniConvert1stUint8SubZpToFp32_4x4); \ - int index = convert_int_rte(tmpData.s0 * input_scale); \ + int index = convert_int_rtz(tmpData.s0 * input_scale); \ int4 data; \ data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \ write_type dst; \ @@ -73,7 +73,7 @@ __kernel void sequence_mask_F16toF16_2D( float4 tmpData; VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ UniFP16toFP32Lo4_dp4x4); - int index = convert_int_rte(tmpData.x); + int index = convert_int_rtz(tmpData.x); float4 data; data = outIdx < index? outputVal1 : convert_float(output_ZP); vxc_short8 dst; @@ -96,7 +96,7 @@ __kernel void sequence_mask_F16toF16( float4 tmpData; VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ UniFP16toFP32Lo4_dp4x4); - int index = convert_int_rte(tmpData.x); + int index = convert_int_rtz(tmpData.x); float4 data; data = outIdx < index? outputVal1 : convert_float(output_ZP); vxc_short8 dst; @@ -119,7 +119,7 @@ __kernel void sequence_mask_F16toU8_2D( float4 tmpData; VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ UniFP16toFP32Lo4_dp4x4); - int index = convert_int_rte(tmpData.x); + int index = convert_int_rtz(tmpData.x); int4 data; data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; vxc_uchar16 dst; @@ -140,7 +140,7 @@ __kernel void sequence_mask_F16toU8( float4 tmpData; VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ UniFP16toFP32Lo4_dp4x4); - int index = convert_int_rte(tmpData.x); + int index = convert_int_rtz(tmpData.x); int4 data; data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; vxc_uchar16 dst; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/tiny_yolov4_postprocess_box.vx b/src/tim/vx/internal/src/libnnext/ops/vx/tiny_yolov4_postprocess_box.vx new file mode 100644 index 000000000..6e513f126 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/tiny_yolov4_postprocess_box.vx @@ -0,0 +1,103 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable +#include "cl_viv_vx_ext.h" + +#define logE (1.44269502f) + +float4 sigmoid4(float4 x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} + +float4 exp4(float4 x) +{ + x *= logE; + return exp2(x); +} + +#define CONST0 (1.0499999523162842f) +#define CONST1 (0.0250000003725290f) + +_viv_uniform VXC_512Bits uniDatatoFloat32_0_4x4; +_viv_uniform VXC_512Bits uniDatatoFloat32_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniDataTranspose_0_2x8; +_viv_uniform VXC_512Bits uniDataTranspose_1_2x8; +_viv_uniform float input0_scale; +_viv_uniform float input0_tail; +_viv_uniform float input1_scale; +_viv_uniform float input1_tail; +_viv_uniform float output_scale; +_viv_uniform float output_zp; +_viv_uniform float CONST2; +__kernel void tiny_yolov4_postprocess_box_U8_U8toU8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + float bias_0, + float bias_1 + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(0)); + + vxc_uchar16 src0, src1, src2, src3; + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input0, coord.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input0, coord.xy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(src2, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src3, input1, coord.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord.zw += (int2)(2, 3); + + float4 data0, data1, data2, data3, data; + VXC_DP4x4(data0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4); + data0 = data0 * input0_scale + input0_tail; + data0 = sigmoid4(data0); + data0 = data0 * CONST0 - CONST1; + + VXC_DP4x4(data, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4); + data = data * input1_scale + input1_tail; + data0 = data0 * CONST2 + data * CONST2; + + VXC_DP4x4(data1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_1_4x4); + data1 = data1 * input0_scale + input0_tail; + data1 = sigmoid4(data1); + data1 = data1 * CONST0 - CONST1; + + VXC_DP4x4(data, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4); + data = data * input1_scale + input1_tail; + data1 = data1 * CONST2 + data * CONST2; + + VXC_DP4x4(data2, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4); + data2 = data2 * input0_scale + input0_tail; + data2 = exp4(data2) * bias_0; + + VXC_DP4x4(data3, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_1_4x4); + data3 = data3 * input0_scale + input0_tail; + data3 = exp4(data3) * bias_1; + + data0 = data0 * output_scale + output_zp; + data1 = data1 * output_scale + output_zp; + + int4 dst0 = convert_int4_rte(data0); + int4 dst1 = convert_int4_rte(data1); + VXC_DP2x8(src1, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); + data2 = data2 * output_scale + output_zp; + data3 = data3 * output_scale + output_zp; + dst0 = convert_int4_rte(data2); + dst1 = convert_int4_rte(data3); + VXC_DP2x8(src1, dst0, dst1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); + + VXC_DP2x8(src0, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniDataTranspose_0_2x8); + VXC_DP2x8(src0, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniDataTranspose_1_2x8); + + VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord.x ++; + VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.yz, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.yw, src0, VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/tiny_yolov4_postprocess_confidence.vx b/src/tim/vx/internal/src/libnnext/ops/vx/tiny_yolov4_postprocess_confidence.vx new file mode 100644 index 000000000..0a41c0e2c --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/tiny_yolov4_postprocess_confidence.vx @@ -0,0 +1,54 @@ +#pragma OPENCL EXTENSION cl_viv_vx_extension : enable +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniU8TimesU8_0_4x4; +_viv_uniform VXC_512Bits uniU8PlusU8_trans_0_2x8; +_viv_uniform VXC_512Bits uniU8PlusU8_trans_1_2x8; +_viv_uniform VXC_512Bits uniU16TimesMultiplier_PostShift_2x8; +_viv_uniform int output_zp; + +__kernel void tiny_yolov4_postprocess_conf_U8toU8 +( + __read_only image2d_t input, + __write_only image2d_t output +) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, get_global_id(0)); + + vxc_uchar16 src0, src1, src2, src3, src4; + + VXC_ReadImage(src0, input, coord.wz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + vxc_ushort8 data0, data1; + + VXC_ReadImage(src1, input, coord.wy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src2, input, coord.wy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src3, input, coord.wy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src4, input, coord.wy, VXC_5BITOFFSET_XY(0, 4), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + + coord.zw = coord.xx + (int2)(2, 3); + + VXC_DP4x4(data0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4); + VXC_DP4x4(data0, src0, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4); + VXC_DP4x4(data1, src0, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4); + VXC_DP4x4(data1, src0, src4, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4); + + VXC_DP2x8(src1, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), + uniU16TimesMultiplier_PostShift_2x8); + VXC_DP2x8(src1, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), + uniU16TimesMultiplier_PostShift_2x8); + + uchar zp; + _viv_asm(COPY, zp, output_zp, 2); + + VXC_DP2x8(src0, src1, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), + uniU8PlusU8_trans_0_2x8); + VXC_DP2x8(src0, src1, zp, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), + uniU8PlusU8_trans_1_2x8); + + VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + coord.x ++; + VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.yz, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord.yw, src0, VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c index f528ccb35..5421a5aba 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c @@ -4902,6 +4902,710 @@ __kernel void cumsum_BF16toBF16_axis0_2D(\n\ }\n\ "; /* end of cumsum_bf16_vx*/ +static const char cumsum_ex_rev_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\ +_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;\n\ +_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniSumHorzRevF16toF16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzRevF16toF16B_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzRevF16toF16C_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzRevF16toF16_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniSumHorzRevU8toI16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzRevU8toI16B_8x4;\n\ +_viv_uniform VXC_512Bits uniSubZpRevI16toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32A_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32B_4x4;\n\ +\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int input_zp;\n\ +_viv_uniform float in_out_scale;\n\ +_viv_uniform float output_zp;\n\ +\n\ +__kernel void cumsum_ex_rev_F16toF16_axis0(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + vxc_short8 src, dst;\n\ + vxc_half8 data, tmpsum, sum;\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ + if(exclusive == 0 && rev)\n\ + {\n\ + for(coord.x = width - 8; coord.x >= 0; coord.x -= 8)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);\n\ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniSumHorzRevF16toF16C_2x8);\n\ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(exclusive && rev == 0)\n\ + {\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + for(; coord.x < width - 8;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.x = coord.x + 1;\n\ + coord.x += 8;\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);\n\ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);\n\ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(exclusive && rev)\n\ + {\n\ + coord.x = width - 8;\n\ + coord_out.x = width - 1;\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + for(; coord.x > 0;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.x = coord.x - 1;\n\ + coord.x -= 8;\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);\n\ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniSumHorzRevF16toF16C_2x8);\n\ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ +}\n\ +\n\ +#define CUMSUM_QINT_EX_REV_AXIS0(in_name, out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis0( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); \\\n\ + int4 coord_out = coord; \\\n\ + \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + vxc_short8 rowSum; \\\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0); \\\n\ + short zp = (short)input_zp; \\\n\ + \\\n\ + if(exclusive == 0 && rev) \\\n\ + { \\\n\ + for(coord.x = width - 8; coord.x >= 0; coord.x -= 8) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \\\n\ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \\\n\ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \\\n\ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniAccSumHorzRevI16toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniAccSumHorzRevI16toI32B_4x4); \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive && rev == 0) \\\n\ + { \\\n\ + for(coord.x = -1; coord.x < width - 8;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_out.x = coord.x + 1; \\\n\ + coord.x += 8; \\\n\ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \\\n\ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \\\n\ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \\\n\ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniAccSumHorzI16toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniAccSumHorzI16toI32B_4x4); \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive && rev) \\\n\ + { \\\n\ + for(coord.x = width - 7; coord.x > 0;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_out.x = coord.x - 1; \\\n\ + coord.x -= 8; \\\n\ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \\\n\ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \\\n\ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \\\n\ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniAccSumHorzRevI16toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniAccSumHorzRevI16toI32B_4x4); \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ +}\n\ +CUMSUM_QINT_EX_REV_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +CUMSUM_QINT_EX_REV_AXIS0(I8, I8, vxc_char16, vxc_char16)\n\ +CUMSUM_QINT_EX_REV_AXIS0(I16, I16, vxc_short8, vxc_short8)\n\ +"; /* end of cumsum_ex_rev_axis0_vx*/ + +static const char cumsum_ex_rev_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\ +\n\ +_viv_uniform int height;\n\ +_viv_uniform float in_out_scale;\n\ +_viv_uniform float in_out_zp_scale;\n\ +_viv_uniform float output_zp;\n\ +\n\ +__kernel void cumsum_ex_rev_F16toF16_axis1(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + vxc_half8 data, sum;\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ + if(exclusive == 0 && rev)\n\ + {\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(exclusive && rev == 0)\n\ + {\n\ + dst ^= dst;\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + for(; coord.y < height - 1;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(exclusive && rev)\n\ + {\n\ + dst ^= dst;\n\ + coord.y = height - 1;\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + for(; coord.y > 0;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y--;\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ +}\n\ +\n\ +#define CUMSUM_8BITS_EX_REV_AXIS1(in_name, out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \\\n\ + \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\ + \\\n\ + if(exclusive == 0 && rev) \\\n\ + { \\\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\ + float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive && rev == 0) \\\n\ + { \\\n\ + int tmpAlpha0 = convert_int_rte(output_zp); \\\n\ + int4 tmpVal; \\\n\ + tmpVal.x = tmpAlpha0; \\\n\ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + for(; coord.y < height - 1;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\ + float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp; \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8);\\\n\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8);\\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive && rev) \\\n\ + { \\\n\ + coord.y = height - 1; \\\n\ + int tmpAlpha0 = convert_int_rte(output_zp); \\\n\ + int4 tmpVal; \\\n\ + tmpVal.x = tmpAlpha0; \\\n\ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + for(; coord.y > 0;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\ + float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \\\n\ + coord.y--; \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8);\\\n\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8);\\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ +}\n\ +CUMSUM_8BITS_EX_REV_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +CUMSUM_8BITS_EX_REV_AXIS1(I8, I8, vxc_char16, vxc_char16)\n\ +\n\ +__kernel void cumsum_ex_rev_I16toI16_axis1(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\ + if(exclusive == 0 && rev)\n\ + {\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ + float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(exclusive && rev == 0)\n\ + {\n\ + int tmpAlpha0 = convert_int_rte(output_zp);\n\ + int4 tmpVal;\n\ + tmpVal.x = tmpAlpha0;\n\ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + for(; coord.y < height - 1;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.y++;\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ + float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp;\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(exclusive && rev)\n\ + {\n\ + coord.y = height - 1;\n\ + int tmpAlpha0 = convert_int_rte(output_zp);\n\ + int4 tmpVal;\n\ + tmpVal.x = tmpAlpha0;\n\ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + for(; coord.y > 0;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ + float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;\n\ + coord.y--;\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ +}\n\ +"; /* end of cumsum_ex_rev_axis1_vx*/ + +static const char cumsum_ex_rev_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\ +\n\ +_viv_uniform int channel;\n\ +_viv_uniform float in_out_scale;\n\ +_viv_uniform float in_out_zp_scale;\n\ +_viv_uniform float output_zp;\n\ +\n\ +__kernel void cumsum_ex_rev_F16toF16_axis2(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + vxc_half8 data, sum;\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ + if(rev && exclusive == 0)\n\ + {\n\ + for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(rev == 0 && exclusive)\n\ + {\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + for(; coord.z < channel - 1;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.z++;\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(rev && exclusive)\n\ + {\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + coord.z = channel - 1;\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + for(; coord.z > 0;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.z--;\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ +}\n\ +\n\ +#define CUMSUM_8BITS_EX_REV_AXIS2(in_name, out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\ + \\\n\ + if(rev && exclusive == 0) \\\n\ + { \\\n\ + for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\ + float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8);\\\n\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \\\n\ + uniConvertInt32toUint8_2x8);\\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive && rev == 0) \\\n\ + { \\\n\ + int tmpAlpha0 = convert_int_rte(output_zp); \\\n\ + int4 tmpVal; \\\n\ + tmpVal.x = tmpAlpha0; \\\n\ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + for(; coord.z < channel - 1;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z++; \\\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\ + float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp; \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(rev && exclusive) \\\n\ + { \\\n\ + coord.z = channel - 1; \\\n\ + int tmpAlpha0 = convert_int_rte(output_zp); \\\n\ + int4 tmpVal; \\\n\ + tmpVal.x = tmpAlpha0; \\\n\ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + for(; coord.z > 0;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\ + float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \\\n\ + coord.z--; \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1),\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ +}\n\ +CUMSUM_8BITS_EX_REV_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +CUMSUM_8BITS_EX_REV_AXIS2(I8, I8, vxc_char16, vxc_char16)\n\ +\n\ +__kernel void cumsum_ex_rev_I16toI16_axis2(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\ + if(exclusive == 0 && rev)\n\ + {\n\ + for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ + float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(exclusive && rev == 0)\n\ + {\n\ + int tmpAlpha0 = convert_int_rte(output_zp);\n\ + int4 tmpVal;\n\ + tmpVal.x = tmpAlpha0;\n\ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + for(; coord.z < channel - 1;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.z++;\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ + float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp;\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ + else if(exclusive && rev)\n\ + {\n\ + coord.z = channel - 1;\n\ + int tmpAlpha0 = convert_int_rte(output_zp);\n\ + int4 tmpVal;\n\ + tmpVal.x = tmpAlpha0;\n\ + VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + for(; coord.z > 0;)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ + float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;\n\ + coord.z--;\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ +}\n\ +"; /* end of cumsum_ex_rev_axis2_vx*/ + static const char cumsum_f16_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\ @@ -5080,6 +5784,138 @@ __kernel void cumsum_F16to##out_name##_axis0_2D( \\\n\ CUMSUM_F16TOQINT_AXIS0_2D(I8, vxc_half8, vxc_char16)\n\ CUMSUM_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8)\n\ CUMSUM_F16TOQINT_AXIS0_2D(U8, vxc_half8, vxc_uchar16)\n\ +\n\ +#define CUMSUM_F16TOQINT_EX_REV_AXIS2(out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_ex_rev_F16to##out_name##_axis2( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + dst_type dst; \\\n\ + vxc_half8 data, sum; \\\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + if(exclusive == 0 && rev) \\\n\ + { \\\n\ + for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive && rev == 0) \\\n\ + { \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + for(; coord.z < channel - 1;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z++; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive && rev) \\\n\ + { \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + coord.z = channel - 1; \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + for(; coord.z > 0;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.z--; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ +}\n\ +CUMSUM_F16TOQINT_EX_REV_AXIS2(I8, vxc_half8, vxc_char16)\n\ +CUMSUM_F16TOQINT_EX_REV_AXIS2(I16, vxc_half8, vxc_short8)\n\ +CUMSUM_F16TOQINT_EX_REV_AXIS2(U8, vxc_half8, vxc_uchar16)\n\ +\n\ +#define CUMSUM_F16TOQINT_EX_REV_AXIS1(out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_ex_rev_F16to##out_name##_axis1( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + dst_type dst; \\\n\ + vxc_half8 data, sum; \\\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + if(exclusive == 0 && rev) \\\n\ + { \\\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive && rev == 0) \\\n\ + { \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + for(; coord.y < height - 1;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ + else if(exclusive && rev) \\\n\ + { \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + coord.y = height - 1; \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + for(; coord.y > 0;) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y--; \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ +}\n\ +CUMSUM_F16TOQINT_EX_REV_AXIS1(I8, vxc_half8, vxc_char16)\n\ +CUMSUM_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8)\n\ +CUMSUM_F16TOQINT_EX_REV_AXIS1(U8, vxc_half8, vxc_uchar16)\n\ "; /* end of cumsum_f16_u8_vx*/ static const char custom_softmax_vx[] = "/*\n\ @@ -5509,15 +6345,13 @@ __kernel void custom_warp_affine_bilinear_U8toU8\n\ }\n\ "; /* end of custom_warp_affine_vx*/ -static const char custom_warp_perspective_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +static const char custom_warp_affine_rgb_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ \n\ #include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float4 matrix0;\n\ -_viv_uniform float4 matrix1;\n\ -_viv_uniform float4 matrix2;\n\ -_viv_uniform float4 matrix4;\n\ -__kernel void custom_warp_perspective_nearest_neighbor_U8toU8_2D\n\ +_viv_uniform float2 matrix1;\n\ +__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb_2D\n\ (\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ @@ -5526,53 +6360,38 @@ __kernel void custom_warp_perspective_nearest_neighbor_U8toU8_2D\n\ float _m2,\n\ float _m3,\n\ float _m4,\n\ - float _m5,\n\ - float _m6,\n\ - float _m7,\n\ - float _m8\n\ + float _m5\n\ )\n\ {\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));\n\ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ \n\ - float4 coord_f0 = convert_float4(coord_in);\n\ -\n\ - float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\ - z0.zw = z0.zw + 2 * matrix1.z;\n\ - float4 z1 = z0 + 4 * matrix1.z;\n\ -\n\ - z0 = 1.0f / z0;\n\ - z1 = 1.0f / z1;\n\ + float4 coord_f = convert_float4(coord_in);\n\ \n\ - coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\ - float4 coord_f = coord_f0 * z0.xxyy;\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ \n\ - coord_in = convert_int4(coord_f);\n\ + coord_in.x = floor(coord_f.x) * 3;\n\ + coord_in.y = floor(coord_f.y);\n\ + coord_in.z = floor(coord_f.z) * 3;\n\ + coord_in.w = floor(coord_f.w);\n\ \n\ vxc_uchar16 dst;\n\ VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ - coord_f0 = coord_f0.zwzw + matrix4;\n\ - coord_f = coord_f0 * z0.zzww;\n\ - coord_in = convert_int4(coord_f);\n\ + coord_in.x = coord_in.x + 1;\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.x = coord_in.x + 1;\n\ VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +\n\ VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ - coord_f0 = coord_f0.zwzw + matrix4;\n\ - coord_f = coord_f0 * z1.xxyy;\n\ - coord_in = convert_int4(coord_f);\n\ - VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z = coord_in.z + 1;\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z = coord_in.z + 1;\n\ VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ - coord_f0 = coord_f0.zwzw + matrix4;\n\ - coord_f = coord_f0 * z1.zzww;\n\ - coord_in = convert_int4(coord_f);\n\ - VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ -\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ -__kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\ +__kernel void custom_warp_affine_bilinear_U8toU8_rgb_2D\n\ (\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ @@ -5581,32 +6400,30 @@ __kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\ float _m2,\n\ float _m3,\n\ float _m4,\n\ - float _m5,\n\ - float _m6,\n\ - float _m7,\n\ - float _m8\n\ + float _m5\n\ )\n\ {\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));\n\ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ \n\ - float4 coord_f0 = convert_float4(coord_in);\n\ + float4 coord_f = convert_float4(coord_in);\n\ \n\ - float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\ - z0.zw = z0.zw + 2 * matrix1.z;\n\ - float4 z1 = z0 + 4 * matrix1.z;\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ \n\ - z0 = 1.0f / z0;\n\ - z1 = 1.0f / z1;\n\ + coord_in.x = floor(coord_f.x) * 3;\n\ + coord_in.y = floor(coord_f.y);\n\ + coord_in.z = floor(coord_f.z) * 3;\n\ + coord_in.w = floor(coord_f.w);\n\ \n\ - coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\ - float4 coord_f = coord_f0 * z0.xxyy;\n\ + vxc_uchar16 src0, src1, src_0, src_1, dst;\n\ + VXC_ReadImage(src_0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src_1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ - coord_in = convert_int4(floor(coord_f));\n\ + src0.x = src_0.s0;\n\ + src0.y = src_0.s3;\n\ + src1.x = src_1.s0;\n\ + src1.y = src_1.s3;\n\ \n\ - vxc_uchar16 src0, src1, dst;\n\ - VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ #if (VX_VERSION==1)\n\ VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ #else\n\ @@ -5615,21 +6432,22 @@ __kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\ VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ #endif\n\ \n\ - VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src0.x = src_0.s1;\n\ + src0.y = src_0.s4;\n\ + src1.x = src_1.s1;\n\ + src1.y = src_1.s4;\n\ #if (VX_VERSION==1)\n\ - VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ #else\n\ - VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ src1.s0 = src0.s1;\n\ - VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ #endif\n\ \n\ - coord_f0 = coord_f0.zwzw + matrix4;\n\ - coord_f = coord_f0 * z0.zzww;\n\ - coord_in = convert_int4(floor(coord_f));\n\ - VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src0.x = src_0.s2;\n\ + src0.y = src_0.s5;\n\ + src1.x = src_1.s2;\n\ + src1.y = src_1.s5;\n\ #if (VX_VERSION==1)\n\ VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ #else\n\ @@ -5638,8 +6456,13 @@ __kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\ VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ #endif\n\ \n\ - VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src_0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src_1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + src0.x = src_0.s0;\n\ + src0.y = src_0.s3;\n\ + src1.x = src_1.s0;\n\ + src1.y = src_1.s3;\n\ #if (VX_VERSION==1)\n\ VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ #else\n\ @@ -5648,21 +6471,22 @@ __kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\ VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ #endif\n\ \n\ - coord_f0 = coord_f0.zwzw + matrix4;\n\ - coord_f = coord_f0 * z1.xxyy;\n\ - coord_in = convert_int4(floor(coord_f));\n\ - VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src0.x = src_0.s1;\n\ + src0.y = src_0.s4;\n\ + src1.x = src_1.s1;\n\ + src1.y = src_1.s4;\n\ #if (VX_VERSION==1)\n\ - VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ #else\n\ - VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ src1.s0 = src0.s1;\n\ - VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ #endif\n\ \n\ - VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src0.x = src_0.s2;\n\ + src0.y = src_0.s5;\n\ + src1.x = src_1.s2;\n\ + src1.y = src_1.s5;\n\ #if (VX_VERSION==1)\n\ VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ #else\n\ @@ -5671,36 +6495,10 @@ __kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\ VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ #endif\n\ \n\ - coord_f0 = coord_f0.zwzw + matrix4;\n\ - coord_f = coord_f0 * z1.zzww;\n\ - coord_in = convert_int4(floor(coord_f));\n\ - VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ -#if (VX_VERSION==1)\n\ - VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ -#else\n\ - VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - src1.s0 = src0.s1;\n\ - VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ -#endif\n\ -\n\ - VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ -#if (VX_VERSION==1)\n\ - VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ -#else\n\ - VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ - src1.s0 = src0.s1;\n\ - VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ -#endif\n\ -\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ -#define IMAGE_LOAD_3D(dst, xoffset, yoffset, start, end) \\\n\ - VXC_OP4(img_load_3d, dst, input, coord_input.xywz, VXC_5BITOFFSET_XY(xoffset, yoffset), \\\n\ - VXC_MODIFIER(start, end, 0, VXC_RM_TowardZero, 0));\n\ -__kernel void custom_warp_perspective_nearest_neighbor_U8toU8\n\ +__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb\n\ (\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ @@ -5709,28 +6507,20 @@ __kernel void custom_warp_perspective_nearest_neighbor_U8toU8\n\ float _m2,\n\ float _m3,\n\ float _m4,\n\ - float _m5,\n\ - float _m6,\n\ - float _m7,\n\ - float _m8\n\ + float _m5\n\ )\n\ {\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2));\n\ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ \n\ - float4 coord_f0 = convert_float4(coord_in);\n\ -\n\ - float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\ - z0.zw = z0.zw + 2 * matrix1.z;\n\ - float4 z1 = z0 + 4 * matrix1.z;\n\ -\n\ - z0 = 1.0f / z0;\n\ - z1 = 1.0f / z1;\n\ + float4 coord_f = convert_float4(coord_in);\n\ \n\ - coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\ - float4 coord_f = coord_f0 * z0.xxyy;\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ \n\ - coord_in = convert_int4(coord_f);\n\ + coord_in.x = floor(coord_f.x) * 3;\n\ + coord_in.y = floor(coord_f.y);\n\ + coord_in.z = floor(coord_f.z) * 3;\n\ + coord_in.w = floor(coord_f.w);\n\ \n\ int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\ int8 input_desc;\n\ @@ -5739,28 +6529,391 @@ __kernel void custom_warp_perspective_nearest_neighbor_U8toU8\n\ _viv_asm(MOV, coord_input.w, baseAddr);\n\ \n\ vxc_uchar16 dst;\n\ - IMAGE_LOAD_3D(dst, 0, 0, 0, 0)\n\ - coord_input.xy = coord_in.zw;\n\ - IMAGE_LOAD_3D(dst, 0, 0, 1, 1)\n\ - coord_f0 = coord_f0.zwzw + matrix4;\n\ - coord_f = coord_f0 * z0.zzww;\n\ - coord_in = convert_int4(coord_f);\n\ - coord_input.xy = coord_in.xy;\n\ - IMAGE_LOAD_3D(dst, 0, 0, 2, 2)\n\ - coord_input.xy = coord_in.zw;\n\ - IMAGE_LOAD_3D(dst, 0, 0, 3, 3)\n\ - coord_f0 = coord_f0.zwzw + matrix4;\n\ - coord_f = coord_f0 * z1.xxyy;\n\ - coord_in = convert_int4(coord_f);\n\ - coord_input.xy = coord_in.xy;\n\ - IMAGE_LOAD_3D(dst, 0, 0, 4, 4)\n\ - coord_input.xy = coord_in.zw;\n\ - IMAGE_LOAD_3D(dst, 0, 0, 5, 5)\n\ - coord_f0 = coord_f0.zwzw + matrix4;\n\ - coord_f = coord_f0 * z1.zzww;\n\ - coord_in = convert_int4(coord_f);\n\ - coord_input.xy = coord_in.xy;\n\ - IMAGE_LOAD_3D(dst, 0, 0, 6, 6)\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.x = coord_input.x + 1;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.x = coord_input.x + 1;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.x = coord_input.x + 1;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + coord_input.x = coord_input.x + 1;\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void custom_warp_affine_bilinear_U8toU8_rgb\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5\n\ +)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f = convert_float4(coord_in);\n\ +\n\ + coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\ +\n\ + coord_in.x = floor(coord_f.x) * 3;\n\ + coord_in.y = floor(coord_f.y);\n\ + coord_in.z = floor(coord_f.z) * 3;\n\ + coord_in.w = floor(coord_f.w);\n\ +\n\ + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_input.w, baseAddr);\n\ +\n\ + vxc_uchar16 src0, src1, src_0, src_1, dst;\n\ + VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + src0.x = src_0.s0;\n\ + src0.y = src_0.s3;\n\ + src1.x = src_1.s0;\n\ + src1.y = src_1.s3;\n\ +\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + src0.x = src_0.s1;\n\ + src0.y = src_0.s4;\n\ + src1.x = src_1.s1;\n\ + src1.y = src_1.s4;\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + src0.x = src_0.s2;\n\ + src0.y = src_0.s5;\n\ + src1.x = src_1.s2;\n\ + src1.y = src_1.s5;\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_input.xy = coord_in.zw;\n\ + VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + src0.x = src_0.s0;\n\ + src0.y = src_0.s3;\n\ + src1.x = src_1.s0;\n\ + src1.y = src_1.s3;\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + src0.x = src_0.s1;\n\ + src0.y = src_0.s4;\n\ + src1.x = src_1.s1;\n\ + src1.y = src_1.s4;\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + src0.x = src_0.s2;\n\ + src0.y = src_0.s5;\n\ + src1.x = src_1.s2;\n\ + src1.y = src_1.s5;\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of custom_warp_affine_rgb_vx*/ + +static const char custom_warp_perspective_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float4 matrix0;\n\ +_viv_uniform float4 matrix1;\n\ +_viv_uniform float4 matrix2;\n\ +_viv_uniform float4 matrix4;\n\ +__kernel void custom_warp_perspective_nearest_neighbor_U8toU8_2D\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5,\n\ + float _m6,\n\ + float _m7,\n\ + float _m8\n\ +)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f0 = convert_float4(coord_in);\n\ +\n\ + float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\ + z0.zw = z0.zw + 2 * matrix1.z;\n\ + float4 z1 = z0 + 4 * matrix1.z;\n\ +\n\ + z0 = 1.0f / z0;\n\ + z1 = 1.0f / z1;\n\ +\n\ + coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\ + float4 coord_f = coord_f0 * z0.xxyy;\n\ +\n\ + coord_in = convert_int4(coord_f);\n\ +\n\ + vxc_uchar16 dst;\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z0.zzww;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z1.xxyy;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z1.zzww;\n\ + coord_in = convert_int4(coord_f);\n\ + VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5,\n\ + float _m6,\n\ + float _m7,\n\ + float _m8\n\ +)\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f0 = convert_float4(coord_in);\n\ +\n\ + float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\ + z0.zw = z0.zw + 2 * matrix1.z;\n\ + float4 z1 = z0 + 4 * matrix1.z;\n\ +\n\ + z0 = 1.0f / z0;\n\ + z1 = 1.0f / z1;\n\ +\n\ + coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\ + float4 coord_f = coord_f0 * z0.xxyy;\n\ +\n\ + coord_in = convert_int4(floor(coord_f));\n\ +\n\ + vxc_uchar16 src0, src1, dst;\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z0.zzww;\n\ + coord_in = convert_int4(floor(coord_f));\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z1.xxyy;\n\ + coord_in = convert_int4(floor(coord_f));\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z1.zzww;\n\ + coord_in = convert_int4(floor(coord_f));\n\ + VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ +#if (VX_VERSION==1)\n\ + VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +#else\n\ + VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\ + src1.s0 = src0.s1;\n\ + VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\ +#endif\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +#define IMAGE_LOAD_3D(dst, xoffset, yoffset, start, end) \\\n\ + VXC_OP4(img_load_3d, dst, input, coord_input.xywz, VXC_5BITOFFSET_XY(xoffset, yoffset), \\\n\ + VXC_MODIFIER(start, end, 0, VXC_RM_TowardZero, 0));\n\ +__kernel void custom_warp_perspective_nearest_neighbor_U8toU8\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float _m0,\n\ + float _m1,\n\ + float _m2,\n\ + float _m3,\n\ + float _m4,\n\ + float _m5,\n\ + float _m6,\n\ + float _m7,\n\ + float _m8\n\ +)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\ +\n\ + float4 coord_f0 = convert_float4(coord_in);\n\ +\n\ + float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\ + z0.zw = z0.zw + 2 * matrix1.z;\n\ + float4 z1 = z0 + 4 * matrix1.z;\n\ +\n\ + z0 = 1.0f / z0;\n\ + z1 = 1.0f / z1;\n\ +\n\ + coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\ + float4 coord_f = coord_f0 * z0.xxyy;\n\ +\n\ + coord_in = convert_int4(coord_f);\n\ +\n\ + int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord_input.w, baseAddr);\n\ +\n\ + vxc_uchar16 dst;\n\ + IMAGE_LOAD_3D(dst, 0, 0, 0, 0)\n\ + coord_input.xy = coord_in.zw;\n\ + IMAGE_LOAD_3D(dst, 0, 0, 1, 1)\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z0.zzww;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + IMAGE_LOAD_3D(dst, 0, 0, 2, 2)\n\ + coord_input.xy = coord_in.zw;\n\ + IMAGE_LOAD_3D(dst, 0, 0, 3, 3)\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z1.xxyy;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + IMAGE_LOAD_3D(dst, 0, 0, 4, 4)\n\ + coord_input.xy = coord_in.zw;\n\ + IMAGE_LOAD_3D(dst, 0, 0, 5, 5)\n\ + coord_f0 = coord_f0.zwzw + matrix4;\n\ + coord_f = coord_f0 * z1.zzww;\n\ + coord_in = convert_int4(coord_f);\n\ + coord_input.xy = coord_in.xy;\n\ + IMAGE_LOAD_3D(dst, 0, 0, 6, 6)\n\ coord_input.xy = coord_in.zw;\n\ IMAGE_LOAD_3D(dst, 0, 0, 7, 7)\n\ \n\ @@ -8432,6 +9585,7 @@ __kernel void gather_I8toI8(\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ int4 indice = read_imagei(input1, coord_in.xy);\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ vxc_char16 src;\n\ @@ -8456,6 +9610,7 @@ __kernel void gather_U8toU8(\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ int4 indice = read_imagei(input1, coord_in.xy);\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ vxc_uchar16 src;\n\ @@ -8479,9 +9634,9 @@ __kernel void gather_I16toI16(\n\ int gidz = get_global_id(2); // block_num\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ -\n\ \n\ int4 indice = read_imagei(input1, coord_in.xy);\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ vxc_short8 src;\n\ @@ -8506,6 +9661,7 @@ __kernel void gather_F16toF16(\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ int4 indice = read_imagei(input1, coord_in.xy);\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ vxc_short8 src;\n\ @@ -8526,6 +9682,7 @@ __kernel void gather_I8toI8_axis0(\n\ {\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ int4 indices = read_imagei(input1, coord.xx);\n\ + indices = indices >= 0 ? indices : indices + axis_num;\n\ int2 coord_in = (int2)(indices.x, get_global_id(1));\n\ \n\ vxc_char16 src, dst;\n\ @@ -8552,6 +9709,7 @@ __kernel void gather_U8toU8_axis0(\n\ {\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ int4 indices = read_imagei(input1, coord.xx);\n\ + indices = indices >= 0 ? indices : indices + axis_num;\n\ int2 coord_in = (int2)(indices.x, get_global_id(1));\n\ \n\ vxc_uchar16 src, dst;\n\ @@ -8578,6 +9736,7 @@ __kernel void gather_I16toI16_axis0(\n\ {\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ int4 indices = read_imagei(input1, coord.xx);\n\ + indices = indices >= 0 ? indices : indices + axis_num;\n\ int2 coord_in = (int2)(indices.x, get_global_id(1));\n\ \n\ vxc_short8 src, dst;\n\ @@ -8604,6 +9763,7 @@ __kernel void gather_F16toF16_axis0(\n\ {\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ int4 indices = read_imagei(input1, coord.xx);\n\ + indices = indices >= 0 ? indices : indices + axis_num;\n\ int2 coord_in = (int2)(indices.x, get_global_id(1));\n\ \n\ vxc_short8 src, dst;\n\ @@ -8640,6 +9800,7 @@ __kernel void gather_I8toI8_array(\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ int4 indice = read_imagei(input1, coord_in.xy);\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ Image img1 = create_image_from_image2d(input0, 1);\n\ @@ -8668,6 +9829,7 @@ __kernel void gather_U8toU8_array(\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ int4 indice = read_imagei(input1, coord_in.xy);\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ Image img1 = create_image_from_image2d(input0, 1);\n\ @@ -8695,9 +9857,9 @@ __kernel void gather_I16toI16_array(\n\ int gidz = get_global_id(2); // block_num\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ -\n\ \n\ int4 indice = read_imagei(input1, coord_in.xy);\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ Image img1 = create_image_from_image2d(input0, 2);\n\ @@ -8727,6 +9889,7 @@ __kernel void gather_F16toF16_array(\n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ \n\ int4 indice = read_imagei(input1, coord_in.xy);\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ Image img1 = create_image_from_image2d(input0, 2);\n\ @@ -8764,6 +9927,7 @@ __kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \\\n\ uchar* output_ptr = get_image_ptr_from_coord(img2, coord.xy); \\\n\ __global data_type* data_ptr = (__global data_type*)input_ptr; \\\n\ __global write_type* out_ptr = (__global write_type*)output_ptr; \\\n\ + indices = indices >= 0 ? indices : indices + axis_num; \\\n\ src.s0 = data_ptr[indices.x]; \\\n\ src.s1 = data_ptr[indices.y]; \\\n\ src.s2 = data_ptr[indices.z]; \\\n\ @@ -8804,6 +9968,7 @@ __kernel void gather_batch_I8toI8(\n\ {\n\ int4 indice = read_imagei(input1, coord_idx);\n\ coord_idx.y++;\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.y = gidz * axis_num + indice.x;\n\ \n\ vxc_char16 src;\n\ @@ -8834,6 +9999,7 @@ __kernel void gather_batch_U8toU8(\n\ {\n\ int4 indice = read_imagei(input1, coord_idx);\n\ coord_idx.y++;\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.y = gidz * axis_num + indice.x;\n\ \n\ vxc_uchar16 src;\n\ @@ -8864,6 +10030,7 @@ __kernel void gather_batch_I16toI16(\n\ {\n\ int4 indice = read_imagei(input1, coord_idx);\n\ coord_idx.y++;\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.y = gidz * axis_num + indice.x;\n\ \n\ vxc_short8 src;\n\ @@ -8894,6 +10061,7 @@ __kernel void gather_batch_F16toF16(\n\ {\n\ int4 indice = read_imagei(input1, coord_idx);\n\ coord_idx.y++;\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.y = gidz * axis_num + indice.x;\n\ \n\ vxc_short8 src;\n\ @@ -8915,6 +10083,7 @@ __kernel void gather_batch_I8toI8_axis0(\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ int4 indices = read_imagei(input1, coord.xz);\n\ + indices = indices >= 0 ? indices : indices + axis_num;\n\ int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\ \n\ vxc_char16 src, dst;\n\ @@ -8943,6 +10112,7 @@ __kernel void gather_batch_U8toU8_axis0(\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ int4 indices = read_imagei(input1, coord.xz);\n\ + indices = indices >= 0 ? indices : indices + axis_num;\n\ int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\ \n\ vxc_uchar16 src, dst;\n\ @@ -8971,6 +10141,7 @@ __kernel void gather_batch_I16toI16_axis0(\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ int4 indices = read_imagei(input1, coord.xz);\n\ + indices = indices >= 0 ? indices : indices + axis_num;\n\ int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\ \n\ vxc_short8 src, dst;\n\ @@ -8999,6 +10170,7 @@ __kernel void gather_batch_F16toF16_axis0(\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ int4 indices = read_imagei(input1, coord.xz);\n\ + indices = indices >= 0 ? indices : indices + axis_num;\n\ int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\ \n\ vxc_short8 src, dst;\n\ @@ -9020,6 +10192,12 @@ __kernel void gather_batch_F16toF16_axis0(\n\ static const char gather_elements_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int axis_size;\n\ +_viv_uniform uint width0;\n\ +_viv_uniform uint height0;\n\ +_viv_uniform uint width1;\n\ +_viv_uniform uint height1;\n\ +_viv_uniform uint width_out;\n\ +_viv_uniform uint height_out;\n\ \n\ #define GATHER_ELEMENTS_AXIS0_2D(name, data_type) \\\n\ __kernel void gather_elements_axis0_##name##_I32to##name##_2D \\\n\ @@ -9170,6 +10348,144 @@ GATHER_ELEMENTS_AXIS2(F16, vxc_short4)\n\ GATHER_ELEMENTS_AXIS2(I16, vxc_short4)\n\ GATHER_ELEMENTS_AXIS2(I8, vxc_char4)\n\ GATHER_ELEMENTS_AXIS2(U8, vxc_uchar4)\n\ +\n\ +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(name, data_type, data_type_ptr, stride) \\\n\ +__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \\\n\ + int* index_ptr = (int*)index_tensor.ptr; \\\n\ + int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \\\n\ + \\\n\ + Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \\\n\ + data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \\\n\ + data_type data = input_ptr[index + coord.y * width0 + coord.z * width0 * height0]; \\\n\ + \\\n\ + Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \\\n\ + data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \\\n\ + output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \\\n\ +}\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(F16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I8, char, char*, 1)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(U8, uchar, uchar*, 1)\n\ +\n\ +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(name, data_type, data_type_ptr, stride) \\\n\ +__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \\\n\ + int* index_ptr = (int*)index_tensor.ptr; \\\n\ + int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \\\n\ + \\\n\ + Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \\\n\ + data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \\\n\ + data_type data = input_ptr[coord.x + index * width0 + coord.z * width0 * height0]; \\\n\ + \\\n\ + Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \\\n\ + data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \\\n\ + output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \\\n\ +}\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(F16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I8, char, char*, 1)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(U8, uchar, uchar*, 1)\n\ +\n\ +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(name, data_type, data_type_ptr, stride) \\\n\ +__kernel void gather_elements_beyond_maxwidth_axis2_##name##_I32to##name \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \\\n\ + int* index_ptr = (int*)index_tensor.ptr; \\\n\ + int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \\\n\ + \\\n\ + Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \\\n\ + data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \\\n\ + data_type data = input_ptr[coord.x + coord.y * width0 + index * width0 * height0]; \\\n\ + \\\n\ + Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \\\n\ + data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \\\n\ + output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \\\n\ +}\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(F16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I8, char, char*, 1)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(U8, uchar, uchar*, 1)\n\ +\n\ +\n\ +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(name, data_type, data_type_ptr, stride) \\\n\ +__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name##_2D \\\n\ + ( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int axis \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + Image index_img = create_image_from_image2d(input1, 4); \\\n\ + int* index_ptr = (int*)index_img.ptr; \\\n\ + int index = index_ptr[coord.x + coord.y * width1]; \\\n\ + \\\n\ + Image input_img = create_image_from_image2d(input0, stride); \\\n\ + data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \\\n\ + data_type data = input_ptr[index + coord.y * width0]; \\\n\ + \\\n\ + Image output_img = create_image_from_image2d(output, stride); \\\n\ + data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \\\n\ + output_ptr[coord.x + coord.y * width_out] = data; \\\n\ +}\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(F16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I8, char, char*, 1)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(U8, uchar, uchar*, 1)\n\ +\n\ +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(name, data_type, data_type_ptr, stride) \\\n\ +__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name##_2D \\\n\ + ( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + int axis \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + Image index_img = create_image_from_image2d(input1, 4); \\\n\ + int* index_ptr = (int*)index_img.ptr; \\\n\ + int index = index_ptr[coord.x + coord.y * width1]; \\\n\ + \\\n\ + Image input_img = create_image_from_image2d(input0, stride); \\\n\ + data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \\\n\ + data_type data = input_ptr[coord.x + index * width0]; \\\n\ + \\\n\ + Image output_img = create_image_from_image2d(output, stride); \\\n\ + data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \\\n\ + output_ptr[coord.x + coord.y * width_out] = data; \\\n\ +}\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(F16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I8, char, char*, 1)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(U8, uchar, uchar*, 1)\n\ +\n\ +\n\ "; /* end of gather_elements_vx*/ static const char gather_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -9198,6 +10514,7 @@ __kernel void gather_##src0_type_name##toF16( \\\n\ \\\n\ int4 coord_in = (int4)(gidy, 0, gidx, 0); \\\n\ int4 indice = read_imagei(input1, coord_in.xy); \\\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \\\n\ coord_in.w = gidz * axis_num + indice.x; \\\n\ \\\n\ read_type src; \\\n\ @@ -9234,6 +10551,7 @@ __kernel void gather_F16to##src1_type_name( \\\n\ int4 coord_in = (int4)(gidy, 0, gidx, 0); \\\n\ \\\n\ int4 indice = read_imagei(input1, coord_in.xy); \\\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \\\n\ coord_in.w = gidz * axis_num + indice.x; \\\n\ \\\n\ vxc_short8 src; \\\n\ @@ -9266,6 +10584,7 @@ __kernel void gather_I16toF16(\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ int4 indice = read_imagei(input1, coord_in.xy);\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ vxc_short8 src;\n\ @@ -9296,6 +10615,7 @@ __kernel void gather_##src0_type_name##toF16_axis0( \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ int4 indices = read_imagei(input1, coord.xx); \\\n\ + indices = indices >= 0 ? indices : indices + axis_num; \\\n\ int2 coord_in = (int2)(indices.x, get_global_id(1)); \\\n\ \\\n\ read_type src; \\\n\ @@ -9327,6 +10647,7 @@ __kernel void gather_F16to##src1_type_name##_axis0( \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ int4 indices = read_imagei(input1, coord.xx); \\\n\ + indices = indices >= 0 ? indices : indices + axis_num; \\\n\ int2 coord_in = (int2)(indices.x, get_global_id(1)); \\\n\ \\\n\ vxc_short8 src; \\\n\ @@ -9358,6 +10679,7 @@ __kernel void gather_I16toF16_axis0(\n\ {\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ int4 indices = read_imagei(input1, coord.xx);\n\ + indices = indices >= 0 ? indices : indices + axis_num;\n\ int2 coord_in = (int2)(indices.x, get_global_id(1));\n\ \n\ vxc_short8 src;\n\ @@ -9414,6 +10736,7 @@ __kernel void gather_batch_##src0_type_name##toF16( \\\n\ { \\\n\ int4 indice = read_imagei(input1, coord_idx); \\\n\ coord_idx.y++; \\\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \\\n\ coord_in.y = gidz * axis_num + indice.x; \\\n\ \\\n\ read_type src; \\\n\ @@ -9459,6 +10782,7 @@ __kernel void gather_batch_F16to##src1_type_name( \\\n\ { \\\n\ int4 indice = read_imagei(input1, coord_idx); \\\n\ coord_idx.y++; \\\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \\\n\ coord_in.y = gidz * axis_num + indice.x; \\\n\ \\\n\ vxc_short8 src; \\\n\ @@ -9501,6 +10825,7 @@ __kernel void gather_batch_I16toF16(\n\ {\n\ int4 indice = read_imagei(input1, coord_idx);\n\ coord_idx.y++;\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.y = gidz * axis_num + indice.x;\n\ \n\ vxc_short8 src;\n\ @@ -9526,6 +10851,7 @@ __kernel void gather_batch_##src0_type_name##toF16_axis0( \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ int4 indices = read_imagei(input1, coord.xz); \\\n\ + indices = indices >= 0 ? indices : indices + axis_num; \\\n\ int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \\\n\ \\\n\ read_type src; \\\n\ @@ -9560,6 +10886,7 @@ __kernel void gather_batch_F16to##src1_type_name##_axis0( \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ int4 indices = read_imagei(input1, coord.xz); \\\n\ + indices = indices >= 0 ? indices : indices + axis_num; \\\n\ int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \\\n\ \\\n\ vxc_short8 src; \\\n\ @@ -9594,6 +10921,7 @@ __kernel void gather_batch_I16toF16_axis0(\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ int4 indices = read_imagei(input1, coord.xz);\n\ + indices = indices >= 0 ? indices : indices + axis_num;\n\ int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\ \n\ vxc_short8 src, dst;\n\ @@ -10083,95 +11411,98 @@ static const char gather_nd_batch_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ __kernel void gather_nd_batch_I8toI8_1D(\n\ __read_only image2d_t input0,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ int block_size,\n\ int coord_dim\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ - int gidy = get_global_id(1); // batch\n\ + int gidy = get_global_id(1); // index num\n\ + int gidz = get_global_id(2); // batch num\n\ \n\ - int4 coord = (int4)(gidx, gidy, 0, 0);\n\ - Image img = create_image_from_image2d(input1, 4);\n\ - uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + Tensor img = create_tensor_from_image2d_array(input1, 4);\n\ + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\ int4 indice = ((int4 *)indice_ptr)[0];\n\ -\n\ - coord.z = indice.x * block_size + gidx;\n\ + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\ \n\ vxc_char16 src;\n\ - VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ \n\ - VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void gather_nd_batch_U8toU8_1D(\n\ __read_only image2d_t input0,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ int block_size,\n\ int coord_dim\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ - int gidy = get_global_id(1); // batch num\n\ + int gidy = get_global_id(1); // index num\n\ + int gidz = get_global_id(2); // batch num\n\ \n\ - int4 coord = (int4)(gidx, gidy, 0, 0);\n\ - Image img = create_image_from_image2d(input1, 4);\n\ - uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + Tensor img = create_tensor_from_image2d_array(input1, 4);\n\ + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\ int4 indice = ((int4 *)indice_ptr)[0];\n\ \n\ - coord.z = indice.x * block_size + gidx;\n\ + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\ \n\ vxc_uchar16 src;\n\ - VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void gather_nd_batch_I16toI16_1D(\n\ __read_only image2d_t input0,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ int block_size,\n\ int coord_dim\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ - int gidy = get_global_id(1); // batch num\n\ + int gidy = get_global_id(1); // index num\n\ + int gidz = get_global_id(2); // batch num\n\ \n\ - int4 coord = (int4)(gidx, gidy, 0, 0);\n\ - Image img = create_image_from_image2d(input1, 4);\n\ - uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + Tensor img = create_tensor_from_image2d_array(input1, 4);\n\ + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\ int4 indice = ((int4 *)indice_ptr)[0];\n\ \n\ - coord.z = indice.x * block_size + gidx;\n\ + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\ \n\ vxc_short8 src;\n\ - VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void gather_nd_batch_F16toF16_1D(\n\ __read_only image2d_t input0,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ int block_size,\n\ int coord_dim\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ - int gidy = get_global_id(1); // batch num\n\ + int gidy = get_global_id(1); // index num\n\ + int gidz = get_global_id(2); // batch num\n\ \n\ - int4 coord = (int4)(gidx, gidy, 0, 0);\n\ - Image img = create_image_from_image2d(input1, 4);\n\ - uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + Tensor img = create_tensor_from_image2d_array(input1, 4);\n\ + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\ int4 indice = ((int4 *)indice_ptr)[0];\n\ \n\ - coord.z = indice.x * block_size + gidx;\n\ + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\ \n\ vxc_short8 src;\n\ - VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ "; /* end of gather_nd_batch_vx*/ @@ -10179,18 +11510,19 @@ static const char gather_nd_batch_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ __kernel void gather_nd_batch_I8toI8_2D(\n\ __read_only image2d_array_t input0,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ int block_size,\n\ int coord_dim\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ - int gidy = get_global_id(1); // batch num\n\ + int gidy = get_global_id(1); // index num\n\ + int gidz = get_global_id(2); // batch num\n\ \n\ - int4 coord = (int4)(gidx, 0, gidy, 0);\n\ - Image img = create_image_from_image2d(input1, 4);\n\ - uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + Tensor img = create_tensor_from_image2d_array(input1, 4);\n\ + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\ int4 indice = ((int4 *)indice_ptr)[0];\n\ \n\ indice.x = indice.x * block_size + gidx;\n\ @@ -10199,23 +11531,24 @@ __kernel void gather_nd_batch_I8toI8_2D(\n\ vxc_char16 src;\n\ VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ \n\ - VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void gather_nd_U8toU8_2D(\n\ __read_only image2d_array_t input0,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ int block_size,\n\ int coord_dim\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ - int gidy = get_global_id(1); // batch num\n\ + int gidy = get_global_id(1); // index num\n\ + int gidz = get_global_id(2); // batch num\n\ \n\ - int4 coord = (int4)(gidx, 0, gidy, 0);\n\ - Image img = create_image_from_image2d(input1, 4);\n\ - uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + Tensor img = create_tensor_from_image2d_array(input1, 4);\n\ + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\ int4 indice = ((int4 *)indice_ptr)[0];\n\ \n\ indice.x = indice.x * block_size + gidx;\n\ @@ -10223,23 +11556,24 @@ __kernel void gather_nd_U8toU8_2D(\n\ \n\ vxc_uchar16 src;\n\ VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void gather_nd_I16toI16_2D(\n\ __read_only image2d_array_t input0,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ int block_size,\n\ int coord_dim\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ - int gidy = get_global_id(1); // batch num\n\ + int gidy = get_global_id(1); // index num\n\ + int gidz = get_global_id(2); // batch num\n\ \n\ - int4 coord = (int4)(gidx, 0, gidy, 0);\n\ - Image img = create_image_from_image2d(input1, 4);\n\ - uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + Tensor img = create_tensor_from_image2d_array(input1, 4);\n\ + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\ int4 indice = ((int4 *)indice_ptr)[0];\n\ \n\ indice.x = indice.x * block_size + gidx;\n\ @@ -10247,23 +11581,24 @@ __kernel void gather_nd_I16toI16_2D(\n\ \n\ vxc_short8 src;\n\ VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ __kernel void gather_nd_F16toF16_2D(\n\ __read_only image2d_array_t input0,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ int block_size,\n\ int coord_dim\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ - int gidy = get_global_id(1); // batch num\n\ + int gidy = get_global_id(1); // index num\n\ + int gidz = get_global_id(2); // batch num\n\ \n\ - int4 coord = (int4)(gidx, 0, gidy, 0);\n\ - Image img = create_image_from_image2d(input1, 4);\n\ - uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + Tensor img = create_tensor_from_image2d_array(input1, 4);\n\ + uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\ int4 indice = ((int4 *)indice_ptr)[0];\n\ \n\ indice.x = indice.x * block_size + gidx;\n\ @@ -10271,7 +11606,7 @@ __kernel void gather_nd_F16toF16_2D(\n\ \n\ vxc_short8 src;\n\ VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ }\n\ "; /* end of gather_nd_batch_2d_vx*/ @@ -10733,12 +12068,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( __read_only image2d_array_t scale, \\\n\ __read_only image2d_t meanVari, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ + float eps, int is2D, float rSpaceOrg, float pStride) \\\n\ { \\\n\ + int gidx = get_global_id(0); \\\n\ int gidy = get_global_id(1); \\\n\ int gidz = get_global_id(2); \\\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\ + int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \\\n\ src_type src0; \\\n\ dst_type dst; \\\n\ vxc_short8 src1; \\\n\ @@ -10784,7 +12120,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name# __read_only image2d_array_t scale, \\\n\ __read_only image2d_t meanVari, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ + float eps, int is2D, float rSpaceOrg, float pStride) \\\n\ { \\\n\ int gidz = get_global_id(1); \\\n\ int2 coord = (int2)(get_global_id(0), gidz); \\\n\ @@ -10834,12 +12170,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( __read_only image2d_t scale, \\\n\ __read_only image2d_t meanVari, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ + float eps, int is2D, float rSpaceOrg, float pStride) \\\n\ { \\\n\ + int gidx = get_global_id(0); \\\n\ int gidy = get_global_id(1); \\\n\ int gidz = get_global_id(2); \\\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\ + int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \\\n\ src_type src0; \\\n\ dst_type dst; \\\n\ float scale_vari, bias_val; \\\n\ @@ -10880,7 +12217,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name# __read_only image2d_t scale, \\\n\ __read_only image2d_t meanVari, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ + float eps, int is2D, float rSpaceOrg, float pStride) \\\n\ { \\\n\ int gidz = get_global_id(1); \\\n\ int2 coord = (int2)(get_global_id(0), gidz); \\\n\ @@ -10938,12 +12275,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( __read_only image2d_array_t scale, \\\n\ __read_only image2d_t meanVari, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ + float eps, int is2D, float rSpaceOrg, float pStride) \\\n\ { \\\n\ + int gidx = get_global_id(0); \\\n\ int gidy = get_global_id(1); \\\n\ int gidz = get_global_id(2); \\\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\ + int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \\\n\ src_type src0; \\\n\ vxc_short8 src1, outval; \\\n\ vxc_half8 scale_h, dst; \\\n\ @@ -10996,7 +12334,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name# __read_only image2d_array_t scale, \\\n\ __read_only image2d_t meanVari, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ + float eps, int is2D, float rSpaceOrg, float pStride) \\\n\ { \\\n\ int gidz = get_global_id(1); \\\n\ int2 coord = (int2)(get_global_id(0), gidz); \\\n\ @@ -11053,12 +12391,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( __read_only image2d_t scale, \\\n\ __read_only image2d_t meanVari, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ + float eps, int is2D, float rSpaceOrg, float pStride) \\\n\ { \\\n\ + int gidx = get_global_id(0); \\\n\ int gidy = get_global_id(1); \\\n\ int gidz = get_global_id(2); \\\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\ + int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \\\n\ src_type src0; \\\n\ vxc_short8 outval; \\\n\ vxc_half8 dst; \\\n\ @@ -11107,7 +12446,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name# __read_only image2d_t scale, \\\n\ __read_only image2d_t meanVari, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ + float eps, int is2D, float rSpaceOrg, float pStride) \\\n\ { \\\n\ int gidz = get_global_id(1); \\\n\ int2 coord = (int2)(get_global_id(0), gidz); \\\n\ @@ -11294,12 +12633,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( __read_only image2d_array_t scale, \\\n\ __read_only image2d_t meanVari, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ + float eps, int is2D, float rSpaceOrg, float pStride) \\\n\ { \\\n\ + int gidx = get_global_id(0); \\\n\ int gidy = get_global_id(1); \\\n\ int gidz = get_global_id(2); \\\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\ + int4 coord_para = (int4)((convert_int(gidx* rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \\\n\ vxc_short8 src0; \\\n\ vxc_short8 src1; \\\n\ vxc_half8 scale_h; \\\n\ @@ -11351,7 +12691,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name# __read_only image2d_array_t scale, \\\n\ __read_only image2d_t meanVari, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ + float eps, int is2D, float rSpaceOrg, float pStride) \\\n\ { \\\n\ int gidz = get_global_id(1); \\\n\ int2 coord = (int2)(get_global_id(0), gidz); \\\n\ @@ -11406,12 +12746,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( __read_only image2d_t scale, \\\n\ __read_only image2d_t meanVari, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ + float eps, int is2D, float rSpaceOrg, float pStride) \\\n\ { \\\n\ + int gidx = get_global_id(0); \\\n\ int gidy = get_global_id(1); \\\n\ int gidz = get_global_id(2); \\\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\ + int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \\\n\ vxc_short8 src0; \\\n\ src_type in_h; \\\n\ float scale_vari, bias_val; \\\n\ @@ -11458,7 +12799,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name# __read_only image2d_t scale, \\\n\ __read_only image2d_t meanVari, \\\n\ __write_only image2d_array_t output, \\\n\ - float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ + float eps, int is2D, float rSpaceOrg, float pStride) \\\n\ { \\\n\ int gidz = get_global_id(1); \\\n\ int2 coord = (int2)(get_global_id(0), gidz); \\\n\ @@ -12731,8 +14072,8 @@ _viv_uniform VXC_512Bits uniConvertF16_0_4x4;\n\ _viv_uniform VXC_512Bits uniConvertF16_1_4x4;\n\ _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ \n\ -#define GRUCELL_F16_F16TOF16(act_name, act_func) \\\n\ -__kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \\\n\ +#define GRUCELL_F16_F16TOF16(act_name, act_func, rec_act_name, rec_act_func) \\\n\ +__kernel void grucell_reset_after_activation_F16_F16toF16_##act_name##_##rec_act_name( \\\n\ __read_only image2d_t hstate_in, \\\n\ __read_only image2d_t input_z_conv, \\\n\ __read_only image2d_t input_r_conv, \\\n\ @@ -12764,15 +14105,15 @@ __kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \\\n\ \\\n\ float4 r; \\\n\ VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\ - r = act_func(r); \\\n\ + r = rec_act_func(r); \\\n\ float4 h0, h1; \\\n\ VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\ VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\ float4 h = h0 + r * h1; \\\n\ float4 z; \\\n\ VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\ - z = act_func(z); \\\n\ - h = tanh_func(h); \\\n\ + z = rec_act_func(z); \\\n\ + h = act_func(h); \\\n\ float4 h_tm; \\\n\ VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\ float4 result = (1 - z) * h + z * h_tm; \\\n\ @@ -12785,14 +14126,15 @@ __kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \\\n\ VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ -GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)\n\ +GRUCELL_F16_F16TOF16(TANH, tanh_func, SIGMOID, sigmoid_func)\n\ +GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func, SIGMOID, sigmoid_func)\n\ \n\ _viv_uniform float hstate_in_scale;\n\ _viv_uniform float hstate_in_tail;\n\ _viv_uniform float output_scale;\n\ _viv_uniform float output_zp;\n\ -#define GRUCELL_QNT_F16TO_QNT(name0, name1, act_name, act_func, src0_type, dst_type) \\\n\ -__kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name( \\\n\ +#define GRUCELL_QNT_F16TO_QNT(name, act_func, rec_act_func, src0_type, dst_type) \\\n\ +__kernel void grucell_reset_after_activation_##name( \\\n\ __read_only image2d_t hstate_in, \\\n\ __read_only image2d_t input_z_conv, \\\n\ __read_only image2d_t input_r_conv, \\\n\ @@ -12824,15 +14166,15 @@ __kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name \\\n\ float4 r; \\\n\ VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\ - r = act_func(r); \\\n\ + r = rec_act_func(r); \\\n\ float4 h0, h1; \\\n\ VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\ VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\ float4 h = h0 + r * h1; \\\n\ float4 z; \\\n\ VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\ - z = act_func(z); \\\n\ - h = tanh_func(h); \\\n\ + z = rec_act_func(z); \\\n\ + h = act_func(h); \\\n\ float4 h_tm; \\\n\ VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\ h_tm = h_tm * hstate_in_scale + hstate_in_tail; \\\n\ @@ -12845,9 +14187,12 @@ __kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ -GRUCELL_QNT_F16TO_QNT(U8, U8, SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\ -GRUCELL_QNT_F16TO_QNT(I8, I8, SIGMOID, sigmoid_func, vxc_char8, vxc_char8)\n\ -GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8)\n\ +GRUCELL_QNT_F16TO_QNT(U8_F16toU8_TANH_SIGMOID, tanh_func, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\ +GRUCELL_QNT_F16TO_QNT(I8_F16toI8_TANH_SIGMOID, tanh_func, sigmoid_func, vxc_char8, vxc_char8)\n\ +GRUCELL_QNT_F16TO_QNT(I16_F16toI16_TANH_SIGMOID, tanh_func, sigmoid_func, vxc_short8, vxc_short8)\n\ +GRUCELL_QNT_F16TO_QNT(U8_F16toU8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\ +GRUCELL_QNT_F16TO_QNT(I8_F16toI8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_char8, vxc_char8)\n\ +GRUCELL_QNT_F16TO_QNT(I16_F16toI16_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_short8, vxc_short8)\n\ "; /* end of grucell_reset_after_activation_vx*/ static const char hswish_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -21791,6 +23136,432 @@ __kernel void gemm_transb_BF16BF16toBF16(image2d_array_t inputA,\n\ }\n\ "; /* end of matrixmul_bf16_vx*/ +static const char matrixmul_cross_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform float mulKIn0In1Zp;\n\ +_viv_uniform float inOutScale;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;\n\ +\n\ +#define GEMM_QINT_TO_QINT_CROSS(src0_type_name, read_type) \\\n\ +__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_cross( \\\n\ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N, \\\n\ + int axis_size, int inner_size, int outer_size, int axis_size0, \\\n\ + int inner_size0, int outer_size0, int axis_size1, int inner_size1, \\\n\ + int outer_size1, int axis_size2, int inner_size2, int outer_size2) \\\n\ +{ \\\n\ + read_type srcA0, srcA1, srcA2, srcA3, srcB, outC; \\\n\ + vxc_float4 sum = (vxc_float4)(mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp); \\\n\ + int gidz = get_global_id(2); \\\n\ + for(int j = 0; j < outer_size; j++) \\\n\ + { \\\n\ + for(int i = 0; i < inner_size; i++) \\\n\ + { \\\n\ + vxc_float4 sum0 = sum, sum1 = sum, sum2 = sum, sum3 = sum; \\\n\ + int4 coord_a = (int4)(0, get_global_id(1), gidz * axis_size0 + i * inner_size0 + j * outer_size0, 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, gidz * axis_size1 + i * inner_size1 + j * outer_size1, 0); \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; coord_b.y += 4; \\\n\ + VXC_DP4x4(tempA0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8toFp32Block4_4x4); \\\n\ + VXC_DP4x4(tempA1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8toFp32Block4_4x4); \\\n\ + VXC_DP4x4(tempA2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8toFp32Block4_4x4); \\\n\ + VXC_DP4x4(tempA3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8toFp32Block4_4x4); \\\n\ + VXC_DP8x4(tempB0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8MulZptoFp32_8x4); \\\n\ + VXC_DP8x4(tempB1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8MulZptoFp32_8x4); \\\n\ + VXC_DP8x4(tempB2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8MulZptoFp32_8x4); \\\n\ + VXC_DP8x4(tempB3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8MulZptoFp32_8x4); \\\n\ + sum0 += tempA0 + tempB0; \\\n\ + sum1 += tempA1 + tempB1; \\\n\ + sum2 += tempA2 + tempB2; \\\n\ + sum3 += tempA3 + tempB3; \\\n\ + } \\\n\ + vxc_int4 tmpOut0, tmpOut1; \\\n\ + coord_b.y = get_global_id(1); \\\n\ + coord_b.z = gidz * axis_size2 + i * inner_size2 + j * outer_size2; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + tmpOut0 = convert_int4_rte(sum0 * inOutScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum1 * inOutScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + tmpOut0 = convert_int4_rte(sum2 * inOutScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum3 * inOutScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ +}\n\ +GEMM_QINT_TO_QINT_CROSS(U8, vxc_uchar16)\n\ +GEMM_QINT_TO_QINT_CROSS(I8, vxc_char16)\n\ +\n\ +__kernel void gemm_F16F16toF16_cross(image2d_array_t inputA,\n\ + image2d_array_t inputB, image2d_array_t output,\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N,\n\ + int axis_size, int inner_size, int outer_size, int axis_size0,\n\ + int inner_size0, int outer_size0, int axis_size1, int inner_size1,\n\ + int outer_size1, int axis_size2, int inner_size2, int outer_size2)\n\ +{\n\ + uint gidy = get_global_id(1);\n\ + uint gidz = get_global_id(2);\n\ + for(int j = 0; j < outer_size; j++)\n\ + {\n\ + for(int i = 0; i < inner_size; i++)\n\ + {\n\ + int4 coord_a = (int4)(0, gidy, gidz * axis_size0 + i * inner_size0 + j * outer_size0, 0);\n\ + int4 coord_b = (int4)(get_global_id(0), 0, gidz * axis_size1 + i * inner_size1 + j * outer_size1, 0);\n\ +\n\ + half4 valC;\n\ + vxc_short8 srcA0, srcA1, srcA2, srcA3, outC;\n\ + vxc_half8 tmpA0, tmpA1, tmpA2, tmpA3;\n\ + vxc_short16 srcB;\n\ + vxc_half16 tmpB;\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);\n\ +\n\ + int8 inputA_desc, inputB_desc, output_desc;\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a);\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b);\n\ +\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)\n\ + {\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_a.x += 4; coord_b.y += 4;\n\ + _viv_asm(COPY, tmpA0, srcA0, 16);\n\ + _viv_asm(COPY, tmpA1, srcA1, 16);\n\ + _viv_asm(COPY, tmpA2, srcA2, 16);\n\ + _viv_asm(COPY, tmpA3, srcA3, 16);\n\ + _viv_asm(COPY, tmpB.hi, srcB.hi, 16);\n\ + _viv_asm(COPY, tmpB.lo, srcB.lo, 16);\n\ + VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmU8F16toF32Lo_4x4b);\n\ + VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmU8F16toF32Lo_4x4b);\n\ + VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, tmpA2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmU8F16toF32Lo_4x4b);\n\ + VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, tmpA3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmU8F16toF32Lo_4x4b);\n\ + sum0 += (tempA0);\n\ + sum1 += (tempA1);\n\ + sum2 += (tempA2);\n\ + sum3 += (tempA3);\n\ + }\n\ + coord_b.y = gidy;\n\ + coord_b.z = gidz * axis_size2 + i * inner_size2 + j * outer_size2;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr);\n\ + _viv_asm(CONV, valC, sum0);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_b.y++;\n\ + _viv_asm(CONV, valC, sum1);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_b.y++;\n\ + _viv_asm(CONV, valC, sum2);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_b.y++;\n\ + _viv_asm(CONV, valC, sum3);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ + }\n\ +}\n\ +"; /* end of matrixmul_cross_vx*/ + +static const char matrixmul_cross_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int input0_ZP;\n\ +_viv_uniform int input1_ZP;\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32B_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform int ac2zero;\n\ +_viv_uniform int bc2zero;\n\ +\n\ +_viv_uniform int outer;\n\ +\n\ +#define GEMM_QINT_TO_QINT_MERGE(src0_type_name, read_type) \\\n\ +__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_merge( \\\n\ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\ +{ \\\n\ + uint gidy = get_global_id(1); \\\n\ + short in0_zp, in1_zp; \\\n\ + _viv_asm(COPY, in0_zp, input0_ZP, 4); \\\n\ + _viv_asm(COPY, in1_zp, input1_ZP, 4); \\\n\ + for(int i = 0; i < outer; i++) \\\n\ + { \\\n\ + read_type srcA, srcB, outC; \\\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? i : get_global_id(2)), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); \\\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; \\\n\ + coord_b.y += 4; \\\n\ + VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \\\n\ + sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \\\n\ + sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \\\n\ + sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \\\n\ + } \\\n\ + vxc_int4 tmpOut0, tmpOut1; \\\n\ + coord_b.y = gidy; \\\n\ + coord_b.z = get_global_id(2) + i * get_global_size(2); \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +GEMM_QINT_TO_QINT_MERGE(I16, vxc_short8)\n\ +\n\ +#define GEMM_QINT_TO_QINT_CROSS(src0_type_name, read_type) \\\n\ +__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_cross( \\\n\ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N, \\\n\ + int axis_size, int inner_size, int outer_size, int axis_size0, \\\n\ + int inner_size0, int outer_size0, int axis_size1, int inner_size1, \\\n\ + int outer_size1, int axis_size2, int inner_size2, int outer_size2) \\\n\ +{ \\\n\ + uint gidy = get_global_id(1); \\\n\ + uint gidz = get_global_id(2); \\\n\ + short in0_zp, in1_zp; \\\n\ + _viv_asm(COPY, in0_zp, input0_ZP, 4); \\\n\ + _viv_asm(COPY, in1_zp, input1_ZP, 4); \\\n\ + for(int j = 0; j < outer_size; j++) \\\n\ + { \\\n\ + for(int i = 0; i < inner_size; i++) \\\n\ + { \\\n\ + read_type srcA, srcB, outC; \\\n\ + int4 coord_a = (int4)(0, gidy, gidz * axis_size0 + i * inner_size0 + j * outer_size0, 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, gidz * axis_size1 + i * inner_size1 + j * outer_size1, 0); \\\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\ + \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; \\\n\ + coord_b.y += 4; \\\n\ + VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniConvertUint8SubZpToFp32B_4x4); \\\n\ + sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \\\n\ + sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \\\n\ + sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \\\n\ + sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \\\n\ + } \\\n\ + vxc_int4 tmpOut0, tmpOut1; \\\n\ + coord_b.y = gidy; \\\n\ + coord_b.z = gidz * axis_size2 + i * inner_size2 + j * outer_size2; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + } \\\n\ +}\n\ +GEMM_QINT_TO_QINT_CROSS(I16, vxc_short8)\n\ +"; /* end of matrixmul_cross_i16_vx*/ + static const char matrixmul_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ @@ -23001,6 +24772,302 @@ __kernel void gemm_transb_I16I16toI16(image2d_array_t inputA,\n\ }\n\ "; /* end of matrixmul_i16_vx*/ +static const char matrixmul_merge_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform float mulKIn0In1Zp;\n\ +_viv_uniform float inOutScale;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform int ac2zero;\n\ +_viv_uniform int bc2zero;\n\ +_viv_uniform int outer;\n\ +\n\ +_viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Lo_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Hi_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Lo_4x4;\n\ +_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Hi_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;\n\ +\n\ +#define GEMM_QINT_TO_QINT_MERGE(src0_type_name, read_type) \\\n\ +__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_merge( \\\n\ + image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\ +{ \\\n\ + read_type srcA0, srcA1, srcA2, srcA3, srcB, outC; \\\n\ + vxc_float4 sum = (vxc_float4)(mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp); \\\n\ + for(int i = 0; i < outer; i++) \\\n\ + { \\\n\ + vxc_float4 sum0 = sum, sum1 = sum, sum2 = sum, sum3 = sum; \\\n\ + int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? i : get_global_id(2)), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); \\\n\ + int8 inputA_desc, inputB_desc, output_desc; \\\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\ + { \\\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\ + VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\ + VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_a.x += 4; coord_b.y += 4; \\\n\ + VXC_DP4x4(tempA0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8toFp32Block4_4x4); \\\n\ + VXC_DP4x4(tempA1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8toFp32Block4_4x4); \\\n\ + VXC_DP4x4(tempA2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8toFp32Block4_4x4); \\\n\ + VXC_DP4x4(tempA3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8toFp32Block4_4x4); \\\n\ + VXC_DP8x4(tempB0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8MulZptoFp32_8x4); \\\n\ + VXC_DP8x4(tempB1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8MulZptoFp32_8x4); \\\n\ + VXC_DP8x4(tempB2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8MulZptoFp32_8x4); \\\n\ + VXC_DP8x4(tempB3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGemmU8U8MulZptoFp32_8x4); \\\n\ + sum0 += tempA0 + tempB0; \\\n\ + sum1 += tempA1 + tempB1; \\\n\ + sum2 += tempA2 + tempB2; \\\n\ + sum3 += tempA3 + tempB3; \\\n\ + } \\\n\ + vxc_int4 tmpOut0, tmpOut1; \\\n\ + coord_b.y = get_global_id(1); \\\n\ + coord_b.z = get_global_id(2) + i * get_global_size(2); \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ + tmpOut0 = convert_int4_rte(sum0 * inOutScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum1 * inOutScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + tmpOut0 = convert_int4_rte(sum2 * inOutScale + output_ZP); \\\n\ + tmpOut1 = convert_int4_rte(sum3 * inOutScale + output_ZP); \\\n\ + VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_b.y++; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\ + VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +GEMM_QINT_TO_QINT_MERGE(U8, vxc_uchar16)\n\ +GEMM_QINT_TO_QINT_MERGE(I8, vxc_char16)\n\ +\n\ +#if (VX_VERSION==2)\n\ +__kernel void gemm_F16F16toF16_merge(image2d_array_t inputA,\n\ + image2d_array_t inputB, image2d_array_t output,\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N)\n\ +{\n\ + uint gidy = get_global_id(1);\n\ + for(int i = 0; i < outer; i++)\n\ + {\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? i : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0);\n\ +\n\ + half4 valC;\n\ + vxc_short8 srcA0, srcA1, srcA2, srcA3, outC;\n\ + vxc_half8 tmpA0, tmpA1, tmpA2, tmpA3;\n\ + vxc_short16 srcB;\n\ + vxc_half16 tmpB;\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);\n\ +\n\ + int8 inputA_desc, inputB_desc, output_desc;\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a);\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b);\n\ +\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)\n\ + {\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_a.x += 4; coord_b.y += 4;\n\ + _viv_asm(COPY, tmpA0, srcA0, 16);\n\ + _viv_asm(COPY, tmpA1, srcA1, 16);\n\ + _viv_asm(COPY, tmpA2, srcA2, 16);\n\ + _viv_asm(COPY, tmpA3, srcA3, 16);\n\ + _viv_asm(COPY, tmpB.hi, srcB.hi, 16);\n\ + _viv_asm(COPY, tmpB.lo, srcB.lo, 16);\n\ + VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmU8F16toF32Lo_4x4b);\n\ + VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmU8F16toF32Lo_4x4b);\n\ + VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, tmpA2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmU8F16toF32Lo_4x4b);\n\ + VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, tmpA3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmU8F16toF32Lo_4x4b);\n\ + sum0 += (tempA0);\n\ + sum1 += (tempA1);\n\ + sum2 += (tempA2);\n\ + sum3 += (tempA3);\n\ + }\n\ + coord_b.y = gidy;\n\ + coord_b.z = get_global_id(2) + i * get_global_size(2);\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr);\n\ + _viv_asm(CONV, valC, sum0);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_b.y++;\n\ + _viv_asm(CONV, valC, sum1);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_b.y++;\n\ + _viv_asm(CONV, valC, sum2);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_b.y++;\n\ + _viv_asm(CONV, valC, sum3);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +#else\n\ +__kernel void gemm_F16F16toF16_merge(image2d_array_t inputA,\n\ + image2d_array_t inputB, image2d_array_t output,\n\ + int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N)\n\ +{\n\ + uint gidy = get_global_id(1);\n\ + for(int i = 0; i < outer; i++)\n\ + {\n\ + int4 coord_a = (int4)(0, gidy, (ac2zero ? i : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0);\n\ +\n\ + half4 valC;\n\ + vxc_short8 srcA0, srcB0, srcA1, srcB1, outC;\n\ + vxc_half8 tmpA0, tmpB0, tmpA1, tmpB1;\n\ + vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);\n\ + vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);\n\ +\n\ + int8 inputA_desc, inputB_desc, output_desc;\n\ + _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\ + int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\ + _viv_asm(MOV, coord_a.w, baseAddr_a);\n\ + _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\ + int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr_b);\n\ +\n\ + for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)\n\ + {\n\ + vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\ + vxc_float4 tempB0, tempB1, tempB2, tempB3;\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord_a.x += 4; coord_b.y += 4;\n\ + _viv_asm(COPY, tmpA0, srcA0, 16);\n\ + _viv_asm(COPY, tmpB0, srcB0, 16);\n\ + _viv_asm(COPY, tmpA1, srcA1, 16);\n\ + _viv_asm(COPY, tmpB1, srcB1, 16);\n\ +\n\ + VXC_DP4x4(tempA0, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmFp16toFp32Row0Lo_4x4);\n\ + VXC_DP4x4(tempB0, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmFp16toFp32Row0Hi_4x4);\n\ + VXC_DP4x4(tempA1, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmFp16toFp32Row1Lo_4x4);\n\ + VXC_DP4x4(tempB1, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmFp16toFp32Row1Hi_4x4);\n\ + VXC_DP4x4(tempA2, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmFp16toFp32Row0Lo_4x4);\n\ + VXC_DP4x4(tempB2, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmFp16toFp32Row0Hi_4x4);\n\ + VXC_DP4x4(tempA3, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmFp16toFp32Row1Lo_4x4);\n\ + VXC_DP4x4(tempB3, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniGemmFp16toFp32Row1Hi_4x4);\n\ + sum0 += (tempA0 + tempB0);\n\ + sum1 += (tempA1 + tempB1);\n\ + sum2 += (tempA2 + tempB2);\n\ + sum3 += (tempA3 + tempB3);\n\ + }\n\ + coord_b.y = gidy;\n\ + coord_b.z = get_global_id(2) + i * get_global_size(2);\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord_b.w, baseAddr);\n\ + _viv_asm(CONV, valC, sum0);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_b.y++;\n\ + _viv_asm(CONV, valC, sum1);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_b.y++;\n\ + _viv_asm(CONV, valC, sum2);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + coord_b.y++;\n\ + _viv_asm(CONV, valC, sum3);\n\ + _viv_asm(COPY, outC, valC, 16);\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +#endif\n\ +"; /* end of matrixmul_merge_vx*/ + static const char matrixmul_transA_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int input0_ZP;\n\ @@ -27977,6 +30044,791 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_BF1 }\n\ }"; /* end of moments_u8_axis012_vx*/ +static const char nearest_grid_sample_BF16_to_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float2 half_input0_wh;\n\ +_viv_uniform float2 add_float_value;\n\ +_viv_uniform int depth;\n\ +\n\ +_viv_uniform VXC_512Bits uniBF16toFp32_part0_2x8;\n\ +_viv_uniform VXC_512Bits uniBF16toFp32_part1_2x8;\n\ +\n\ +#define GRID_SAMPLE_BF16_PROCESS() \\\n\ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \\\n\ + int4 x_idx = convert_int4(in_x); \\\n\ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \\\n\ + int4 y_idx = convert_int4(in_y); \\\n\ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\ + int baseAddr = input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + vxc_short8 src; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + int loop = depth - 1; \\\n\ + while (coord_in.z < loop) \\\n\ + { \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\ + coord_in.x = x_idx.x; \\\n\ + coord_in.y = y_idx.x; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +\n\ +\n\ +__kernel void nearest_grid_sample_BF16_BF16toBF16(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ +\n\ + vxc_short8 read_val;\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ + float4 fxy0;\n\ + float4 fxy1;\n\ +\n\ + vxc_short8 read_src;\n\ + VXC_DP2x8(read_src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part0_2x8);\n\ + _viv_asm(COPY, fxy0, read_src, 16);\n\ + VXC_DP2x8(read_src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part1_2x8);\n\ + _viv_asm(COPY, fxy1, read_src, 16);\n\ +\n\ +\n\ +\n\ + GRID_SAMPLE_BF16_PROCESS();\n\ +\n\ +}\n\ +"; /* end of nearest_grid_sample_BF16_to_BF16_vx*/ + +static const char nearest_grid_sample_F16_to_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float2 half_input0_wh;\n\ +_viv_uniform float2 add_float_value;\n\ +_viv_uniform int depth;\n\ +_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;\n\ +_viv_uniform VXC_512Bits uniExtactHalf8_2x8;\n\ +\n\ +#define GRID_SAMPLE_F16_PROCESS() \\\n\ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \\\n\ + int4 x_idx = convert_int4(in_x); \\\n\ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \\\n\ + int4 y_idx = convert_int4(in_y); \\\n\ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\ + int baseAddr = input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + vxc_short8 src; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + int loop = depth - 1; \\\n\ + while (coord_in.z < loop) \\\n\ + { \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\ + coord_in.x = x_idx.x; \\\n\ + coord_in.y = y_idx.x; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +\n\ +__kernel void nearest_grid_sample_F16_F32toF16(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ + coord_in1.z = coord_in1.z + 4;\n\ +\n\ + float4 fxy0 = read_imagef(input1, coord_in1.xy);\n\ + float4 fxy1 = read_imagef(input1, coord_in1.zw);\n\ +\n\ + GRID_SAMPLE_F16_PROCESS();\n\ +\n\ +}\n\ +\n\ +_viv_uniform int input1_ZP;\n\ +_viv_uniform float input1Scale;\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\ +\n\ +__kernel void nearest_grid_sample_F16_U8toF16(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ + vxc_uchar16 read_coord;\n\ + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + float4 fxy0;\n\ + float4 fxy1;\n\ + unsigned char input1ZP;\n\ + _viv_asm(COPY, input1ZP, input1_ZP, 4);\n\ + VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);\n\ + VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\ + fxy0 = fxy0 * input1Scale;\n\ + fxy1 = fxy1 * input1Scale;\n\ +\n\ + GRID_SAMPLE_F16_PROCESS();\n\ +\n\ +}\n\ +\n\ +\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\ +\n\ +__kernel void nearest_grid_sample_F16_F16toF16(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ +\n\ + vxc_short8 read_val;\n\ + vxc_half8 read_coord;\n\ +\n\ + VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, read_coord, read_val, 16);\n\ +\n\ + float4 fxy0;\n\ + float4 fxy1;\n\ +\n\ + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);\n\ + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\ +\n\ + GRID_SAMPLE_F16_PROCESS();\n\ +\n\ +}\n\ +"; /* end of nearest_grid_sample_F16_to_F16_vx*/ + +static const char nearest_grid_sample_F16_to_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float2 half_input0_wh;\n\ +_viv_uniform float2 add_float_value;\n\ +_viv_uniform int depth;\n\ +\n\ +_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ +_viv_uniform float uint8Scale;\n\ +_viv_uniform float output_ZP;\n\ +\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\ +\n\ +#define GRID_SAMPLE_F16_to_U8_PROCESS() \\\n\ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \\\n\ + int4 x_idx = convert_int4(in_x); \\\n\ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \\\n\ + int4 y_idx = convert_int4(in_y); \\\n\ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\ + int baseAddr = input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + vxc_short8 s0; \\\n\ + vxc_uchar16 result; \\\n\ + vxc_half8 src; \\\n\ + VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, s0, 16); \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + int loop = depth - 1; \\\n\ + float4 dst4; \\\n\ + int4 dst; \\\n\ + while (coord_in.z < loop) \\\n\ + { \\\n\ + VXC_DP4x4(dst4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4); \\\n\ + dst4 = dst4 * uint8Scale + output_ZP; \\\n\ + dst = convert_int4_rte(dst4); \\\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\ + result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\ + coord_in.x = x_idx.x; \\\n\ + coord_in.y = y_idx.x; \\\n\ + VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src, s0, 16); \\\n\ + } \\\n\ + VXC_DP4x4(dst4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4); \\\n\ + dst4 = dst4 * uint8Scale + output_ZP; \\\n\ + dst = convert_int4_rte(dst4); \\\n\ + VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void nearest_grid_sample_F16_F32toU8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ + coord_in1.z = coord_in1.z + 4;\n\ +\n\ + float4 fxy0 = read_imagef(input1, coord_in1.xy);\n\ + float4 fxy1 = read_imagef(input1, coord_in1.zw);\n\ + GRID_SAMPLE_F16_to_U8_PROCESS();\n\ +\n\ +}\n\ +\n\ +_viv_uniform int input1_ZP;\n\ +_viv_uniform float input1Scale;\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\ +\n\ +\n\ +__kernel void nearest_grid_sample_F16_U8toU8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ +\n\ + vxc_uchar16 read_coord;\n\ +\n\ + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 fxy0;\n\ + float4 fxy1;\n\ +\n\ + unsigned char input1ZP;\n\ + _viv_asm(COPY, input1ZP, input1_ZP, 4);\n\ +\n\ + VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);\n\ + VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\ +\n\ + fxy0 = fxy0 * input1Scale;\n\ + fxy1 = fxy1 * input1Scale;\n\ +\n\ + GRID_SAMPLE_F16_to_U8_PROCESS();\n\ +\n\ +}\n\ +\n\ +\n\ +__kernel void nearest_grid_sample_F16_F16toU8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ +\n\ + vxc_short8 read_val;\n\ + vxc_half8 read_coord;\n\ +\n\ + VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, read_coord, read_val, 16);\n\ +\n\ + float4 fxy0;\n\ + float4 fxy1;\n\ +\n\ + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);\n\ + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\ +\n\ + GRID_SAMPLE_F16_to_U8_PROCESS();\n\ +\n\ +}\n\ +\n\ +"; /* end of nearest_grid_sample_F16_to_U8_vx*/ + +static const char nearest_grid_sample_I16_to_I16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float2 half_input0_wh;\n\ +_viv_uniform float2 add_float_value;\n\ +_viv_uniform int depth;\n\ +\n\ +_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;\n\ +_viv_uniform float input1_scale;\n\ +_viv_uniform VXC_512Bits uniConvertI8toI8_2x8;\n\ +\n\ +\n\ +#define GRID_SAMPLE_I16_PROCESS() \\\n\ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \\\n\ + int4 x_idx = convert_int4(in_x); \\\n\ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \\\n\ + int4 y_idx = convert_int4(in_y); \\\n\ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\ + int baseAddr = input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + vxc_short8 src, dst; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + int loop = depth - 1; \\\n\ + while (coord_in.z < loop) \\\n\ + { \\\n\ + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\ + coord_in.x = x_idx.x; \\\n\ + coord_in.y = y_idx.x; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +\n\ +__kernel void nearest_grid_sample_I16_I16toI16(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ + vxc_short8 read_coord;\n\ + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 fxy0;\n\ + float4 fxy1;\n\ +\n\ + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);\n\ + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);\n\ +\n\ + fxy0 = fxy0 * input1_scale;\n\ + fxy1 = fxy1 * input1_scale;\n\ +\n\ + GRID_SAMPLE_I16_PROCESS();\n\ +\n\ +}\n\ +"; /* end of nearest_grid_sample_I16_to_I16_vx*/ + +static const char nearest_grid_sample_I8_to_I8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float2 half_input0_wh;\n\ +_viv_uniform float2 add_float_value;\n\ +_viv_uniform int depth;\n\ +\n\ +\n\ +_viv_uniform float input1_scale;\n\ +_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertI8toI8_2x8;\n\ +\n\ +#define GRID_SAMPLE_I8_PROCESS() \\\n\ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \\\n\ + int4 x_idx = convert_int4(in_x); \\\n\ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \\\n\ + int4 y_idx = convert_int4(in_y); \\\n\ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\ + int baseAddr = input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + vxc_char16 src, dst; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + int loop = depth - 1; \\\n\ + while (coord_in.z < loop) \\\n\ + { \\\n\ + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\ + coord_in.x = x_idx.x; \\\n\ + coord_in.y = y_idx.x; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void nearest_grid_sample_I8_I8toI8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ + vxc_char16 read_coord;\n\ + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 fxy0;\n\ + float4 fxy1;\n\ +\n\ + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);\n\ + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);\n\ +\n\ + fxy0 = fxy0 * input1_scale;\n\ + fxy1 = fxy1 * input1_scale;\n\ +\n\ + GRID_SAMPLE_I8_PROCESS();\n\ +\n\ +}\n\ +"; /* end of nearest_grid_sample_I8_to_I8_vx*/ + +static const char nearest_grid_sample_U8_to_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform float2 half_input0_wh;\n\ +_viv_uniform float2 add_float_value;\n\ +_viv_uniform int depth;\n\ +\n\ +_viv_uniform int input1_ZP;\n\ +_viv_uniform float input1Scale;\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniMultiplyAndPostShift_2x8;\n\ +_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp\n\ +\n\ +#define GRID_SAMPLE_U8_PROCESS() \\\n\ + fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\ + float4 in_x = (float4)(fxy0.xz, fxy1.xz); \\\n\ + int4 x_idx = convert_int4(in_x); \\\n\ + float4 in_y = (float4)(fxy0.yw, fxy1.yw); \\\n\ + int4 y_idx = convert_int4(in_y); \\\n\ + int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\ + int baseAddr = input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.w, baseAddr); \\\n\ + vxc_uchar16 src, dst; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + int8 output_desc; \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.w, baseAddr); \\\n\ + int loop = depth - 1; \\\n\ + vxc_ushort8 multiplier; \\\n\ + _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\ + while (coord_in.z < loop) \\\n\ + { \\\n\ + VXC_DP2x8(dst, src, multiplier, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\ + coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\ + coord_in.x = x_idx.x; \\\n\ + coord_in.y = y_idx.x; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.y; \\\n\ + coord_in.y = y_idx.y; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.z; \\\n\ + coord_in.y = y_idx.z; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = x_idx.w; \\\n\ + coord_in.y = y_idx.w; \\\n\ + VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\ + VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ + VXC_DP2x8(dst, src, multiplier, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ +\n\ +\n\ +__kernel void nearest_grid_sample_U8_F32toU8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ + coord_in1.z = coord_in1.z + 4;\n\ +\n\ + float4 fxy0 = read_imagef(input1, coord_in1.xy);\n\ + float4 fxy1 = read_imagef(input1, coord_in1.zw);\n\ + GRID_SAMPLE_U8_PROCESS();\n\ +\n\ +}\n\ +\n\ +\n\ +__kernel void nearest_grid_sample_U8_U8toU8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ +\n\ + vxc_uchar16 read_coord;\n\ +\n\ + VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + float4 fxy0;\n\ + float4 fxy1;\n\ +\n\ + unsigned char input1ZP;\n\ + _viv_asm(COPY, input1ZP, input1_ZP, 4);\n\ +\n\ + VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);\n\ + VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\ +\n\ + fxy0 = fxy0 * input1Scale;\n\ + fxy1 = fxy1 * input1Scale;\n\ +\n\ + GRID_SAMPLE_U8_PROCESS();\n\ +\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;\n\ +_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\ +\n\ +__kernel void nearest_grid_sample_U8_F16toU8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + int align_corners)\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in1 = coord_out.xyxy;\n\ +\n\ + coord_in1.xz = coord_in1.xz * 2;\n\ +\n\ + vxc_short8 read_val;\n\ + vxc_half8 read_coord;\n\ +\n\ + VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, read_coord, read_val, 16);\n\ +\n\ + float4 fxy0;\n\ + float4 fxy1;\n\ +\n\ + VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);\n\ + VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\ +\n\ + GRID_SAMPLE_U8_PROCESS();\n\ +\n\ +}\n\ +\n\ +"; /* end of nearest_grid_sample_U8_to_U8_vx*/ + static const char one_hot_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniDataConvert_0_4x4;\n\ @@ -29077,8 +31929,8 @@ __kernel void pow_##name \\\n\ \\\n\ src0_type src0; \\\n\ copy0_type data0; \\\n\ - src0_type src1; \\\n\ - copy0_type data1; \\\n\ + src1_type src1; \\\n\ + copy1_type data1; \\\n\ VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, data0, src0, 16); \\\n\ VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ @@ -29149,8 +32001,8 @@ __kernel void pow_##name##_2D \\\n\ \\\n\ src0_type src0; \\\n\ copy0_type data0; \\\n\ - src0_type src1; \\\n\ - copy0_type data1; \\\n\ + src1_type src1; \\\n\ + copy1_type data1; \\\n\ VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, data0, src0, 16); \\\n\ VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ @@ -29331,9 +32183,21 @@ _viv_uniform int zp;\n\ _viv_uniform float outputScale;\n\ \n\ __kernel void pre_process_bgra_scale_U8toU8(\n\ - __read_only image2d_array_t input, __write_only image2d_array_t output,\n\ - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + global int * xRatio,\n\ + global int * yRatio,\n\ + global int * xOffset,\n\ + global int * yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float r_scale,\n\ + int reverse_channel,\n\ + int trans,\n\ + float g_scale,\n\ + float b_scale\n\ + )\n\ {\n\ int4 gidx = get_global_id(0);\n\ int gidy = get_global_id(1);\n\ @@ -29389,6 +32253,7 @@ __kernel void pre_process_bgra_scale_U8toU8(\n\ int4 tmp1, tmp2, result1, result2;\n\ float4 tmpDst, tmp0;\n\ float4 mean = (float4)(bMean, gMean, rMean, 0);\n\ + float4 var = (float4)(b_scale, g_scale, r_scale, 0);\n\ //tmpFx = (int4)(fx.x, fx.x, fx.x, fx.x);\n\ int tmpV = 1 << 19;\n\ vxc_short8 tmpFx;\n\ @@ -29451,9 +32316,21 @@ __kernel void pre_process_bgra_scale_U8toU8(\n\ }\n\ \n\ __kernel void pre_process_bgra_copy_U8toU8(\n\ - __read_only image2d_array_t input, __write_only image2d_array_t output,\n\ - global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + global int * xRatio,\n\ + global int * yRatio,\n\ + global int * xOffset,\n\ + global int * yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float r_scale,\n\ + int reverse_channel,\n\ + int trans,\n\ + float g_scale,\n\ + float b_scale\n\ +)\n\ {\n\ int2 pos = (int2)((get_global_id(0) + (*xOffset)) << 2, get_global_id(1) + (*yOffset));\n\ \n\ @@ -29468,10 +32345,10 @@ __kernel void pre_process_bgra_copy_U8toU8(\n\ VXC_DP4x4(tmpG, lineBGRA0, lineBGRA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGfromBgra_4x4);\n\ VXC_DP4x4(tmpR, lineBGRA0, lineBGRA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRfromBgra_4x4);\n\ \n\ - tmpDst = (tmpB - bMean) * var;\n\ + tmpDst = (tmpB - bMean) * b_scale;\n\ result1 = convert_int4_rte(tmpDst * outputScale + zp);\n\ \n\ - tmpDst = (tmpG - gMean) * var;\n\ + tmpDst = (tmpG - gMean) * g_scale;\n\ result2 = convert_int4_rte(tmpDst * outputScale + zp);\n\ VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ \n\ @@ -29481,7 +32358,7 @@ __kernel void pre_process_bgra_copy_U8toU8(\n\ dstPos.z = 1;\n\ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ - tmpDst = (tmpR - rMean) * var;\n\ + tmpDst = (tmpR - rMean) * r_scale;\n\ result1 = convert_int4_rte(tmpDst * outputScale + zp);\n\ VXC_DP2x8(dst, result1, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\ \n\ @@ -30016,7 +32893,10 @@ static const char pre_process_nv12_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ _viv_uniform int bOrder;\n\ _viv_uniform int rOrder;\n\ \n\ -_viv_uniform float outputScaleVar;\n\ +_viv_uniform float outputScaleVar_b;\n\ +_viv_uniform float outputScaleVar_g;\n\ +_viv_uniform float outputScaleVar_r;\n\ +\n\ _viv_uniform float bMeanScaleVarZp;\n\ _viv_uniform float gMeanScaleVarZp;\n\ _viv_uniform float rMeanScaleVarZp;\n\ @@ -30041,10 +32921,12 @@ __kernel void pre_process_nv12_copy_##name \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float var, \\\n\ + float r_scale, \\\n\ int reverse_channel, \\\n\ int trans, \\\n\ - int nv_type \\\n\ + int nv_type, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ int gidx = get_global_id(0); \\\n\ @@ -30078,21 +32960,21 @@ __kernel void pre_process_nv12_copy_##name \\\n\ dst_type dst0; \\\n\ save_type dst; \\\n\ int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\ - tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\ + tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\ _viv_asm(CONV_RTE, result, tmpDstB); \\\n\ dstPos.z = bOrder; \\\n\ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\ + tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\ _viv_asm(CONV_RTE, result, tmpDstG); \\\n\ dstPos.z = 1; \\\n\ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\ + tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\ _viv_asm(CONV_RTE, result, tmpDstR); \\\n\ dstPos.z = rOrder; \\\n\ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ @@ -30110,7 +32992,10 @@ static const char pre_process_nv12_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ _viv_uniform int bOrder;\n\ _viv_uniform int rOrder;\n\ \n\ -_viv_uniform float outputScaleVar;\n\ +_viv_uniform float outputScaleVar_b;\n\ +_viv_uniform float outputScaleVar_g;\n\ +_viv_uniform float outputScaleVar_r;\n\ +\n\ _viv_uniform float bMeanScaleVarZp;\n\ _viv_uniform float gMeanScaleVarZp;\n\ _viv_uniform float rMeanScaleVarZp;\n\ @@ -30143,10 +33028,12 @@ __kernel void pre_process_nv12_scale_##name##_gq \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float var, \\\n\ + float r_scale, \\\n\ int reverse_channel, \\\n\ int trans, \\\n\ - int nv_type \\\n\ + int nv_type, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ uint4 gidx = get_global_id(0); \\\n\ @@ -30200,21 +33087,21 @@ __kernel void pre_process_nv12_scale_##name##_gq \\\n\ dst_type dst0; \\\n\ save_type dst; \\\n\ int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\ - tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\ + tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\ _viv_asm(CONV_RTE, result, tmpDstB); \\\n\ dstPos.z = bOrder; \\\n\ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\ + tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\ _viv_asm(CONV_RTE, result, tmpDstG); \\\n\ dstPos.z = 1; \\\n\ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\ + tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\ _viv_asm(CONV_RTE, result, tmpDstR); \\\n\ dstPos.z = rOrder; \\\n\ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ @@ -30239,10 +33126,12 @@ __kernel void pre_process_nv12_scale_##name \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float var, \\\n\ + float r_scale, \\\n\ int reverse_channel, \\\n\ int trans, \\\n\ - int nv_type \\\n\ + int nv_type, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ uint4 gidx = get_global_id(0); \\\n\ @@ -30268,102 +33157,445 @@ __kernel void pre_process_nv12_scale_##name \\\n\ coord.x = sx.w; \\\n\ VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ - coord_uv.x = uvX.y; \\\n\ - VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ - coord_uv.x = uvX.z; \\\n\ - VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ - coord_uv.x = uvX.w; \\\n\ - VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_uv.x = uvX.y; \\\n\ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_uv.x = uvX.z; \\\n\ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_uv.x = uvX.w; \\\n\ + VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + if (nv_type == 1) \\\n\ + { \\\n\ + UV.s01234567 = UV.s10325476; \\\n\ + } \\\n\ + \\\n\ + vxc_char16 tmpUV; \\\n\ + short tmpVal = 128; \\\n\ + VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \\\n\ + \\\n\ + float4 tmpDstB, tmpDstG, tmpDstR; \\\n\ + VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \\\n\ + VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \\\n\ + VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \\\n\ + \\\n\ + conv_type result; \\\n\ + dst_type dst0; \\\n\ + save_type dst; \\\n\ + int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\ + tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstB); \\\n\ + dstPos.z = bOrder; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstG); \\\n\ + dstPos.z = 1; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\ + _viv_asm(CONV_RTE, result, tmpDstR); \\\n\ + dstPos.z = rOrder; \\\n\ + VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ + VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +NV12_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8)\n\ +NV12_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8)\n\ +NV12_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16)\n\ +NV12_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16)\n\ +"; /* end of pre_process_nv12_scale_vx*/ + +static const char pre_process_rgb_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniVecShift10;\n\ +_viv_uniform VXC_512Bits uniAddRShift;\n\ +_viv_uniform VXC_512Bits uniGetTempVal;\n\ +_viv_uniform VXC_512Bits uniExtractBytes;\n\ +_viv_uniform VXC_512Bits uniUnpackToR;\n\ +_viv_uniform VXC_512Bits uniUnpackToG;\n\ +_viv_uniform VXC_512Bits uniUnpackToB;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform float outputZP;\n\ +_viv_uniform int r_order;\n\ +_viv_uniform int b_order;\n\ +\n\ +#define DESCALE(x) (((x) + (1<<19)) >> 20)\n\ +\n\ +#define IMAGE_PRE_PROCESS(dst_name, conv_type, dst_type, copy_type) \\\n\ +__kernel void pre_process_rgb_scale_U8to##dst_name \\\n\ + ( \\\n\ +__read_only image2d_array_t input, \\\n\ +__write_only image2d_array_t output, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float r_scale, \\\n\ + int reverse_channel, \\\n\ + int trans, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ + ) \\\n\ +{ \\\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\ + int4 xPos = get_global_id(0); \\\n\ + int yPos = get_global_id(1); \\\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\ + xPos += (int4)(0, 1, 2, 3); \\\n\ + \\\n\ + /*x*/ \\\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\ + int4 sx = fx0 & 0xffff8000; \\\n\ + fx0 -= sx; \\\n\ + sx = sx >> 15; \\\n\ + \\\n\ + vxc_short4 fx; \\\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \\\n\ + /*y*/ \\\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\ + int sy = fy & 0xffff8000; \\\n\ + \\\n\ + fy -= sy; \\\n\ + sy = sy >> 15; \\\n\ + \\\n\ + fy = (fy + (1<< 4)) >> 5; \\\n\ + \\\n\ + vxc_uchar16 line0RGB1, line0RGB2; \\\n\ + vxc_uchar16 line1RGB3, line1RGB4; \\\n\ + int4 coord; \\\n\ + sx = (sx + (*xOffset)) * 3; \\\n\ + coord.xyz = sx.xyz; \\\n\ + coord.w = sy + *yOffset; \\\n\ + int2 coord1 = (int2)(sx.w, coord.w); \\\n\ + VXC_ReadImage(line0RGB1, input, coord.xw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0RGB1, input, coord.yw, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0RGB2, input, coord.zw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0RGB2, input, coord1, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \\\n\ + \\\n\ + bgrMean *= (float4)(b_scale, g_scale, r_scale, 0); \\\n\ + \\\n\ + int4 test01, temp1; \\\n\ + int4 test02, temp2; \\\n\ + int4 tt; \\\n\ + vxc_uchar4 val; \\\n\ + int4 coord_out = (int4)(xPos.x, yPos, r_order, 0); \\\n\ + \\\n\ + vxc_uchar8 line1, line2; \\\n\ + \\\n\ + /*R*/ \\\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\ + \\\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + \\\n\ + vxc_float4 tmp_dst; \\\n\ + vxc_uchar4 u8_dst; \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + \\\n\ + /*convert U8 to dst*/ \\\n\ + dst_type dst; \\\n\ + tmp_dst = tmp_dst * r_scale - bgrMean.zzzz; \\\n\ + tmp_dst = tmp_dst * outputScale + outputZP; \\\n\ + conv_type dst0; \\\n\ + _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + copy_type result; \\\n\ + _viv_asm(COPY, result, dst, 16); \\\n\ + VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + /*G*/ \\\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\ + \\\n\ + coord_out.z = 1; \\\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + \\\n\ + tmp_dst = tmp_dst * g_scale - bgrMean.y; \\\n\ + tmp_dst = tmp_dst * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, result, dst, 16); \\\n\ + VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + /*B*/ \\\n\ + VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\ + VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\ + \\\n\ + coord_out.z = b_order; \\\n\ + VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + \\\n\ + tmp_dst = tmp_dst * b_scale - bgrMean.x; \\\n\ + tmp_dst = tmp_dst * outputScale + outputZP; \\\n\ + _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, result, dst, 16); \\\n\ + VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +IMAGE_PRE_PROCESS(U8, uint4, vxc_uchar16, vxc_uchar16)\n\ +IMAGE_PRE_PROCESS(I8, int4, vxc_char16, vxc_char16)\n\ +IMAGE_PRE_PROCESS(I16, int4, vxc_short8, vxc_short8)\n\ +IMAGE_PRE_PROCESS(F16, half4, vxc_half8, vxc_short8)\n\ +"; /* end of pre_process_rgb_vx*/ + +static const char pre_process_rgb888_planar_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniVecShift10;\n\ +_viv_uniform VXC_512Bits uniAddRShift;\n\ +_viv_uniform VXC_512Bits uniGetTempVal;\n\ +_viv_uniform VXC_512Bits uniExtractBytes;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform int4 rgb_order;\n\ +\n\ +#define RESIZE_BILINEAR_4X1(scale, mean, output, _coord) \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.y; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.z; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.w; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.z ++; \\\n\ + coord_in.x = coord.x; \\\n\ + \\\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + \\\n\ + tmp_dst = tmp_dst * scale * output_scale - mean * scale * output_scale + output_zp; \\\n\ + _viv_asm(CONV, dst0, tmp_dst); \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst1, 8); \\\n\ + VXC_WriteImage(output, _coord, dst, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\ +__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float r_scale, \\\n\ + int reverse, \\\n\ + int height, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ + ) \\\n\ +{ \\\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\ + \\\n\ + int4 xPos = get_global_id(0); \\\n\ + int yPos = get_global_id(1); \\\n\ + \\\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\ + xPos += (int4)(0, 1, 2, 3); \\\n\ + \\\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\ + int4 sx = fx0 & 0xffff8000; \\\n\ + fx0 -= sx; \\\n\ + sx = sx >> 15; \\\n\ + \\\n\ + vxc_short4 fx; \\\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniAddRShift); \\\n\ + \\\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\ + int sy = fy & 0xffff8000; \\\n\ + \\\n\ + fy -= sy; \\\n\ + sy = sy >> 15; \\\n\ + \\\n\ + fy = (fy + (1<< 4)) >> 5; \\\n\ + \\\n\ + vxc_uchar16 line0Y; \\\n\ + vxc_uchar16 line1Y; \\\n\ + int4 coord; \\\n\ + int4 coord_in = (int4)(0, 0, 0, 0); \\\n\ + sx = sx + *xOffset; \\\n\ + coord = sx.xyzw; \\\n\ + coord_in.y = sy + *yOffset; \\\n\ + coord_in.x = coord.x; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.y; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.z; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.w; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.z ++; \\\n\ + coord_in.x = coord.x; \\\n\ + \\\n\ + int4 test01, temp1; \\\n\ + int4 test02, temp2; \\\n\ + int4 tt; \\\n\ + vxc_uchar4 val; \\\n\ + int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \\\n\ + coord_out.yzw += rgb_order.xyz; \\\n\ \\\n\ - if (nv_type == 1) \\\n\ - { \\\n\ - UV.s01234567 = UV.s10325476; \\\n\ - } \\\n\ + vxc_uchar8 line1, line2; \\\n\ \\\n\ - vxc_char16 tmpUV; \\\n\ - short tmpVal = 128; \\\n\ - VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \\\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ \\\n\ - float4 tmpDstB, tmpDstG, tmpDstR; \\\n\ - VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \\\n\ - VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \\\n\ - VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \\\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ \\\n\ - conv_type result; \\\n\ - dst_type dst0; \\\n\ - save_type dst; \\\n\ - int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\ - tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\ - _viv_asm(CONV_RTE, result, tmpDstB); \\\n\ - dstPos.z = bOrder; \\\n\ - VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ - _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + vxc_float4 tmp_dst; \\\n\ + vxc_uchar4 u8_dst; \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ \\\n\ - tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\ - _viv_asm(CONV_RTE, result, tmpDstG); \\\n\ - dstPos.z = 1; \\\n\ - VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ - _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + conv_type dst0; \\\n\ + dst_type dst1; \\\n\ + copy_type dst; \\\n\ + tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\ + _viv_asm(CONV, dst0, tmp_dst); \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst1, 8); \\\n\ + VXC_WriteImage(output, coord_out.xy, dst, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\ - _viv_asm(CONV_RTE, result, tmpDstR); \\\n\ - dstPos.z = rOrder; \\\n\ - VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ - _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ - VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + RESIZE_BILINEAR_4X1(g_scale, gMean, output, coord_out.xz) \\\n\ + RESIZE_BILINEAR_4X1(b_scale, bMean, output, coord_out.xw) \\\n\ }\n\ -NV12_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8)\n\ -NV12_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8)\n\ -NV12_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16)\n\ -NV12_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16)\n\ -"; /* end of pre_process_nv12_scale_vx*/ - -static const char pre_process_rgb_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniVecShift10;\n\ -_viv_uniform VXC_512Bits uniAddRShift;\n\ -_viv_uniform VXC_512Bits uniGetTempVal;\n\ -_viv_uniform VXC_512Bits uniExtractBytes;\n\ -_viv_uniform VXC_512Bits uniUnpackToR;\n\ -_viv_uniform VXC_512Bits uniUnpackToG;\n\ -_viv_uniform VXC_512Bits uniUnpackToB;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ -_viv_uniform float outputScale;\n\ -_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ -_viv_uniform float outputZP;\n\ -_viv_uniform int r_order;\n\ -_viv_uniform int b_order;\n\ -\n\ -#define DESCALE(x) (((x) + (1<<19)) >> 20)\n\ +PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8, half4, vxc_short8)\n\ +PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4, vxc_short8)\n\ \n\ -#define IMAGE_PRE_PROCESS(dst_name, conv_type, dst_type, copy_type) \\\n\ -__kernel void pre_process_rgb_scale_U8to##dst_name \\\n\ +#define PRE_PROCESS_RGB888_PLANAR_8BITS(dst_name, write_type) \\\n\ +__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\ ( \\\n\ -__read_only image2d_array_t input, \\\n\ -__write_only image2d_array_t output, \\\n\ - global int *xRatio, \\\n\ - global int *yRatio, \\\n\ - global int *xOffset, \\\n\ - global int *yOffset, \\\n\ - float rMean, \\\n\ - float gMean, \\\n\ - float bMean, \\\n\ - float f32Var, \\\n\ - int reverse_channel, \\\n\ - int trans \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float r_scale, \\\n\ + int reverse, \\\n\ + int height, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\ - int4 xPos = get_global_id(0); \\\n\ - int yPos = get_global_id(1); \\\n\ + int4 xPos = get_global_id(0); \\\n\ + int yPos = get_global_id(1); \\\n\ + \\\n\ int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\ xPos += (int4)(0, 1, 2, 3); \\\n\ \\\n\ - /*x*/ \\\n\ int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\ int4 sx = fx0 & 0xffff8000; \\\n\ fx0 -= sx; \\\n\ @@ -30371,137 +33603,485 @@ __write_only image2d_array_t output, \\\n\ \\\n\ vxc_short4 fx; \\\n\ VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \\\n\ - /*y*/ \\\n\ + \\\n\ int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\ int sy = fy & 0xffff8000; \\\n\ \\\n\ fy -= sy; \\\n\ sy = sy >> 15; \\\n\ - \\\n\ fy = (fy + (1<< 4)) >> 5; \\\n\ \\\n\ - vxc_uchar16 line0RGB1, line0RGB2; \\\n\ - vxc_uchar16 line1RGB3, line1RGB4; \\\n\ + vxc_uchar16 line0Y; \\\n\ + vxc_uchar16 line1Y; \\\n\ int4 coord; \\\n\ - sx = (sx + (*xOffset)) * 3; \\\n\ - coord.xyz = sx.xyz; \\\n\ - coord.w = sy + *yOffset; \\\n\ + sx = sx + *xOffset; \\\n\ + coord.xyz = sx.xyz; \\\n\ + coord.w = sy + *yOffset; \\\n\ int2 coord1 = (int2)(sx.w, coord.w); \\\n\ - VXC_ReadImage(line0RGB1, input, coord.xw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(line0RGB1, input, coord.yw, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(line0RGB2, input, coord.zw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(line0RGB2, input, coord1, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ - \\\n\ - VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\ - VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\ - VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\ - VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\ - VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ - \\\n\ - float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \\\n\ - \\\n\ - bgrMean *= f32Var; \\\n\ + int4 coord_in = (int4)(coord.xw, 0, 0); \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.y; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.z; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord1.x; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ int4 test01, temp1; \\\n\ int4 test02, temp2; \\\n\ - int4 tt; \\\n\ - vxc_uchar4 val; \\\n\ - int4 coord_out = (int4)(xPos.x, yPos, r_order, 0); \\\n\ + int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \\\n\ + coord_out.yzw += rgb_order.xyz; \\\n\ \\\n\ - vxc_uchar8 line1, line2; \\\n\ - \\\n\ - /*R*/ \\\n\ - VXC_DP2x8(line1, line0RGB1, line0RGB2, \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\ - VXC_DP2x8(line2, line1RGB3, line1RGB4, \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\ - \\\n\ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ temp1 = temp1 + test01; \\\n\ \\\n\ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ temp2 = temp2 + test02; \\\n\ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ \\\n\ vxc_float4 tmp_dst; \\\n\ vxc_uchar4 u8_dst; \\\n\ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtractBytes); \\\n\ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ uniConvertIntergetoF32_4x4); \\\n\ \\\n\ - /*convert U8 to dst*/ \\\n\ - dst_type dst; \\\n\ - tmp_dst = tmp_dst * f32Var - bgrMean.zzzz; \\\n\ - tmp_dst = tmp_dst * outputScale + outputZP; \\\n\ - conv_type dst0; \\\n\ - _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ - copy_type result; \\\n\ - _viv_asm(COPY, result, dst, 16); \\\n\ - VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + int4 dst0; \\\n\ + write_type dst; \\\n\ + tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\ + dst0 = convert_int4_rte(tmp_dst); \\\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ \\\n\ - /*G*/ \\\n\ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\ + VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - coord_out.z = 1; \\\n\ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + coord_in.x = coord.x; \\\n\ + coord_in.z = 1; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.y; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.z; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord1.x; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ temp1 = temp1 + test01; \\\n\ \\\n\ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ temp2 = temp2 + test02; \\\n\ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ - \\\n\ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtractBytes); \\\n\ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ - uniConvertIntergetoF32_4x4); \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \\\n\ + dst0 = convert_int4_rte(tmp_dst); \\\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ \\\n\ - tmp_dst = tmp_dst * f32Var - bgrMean.y; \\\n\ - tmp_dst = tmp_dst * outputScale + outputZP; \\\n\ - _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ - _viv_asm(COPY, result, dst, 16); \\\n\ - VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output, coord_out.xz, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - /*B*/ \\\n\ - VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\ - VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\ + coord_in.x = coord.x; \\\n\ + coord_in.z = 2; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.y; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.z; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord1.x; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - coord_out.z = b_order; \\\n\ - VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ - VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ temp1 = temp1 + test01; \\\n\ \\\n\ - VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\ - VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ temp2 = temp2 + test02; \\\n\ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ - \\\n\ - VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtractBytes); \\\n\ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ uniConvertIntergetoF32_4x4); \\\n\ + tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \\\n\ + dst0 = convert_int4_rte(tmp_dst); \\\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ \\\n\ - tmp_dst = tmp_dst * f32Var - bgrMean.x; \\\n\ - tmp_dst = tmp_dst * outputScale + outputZP; \\\n\ - _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ - _viv_asm(COPY, result, dst, 16); \\\n\ - VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output, coord_out.xw, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)\n\ +PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_0_vx*/ + +static const char pre_process_rgb888_planar_1_vx[] = "\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\ +_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\ +\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform int4 rgb_order;\n\ +\n\ +#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\ +__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float r_scale, \\\n\ + int reverse, \\\n\ + int height, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + coord.xy += (int2)(*xOffset, *yOffset); \\\n\ + vxc_uchar16 src0, src1, src2; \\\n\ + dst_type dst0, dst1; \\\n\ + \\\n\ + int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\ + VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.z ++; \\\n\ + VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.z ++; \\\n\ + VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + coord.x = coord.z + 8; \\\n\ + float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp,\\\n\ + rMean * r_scale * output_scale - output_zp, \\\n\ + rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\ + \\\n\ + half4 paramData_f16; \\\n\ + copy_type tmp_dst; \\\n\ + _viv_asm(CONV, paramData_f16, paramData0); \\\n\ + VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniDataMeanStddevHi_2x8); \\\n\ + _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\ + int4 coord_out = coord; \\\n\ + coord_out.yw = coord_out.ww + rgb_order.xy; \\\n\ + VXC_WriteImage(output, coord_out.zy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_out.xy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \\\n\ + gMean * g_scale * output_scale - output_zp, \\\n\ + gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\ + _viv_asm(CONV, paramData_f16, paramData1); \\\n\ + VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniDataMeanStddevHi_2x8); \\\n\ + _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\ + VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \\\n\ + bMean * b_scale * output_scale - output_zp, \\\n\ + bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\ + _viv_asm(CONV, paramData_f16, paramData2); \\\n\ + VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniDataMeanStddevHi_2x8); \\\n\ + _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\ + coord_out.w = coord.w + rgb_order.z; \\\n\ + VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\ + VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8, vxc_short8)\n\ +PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\ +\n\ +#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \\\n\ +__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float r_scale, \\\n\ + int reverse, \\\n\ + int height, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + coord.xy += (int2) (*xOffset, *yOffset); \\\n\ + vxc_uchar16 src0, src1, src2; \\\n\ + write_type dst; \\\n\ + \\\n\ + int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\ + VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.z ++; \\\n\ + VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.z ++; \\\n\ + VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int4 coord_out = coord; \\\n\ + coord_out.xyw = coord.www + rgb_order.xyz; \\\n\ + float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \\\n\ + rMean * r_scale * output_scale - output_zp, \\\n\ + rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\ + \\\n\ + half4 paramData_f16; \\\n\ + _viv_asm(CONV, paramData_f16, paramData0); \\\n\ + \\\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniDataMeanStddevHi_2x8); \\\n\ + VXC_WriteImage(output, coord_out.zx, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \\\n\ + gMean * g_scale * output_scale - output_zp, \\\n\ + gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\ + _viv_asm(CONV, paramData_f16, paramData1); \\\n\ + \\\n\ + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniDataMeanStddevHi_2x8); \\\n\ + VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \\\n\ + bMean * b_scale * output_scale - output_zp, \\\n\ + bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\ + _viv_asm(CONV, paramData_f16, paramData2); \\\n\ + \\\n\ + VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniDataMeanStddevHi_2x8); \\\n\ + VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\ +PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\ +"; /* end of pre_process_rgb888_planar_1_vx*/ + +static const char pre_process_rgb888_planar_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;\n\ +_viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;\n\ +_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;\n\ +_viv_uniform int4 rgb_order;\n\ +\n\ +__kernel void pre_process_rgb888_planar_4over3_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float r_scale,\n\ + int reverse,\n\ + int height,\n\ + float g_scale,\n\ + float b_scale\n\ + )\n\ +{\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_out;\n\ +\n\ + vxc_uchar16 src0, src1, src2, src3;\n\ + vxc_uchar16 dst0, dst1, dst2;\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z ++;\n\ + coord_out.xy = (coord_in.xy >> 2) * 3;\n\ + coord_out.zw = coord_in.yy + (int2)(1, 2);\n\ +\n\ + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\ + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ +\n\ + int4 coord_r = coord_out;\n\ + coord_r.yzw += rgb_order.xxx;\n\ + VXC_WriteImage(output, coord_r.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_r.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_r.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z ++;\n\ +\n\ + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\ + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ +\n\ + int4 coord_g = coord_out;\n\ + coord_g.yzw += rgb_order.yyy;\n\ + VXC_WriteImage(output, coord_g.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_g.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_g.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\ + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ +\n\ + int4 coord_b = coord_out;\n\ + coord_b.yzw += rgb_order.zzz;\n\ + VXC_WriteImage(output, coord_b.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_b.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_b.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ }\n\ -IMAGE_PRE_PROCESS(U8, uint4, vxc_uchar16, vxc_uchar16)\n\ -IMAGE_PRE_PROCESS(I8, int4, vxc_char16, vxc_char16)\n\ -IMAGE_PRE_PROCESS(I16, int4, vxc_short8, vxc_short8)\n\ -IMAGE_PRE_PROCESS(F16, half4, vxc_half8, vxc_short8)\n\ -"; /* end of pre_process_rgb_vx*/ +\n\ +__kernel void pre_process_rgb888_planar_half_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float r_scale,\n\ + int reverse,\n\ + int height,\n\ + float g_scale,\n\ + float b_scale\n\ + )\n\ +{\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + vxc_uchar16 src0, src1, src2;\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, 0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z ++;\n\ + VXC_ReadImage2DArray(src1, input, coord_in, 0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z ++;\n\ + VXC_ReadImage2DArray(src2, input, coord_in, 0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int2 coord = coord_in.xy >> 1;\n\ +\n\ + int4 coord_rgb = coord.xyyy;\n\ + coord_rgb.yzw += rgb_order.xyz;\n\ + VXC_WriteImage(output, coord_rgb.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_rgb.xz, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_rgb.xw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of pre_process_rgb888_planar_2_vx*/ -static const char pre_process_rgb888_planar_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char pre_process_rgb888_planar_nhwc_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniVecShift10;\n\ _viv_uniform VXC_512Bits uniAddRShift;\n\ @@ -30510,11 +34090,15 @@ _viv_uniform VXC_512Bits uniExtractBytes;\n\ \n\ _viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;\n\ +_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;\n\ +_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;\n\ +_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;\n\ \n\ _viv_uniform float output_scale;\n\ _viv_uniform float output_zp;\n\ \n\ -#define RESIZE_BILINEAR_4X1(mean, output) \\\n\ +#define RESIZE_BILINEAR_4X1(scale, mean) \\\n\ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ @@ -30552,21 +34136,13 @@ _viv_uniform float output_zp;\n\ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ uniConvertIntergetoF32_4x4); \\\n\ \\\n\ - tmp_dst = tmp_dst * output_scale - mean * output_scale + output_zp; \\\n\ - _viv_asm(CONV, dst0, tmp_dst); \\\n\ - VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ - uniExtract8Data_2x8); \\\n\ - _viv_asm(COPY, dst, dst1, 8); \\\n\ - VXC_WriteImage(output, coord_out, dst, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ -\n\ + tmp_dst = tmp_dst * scale * output_scale - mean * scale * output_scale + output_zp; \\\n\ + _viv_asm(CONV, dst0, tmp_dst);\n\ #define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\ -__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\ +__kernel void pre_process_rgb888_planar_scale_U8to##dst_name##_nhwc \\\n\ ( \\\n\ __read_only image2d_array_t input, \\\n\ - __write_only image2d_array_t output0, \\\n\ - __write_only image2d_array_t output1, \\\n\ - __write_only image2d_array_t output2, \\\n\ + __write_only image2d_array_t output, \\\n\ global int *xRatio, \\\n\ global int *yRatio, \\\n\ global int *xOffset, \\\n\ @@ -30574,7 +34150,10 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float f32Var \\\n\ + float r_scale, \\\n\ + int reverse, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\ @@ -30636,7 +34215,9 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\ int4 test02, temp2; \\\n\ int4 tt; \\\n\ vxc_uchar4 val; \\\n\ - int2 coord_out = (int2)(xPos.x, yPos); \\\n\ + int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \\\n\ + coord_out.x = coord_out.x * 3; \\\n\ + coord_out.z = coord_out.x + 8; \\\n\ \\\n\ vxc_uchar8 line1, line2; \\\n\ \\\n\ @@ -30659,29 +34240,36 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\ uniConvertIntergetoF32_4x4); \\\n\ \\\n\ conv_type dst0; \\\n\ - dst_type dst1; \\\n\ - copy_type dst; \\\n\ - tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \\\n\ + dst_type dst1, dst2; \\\n\ + copy_type data0, data1, dst; \\\n\ + tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\ _viv_asm(CONV, dst0, tmp_dst); \\\n\ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniExtract8Data_2x8); \\\n\ - _viv_asm(COPY, dst, dst1, 8); \\\n\ - VXC_WriteImage(output0, coord_out, dst, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - RESIZE_BILINEAR_4X1(gMean, output1) \\\n\ - RESIZE_BILINEAR_4X1(bMean, output2) \\\n\ + RESIZE_BILINEAR_4X1(g_scale, gMean) \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + RESIZE_BILINEAR_4X1(b_scale, bMean) \\\n\ + VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, data0, dst1, 16); \\\n\ + _viv_asm(COPY, data1, dst2, 16); \\\n\ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uni16BitsDataInterleave_0_2x8); \\\n\ + VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uni16BitsDataInterleave_1_2x8); \\\n\ + VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8, half4, vxc_short8)\n\ PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4, vxc_short8)\n\ \n\ #define PRE_PROCESS_RGB888_PLANAR_8BITS(dst_name, write_type) \\\n\ -__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\ +__kernel void pre_process_rgb888_planar_scale_U8to##dst_name##_nhwc \\\n\ ( \\\n\ __read_only image2d_array_t input, \\\n\ - __write_only image2d_array_t output0, \\\n\ - __write_only image2d_array_t output1, \\\n\ - __write_only image2d_array_t output2, \\\n\ + __write_only image2d_array_t output, \\\n\ global int *xRatio, \\\n\ global int *yRatio, \\\n\ global int *xOffset, \\\n\ @@ -30689,7 +34277,10 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float f32Var \\\n\ + float r_scale, \\\n\ + int reverse, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\ @@ -30745,6 +34336,7 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\ int4 test01, temp1; \\\n\ int4 test02, temp2; \\\n\ int2 coord_out = (int2)(xPos.x, yPos); \\\n\ + coord_out.x = coord_out.x * 3; \\\n\ \\\n\ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniVecShift10); \\\n\ @@ -30767,13 +34359,11 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\ uniConvertIntergetoF32_4x4); \\\n\ \\\n\ int4 dst0; \\\n\ - write_type dst; \\\n\ - tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \\\n\ + write_type dst1, dst; \\\n\ + tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\ dst0 = convert_int4_rte(tmp_dst); \\\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniExtract8Data_2x8); \\\n\ - \\\n\ - VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ coord_in.x = coord.x; \\\n\ coord_in.z = 1; \\\n\ @@ -30813,12 +34403,10 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\ uniExtractBytes); \\\n\ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ uniConvertIntergetoF32_4x4); \\\n\ - tmp_dst = tmp_dst * output_scale - gMean * output_scale + output_zp; \\\n\ + tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \\\n\ dst0 = convert_int4_rte(tmp_dst); \\\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniExtract8Data_2x8); \\\n\ - \\\n\ - VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ coord_in.x = coord.x; \\\n\ coord_in.z = 2; \\\n\ @@ -30858,32 +34446,591 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\ uniExtractBytes); \\\n\ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ uniConvertIntergetoF32_4x4); \\\n\ - tmp_dst = tmp_dst * output_scale - bMean * output_scale + output_zp; \\\n\ + tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \\\n\ + dst0 = convert_int4_rte(tmp_dst); \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uni8BitsDataInterleave_0_2x8); \\\n\ + VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uni16BitsDataInterleave_1_2x8); \\\n\ + VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)\n\ +PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_nhwc_0_vx*/ + +static const char pre_process_rgb888_planar_nhwc_1_vx[] = "\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\ +_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\ +\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;\n\ +_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;\n\ +_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;\n\ +_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;\n\ +_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;\n\ +\n\ +#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\ +__kernel void pre_process_rgb888_planar_copy_U8to##dst_name##_nhwc \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float r_scale, \\\n\ + int reverse, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + coord.xy += (int2)(*xOffset, *yOffset); \\\n\ + vxc_uchar16 src0, src1, src2; \\\n\ + dst_type dst0, dst1; \\\n\ + \\\n\ + int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\ + VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.z ++; \\\n\ + VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.z ++; \\\n\ + VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int4 coord_out = coord; \\\n\ + coord_out.z = coord_out.z * 3; \\\n\ + coord_out.x = coord_out.z + 8; \\\n\ + float4 paramData0 = (float4)(rMean * output_scale * r_scale - output_zp, \\\n\ + rMean * r_scale * output_scale - output_zp, \\\n\ + rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\ + \\\n\ + half4 paramData_f16; \\\n\ + copy_type data0, data1, data2, dst; \\\n\ + _viv_asm(CONV, paramData_f16, paramData0); \\\n\ + VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + float4 paramData1 = (float4)(gMean * output_scale * g_scale - output_zp,\\\n\ + gMean * g_scale * output_scale - output_zp, \\\n\ + gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\ + _viv_asm(CONV, paramData_f16, paramData1); \\\n\ + VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + _viv_asm(COPY, data0, dst0, 16); \\\n\ + \\\n\ + float4 paramData2 = (float4)(bMean * output_scale * b_scale - output_zp, \\\n\ + bMean * b_scale * output_scale - output_zp, \\\n\ + bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\ + _viv_asm(CONV, paramData_f16, paramData2); \\\n\ + VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + _viv_asm(COPY, data1, dst1, 16); \\\n\ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uni16BitsDataInterleave_0_2x8); \\\n\ + VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uni16BitsDataInterleave_1_2x8); \\\n\ + VXC_WriteImage(output, coord_out.xw, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8, vxc_short8)\n\ +PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\ +\n\ +#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \\\n\ +__kernel void pre_process_rgb888_planar_copy_U8to##dst_name##_nhwc \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float r_scale, \\\n\ + int reverse, \\\n\ + int height, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + coord.xy += (int2) (*xOffset, *yOffset); \\\n\ + vxc_uchar16 src0, src1, src2; \\\n\ + write_type dst0, dst1, dst2, dst3; \\\n\ + \\\n\ + int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\ + VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.z ++; \\\n\ + VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.z ++; \\\n\ + VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int4 coord_out = coord; \\\n\ + coord_out.z = coord_out.z * 3; \\\n\ + coord_out.x = coord_out.z + 16; \\\n\ + float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \\\n\ + rMean * r_scale * output_scale - output_zp, \\\n\ + rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\ + \\\n\ + half4 paramData_f16; \\\n\ + _viv_asm(CONV, paramData_f16, paramData0); \\\n\ + \\\n\ + VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + \\\n\ + float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \\\n\ + gMean * g_scale * output_scale - output_zp, \\\n\ + gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\ + _viv_asm(CONV, paramData_f16, paramData1); \\\n\ + \\\n\ + VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + \\\n\ + float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \\\n\ + bMean * b_scale * output_scale - output_zp, \\\n\ + bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\ + _viv_asm(CONV, paramData_f16, paramData2); \\\n\ + \\\n\ + VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uni8BitsDataInterleave_0_2x8); \\\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uni8BitsDataInterleave_1_2x8); \\\n\ + VXC_DP2x8(dst3, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uni8BitsDataInterleave_2_2x8); \\\n\ + VXC_WriteImage(output, coord_out.zw, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output, coord_out.xw, dst3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\ +PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\ +"; /* end of pre_process_rgb888_planar_nhwc_1_vx*/ + +static const char pre_process_rgb888_planar_nhwc_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;\n\ +_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;\n\ +_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;\n\ +\n\ +__kernel void pre_process_rgb888_planar_half_U8toU8_nhwc\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float r_scale,\n\ + int reverse,\n\ + float g_scale,\n\ + float b_scale\n\ + )\n\ +{\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + vxc_uchar16 src0, src1, src2;\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, 0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z ++;\n\ + VXC_ReadImage2DArray(src1, input, coord_in, 0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z ++;\n\ + VXC_ReadImage2DArray(src2, input, coord_in, 0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int4 coord;\n\ + coord.xy = coord_in.xy >> 1;\n\ +\n\ + coord.x = coord.x * 3;\n\ + coord.z = coord.x + 16;\n\ +\n\ + vxc_uchar16 dst0, dst1;\n\ + src0.lo = src0.s02468ace;\n\ + src0.hi = src1.s02468ace;\n\ + src1.lo = src2.s02468ace;\n\ +\n\ + VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\ + uni8BitsDataInterleave_0_2x8);\n\ + VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\n\ + uni8BitsDataInterleave_1_2x8);\n\ + VXC_DP2x8(dst1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\ + uni8BitsDataInterleave_2_2x8);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.zy, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of pre_process_rgb888_planar_nhwc_2_vx*/ + +static const char pre_process_rgb888_planar_sep_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniVecShift10;\n\ +_viv_uniform VXC_512Bits uniAddRShift;\n\ +_viv_uniform VXC_512Bits uniGetTempVal;\n\ +_viv_uniform VXC_512Bits uniExtractBytes;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform int4 rgb_order;\n\ +\n\ +#define RESIZE_BILINEAR_4X1(input, scale, mean, output, _coord) \\\n\ + VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0Y, input, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + \\\n\ + tmp_dst = tmp_dst * scale * output_scale - scale * mean * output_scale + output_zp; \\\n\ + _viv_asm(CONV, dst0, tmp_dst); \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst1, 8); \\\n\ + VXC_WriteImage(output, _coord, dst, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define RGB888_PLANAR_SEP_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\ +__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __read_only image2d_array_t input2, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float r_scale, \\\n\ + int reverse, \\\n\ + int height, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ + ) \\\n\ +{ \\\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\ + \\\n\ + int4 xPos = get_global_id(0); \\\n\ + int yPos = get_global_id(1); \\\n\ + \\\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\ + xPos += (int4)(0, 1, 2, 3); \\\n\ + \\\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\ + int4 sx = fx0 & 0xffff8000; \\\n\ + fx0 -= sx; \\\n\ + sx = sx >> 15; \\\n\ + \\\n\ + vxc_short4 fx; \\\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniAddRShift); \\\n\ + \\\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\ + int sy = fy & 0xffff8000; \\\n\ + \\\n\ + fy -= sy; \\\n\ + sy = sy >> 15; \\\n\ + \\\n\ + fy = (fy + (1<< 4)) >> 5; \\\n\ + \\\n\ + vxc_uchar16 line0Y; \\\n\ + vxc_uchar16 line1Y; \\\n\ + int4 coord; \\\n\ + sx = sx + *xOffset; \\\n\ + coord.xyz = sx.xyz; \\\n\ + coord.w = sy + *yOffset; \\\n\ + int2 coord1 = (int2)(sx.w, coord.w); \\\n\ + VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int4 test01, temp1; \\\n\ + int4 test02, temp2; \\\n\ + int4 tt; \\\n\ + vxc_uchar4 val; \\\n\ + int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \\\n\ + coord_out.yzw += rgb_order.xyz; \\\n\ + \\\n\ + vxc_uchar8 line1, line2; \\\n\ + \\\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + \\\n\ + vxc_float4 tmp_dst; \\\n\ + vxc_uchar4 u8_dst; \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + \\\n\ + conv_type dst0; \\\n\ + dst_type dst1; \\\n\ + copy_type dst; \\\n\ + tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\ + _viv_asm(CONV, dst0, tmp_dst); \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst1, 8); \\\n\ + VXC_WriteImage(output, coord_out.xy, dst, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + RESIZE_BILINEAR_4X1(input1, g_scale, gMean, output, coord_out.xz) \\\n\ + RESIZE_BILINEAR_4X1(input2, b_scale, bMean, output, coord_out.xw) \\\n\ +}\n\ +RGB888_PLANAR_SEP_16BITS(F16, vxc_half8, half4, vxc_short8)\n\ +RGB888_PLANAR_SEP_16BITS(I16, vxc_short8, int4, vxc_short8)\n\ +\n\ +#define RGB888_PLANAR_SEP_8BITS(dst_name, write_type) \\\n\ +__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __read_only image2d_array_t input2, \\\n\ + __write_only image2d_array_t output, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float r_scale, \\\n\ + int reverse, \\\n\ + int height, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ + ) \\\n\ +{ \\\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\ + int4 xPos = get_global_id(0); \\\n\ + int yPos = get_global_id(1); \\\n\ + \\\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\ + xPos += (int4)(0, 1, 2, 3); \\\n\ + \\\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\ + int4 sx = fx0 & 0xffff8000; \\\n\ + fx0 -= sx; \\\n\ + sx = sx >> 15; \\\n\ + \\\n\ + vxc_short4 fx; \\\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \\\n\ + \\\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\ + int sy = fy & 0xffff8000; \\\n\ + \\\n\ + fy -= sy; \\\n\ + sy = sy >> 15; \\\n\ + fy = (fy + (1<< 4)) >> 5; \\\n\ + \\\n\ + vxc_uchar16 line0Y; \\\n\ + vxc_uchar16 line1Y; \\\n\ + int4 coord; \\\n\ + sx = sx + *xOffset; \\\n\ + coord.xyz = sx.xyz; \\\n\ + coord.w = sy + *yOffset; \\\n\ + int2 coord1 = (int2)(sx.w, coord.w); \\\n\ + VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int4 test01, temp1; \\\n\ + int4 test02, temp2; \\\n\ + int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \\\n\ + coord_out.yzw += rgb_order.xyz; \\\n\ + \\\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + \\\n\ + vxc_float4 tmp_dst; \\\n\ + vxc_uchar4 u8_dst; \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + \\\n\ + int4 dst0; \\\n\ + write_type dst; \\\n\ + tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\ + dst0 = convert_int4_rte(tmp_dst); \\\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + \\\n\ + VXC_WriteImage(output, coord_out.xy, dst, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(line0Y, input1, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0Y, input1, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0Y, input1, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0Y, input1, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(line1Y, input1, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1Y, input1, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1Y, input1, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1Y, input1, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \\\n\ + dst0 = convert_int4_rte(tmp_dst); \\\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + \\\n\ + VXC_WriteImage(output, coord_out.xz, \\\n\ + dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(line0Y, input2, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0Y, input2, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0Y, input2, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line0Y, input2, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_ReadImage(line1Y, input2, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1Y, input2, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1Y, input2, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(line1Y, input2, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \\\n\ dst0 = convert_int4_rte(tmp_dst); \\\n\ VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniExtract8Data_2x8); \\\n\ \\\n\ - VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output, coord_out.xw, \\\n\ + dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ -PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)\n\ -PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_0_vx*/ +RGB888_PLANAR_SEP_8BITS(U8, vxc_uchar16)\n\ +RGB888_PLANAR_SEP_8BITS(I8, vxc_char16)\n\ +"; /* end of pre_process_rgb888_planar_sep_0_vx*/ -static const char pre_process_rgb888_planar_1_vx[] = "\n\ -#include \"cl_viv_vx_ext.h\"\n\ +static const char pre_process_rgb888_planar_sep_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\ _viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\ \n\ _viv_uniform float output_scale;\n\ _viv_uniform float output_zp;\n\ +_viv_uniform int4 rgb_order;\n\ \n\ -#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\ -__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\ +#define RGB888_PLANAR_SEP_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\ +__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\ ( \\\n\ - __read_only image2d_array_t input, \\\n\ - __write_only image2d_array_t output0, \\\n\ - __write_only image2d_array_t output1, \\\n\ - __write_only image2d_array_t output2, \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __read_only image2d_array_t input2, \\\n\ + __write_only image2d_array_t output, \\\n\ global int *xRatio, \\\n\ global int *yRatio, \\\n\ global int *xOffset, \\\n\ @@ -30891,7 +35038,11 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float f32Var \\\n\ + float r_scale, \\\n\ + int reverse, \\\n\ + int height, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ @@ -30900,16 +35051,14 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\ vxc_uchar16 src0, src1, src2; \\\n\ dst_type dst0, dst1; \\\n\ \\\n\ - int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\ - VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ - coord_in.z ++; \\\n\ - VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ - coord_in.z ++; \\\n\ - VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ coord.x = coord.z + 8; \\\n\ - float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \\\n\ - rMean * output_scale - output_zp, output_scale); \\\n\ + float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \\\n\ + rMean * r_scale * output_scale - output_zp, \\\n\ + rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\ \\\n\ half4 paramData_f16; \\\n\ copy_type tmp_dst; \\\n\ @@ -30919,44 +35068,49 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\ VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ uniDataMeanStddevHi_2x8); \\\n\ _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\ - VXC_WriteImage(output0, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + int4 coord_out = coord; \\\n\ + coord_out.yw = coord_out.ww + rgb_order.xy; \\\n\ + VXC_WriteImage(output, coord_out.zy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\ - VXC_WriteImage(output0, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output, coord_out.xy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \\\n\ - gMean * output_scale - output_zp, output_scale); \\\n\ + float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \\\n\ + gMean * g_scale * output_scale - output_zp, \\\n\ + gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\ _viv_asm(CONV, paramData_f16, paramData1); \\\n\ VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ uniDataMeanStddevLo_2x8); \\\n\ VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ uniDataMeanStddevHi_2x8); \\\n\ _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\ - VXC_WriteImage(output1, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\ - VXC_WriteImage(output1, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \\\n\ - bMean * output_scale - output_zp, output_scale); \\\n\ + float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \\\n\ + bMean * b_scale * output_scale - output_zp, \\\n\ + bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\ _viv_asm(CONV, paramData_f16, paramData2); \\\n\ VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ uniDataMeanStddevLo_2x8); \\\n\ VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ uniDataMeanStddevHi_2x8); \\\n\ _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\ - VXC_WriteImage(output2, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_out.w = coord.w + rgb_order.z; \\\n\ + VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\ - VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ -PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8, vxc_short8)\n\ -PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\ +RGB888_PLANAR_SEP_COPY_16BITS(F16, vxc_half8, vxc_short8)\n\ +RGB888_PLANAR_SEP_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\ \n\ #define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \\\n\ -__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\ +__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\ ( \\\n\ - __read_only image2d_array_t input, \\\n\ - __write_only image2d_array_t output0, \\\n\ - __write_only image2d_array_t output1, \\\n\ - __write_only image2d_array_t output2, \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __read_only image2d_array_t input2, \\\n\ + __write_only image2d_array_t output, \\\n\ global int *xRatio, \\\n\ global int *yRatio, \\\n\ global int *xOffset, \\\n\ @@ -30964,7 +35118,11 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float f32Var \\\n\ + float r_scale, \\\n\ + int reverse, \\\n\ + int height, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ @@ -30973,15 +35131,15 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\ vxc_uchar16 src0, src1, src2; \\\n\ write_type dst; \\\n\ \\\n\ - int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\ - VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ - coord_in.z ++; \\\n\ - VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ - coord_in.z ++; \\\n\ - VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \\\n\ - rMean * output_scale - output_zp, output_scale); \\\n\ + int4 coord_out = coord; \\\n\ + coord_out.xyw += rgb_order.xyz; \\\n\ + float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \\\n\ + rMean * r_scale * output_scale - output_zp, \\\n\ + rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\ \\\n\ half4 paramData_f16; \\\n\ _viv_asm(CONV, paramData_f16, paramData0); \\\n\ @@ -30990,46 +35148,49 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\ uniDataMeanStddevLo_2x8); \\\n\ VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniDataMeanStddevHi_2x8); \\\n\ - VXC_WriteImage(output0, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output, coord_out.zx, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \\\n\ - gMean * output_scale - output_zp, output_scale); \\\n\ + float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \\\n\ + gMean * g_scale * output_scale - output_zp, \\\n\ + gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\ _viv_asm(CONV, paramData_f16, paramData1); \\\n\ \\\n\ VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniDataMeanStddevLo_2x8); \\\n\ VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniDataMeanStddevHi_2x8); \\\n\ - VXC_WriteImage(output1, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \\\n\ - bMean * output_scale - output_zp, output_scale); \\\n\ + float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \\\n\ + bMean * b_scale * output_scale - output_zp, \\\n\ + bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\ _viv_asm(CONV, paramData_f16, paramData2); \\\n\ \\\n\ VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniDataMeanStddevLo_2x8); \\\n\ VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniDataMeanStddevHi_2x8); \\\n\ - VXC_WriteImage(output2, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\ PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\ -"; /* end of pre_process_rgb888_planar_1_vx*/ +"; /* end of pre_process_rgb888_planar_sep_1_vx*/ -static const char pre_process_rgb888_planar_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char pre_process_rgb888_planar_sep_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;\n\ _viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;\n\ _viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;\n\ _viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;\n\ _viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;\n\ +_viv_uniform int4 rgb_order;\n\ \n\ -__kernel void pre_process_rgb888_planar_4over3_U8toU8\n\ +__kernel void pre_process_rgb888_planar_sep_4over3_U8toU8\n\ (\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output0,\n\ - __write_only image2d_array_t output1,\n\ - __write_only image2d_array_t output2,\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __read_only image2d_array_t input2,\n\ + __write_only image2d_array_t output,\n\ global int *xRatio,\n\ global int *yRatio,\n\ global int *xOffset,\n\ @@ -31037,24 +35198,24 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8\n\ float rMean,\n\ float gMean,\n\ float bMean,\n\ - float f32Var\n\ + float r_scale,\n\ + int reverse,\n\ + int height,\n\ + float g_scale,\n\ + float b_scale\n\ )\n\ {\n\ - int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ int4 coord_out;\n\ \n\ vxc_uchar16 src0, src1, src2, src3;\n\ vxc_uchar16 dst0, dst1, dst2;\n\ \n\ - VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.z ++;\n\ + VXC_ReadImage(src0, input0, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input0, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src2, input0, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src3, input0, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ coord_out.xy = (coord_in.xy >> 2) * 3;\n\ coord_out.zw = coord_in.yy + (int2)(1, 2);\n\ \n\ @@ -31067,19 +35228,16 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8\n\ VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ \n\ - VXC_WriteImage(output0, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + int4 coord_r = coord_out;\n\ + coord_r.yzw += rgb_order.xxx;\n\ + VXC_WriteImage(output, coord_r.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_r.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_r.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ \n\ - VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.z ++;\n\ + VXC_ReadImage(src0, input1, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src2, input1, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src3, input1, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\ VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\ @@ -31090,18 +35248,16 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8\n\ VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ \n\ - VXC_WriteImage(output1, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + int4 coord_g = coord_out;\n\ + coord_g.yzw += rgb_order.yyy;\n\ + VXC_WriteImage(output, coord_g.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_g.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_g.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ \n\ - VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src0, input2, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input2, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src2, input2, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src3, input2, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\ VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\ @@ -31112,17 +35268,19 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8\n\ VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ \n\ - VXC_WriteImage(output2, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output2, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + int4 coord_b = coord_out;\n\ + coord_b.yzw += rgb_order.zzz;\n\ + VXC_WriteImage(output, coord_b.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_b.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_b.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ -__kernel void pre_process_rgb888_planar_half_U8toU8\n\ +__kernel void pre_process_rgb888_planar_sep_half_U8toU8\n\ (\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_array_t output0,\n\ - __write_only image2d_array_t output1,\n\ - __write_only image2d_array_t output2,\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __read_only image2d_array_t input2,\n\ + __write_only image2d_array_t output,\n\ global int *xRatio,\n\ global int *yRatio,\n\ global int *xOffset,\n\ @@ -31130,31 +35288,32 @@ __kernel void pre_process_rgb888_planar_half_U8toU8\n\ float rMean,\n\ float gMean,\n\ float bMean,\n\ - float f32Var\n\ + float r_scale,\n\ + int reverse,\n\ + int height,\n\ + float g_scale,\n\ + float b_scale\n\ )\n\ {\n\ - int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ \n\ vxc_uchar16 src0, src1, src2;\n\ \n\ - VXC_ReadImage2DArray(src0, input, coord_in, 0,\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.z ++;\n\ - VXC_ReadImage2DArray(src1, input, coord_in, 0,\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.z ++;\n\ - VXC_ReadImage2DArray(src2, input, coord_in, 0,\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src0, input0, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input1, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src2, input2, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ - int2 coord = coord_in.xy >> 1;\n\ + coord_in.zw = coord_in.xy >> 1;\n\ \n\ - VXC_WriteImage(output0, coord, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output1, coord, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output2, coord, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + int4 coord_rgb = coord_in.zwww;\n\ + coord_rgb.yzw += rgb_order.xyz;\n\ + VXC_WriteImage(output, coord_rgb.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_rgb.xz, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_rgb.xw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ }\n\ -"; /* end of pre_process_rgb888_planar_2_vx*/ +"; /* end of pre_process_rgb888_planar_sep_2_vx*/ -static const char pre_process_rgb888_planar_sep_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char pre_process_rgb888_planar_sep_nhwc_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniVecShift10;\n\ _viv_uniform VXC_512Bits uniAddRShift;\n\ @@ -31163,11 +35322,15 @@ _viv_uniform VXC_512Bits uniExtractBytes;\n\ \n\ _viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;\n\ +_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;\n\ +_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;\n\ +_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;\n\ \n\ _viv_uniform float output_scale;\n\ _viv_uniform float output_zp;\n\ \n\ -#define RESIZE_BILINEAR_4X1(input, mean, output) \\\n\ +#define RESIZE_BILINEAR_4X1(input, scale, mean) \\\n\ VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ @@ -31197,23 +35360,16 @@ _viv_uniform float output_zp;\n\ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ uniConvertIntergetoF32_4x4); \\\n\ \\\n\ - tmp_dst = tmp_dst * output_scale - mean * output_scale + output_zp; \\\n\ - _viv_asm(CONV, dst0, tmp_dst); \\\n\ - VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ - uniExtract8Data_2x8); \\\n\ - _viv_asm(COPY, dst, dst1, 8); \\\n\ - VXC_WriteImage(output, coord_out, dst, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + tmp_dst = tmp_dst * scale * output_scale - mean * scale * output_scale + output_zp; \\\n\ + _viv_asm(CONV, dst0, tmp_dst);\n\ \n\ #define RGB888_PLANAR_SEP_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\ -__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\ +__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name##_nhwc \\\n\ ( \\\n\ __read_only image2d_array_t input0, \\\n\ __read_only image2d_array_t input1, \\\n\ __read_only image2d_array_t input2, \\\n\ - __write_only image2d_array_t output0, \\\n\ - __write_only image2d_array_t output1, \\\n\ - __write_only image2d_array_t output2, \\\n\ + __write_only image2d_array_t output, \\\n\ global int *xRatio, \\\n\ global int *yRatio, \\\n\ global int *xOffset, \\\n\ @@ -31221,7 +35377,10 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float f32Var \\\n\ + float r_scale, \\\n\ + int reverse, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\ @@ -31274,7 +35433,9 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\ int4 test02, temp2; \\\n\ int4 tt; \\\n\ vxc_uchar4 val; \\\n\ - int2 coord_out = (int2)(xPos.x, yPos); \\\n\ + int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \\\n\ + coord_out.x = coord_out.x * 3; \\\n\ + coord_out.z = coord_out.x + 8; \\\n\ \\\n\ vxc_uchar8 line1, line2; \\\n\ \\\n\ @@ -31297,31 +35458,38 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\ uniConvertIntergetoF32_4x4); \\\n\ \\\n\ conv_type dst0; \\\n\ - dst_type dst1; \\\n\ - copy_type dst; \\\n\ - tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \\\n\ + dst_type dst1, dst2; \\\n\ + copy_type data0, data1, dst; \\\n\ + tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\ _viv_asm(CONV, dst0, tmp_dst); \\\n\ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniExtract8Data_2x8); \\\n\ - _viv_asm(COPY, dst, dst1, 8); \\\n\ - VXC_WriteImage(output0, coord_out, dst, \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + RESIZE_BILINEAR_4X1(input1, g_scale, gMean) \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ \\\n\ - RESIZE_BILINEAR_4X1(input1, gMean, output1) \\\n\ - RESIZE_BILINEAR_4X1(input2, bMean, output2) \\\n\ + RESIZE_BILINEAR_4X1(input2, b_scale, bMean) \\\n\ + VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, data0, dst1, 16); \\\n\ + _viv_asm(COPY, data1, dst2, 16); \\\n\ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uni16BitsDataInterleave_0_2x8); \\\n\ + VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uni16BitsDataInterleave_1_2x8); \\\n\ + VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ RGB888_PLANAR_SEP_16BITS(F16, vxc_half8, half4, vxc_short8)\n\ RGB888_PLANAR_SEP_16BITS(I16, vxc_short8, int4, vxc_short8)\n\ \n\ #define RGB888_PLANAR_SEP_8BITS(dst_name, write_type) \\\n\ -__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\ +__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name##_nhwc \\\n\ ( \\\n\ __read_only image2d_array_t input0, \\\n\ __read_only image2d_array_t input1, \\\n\ __read_only image2d_array_t input2, \\\n\ - __write_only image2d_array_t output0, \\\n\ - __write_only image2d_array_t output1, \\\n\ - __write_only image2d_array_t output2, \\\n\ + __write_only image2d_array_t output, \\\n\ global int *xRatio, \\\n\ global int *yRatio, \\\n\ global int *xOffset, \\\n\ @@ -31329,7 +35497,10 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float f32Var \\\n\ + float r_scale, \\\n\ + int reverse, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\ @@ -31378,6 +35549,7 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\ int4 test01, temp1; \\\n\ int4 test02, temp2; \\\n\ int2 coord_out = (int2)(xPos.x, yPos); \\\n\ + coord_out.x = coord_out.x * 3; \\\n\ \\\n\ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniVecShift10); \\\n\ @@ -31400,13 +35572,11 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\ uniConvertIntergetoF32_4x4); \\\n\ \\\n\ int4 dst0; \\\n\ - write_type dst; \\\n\ - tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \\\n\ + write_type dst1, dst; \\\n\ + tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\ dst0 = convert_int4_rte(tmp_dst); \\\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniExtract8Data_2x8); \\\n\ - \\\n\ - VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ VXC_ReadImage(line0Y, input1, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_ReadImage(line0Y, input1, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ @@ -31438,12 +35608,10 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\ uniExtractBytes); \\\n\ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ uniConvertIntergetoF32_4x4); \\\n\ - tmp_dst = tmp_dst * output_scale - gMean * output_scale + output_zp; \\\n\ + tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \\\n\ dst0 = convert_int4_rte(tmp_dst); \\\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniExtract8Data_2x8); \\\n\ - \\\n\ - VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ VXC_ReadImage(line0Y, input2, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_ReadImage(line0Y, input2, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ @@ -31475,33 +35643,39 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\ uniExtractBytes); \\\n\ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ uniConvertIntergetoF32_4x4); \\\n\ - tmp_dst = tmp_dst * output_scale - bMean * output_scale + output_zp; \\\n\ + tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \\\n\ dst0 = convert_int4_rte(tmp_dst); \\\n\ - VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniExtract8Data_2x8); \\\n\ - \\\n\ - VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uni8BitsDataInterleave_0_2x8); \\\n\ + VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uni16BitsDataInterleave_1_2x8); \\\n\ + VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ RGB888_PLANAR_SEP_8BITS(U8, vxc_uchar16)\n\ -RGB888_PLANAR_SEP_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_sep_0_vx*/ +RGB888_PLANAR_SEP_8BITS(I8, vxc_char16)\n\ +"; /* end of pre_process_rgb888_planar_sep_nhwc_0_vx*/ -static const char pre_process_rgb888_planar_sep_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char pre_process_rgb888_planar_sep_nhwc_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\ -_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\ \n\ _viv_uniform float output_scale;\n\ _viv_uniform float output_zp;\n\ +_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;\n\ +_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;\n\ +_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;\n\ +_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;\n\ +_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;\n\ \n\ #define RGB888_PLANAR_SEP_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\ -__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\ +__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name##_nhwc \\\n\ ( \\\n\ __read_only image2d_array_t input0, \\\n\ __read_only image2d_array_t input1, \\\n\ __read_only image2d_array_t input2, \\\n\ - __write_only image2d_array_t output0, \\\n\ - __write_only image2d_array_t output1, \\\n\ - __write_only image2d_array_t output2, \\\n\ + __write_only image2d_array_t output, \\\n\ global int *xRatio, \\\n\ global int *yRatio, \\\n\ global int *xOffset, \\\n\ @@ -31509,7 +35683,10 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float f32Var \\\n\ + float r_scale, \\\n\ + int reverse, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ @@ -31522,58 +35699,50 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\ VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - coord.x = coord.z + 8; \\\n\ - float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \\\n\ - rMean * output_scale - output_zp, output_scale); \\\n\ + int4 coord_out = coord; \\\n\ + coord_out.z = coord_out.z * 3; \\\n\ + coord_out.x = coord_out.z + 8; \\\n\ + float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp,\\\n\ + rMean * r_scale * output_scale - output_zp, \\\n\ + rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\ \\\n\ half4 paramData_f16; \\\n\ - copy_type tmp_dst; \\\n\ + copy_type data0, data1, data2, dst; \\\n\ _viv_asm(CONV, paramData_f16, paramData0); \\\n\ - VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniDataMeanStddevLo_2x8); \\\n\ - VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ - uniDataMeanStddevHi_2x8); \\\n\ - _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\ - VXC_WriteImage(output0, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\ - VXC_WriteImage(output0, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - \\\n\ - float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \\\n\ - gMean * output_scale - output_zp, output_scale); \\\n\ + float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp,\\\n\ + gMean * g_scale * output_scale - output_zp, \\\n\ + gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\ _viv_asm(CONV, paramData_f16, paramData1); \\\n\ - VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), \\\n\ uniDataMeanStddevLo_2x8); \\\n\ - VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ - uniDataMeanStddevHi_2x8); \\\n\ - _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\ - VXC_WriteImage(output1, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\ - VXC_WriteImage(output1, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data0, dst0, 16); \\\n\ \\\n\ - float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \\\n\ - bMean * output_scale - output_zp, output_scale); \\\n\ + float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp,\\\n\ + bMean * b_scale * output_scale - output_zp, \\\n\ + bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\ _viv_asm(CONV, paramData_f16, paramData2); \\\n\ VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ uniDataMeanStddevLo_2x8); \\\n\ - VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ - uniDataMeanStddevHi_2x8); \\\n\ - _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\ - VXC_WriteImage(output2, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\ - VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data1, dst0, 16); \\\n\ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uni16BitsDataInterleave_0_2x8); \\\n\ + VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uni16BitsDataInterleave_1_2x8); \\\n\ + VXC_WriteImage(output, coord_out.xw, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ RGB888_PLANAR_SEP_COPY_16BITS(F16, vxc_half8, vxc_short8)\n\ RGB888_PLANAR_SEP_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\ \n\ #define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \\\n\ -__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\ +__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name##_nhwc \\\n\ ( \\\n\ __read_only image2d_array_t input0, \\\n\ __read_only image2d_array_t input1, \\\n\ __read_only image2d_array_t input2, \\\n\ - __write_only image2d_array_t output0, \\\n\ - __write_only image2d_array_t output1, \\\n\ - __write_only image2d_array_t output2, \\\n\ + __write_only image2d_array_t output, \\\n\ global int *xRatio, \\\n\ global int *yRatio, \\\n\ global int *xOffset, \\\n\ @@ -31581,153 +35750,75 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float f32Var \\\n\ + float r_scale, \\\n\ + int reverse, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ \\\n\ coord.xy += (int2) (*xOffset, *yOffset); \\\n\ vxc_uchar16 src0, src1, src2; \\\n\ - write_type dst; \\\n\ + write_type dst0, dst1, dst2, dst3; \\\n\ \\\n\ VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \\\n\ - rMean * output_scale - output_zp, output_scale); \\\n\ + int4 coord_out = coord; \\\n\ + coord_out.z = coord_out.z * 3; \\\n\ + coord_out.x = coord_out.z + 16; \\\n\ + float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp,\\\n\ + rMean * r_scale * output_scale - output_zp, \\\n\ + rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\ \\\n\ half4 paramData_f16; \\\n\ _viv_asm(CONV, paramData_f16, paramData0); \\\n\ \\\n\ - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniDataMeanStddevLo_2x8); \\\n\ - VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ - uniDataMeanStddevHi_2x8); \\\n\ - VXC_WriteImage(output0, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \\\n\ - gMean * output_scale - output_zp, output_scale); \\\n\ + float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp,\\\n\ + gMean * g_scale * output_scale - output_zp, \\\n\ + gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\ _viv_asm(CONV, paramData_f16, paramData1); \\\n\ \\\n\ - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniDataMeanStddevLo_2x8); \\\n\ - VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ - uniDataMeanStddevHi_2x8); \\\n\ - VXC_WriteImage(output1, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \\\n\ - bMean * output_scale - output_zp, output_scale); \\\n\ + float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp,\\\n\ + bMean * b_scale * output_scale - output_zp, \\\n\ + bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\ _viv_asm(CONV, paramData_f16, paramData2); \\\n\ \\\n\ - VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniDataMeanStddevLo_2x8); \\\n\ - VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ - uniDataMeanStddevHi_2x8); \\\n\ - VXC_WriteImage(output2, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uni8BitsDataInterleave_0_2x8); \\\n\ + VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uni8BitsDataInterleave_1_2x8); \\\n\ + VXC_DP2x8(dst3, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uni8BitsDataInterleave_2_2x8); \\\n\ + VXC_WriteImage(output, coord_out.zw, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(output, coord_out.xw, dst3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\ PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\ -"; /* end of pre_process_rgb888_planar_sep_1_vx*/ +"; /* end of pre_process_rgb888_planar_sep_nhwc_1_vx*/ -static const char pre_process_rgb888_planar_sep_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char pre_process_rgb888_planar_sep_nhwc_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ -_viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;\n\ -_viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;\n\ -_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;\n\ -_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;\n\ +_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;\n\ +_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;\n\ +_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;\n\ \n\ -__kernel void pre_process_rgb888_planar_sep_4over3_U8toU8\n\ +__kernel void pre_process_rgb888_planar_sep_half_U8toU8_nhwc\n\ (\n\ __read_only image2d_array_t input0,\n\ __read_only image2d_array_t input1,\n\ __read_only image2d_array_t input2,\n\ - __write_only image2d_array_t output0,\n\ - __write_only image2d_array_t output1,\n\ - __write_only image2d_array_t output2,\n\ - global int *xRatio,\n\ - global int *yRatio,\n\ - global int *xOffset,\n\ - global int *yOffset,\n\ - float rMean,\n\ - float gMean,\n\ - float bMean,\n\ - float f32Var\n\ - )\n\ -{\n\ - int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ - int4 coord_out;\n\ -\n\ - vxc_uchar16 src0, src1, src2, src3;\n\ - vxc_uchar16 dst0, dst1, dst2;\n\ -\n\ - VXC_ReadImage(src0, input0, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input0, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src2, input0, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src3, input0, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - coord_out.xy = (coord_in.xy >> 2) * 3;\n\ - coord_out.zw = coord_in.yy + (int2)(1, 2);\n\ -\n\ - VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\ - VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\ - VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ - VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ - VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ - VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ - VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ - VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ -\n\ - VXC_WriteImage(output0, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(src0, input1, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input1, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src2, input1, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src3, input1, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\ - VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\ - VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ - VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ - VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ - VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ - VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ - VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ -\n\ - VXC_WriteImage(output1, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_ReadImage(src0, input2, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input2, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src2, input2, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src3, input2, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\ - VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\ - VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ - VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ - VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ - VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ - VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ - VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ -\n\ - VXC_WriteImage(output2, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output2, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pre_process_rgb888_planar_sep_half_U8toU8\n\ - (\n\ - __read_only image2d_array_t input0,\n\ - __read_only image2d_array_t input1,\n\ - __read_only image2d_array_t input2,\n\ - __write_only image2d_array_t output0,\n\ - __write_only image2d_array_t output1,\n\ - __write_only image2d_array_t output2,\n\ + __write_only image2d_array_t output,\n\ global int *xRatio,\n\ global int *yRatio,\n\ global int *xOffset,\n\ @@ -31735,7 +35826,10 @@ __kernel void pre_process_rgb888_planar_sep_half_U8toU8\n\ float rMean,\n\ float gMean,\n\ float bMean,\n\ - float f32Var\n\ + float r_scale,\n\ + int reverse,\n\ + float g_scale,\n\ + float b_scale\n\ )\n\ {\n\ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ @@ -31746,13 +35840,28 @@ __kernel void pre_process_rgb888_planar_sep_half_U8toU8\n\ VXC_ReadImage(src1, input1, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ VXC_ReadImage(src2, input2, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ \n\ - coord_in.zw = coord_in.xy >> 1;\n\ + int4 coord;\n\ + coord.xy = coord_in.xy >> 1;\n\ +\n\ + coord.x = coord.x * 3;\n\ + coord.z = coord.x + 16;\n\ +\n\ + vxc_uchar16 dst0, dst1;\n\ + src0.lo = src0.s02468ace;\n\ + src0.hi = src1.s02468ace;\n\ + src1.lo = src2.s02468ace;\n\ \n\ - VXC_WriteImage(output0, coord_in.zw, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output1, coord_in.zw, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ - VXC_WriteImage(output2, coord_in.zw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\ + uni8BitsDataInterleave_0_2x8);\n\ + VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\n\ + uni8BitsDataInterleave_1_2x8);\n\ + VXC_DP2x8(dst1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\ + uni8BitsDataInterleave_2_2x8);\n\ +\n\ + VXC_WriteImage(output, coord.xy, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.zy, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ }\n\ -"; /* end of pre_process_rgb888_planar_sep_2_vx*/ +"; /* end of pre_process_rgb888_planar_sep_nhwc_2_vx*/ static const char pre_process_rgb_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -31773,6 +35882,8 @@ _viv_uniform VXC_512Bits uniExtractBtoF32_part1_4x4;\n\ _viv_uniform VXC_512Bits uniExtractBtoF32_part2_4x4;\n\ _viv_uniform VXC_512Bits uniExtractBtoF32_part3_4x4;\n\ _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform float4 param_data;\n\ +_viv_uniform float4 rgb_scale;\n\ \n\ #define IMAGE_PRE_PROCESS_COPY_16BITS(dst_name, dst_type, copy_type, convert_type) \\\n\ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ @@ -31786,9 +35897,11 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float f32Var, \\\n\ + float r_scale, \\\n\ int reverse_channel, \\\n\ - int trans \\\n\ + int trans, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); \\\n\ @@ -31802,10 +35915,6 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ - \\\n\ - f32Var *= outputScale; \\\n\ - float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \\\n\ - bMean * f32Var - outputZP, f32Var); \\\n\ \\\n\ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), r_order, 0); \\\n\ float4 tmp0, tmp1; \\\n\ @@ -31813,8 +35922,8 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ \\\n\ VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \\\n\ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \\\n\ - tmp0 = tmp0 * paramData.w - paramData.x; \\\n\ - tmp1 = tmp1 * paramData.w - paramData.x; \\\n\ + tmp0 = tmp0 * rgb_scale.x - param_data.x; \\\n\ + tmp1 = tmp1 * rgb_scale.x - param_data.x; \\\n\ _viv_asm(CONV_RTE, result0, tmp0); \\\n\ _viv_asm(CONV_RTE, result1, tmp1); \\\n\ VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ @@ -31824,8 +35933,8 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ coord_out.z = 1; \\\n\ VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \\\n\ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \\\n\ - tmp0 = tmp0 * paramData.w - paramData.y; \\\n\ - tmp1 = tmp1 * paramData.w - paramData.y; \\\n\ + tmp0 = tmp0 * rgb_scale.y - param_data.y; \\\n\ + tmp1 = tmp1 * rgb_scale.y - param_data.y; \\\n\ _viv_asm(CONV_RTE, result0, tmp0); \\\n\ _viv_asm(CONV_RTE, result1, tmp1); \\\n\ VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ @@ -31835,8 +35944,8 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ coord_out.z = b_order; \\\n\ VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \\\n\ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \\\n\ - tmp0 = tmp0 * paramData.w - paramData.z; \\\n\ - tmp1 = tmp1 * paramData.w - paramData.z; \\\n\ + tmp0 = tmp0 * rgb_scale.z - param_data.z; \\\n\ + tmp1 = tmp1 * rgb_scale.z - param_data.z; \\\n\ _viv_asm(CONV_RTE, result0, tmp0); \\\n\ _viv_asm(CONV_RTE, result1, tmp1); \\\n\ VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ @@ -31858,9 +35967,11 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float f32Var, \\\n\ + float r_scale, \\\n\ int reverse_channel, \\\n\ - int trans \\\n\ + int trans, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); \\\n\ @@ -31875,10 +35986,6 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ coord.x += 16; \\\n\ VXC_ReadImage(src2, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ - \\\n\ - f32Var *= outputScale; \\\n\ - float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \\\n\ - bMean * f32Var - outputZP, f32Var); \\\n\ \\\n\ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), r_order, 0); \\\n\ float4 tmp0, tmp1; \\\n\ @@ -31886,15 +35993,15 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ \\\n\ VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \\\n\ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \\\n\ - tmp0 = tmp0 * paramData.w - paramData.x; \\\n\ - tmp1 = tmp1 * paramData.w - paramData.x; \\\n\ + tmp0 = tmp0 * rgb_scale.x - param_data.x; \\\n\ + tmp1 = tmp1 * rgb_scale.x - param_data.x; \\\n\ result0 = convert_int4_rte(tmp0); \\\n\ result1 = convert_int4_rte(tmp1); \\\n\ VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part2_4x4); \\\n\ VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part3_4x4); \\\n\ - tmp0 = tmp0 * paramData.w - paramData.x; \\\n\ - tmp1 = tmp1 * paramData.w - paramData.x; \\\n\ + tmp0 = tmp0 * rgb_scale.x - param_data.x; \\\n\ + tmp1 = tmp1 * rgb_scale.x - param_data.x; \\\n\ result0 = convert_int4_rte(tmp0); \\\n\ result1 = convert_int4_rte(tmp1); \\\n\ VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ @@ -31903,15 +36010,15 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ coord_out.z = 1; \\\n\ VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \\\n\ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \\\n\ - tmp0 = tmp0 * paramData.w - paramData.y; \\\n\ - tmp1 = tmp1 * paramData.w - paramData.y; \\\n\ + tmp0 = tmp0 * rgb_scale.y - param_data.y; \\\n\ + tmp1 = tmp1 * rgb_scale.y - param_data.y; \\\n\ result0 = convert_int4_rte(tmp0); \\\n\ result1 = convert_int4_rte(tmp1); \\\n\ VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part2_4x4); \\\n\ VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part3_4x4); \\\n\ - tmp0 = tmp0 * paramData.w - paramData.y; \\\n\ - tmp1 = tmp1 * paramData.w - paramData.y; \\\n\ + tmp0 = tmp0 * rgb_scale.y - param_data.y; \\\n\ + tmp1 = tmp1 * rgb_scale.y - param_data.y; \\\n\ result0 = convert_int4_rte(tmp0); \\\n\ result1 = convert_int4_rte(tmp1); \\\n\ VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ @@ -31920,15 +36027,15 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\ coord_out.z = b_order; \\\n\ VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \\\n\ VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \\\n\ - tmp0 = tmp0 * paramData.w - paramData.z; \\\n\ - tmp1 = tmp1 * paramData.w - paramData.z; \\\n\ + tmp0 = tmp0 * rgb_scale.z - param_data.z; \\\n\ + tmp1 = tmp1 * rgb_scale.z - param_data.z; \\\n\ result0 = convert_int4_rte(tmp0); \\\n\ result1 = convert_int4_rte(tmp1); \\\n\ VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part2_4x4); \\\n\ VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part3_4x4); \\\n\ - tmp0 = tmp0 * paramData.w - paramData.z; \\\n\ - tmp1 = tmp1 * paramData.w - paramData.z; \\\n\ + tmp0 = tmp0 * rgb_scale.z - param_data.z; \\\n\ + tmp1 = tmp1 * rgb_scale.z - param_data.z; \\\n\ result0 = convert_int4_rte(tmp0); \\\n\ result1 = convert_int4_rte(tmp1); \\\n\ VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ @@ -31989,9 +36096,11 @@ __kernel void pre_process_yuv420_copy_##name \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float var, \\\n\ + float r_scale, \\\n\ int reverse_channel, \\\n\ - int trans \\\n\ + int trans, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \\\n\ @@ -32050,17 +36159,23 @@ __kernel void pre_process_yuv420_copy_##name \\\n\ VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ \\\n\ - var *= output_scale; \\\n\ - float4 paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \\\n\ - rMean * var - output_zp, var); \\\n\ + float4 paramData = (float4)(bMean * b_scale * output_scale - output_zp,\\\n\ + gMean * g_scale * output_scale - output_zp, \\\n\ + rMean * r_scale * output_scale - output_zp, b_scale * output_scale); \\\n\ half4 paramData_f16; \\\n\ _viv_asm(CONV, paramData_f16, paramData); \\\n\ \\\n\ VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \\\n\ VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \\\n\ + \\\n\ + paramData.w = g_scale * output_scale; \\\n\ + _viv_asm(CONV, paramData_f16, paramData); \\\n\ \\\n\ VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \\\n\ VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \\\n\ + \\\n\ + paramData.w = r_scale * output_scale; \\\n\ + _viv_asm(CONV, paramData_f16, paramData); \\\n\ \\\n\ VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \\\n\ VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \\\n\ @@ -32090,9 +36205,11 @@ __kernel void pre_process_yuv420_copy_##name \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float var, \\\n\ + float r_scale, \\\n\ int reverse_channel, \\\n\ - int trans \\\n\ + int trans, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \\\n\ @@ -32142,18 +36259,22 @@ __kernel void pre_process_yuv420_copy_##name \\\n\ VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\ \\\n\ - var *= output_scale; \\\n\ - float4 paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \\\n\ - rMean * var - output_zp, var); \\\n\ + float4 paramData = (float4)(bMean * b_scale * output_scale - output_zp, \\\n\ + gMean * g_scale * output_scale - output_zp, \\\n\ + rMean * r_scale * output_scale - output_zp, b_scale * output_scale); \\\n\ half4 paramData_f16; \\\n\ _viv_asm(CONV, paramData_f16, paramData); \\\n\ \\\n\ VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \\\n\ VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \\\n\ \\\n\ + paramData.w = g_scale * output_scale; \\\n\ + _viv_asm(CONV, paramData_f16, paramData); \\\n\ VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \\\n\ VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \\\n\ \\\n\ + paramData.w = r_scale * output_scale; \\\n\ + _viv_asm(CONV, paramData_f16, paramData); \\\n\ VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \\\n\ VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \\\n\ \\\n\ @@ -32228,9 +36349,11 @@ __kernel void pre_process_yuv420_scale_##name \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float var, \\\n\ + float r_scale, \\\n\ int reverse_channel, \\\n\ - int trans \\\n\ + int trans, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ int4 gidx = get_global_id(0); \\\n\ @@ -32379,7 +36502,7 @@ __kernel void pre_process_yuv420_scale_##name \\\n\ float4 tmpDst; \\\n\ int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ - tmpDst = (tmpDst - bMean) * var; \\\n\ + tmpDst = (tmpDst - bMean) * b_scale; \\\n\ dstPos.z = bOrder; \\\n\ result = convert_int4_rte(tmpDst * output_scale + output_zp); \\\n\ VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ @@ -32393,7 +36516,7 @@ __kernel void pre_process_yuv420_scale_##name \\\n\ temp2 = fx * tmpData0 + tmpData1; \\\n\ result = fy * temp2 + (temp1 << 10); \\\n\ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ - tmpDst = (tmpDst - gMean) * var; \\\n\ + tmpDst = (tmpDst - gMean) * g_scale; \\\n\ dstPos.z = 1; \\\n\ result = convert_int4_rte(tmpDst * output_scale + output_zp); \\\n\ VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ @@ -32407,7 +36530,7 @@ __kernel void pre_process_yuv420_scale_##name \\\n\ temp2 = fx * tmpData0 + tmpData1; \\\n\ result = fy * temp2 + (temp1 << 10); \\\n\ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ - tmpDst = (tmpDst - rMean) * var; \\\n\ + tmpDst = (tmpDst - rMean) * r_scale; \\\n\ dstPos.z = rOrder; \\\n\ result = convert_int4_rte(tmpDst * output_scale + output_zp); \\\n\ VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\ @@ -32467,9 +36590,11 @@ __kernel void pre_process_yuv420_scale_##name \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float var, \\\n\ + float r_scale, \\\n\ int reverse_channel, \\\n\ - int trans \\\n\ + int trans, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ int4 gidx = get_global_id(0); \\\n\ @@ -32620,7 +36745,7 @@ __kernel void pre_process_yuv420_scale_##name \\\n\ float4 tmpDst; \\\n\ int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ - tmpDst = (tmpDst - bMean) * var; \\\n\ + tmpDst = (tmpDst - bMean) * b_scale; \\\n\ dstPos.z = bOrder; \\\n\ tmpDst = tmpDst * output_scale + output_zp; \\\n\ _viv_asm(CONV_RTE, tmpVal, tmpDst); \\\n\ @@ -32636,7 +36761,7 @@ __kernel void pre_process_yuv420_scale_##name \\\n\ temp2 = fx * tmpData0 + tmpData1; \\\n\ result = fy * temp2 + (temp1 << 10); \\\n\ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ - tmpDst = (tmpDst - gMean) * var; \\\n\ + tmpDst = (tmpDst - gMean) * g_scale; \\\n\ dstPos.z = 1; \\\n\ tmpDst = tmpDst * output_scale + output_zp; \\\n\ _viv_asm(CONV_RTE, tmpVal, tmpDst); \\\n\ @@ -32652,7 +36777,7 @@ __kernel void pre_process_yuv420_scale_##name \\\n\ temp2 = fx * tmpData0 + tmpData1; \\\n\ result = fy * temp2 + (temp1 << 10); \\\n\ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ - tmpDst = (tmpDst - rMean) * var; \\\n\ + tmpDst = (tmpDst - rMean) * r_scale; \\\n\ dstPos.z = rOrder; \\\n\ tmpDst = tmpDst * output_scale + output_zp; \\\n\ _viv_asm(CONV_RTE, tmpVal, tmpDst); \\\n\ @@ -32669,7 +36794,9 @@ static const char pre_process_yuv422_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n _viv_uniform int bOrder;\n\ _viv_uniform int rOrder;\n\ \n\ -_viv_uniform float outputScaleVar;\n\ +_viv_uniform float outputScaleVar_b;\n\ +_viv_uniform float outputScaleVar_g;\n\ +_viv_uniform float outputScaleVar_r;\n\ _viv_uniform float bMeanScaleVarZp;\n\ _viv_uniform float gMeanScaleVarZp;\n\ _viv_uniform float rMeanScaleVarZp;\n\ @@ -32693,10 +36820,12 @@ __kernel void pre_process_yuv422_copy_##name \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float var, \\\n\ + float r_scale, \\\n\ int reverse_channel, \\\n\ int trans, \\\n\ - int yuv422_type \\\n\ + int yuv422_type, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ int gidx = get_global_id(0); \\\n\ @@ -32726,21 +36855,21 @@ __kernel void pre_process_yuv422_copy_##name \\\n\ dst_type dst0; \\\n\ save_type dst; \\\n\ int4 dstPos = (int4)(gidx, gidy, 0, 0); \\\n\ - tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\ + tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\ _viv_asm(CONV_RTE, result, tmpDstB); \\\n\ dstPos.z = bOrder; \\\n\ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\ _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\ + tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\ _viv_asm(CONV_RTE, result, tmpDstG); \\\n\ dstPos.z = 1; \\\n\ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\ _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\ + tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\ _viv_asm(CONV_RTE, result, tmpDstR); \\\n\ dstPos.z = rOrder; \\\n\ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\ @@ -32758,7 +36887,10 @@ static const char pre_process_yuv422_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\ _viv_uniform int bOrder;\n\ _viv_uniform int rOrder;\n\ \n\ -_viv_uniform float outputScaleVar;\n\ +_viv_uniform float outputScaleVar_b;\n\ +_viv_uniform float outputScaleVar_g;\n\ +_viv_uniform float outputScaleVar_r;\n\ +\n\ _viv_uniform float bMeanScaleVarZp;\n\ _viv_uniform float gMeanScaleVarZp;\n\ _viv_uniform float rMeanScaleVarZp;\n\ @@ -32788,10 +36920,12 @@ __kernel void pre_process_yuv422_scale_##name \\\n\ float rMean, \\\n\ float gMean, \\\n\ float bMean, \\\n\ - float var, \\\n\ + float r_scale, \\\n\ int reverse_channel, \\\n\ int trans, \\\n\ - int yuv422_type \\\n\ + int yuv422_type, \\\n\ + float g_scale, \\\n\ + float b_scale \\\n\ ) \\\n\ { \\\n\ int4 gidx = get_global_id(0); \\\n\ @@ -32863,21 +36997,21 @@ __kernel void pre_process_yuv422_scale_##name \\\n\ dst_type dst0; \\\n\ save_type dst; \\\n\ int4 dstPos = (int4)(gidx.x, gidy, 0, 0); \\\n\ - tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\ + tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\ _viv_asm(CONV_RTE, result, tmpDstB); \\\n\ dstPos.z = bOrder; \\\n\ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\ _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\ + tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\ _viv_asm(CONV_RTE, result, tmpDstG); \\\n\ dstPos.z = 1; \\\n\ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\ _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ \\\n\ - tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\ + tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\ _viv_asm(CONV_RTE, result, tmpDstR); \\\n\ dstPos.z = rOrder; \\\n\ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\ @@ -32939,9 +37073,11 @@ __kernel void pre_process_yuv444_copy_U8toU8(\n\ float rMean,\n\ float gMean,\n\ float bMean,\n\ - float var,\n\ + float r_scale,\n\ int reverse_channel,\n\ - int trans\n\ + int trans,\n\ + float g_scale,\n\ + float b_scale\n\ )\n\ {\n\ int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset));\n\ @@ -33000,17 +37136,22 @@ __kernel void pre_process_yuv444_copy_U8toU8(\n\ VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ \n\ - var *= outputScale;\n\ - float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\\\n\ - rMean * var - zp, var);\n\ + float4 paramData = (float4)(bMean * b_scale * outputScale - zp, gMean * g_scale * outputScale - zp,\\\n\ + rMean * r_scale * outputScale - zp, b_scale * outputScale);\n\ half4 paramData_f16;\n\ _viv_asm(CONV, paramData_f16, paramData);\n\ \n\ VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\ VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\ +\n\ + paramData.w = g_scale * outputScale;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ \n\ VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\ VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\ +\n\ + paramData.w = r_scale * outputScale;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ \n\ VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\ VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\ @@ -33035,9 +37176,11 @@ __kernel void pre_process_yuv444_copy_U8toF16(\n\ float rMean,\n\ float gMean,\n\ float bMean,\n\ - float var,\n\ + float r_scale,\n\ int reverse_channel,\n\ - int trans\n\ + int trans,\n\ + float g_scale,\n\ + float b_scale\n\ )\n\ {\n\ int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset));\n\ @@ -33097,16 +37240,22 @@ __kernel void pre_process_yuv444_copy_U8toF16(\n\ VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\ \n\ - float4 paramData = (float4)(bMean * var, gMean * var,\\\n\ - rMean * var, var);\n\ + float4 paramData = (float4)(bMean * b_scale * outputScale, gMean * g_scale * outputScale,\\\n\ + rMean * r_scale * outputScale, b_scale * outputScale);\n\ half4 paramData_f16;\n\ _viv_asm(CONV, paramData_f16, paramData);\n\ \n\ VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\ VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\ +\n\ + paramData.w = g_scale * outputScale;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ \n\ VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\ VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\ +\n\ + paramData.w = r_scale * outputScale;\n\ + _viv_asm(CONV, paramData_f16, paramData);\n\ \n\ VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\ VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\ @@ -33171,7 +37320,8 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \\\n\ __read_only image2d_t y_img, __read_only image2d_t u_img, \\\n\ __read_only image2d_t v_img, __write_only image2d_array_t output, \\\n\ global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, \\\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) \\\n\ + float rMean, float gMean, float bMean, float r_scale, int reverse_channel, int trans, \\\n\ + float g_scale, float b_scale) \\\n\ { \\\n\ int4 gidx = get_global_id(0); \\\n\ int gidy = get_global_id(1); \\\n\ @@ -33283,7 +37433,7 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \\\n\ float4 tmpDst; \\\n\ int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ - tmpDst = (tmpDst - bMean) * var; \\\n\ + tmpDst = (tmpDst - bMean) * b_scale; \\\n\ dstPos.z = bOrder; \\\n\ result = convert_int4_rte(tmpDst * outputScale + zp); \\\n\ VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \\\n\ @@ -33297,7 +37447,7 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \\\n\ temp2 = fx * tmpData0 + tmpData1; \\\n\ result = fy * temp2 + (temp1 << 10); \\\n\ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ - tmpDst = (tmpDst - gMean) * var; \\\n\ + tmpDst = (tmpDst - gMean) * g_scale; \\\n\ dstPos.z = 1; \\\n\ result = convert_int4_rte(tmpDst * outputScale + zp); \\\n\ VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \\\n\ @@ -33311,7 +37461,7 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \\\n\ temp2 = fx * tmpData0 + tmpData1; \\\n\ result = fy * temp2 + (temp1 << 10); \\\n\ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\ - tmpDst = (tmpDst - rMean) * var; \\\n\ + tmpDst = (tmpDst - rMean) * r_scale; \\\n\ dstPos.z = rOrder; \\\n\ result = convert_int4_rte(tmpDst * outputScale + zp); \\\n\ VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \\\n\ @@ -33360,7 +37510,8 @@ __kernel void pre_process_yuv444_scale_U8toF16(\n\ __read_only image2d_t y_img, __read_only image2d_t u_img,\n\ __read_only image2d_t v_img, __write_only image2d_array_t output,\n\ global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\ - float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\ + float rMean, float gMean, float bMean, float r_scale, int reverse_channel, int trans,\n\ + float g_scale, float b_scale)\n\ {\n\ int4 gidx = get_global_id(0);\n\ int gidy = get_global_id(1);\n\ @@ -33480,7 +37631,7 @@ __kernel void pre_process_yuv444_scale_U8toF16(\n\ float4 tmpDst;\n\ int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - tmpDst = (tmpDst - bMean) * var;\n\ + tmpDst = (tmpDst - bMean) * b_scale;\n\ dstPos.z = bOrder;\n\ _viv_asm(CONV, hDst, tmpDst);\n\ VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ @@ -33495,7 +37646,7 @@ __kernel void pre_process_yuv444_scale_U8toF16(\n\ temp2 = fx * tmpData0 + tmpData1;\n\ result = fy * temp2 + (temp1 << 10);\n\ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - tmpDst = (tmpDst - gMean) * var;\n\ + tmpDst = (tmpDst - gMean) * g_scale;\n\ dstPos.z = 1;\n\ _viv_asm(CONV, hDst, tmpDst);\n\ VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ @@ -33510,7 +37661,7 @@ __kernel void pre_process_yuv444_scale_U8toF16(\n\ temp2 = fx * tmpData0 + tmpData1;\n\ result = fy * temp2 + (temp1 << 10);\n\ VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\ - tmpDst = (tmpDst - rMean) * var;\n\ + tmpDst = (tmpDst - rMean) * r_scale;\n\ dstPos.z = rOrder;\n\ _viv_asm(CONV, hDst, tmpDst);\n\ VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ @@ -37154,7 +41305,6 @@ static const char resize_1d_bilinear_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ _viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\ _viv_uniform VXC_512Bits uniExtactHalf8_2x8;\n\ _viv_uniform float scale_x;\n\ _viv_uniform int out_height;\n\ @@ -37215,8 +41365,10 @@ __kernel void resize_1d_bilinear_F16toF16_DOWN\n\ \n\ _viv_asm(COPY, src_half, src, 16);\n\ \n\ - VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_left_4x4);\n\ - VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_right_4x4);\n\ + VXC_DP4x4(left4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniConvertFp2FP32_left_4x4);\n\ + VXC_DP4x4(right4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniConvertFp2FP32_right_4x4);\n\ right4 -= left4;\n\ float4 dst4 = right4 * x_lerp + left4;\n\ \n\ @@ -37281,8 +41433,10 @@ __kernel void resize_1d_bilinear_F16toU8_DOWN\n\ \n\ _viv_asm(COPY, src_half, src, 16);\n\ \n\ - VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_left_4x4);\n\ - VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_right_4x4);\n\ + VXC_DP4x4(left4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniConvertFp2FP32_left_4x4);\n\ + VXC_DP4x4(right4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\ + uniConvertFp2FP32_right_4x4);\n\ right4 -= left4;\n\ float4 dst4 = right4 * x_lerp + left4;\n\ \n\ @@ -41782,6 +45936,580 @@ __kernel void scatter_nd_update_F16F16toU8_big(\n\ }\n\ "; /* end of scatter_nd_update_big_vx*/ +static const char scatter_nd_update_fp_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int update_width;\n\ +_viv_uniform int output_width;\n\ +_viv_uniform int ref_stride;\n\ +_viv_uniform int output_stride;\n\ +\n\ +_viv_uniform int4 coord_stride;\n\ +_viv_uniform int4 coord_stride1;\n\ +_viv_uniform float inout_scale;\n\ +_viv_uniform float output_zp;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertFp16ToFp32_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +inline void AtomicAdd_float(volatile __global float *source, const float operand)\n\ +{\n\ + union\n\ + {\n\ + unsigned int intVal;\n\ + float floatVal;\n\ + } newVal;\n\ + union\n\ + {\n\ + unsigned int intVal;\n\ + float floatVal;\n\ + } prevVal;\n\ + do\n\ + {\n\ + prevVal.floatVal = *source;\n\ + newVal.floatVal = prevVal.floatVal + operand;\n\ + } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\ + prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\ +}\n\ +\n\ +__kernel void scatter_nd_update_update_F16(\n\ + __read_only image2d_t index,\n\ + __read_only image2d_t update,\n\ + image2d_t temp_buf_float,\n\ + image2d_t link_buffer0,\n\ + int width, int area, int vol, int val4,\n\ + int val5, int val6, int val7, int coord_dim)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + Image img1 = create_image_from_image2d(index, 4);\n\ + Image img2 = create_image_from_image2d(update, 2);\n\ + Image img3 = create_image_from_image2d(temp_buf_float, 4);\n\ + __global int* index_ptr = (__global int*)img1.ptr;\n\ + __global short* update_ptr = (__global short*)img2.ptr;\n\ + __global float* output_ptr = (__global float*)img3.ptr;\n\ + half src;\n\ +\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim);\n\ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);\n\ + short tmpData = update_ptr[gidy * update_width + gidx];\n\ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;\n\ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;\n\ + int loc = idx * output_width + gidx;\n\ + _viv_asm(COPY, src, tmpData, 4);\n\ + float data;\n\ + _viv_asm(CONV, data, src);\n\ + AtomicAdd_float(output_ptr + loc, data);\n\ +}\n\ +\n\ +__kernel void scatter_nd_update_update_F16_4X(\n\ + __read_only image2d_t index,\n\ + __read_only image2d_t update,\n\ + image2d_t temp_buf_float,\n\ + image2d_t link_buffer0,\n\ + int width, int area, int vol, int val4,\n\ + int val5, int val6, int val7, int coord_dim)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + Image img1 = create_image_from_image2d(index, 4);\n\ + Image img2 = create_image_from_image2d(update, 2);\n\ + Image img3 = create_image_from_image2d(temp_buf_float, 4);\n\ + __global int* index_ptr = (__global int*)img1.ptr;\n\ + __global vxc_short4* update_ptr = (__global vxc_short4*)img2.ptr;\n\ + __global float* output_ptr = (__global float*)img3.ptr;\n\ + vxc_half4 src;\n\ +\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim);\n\ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);\n\ + vxc_short4 tmpData = update_ptr[gidy * update_width + gidx];\n\ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;\n\ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;\n\ + int4 loc = idx * output_width + gidx * 4 + (int4)(0, 1, 2, 3);\n\ +\n\ + _viv_asm(COPY, src, tmpData, 8);\n\ + float4 data;\n\ + VXC_DP4x4(data, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1),\n\ + uniConvertFp16ToFp32_4x4);\n\ + AtomicAdd_float(output_ptr + loc.x, data.x);\n\ + AtomicAdd_float(output_ptr + loc.y, data.y);\n\ + AtomicAdd_float(output_ptr + loc.z, data.z);\n\ + AtomicAdd_float(output_ptr + loc.w, data.w);\n\ +}\n\ +\n\ +__kernel void scatter_nd_update_update_BF16(\n\ + __read_only image2d_t index,\n\ + __read_only image2d_t update,\n\ + image2d_t temp_buf_float,\n\ + image2d_t link_buffer0,\n\ + int width, int area, int vol, int val4,\n\ + int val5, int val6, int val7, int coord_dim)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + Image img1 = create_image_from_image2d(index, 4);\n\ + Image img2 = create_image_from_image2d(update, 2);\n\ + Image img3 = create_image_from_image2d(temp_buf_float, 4);\n\ + __global int* index_ptr = (__global int*)img1.ptr;\n\ + __global short* update_ptr = (__global short*)img2.ptr;\n\ + __global float* output_ptr = (__global float*)img3.ptr;\n\ + float data;\n\ +\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim);\n\ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);\n\ + short tmpData = update_ptr[gidy * update_width + gidx];\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_short8 src0, src1;\n\ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;\n\ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;\n\ + int loc = idx * output_width + gidx;\n\ + _viv_asm(COPY, src0, tmpData, 4);\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, data, src1, 4);\n\ + AtomicAdd_float(output_ptr + loc, data);\n\ +}\n\ +\n\ +__kernel void scatter_nd_update_update_BF16_4X(\n\ + __read_only image2d_t index,\n\ + __read_only image2d_t update,\n\ + image2d_t temp_buf_float,\n\ + image2d_t link_buffer0,\n\ + int width, int area, int vol, int val4,\n\ + int val5, int val6, int val7, int coord_dim)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + Image img1 = create_image_from_image2d(index, 4);\n\ + Image img2 = create_image_from_image2d(update, 2);\n\ + Image img3 = create_image_from_image2d(temp_buf_float, 4);\n\ + __global int* index_ptr = (__global int*)img1.ptr;\n\ + __global vxc_short4* update_ptr = (__global vxc_short4*)img2.ptr;\n\ + __global float* output_ptr = (__global float*)img3.ptr;\n\ +\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim);\n\ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);\n\ + vxc_short4 tmpData = update_ptr[gidy * update_width + gidx];\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_short8 src0, src1;\n\ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;\n\ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;\n\ + int4 loc = idx * output_width + gidx * 4 + (int4)(0, 1, 2, 3);\n\ +\n\ + _viv_asm(COPY, src0, tmpData, 8);\n\ + float4 data;\n\ + VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + _viv_asm(COPY, data, src1, 16);\n\ + AtomicAdd_float(output_ptr + loc.x, data.x);\n\ + AtomicAdd_float(output_ptr + loc.y, data.y);\n\ + AtomicAdd_float(output_ptr + loc.z, data.z);\n\ + AtomicAdd_float(output_ptr + loc.w, data.w);\n\ +}\n\ +\n\ +#define SCATTER_ND_UPDATE_REF_FP16(type0, type1, ptr_type) \\\n\ +__kernel void scatter_nd_update_ref_##type0##to##type1( \\\n\ + __read_only image2d_t index, \\\n\ + __read_only image2d_t update, \\\n\ + __read_only image2d_t temp_buf_int, \\\n\ + image2d_t temp_ref, \\\n\ + image2d_t link_buffer0, \\\n\ + image2d_t link_buffer1, \\\n\ + int width, int area, int vol, int val4, \\\n\ + int val5, int val6, int val7, int coord_dim) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + Image img1 = create_image_from_image2d(index, 4); \\\n\ + Image img2 = create_image_from_image2d(temp_buf_int, 4); \\\n\ + Image img3 = create_image_from_image2d(temp_ref, 2); \\\n\ + __global int* index_ptr = (__global int*)img1.ptr; \\\n\ + __global ptr_type* acc_ptr = (__global ptr_type*)img2.ptr; \\\n\ + __global short* ref_ptr = (__global short*)img3.ptr; \\\n\ + \\\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\ + int loc = idx * output_stride + gidx; \\\n\ + float4 tmpData; \\\n\ + tmpData.x = convert_float(acc_ptr[loc]) * inout_scale + output_zp; \\\n\ + half4 data; \\\n\ + short tmpDst; \\\n\ + _viv_asm(CONV, data, tmpData); \\\n\ + _viv_asm(COPY, tmpDst, data, 4); \\\n\ + ref_ptr[loc] = tmpDst; \\\n\ +}\n\ +SCATTER_ND_UPDATE_REF_FP16(I32, F16, int)\n\ +SCATTER_ND_UPDATE_REF_FP16(F32, F16, float)\n\ +\n\ +#define SCATTER_ND_UPDATE_REF_FP16_4X(type0, type1, ptr_type) \\\n\ +__kernel void scatter_nd_update_ref_##type0##to##type1##_4X( \\\n\ + __read_only image2d_t index, \\\n\ + __read_only image2d_t update, \\\n\ + __read_only image2d_t temp_buf_int, \\\n\ + image2d_t temp_ref, \\\n\ + image2d_t link_buffer0, \\\n\ + image2d_t link_buffer1, \\\n\ + int width, int area, int vol, int val4, \\\n\ + int val5, int val6, int val7, int coord_dim) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + Image img1 = create_image_from_image2d(index, 4); \\\n\ + Image img2 = create_image_from_image2d(temp_buf_int, 4); \\\n\ + Image img3 = create_image_from_image2d(temp_ref, 2); \\\n\ + __global int* index_ptr = (__global int*)img1.ptr; \\\n\ + __global ptr_type* acc_ptr = (__global ptr_type*)img2.ptr; \\\n\ + __global vxc_short4* ref_ptr = (__global vxc_short4*)img3.ptr; \\\n\ + \\\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\ + float4 tmpData = convert_float4(vload4(gidx, acc_ptr + idx * ref_stride)); \\\n\ + int loc = idx * output_stride + gidx; \\\n\ + float4 tmpVal = tmpData * inout_scale + output_zp; \\\n\ + half4 data; \\\n\ + vxc_short8 tmpDst; \\\n\ + _viv_asm(CONV, data, tmpVal); \\\n\ + _viv_asm(COPY, tmpDst, data, 16); \\\n\ + ref_ptr[loc] = tmpDst.s0246; \\\n\ +}\n\ +SCATTER_ND_UPDATE_REF_FP16_4X(I32, F16, int)\n\ +SCATTER_ND_UPDATE_REF_FP16_4X(F32, F16, float)\n\ +\n\ +__kernel void scatter_nd_update_ref_F32toBF16(\n\ + __read_only image2d_t index,\n\ + __read_only image2d_t update,\n\ + __read_only image2d_t temp_buf_int,\n\ + image2d_t temp_ref,\n\ + image2d_t link_buffer0,\n\ + image2d_t link_buffer1,\n\ + int width, int area, int vol, int val4,\n\ + int val5, int val6, int val7, int coord_dim)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + Image img1 = create_image_from_image2d(index, 4);\n\ + Image img2 = create_image_from_image2d(temp_buf_int, 4);\n\ + Image img3 = create_image_from_image2d(temp_ref, 2);\n\ + __global int* index_ptr = (__global int*)img1.ptr;\n\ + __global float* acc_ptr = (__global float*)img2.ptr;\n\ + __global short* ref_ptr = (__global short*)img3.ptr;\n\ +\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim);\n\ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);\n\ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;\n\ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;\n\ + int loc = idx * output_stride + gidx;\n\ + float tmpData;\n\ + tmpData = acc_ptr[loc];\n\ + vxc_ushort8 src0, src2;\n\ + _viv_asm(COPY, src0, tmpData, 4);\n\ + VXC_DP2x8(src2, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ +\n\ + ref_ptr[loc] = src2.x;\n\ +}\n\ +\n\ +__kernel void scatter_nd_update_ref_F32toBF16_4X(\n\ + __read_only image2d_t index,\n\ + __read_only image2d_t update,\n\ + __read_only image2d_t temp_buf_int,\n\ + image2d_t temp_ref,\n\ + image2d_t link_buffer0,\n\ + image2d_t link_buffer1,\n\ + int width, int area, int vol, int val4,\n\ + int val5, int val6, int val7, int coord_dim)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + Image img1 = create_image_from_image2d(index, 4);\n\ + Image img2 = create_image_from_image2d(temp_buf_int, 4);\n\ + Image img3 = create_image_from_image2d(temp_ref, 2);\n\ + __global int* index_ptr = (__global int*)img1.ptr;\n\ + __global float* acc_ptr = (__global float*)img2.ptr;\n\ + __global vxc_short4* ref_ptr = (__global vxc_short4*)img3.ptr;\n\ +\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim);\n\ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);\n\ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;\n\ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;\n\ + float4 tmpData = vload4(gidx, acc_ptr + idx * ref_stride);\n\ + int loc = idx * output_stride + gidx;\n\ + vxc_short8 src0, src2;\n\ + _viv_asm(COPY, src0, tmpData, 16);\n\ + VXC_DP2x8(src2, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + ref_ptr[loc] = src2.s0123;\n\ +}\n\ +"; /* end of scatter_nd_update_fp_vx*/ + +static const char scatter_nd_update_qint_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\ +_viv_uniform int update_width;\n\ +_viv_uniform int output_width;\n\ +_viv_uniform int ref_stride;\n\ +_viv_uniform int output_stride;\n\ +_viv_uniform int2 multAndoutZP0;\n\ +\n\ +_viv_uniform int4 coord_stride;\n\ +_viv_uniform int4 coord_stride1;\n\ +\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform int input_zp;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform float inout_scale;\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +#define SCATTER_RESET(name0, name1, ptr0, ptr1, type0, type1, len0, len1, size0, size1, ptr2, ptr3, len3) \\\n\ +__kernel void scatter_nd_update_reset_##name0##to##name1( \\\n\ + __read_only image2d_t input_ref, \\\n\ + image2d_t temp_ref, \\\n\ + image2d_t temp_buf_int, \\\n\ + int length, int res) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + Image img1 = create_image_from_image2d(input_ref, size0); \\\n\ + Image img2 = create_image_from_image2d(temp_ref, size1); \\\n\ + Image img3 = create_image_from_image2d(temp_buf_int, 4); \\\n\ + __global ptr0* input_ptr = (__global ptr0*)img1.ptr; \\\n\ + __global ptr1* output_ptr = (__global ptr1*)img2.ptr; \\\n\ + __global int* tmp_update_ptr = (__global int*)img3.ptr; \\\n\ + ptr0 tmpData = input_ptr[gidx]; \\\n\ + int4 zeros = (int4)(0); \\\n\ + int loc2 = gidx * 8; \\\n\ + type0 src; \\\n\ + type1 tmpDst; \\\n\ + ptr1 dst; \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + _viv_asm(COPY, src, tmpData, len0); \\\n\ + VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + _viv_asm(COPY, dst, tmpDst, len1); \\\n\ + output_ptr[gidx] = dst; \\\n\ + vstore4(zeros, 0, tmp_update_ptr + loc2); \\\n\ + vstore4(zeros, 1, tmp_update_ptr + loc2); \\\n\ + if(gidx < res) \\\n\ + { \\\n\ + __global ptr2* input_ptr1 = (__global ptr2*)img1.ptr; \\\n\ + __global ptr3* output_ptr1 = (__global ptr3*)img2.ptr; \\\n\ + ptr2 tmpData1 = input_ptr1[length + gidx]; \\\n\ + ptr3 dst1; \\\n\ + dst1 ^= dst1; \\\n\ + tmp_update_ptr[length + gidx] = 0; \\\n\ + _viv_asm(COPY, src, tmpData1, 4); \\\n\ + VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + _viv_asm(COPY, dst1, tmpDst, len3); \\\n\ + output_ptr1[length + gidx] = dst1; \\\n\ + } \\\n\ +}\n\ +SCATTER_RESET(U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, 8, 8, 1, 1, uchar, uchar, 1)\n\ +SCATTER_RESET(I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, 8, 8, 1, 1, char, char, 1)\n\ +SCATTER_RESET(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, 16, 16, 2, 2, short, short, 2)\n\ +SCATTER_RESET(F16, F16, vxc_short8, vxc_short8, vxc_half8, vxc_half8, 16, 16, 2, 2, short, short, 2)\n\ +SCATTER_RESET(U8, F16, vxc_uchar8, vxc_short8, vxc_uchar8, vxc_half8, 8, 16, 1, 2, uchar, short, 2)\n\ +SCATTER_RESET(I8, F16, vxc_char8, vxc_short8, vxc_char8, vxc_half8, 8, 16, 1, 2, char, short, 2)\n\ +SCATTER_RESET(I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, 16, 8, 2, 1, short, short, 2)\n\ +SCATTER_RESET(F16, U8, vxc_short8, vxc_uchar8, vxc_half8, vxc_uchar8, 16, 8, 2, 1, short, uchar, 1)\n\ +\n\ +__kernel void scatter_nd_update_reset_BF16toBF16(\n\ + __read_only image2d_t input_ref,\n\ + image2d_t temp_ref,\n\ + image2d_t temp_buf_int)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + Image img1 = create_image_from_image2d(input_ref, 2);\n\ + Image img2 = create_image_from_image2d(temp_ref, 2);\n\ + Image img3 = create_image_from_image2d(temp_buf_int, 4);\n\ + __global vxc_short8* input_ptr = (__global vxc_short8*)img1.ptr;\n\ + __global vxc_short8* output_ptr = (__global vxc_short8*)img2.ptr;\n\ + __global float* tmp_update_ptr = (__global float*)img3.ptr;\n\ + vxc_short8 src = input_ptr[gidx];\n\ + float4 zeros = (float4)(0, 0, 0, 0);\n\ + int loc2 = gidx * 8;\n\ + output_ptr[gidx] = src;\n\ + vstore4(zeros, 0, tmp_update_ptr + loc2);\n\ + vstore4(zeros, 1, tmp_update_ptr + loc2);\n\ +}\n\ +\n\ +#define SCATTER_ND_UPDATE_QINT(src0_type, data_type, ptr_type, element_size) \\\n\ +__kernel void scatter_nd_update_update_##src0_type( \\\n\ + __read_only image2d_t index, \\\n\ + __read_only image2d_t update, \\\n\ + image2d_t temp_buf_int, \\\n\ + image2d_t link_buffer0, \\\n\ + int width, int area, int vol, int val4, \\\n\ + int val5, int val6, int val7, int coord_dim) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + Image img1 = create_image_from_image2d(index, 4); \\\n\ + Image img2 = create_image_from_image2d(update, element_size); \\\n\ + Image img3 = create_image_from_image2d(temp_buf_int, 4); \\\n\ + __global int* index_ptr = (__global int*)img1.ptr; \\\n\ + __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \\\n\ + __global int* output_ptr = (__global int*)img3.ptr; \\\n\ + data_type src; \\\n\ + \\\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\ + ptr_type tmpData = update_ptr[gidy * update_width + gidx]; \\\n\ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\ + int loc = idx * output_width + gidx; \\\n\ + _viv_asm(COPY, src, tmpData, 4); \\\n\ + vxc_int4 data; \\\n\ + short zp = input_zp; \\\n\ + VXC_DP4x4(data, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + atomic_add(output_ptr + loc, data.x); \\\n\ +}\n\ +SCATTER_ND_UPDATE_QINT(U8, vxc_uchar8, uchar, 1)\n\ +SCATTER_ND_UPDATE_QINT(I8, vxc_char8, char, 1)\n\ +SCATTER_ND_UPDATE_QINT(I16, vxc_short8, short, 2)\n\ +\n\ +#define SCATTER_ND_UPDATE_QINT_4X(src0_type, data_type, ptr_type, element_size) \\\n\ +__kernel void scatter_nd_update_update_##src0_type##_4X( \\\n\ + __read_only image2d_t index, \\\n\ + __read_only image2d_t update, \\\n\ + image2d_t temp_buf_int, \\\n\ + image2d_t link_buffer0, \\\n\ + int width, int area, int vol, int val4, \\\n\ + int val5, int val6, int val7, int coord_dim) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + Image img1 = create_image_from_image2d(index, 4); \\\n\ + Image img2 = create_image_from_image2d(update, element_size); \\\n\ + Image img3 = create_image_from_image2d(temp_buf_int, 4); \\\n\ + __global int* index_ptr = (__global int*)img1.ptr; \\\n\ + __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \\\n\ + __global int* output_ptr = (__global int*)img3.ptr; \\\n\ + \\\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\ + ptr_type src = update_ptr[gidy * update_width + gidx]; \\\n\ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\ + int4 loc = idx * output_width + gidx * 4 + (int4)(0, 1, 2, 3); \\\n\ + vxc_int4 data; \\\n\ + short zp = input_zp; \\\n\ + VXC_DP4x4(data, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + atomic_add(output_ptr + loc.x, data.x); \\\n\ + atomic_add(output_ptr + loc.y, data.y); \\\n\ + atomic_add(output_ptr + loc.z, data.z); \\\n\ + atomic_add(output_ptr + loc.w, data.w); \\\n\ +}\n\ +SCATTER_ND_UPDATE_QINT_4X(U8, vxc_uchar8, vxc_uchar4, 1)\n\ +SCATTER_ND_UPDATE_QINT_4X(I8, vxc_char8, vxc_char4, 1)\n\ +SCATTER_ND_UPDATE_QINT_4X(I16, vxc_short8, vxc_short4, 2)\n\ +\n\ +#define SCATTER_ND_UPDATE_REF(src0_type, dst_type, data_type, ptr_type, element_size) \\\n\ +__kernel void scatter_nd_update_ref_##src0_type##to##dst_type( \\\n\ + __read_only image2d_t index, \\\n\ + __read_only image2d_t update, \\\n\ + __read_only image2d_t temp_buf_int, \\\n\ + image2d_t temp_ref, \\\n\ + image2d_t link_buffer0, \\\n\ + image2d_t link_buffer1, \\\n\ + int width, int area, int vol, int val4, \\\n\ + int val5, int val6, int val7, int coord_dim) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + Image img1 = create_image_from_image2d(index, 4); \\\n\ + Image img2 = create_image_from_image2d(temp_buf_int, 4); \\\n\ + Image img3 = create_image_from_image2d(temp_ref, element_size); \\\n\ + __global int* index_ptr = (__global int*)img1.ptr; \\\n\ + __global int* acc_ptr = (__global int*)img2.ptr; \\\n\ + __global ptr_type* ref_ptr = (__global ptr_type*)img3.ptr; \\\n\ + data_type dst; \\\n\ + \\\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\ + int loc = idx * output_stride + gidx; \\\n\ + int tmpData = acc_ptr[loc]; \\\n\ + int4 data; \\\n\ + data.x = convert_int_rte(tmpData * inout_scale + output_zp); \\\n\ + VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + ref_ptr[loc] = dst.x; \\\n\ +}\n\ +SCATTER_ND_UPDATE_REF(I32, U8, vxc_uchar8, uchar, 1)\n\ +SCATTER_ND_UPDATE_REF(I32, I8, vxc_char8, char, 1)\n\ +SCATTER_ND_UPDATE_REF(I32, I16, vxc_short8, short, 2)\n\ +\n\ +#define SCATTER_ND_UPDATE_REF_4X(src0_type, dst_type, data_type, ptr_type, element_size) \\\n\ +__kernel void scatter_nd_update_ref_##src0_type##to##dst_type##_4X( \\\n\ + __read_only image2d_t index, \\\n\ + __read_only image2d_t update, \\\n\ + __read_only image2d_t temp_buf_int, \\\n\ + image2d_t temp_ref, \\\n\ + image2d_t link_buffer0, \\\n\ + image2d_t link_buffer1, \\\n\ + int width, int area, int vol, int val4, \\\n\ + int val5, int val6, int val7, int coord_dim) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + int gidy = get_global_id(1); \\\n\ + Image img1 = create_image_from_image2d(index, 4); \\\n\ + Image img2 = create_image_from_image2d(temp_buf_int, 4); \\\n\ + Image img3 = create_image_from_image2d(temp_ref, element_size); \\\n\ + __global int* index_ptr = (__global int*)img1.ptr; \\\n\ + __global int* acc_ptr = (__global int*)img2.ptr; \\\n\ + __global ptr_type* ref_ptr = (__global ptr_type*)img3.ptr; \\\n\ + data_type dst; \\\n\ + \\\n\ + int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\ + int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\ + int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\ + int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\ + float4 tmpData = convert_float4(vload4(gidx, acc_ptr + idx * ref_stride)); \\\n\ + int loc = idx * output_stride + gidx; \\\n\ + int4 data = convert_int4_rte(tmpData * inout_scale + output_zp); \\\n\ + VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + ref_ptr[loc] = dst.xyzw; \\\n\ +}\n\ +SCATTER_ND_UPDATE_REF_4X(I32, U8, vxc_uchar8, vxc_uchar4, 1)\n\ +SCATTER_ND_UPDATE_REF_4X(I32, I8, vxc_char8, vxc_char4, 1)\n\ +SCATTER_ND_UPDATE_REF_4X(I32, I16, vxc_short8, vxc_short4, 2)\n\ +\n\ +#define SCATTER_ND_UPDATE_COPY(src0_type, ptr_type, element_size, ptr_type1) \\\n\ +__kernel void scatter_nd_update_copy_##src0_type( \\\n\ + __read_only image2d_t temp_ref, \\\n\ + __read_only image2d_t link_buffer1, \\\n\ + image2d_t output, \\\n\ + int length, int res) \\\n\ +{ \\\n\ + int gidx = get_global_id(0); \\\n\ + Image img1 = create_image_from_image2d(temp_ref, element_size); \\\n\ + Image img2 = create_image_from_image2d(output, element_size); \\\n\ + __global ptr_type* input_ptr = (__global ptr_type*)img1.ptr; \\\n\ + __global ptr_type* output_ptr = (__global ptr_type*)img2.ptr; \\\n\ + output_ptr[gidx] = input_ptr[gidx]; \\\n\ + if(gidx < res) \\\n\ + { \\\n\ + __global ptr_type1* input_ptr1 = (__global ptr_type1*)img1.ptr; \\\n\ + __global ptr_type1* output_ptr1 = (__global ptr_type1*)img2.ptr; \\\n\ + output_ptr1[length + gidx] = input_ptr1[length + gidx]; \\\n\ + } \\\n\ +}\n\ +SCATTER_ND_UPDATE_COPY(U8, vxc_uchar8, 1, uchar)\n\ +SCATTER_ND_UPDATE_COPY(I8, vxc_char8, 1, char)\n\ +SCATTER_ND_UPDATE_COPY(I16, vxc_short8, 2, short)\n\ +SCATTER_ND_UPDATE_COPY(F16, vxc_short8, 2, short)\n\ +SCATTER_ND_UPDATE_COPY(BF16, vxc_short8, 2, short)\n\ +"; /* end of scatter_nd_update_qint_vx*/ + static const char scatter_nd_update_special_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\ @@ -42120,7 +46848,7 @@ __kernel void sequence_mask_##src0_type_name##to##src1_type_name##_2D( \\\n\ short zp = inputZP; \\\n\ VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvert1stUint8SubZpToFp32_4x4); \\\n\ - int index = convert_int_rte(tmpData.s0 * input_scale); \\\n\ + int index = convert_int_rtz(tmpData.s0 * input_scale); \\\n\ int4 data; \\\n\ data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \\\n\ write_type dst; \\\n\ @@ -42146,7 +46874,7 @@ __kernel void sequence_mask_##src0_type_name##to##src1_type_name( \\\n\ short zp = inputZP; \\\n\ VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ uniConvert1stUint8SubZpToFp32_4x4); \\\n\ - int index = convert_int_rte(tmpData.s0 * input_scale); \\\n\ + int index = convert_int_rtz(tmpData.s0 * input_scale); \\\n\ int4 data; \\\n\ data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \\\n\ write_type dst; \\\n\ @@ -42172,7 +46900,7 @@ __kernel void sequence_mask_F16toF16_2D(\n\ float4 tmpData;\n\ VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ UniFP16toFP32Lo4_dp4x4);\n\ - int index = convert_int_rte(tmpData.x);\n\ + int index = convert_int_rtz(tmpData.x);\n\ float4 data;\n\ data = outIdx < index? outputVal1 : convert_float(output_ZP);\n\ vxc_short8 dst;\n\ @@ -42195,7 +46923,7 @@ __kernel void sequence_mask_F16toF16(\n\ float4 tmpData;\n\ VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ UniFP16toFP32Lo4_dp4x4);\n\ - int index = convert_int_rte(tmpData.x);\n\ + int index = convert_int_rtz(tmpData.x);\n\ float4 data;\n\ data = outIdx < index? outputVal1 : convert_float(output_ZP);\n\ vxc_short8 dst;\n\ @@ -42218,7 +46946,7 @@ __kernel void sequence_mask_F16toU8_2D(\n\ float4 tmpData;\n\ VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ UniFP16toFP32Lo4_dp4x4);\n\ - int index = convert_int_rte(tmpData.x);\n\ + int index = convert_int_rtz(tmpData.x);\n\ int4 data;\n\ data = outIdx < index? convert_int_rte(outputVal1) : output_ZP;\n\ vxc_uchar16 dst;\n\ @@ -42239,7 +46967,7 @@ __kernel void sequence_mask_F16toU8(\n\ float4 tmpData;\n\ VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ UniFP16toFP32Lo4_dp4x4);\n\ - int index = convert_int_rte(tmpData.x);\n\ + int index = convert_int_rtz(tmpData.x);\n\ int4 data;\n\ data = outIdx < index? convert_int_rte(outputVal1) : output_ZP;\n\ vxc_uchar16 dst;\n\ @@ -43211,6 +47939,167 @@ TILE_2D_MIX(U8, F16, 7, 6, vxc_uchar8, vxc_short8)\n\ TILE_2D_MIX(U8, F16, 0, 7, vxc_uchar8, vxc_short8)\n\ "; /* end of tile_mix_vx*/ +static const char tiny_yolov4_postprocess_box_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#define logE (1.44269502f)\n\ +\n\ +float4 sigmoid4(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +\n\ +float4 exp4(float4 x)\n\ +{\n\ + x *= logE;\n\ + return exp2(x);\n\ +}\n\ +\n\ +#define CONST0 (1.0499999523162842f)\n\ +#define CONST1 (0.0250000003725290f)\n\ +\n\ +_viv_uniform VXC_512Bits uniDatatoFloat32_0_4x4;\n\ +_viv_uniform VXC_512Bits uniDatatoFloat32_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniDataTranspose_0_2x8;\n\ +_viv_uniform VXC_512Bits uniDataTranspose_1_2x8;\n\ +_viv_uniform float input0_scale;\n\ +_viv_uniform float input0_tail;\n\ +_viv_uniform float input1_scale;\n\ +_viv_uniform float input1_tail;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ +_viv_uniform float CONST2;\n\ +__kernel void tiny_yolov4_postprocess_box_U8_U8toU8\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + float bias_0,\n\ + float bias_1\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(0));\n\ +\n\ + vxc_uchar16 src0, src1, src2, src3;\n\ + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input0, coord.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input0, coord.xy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage(src2, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src3, input1, coord.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord.zw += (int2)(2, 3);\n\ +\n\ + float4 data0, data1, data2, data3, data;\n\ + VXC_DP4x4(data0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);\n\ + data0 = data0 * input0_scale + input0_tail;\n\ + data0 = sigmoid4(data0);\n\ + data0 = data0 * CONST0 - CONST1;\n\ +\n\ + VXC_DP4x4(data, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);\n\ + data = data * input1_scale + input1_tail;\n\ + data0 = data0 * CONST2 + data * CONST2;\n\ +\n\ + VXC_DP4x4(data1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_1_4x4);\n\ + data1 = data1 * input0_scale + input0_tail;\n\ + data1 = sigmoid4(data1);\n\ + data1 = data1 * CONST0 - CONST1;\n\ +\n\ + VXC_DP4x4(data, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);\n\ + data = data * input1_scale + input1_tail;\n\ + data1 = data1 * CONST2 + data * CONST2;\n\ +\n\ + VXC_DP4x4(data2, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);\n\ + data2 = data2 * input0_scale + input0_tail;\n\ + data2 = exp4(data2) * bias_0;\n\ +\n\ + VXC_DP4x4(data3, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_1_4x4);\n\ + data3 = data3 * input0_scale + input0_tail;\n\ + data3 = exp4(data3) * bias_1;\n\ +\n\ + data0 = data0 * output_scale + output_zp;\n\ + data1 = data1 * output_scale + output_zp;\n\ +\n\ + int4 dst0 = convert_int4_rte(data0);\n\ + int4 dst1 = convert_int4_rte(data1);\n\ + VXC_DP2x8(src1, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\ + data2 = data2 * output_scale + output_zp;\n\ + data3 = data3 * output_scale + output_zp;\n\ + dst0 = convert_int4_rte(data2);\n\ + dst1 = convert_int4_rte(data3);\n\ + VXC_DP2x8(src1, dst0, dst1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\ +\n\ + VXC_DP2x8(src0, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniDataTranspose_0_2x8);\n\ + VXC_DP2x8(src0, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniDataTranspose_1_2x8);\n\ +\n\ + VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord.x ++;\n\ + VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.yz, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.yw, src0, VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of tiny_yolov4_postprocess_box_vx*/ + +static const char tiny_yolov4_postprocess_confidence_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniU8TimesU8_0_4x4;\n\ +_viv_uniform VXC_512Bits uniU8PlusU8_trans_0_2x8;\n\ +_viv_uniform VXC_512Bits uniU8PlusU8_trans_1_2x8;\n\ +_viv_uniform VXC_512Bits uniU16TimesMultiplier_PostShift_2x8;\n\ +_viv_uniform int output_zp;\n\ +\n\ +__kernel void tiny_yolov4_postprocess_conf_U8toU8\n\ +(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output\n\ +)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, get_global_id(0));\n\ +\n\ + vxc_uchar16 src0, src1, src2, src3, src4;\n\ +\n\ + VXC_ReadImage(src0, input, coord.wz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + vxc_ushort8 data0, data1;\n\ +\n\ + VXC_ReadImage(src1, input, coord.wy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src2, input, coord.wy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src3, input, coord.wy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src4, input, coord.wy, VXC_5BITOFFSET_XY(0, 4), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord.zw = coord.xx + (int2)(2, 3);\n\ +\n\ + VXC_DP4x4(data0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);\n\ + VXC_DP4x4(data0, src0, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);\n\ + VXC_DP4x4(data1, src0, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);\n\ + VXC_DP4x4(data1, src0, src4, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);\n\ +\n\ + VXC_DP2x8(src1, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\ + uniU16TimesMultiplier_PostShift_2x8);\n\ + VXC_DP2x8(src1, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\n\ + uniU16TimesMultiplier_PostShift_2x8);\n\ +\n\ + uchar zp;\n\ + _viv_asm(COPY, zp, output_zp, 2);\n\ +\n\ + VXC_DP2x8(src0, src1, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\ + uniU8PlusU8_trans_0_2x8);\n\ + VXC_DP2x8(src0, src1, zp, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\n\ + uniU8PlusU8_trans_1_2x8);\n\ +\n\ + VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ + coord.x ++;\n\ + VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.yz, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord.yw, src0, VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of tiny_yolov4_postprocess_confidence_vx*/ + static const char upsample_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniF16MulMultipiler_PostShft_2x8;\n\ @@ -49204,6 +54093,8 @@ static const char gather_cl[] = "__kernel void gather_U8toU8(\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ int4 indice = read_imagei(input1, coord_in.xy);\n\ +\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ uint4 data = read_imageui(input0, coord_in.zw);\n\ @@ -49229,6 +54120,8 @@ __kernel void gather_F16toF16(\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ int4 indice = read_imagei(input1, coord_in.xy);\n\ +\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ float4 data = read_imagef(input0, coord_in.zw);\n\ @@ -49254,6 +54147,8 @@ __kernel void gather_I32toI32(\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ int4 indice = read_imagei(input1, coord_in.xy);\n\ +\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ int4 data = read_imagei(input0, coord_in.zw);\n\ @@ -49279,6 +54174,8 @@ __kernel void gather_F32toF32(\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ int4 indice = read_imagei(input1, coord_in.xy);\n\ +\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ float4 data = read_imagef(input0, coord_in.zw);\n\ @@ -49305,6 +54202,7 @@ static const char gather_array_cl[] = "__kernel void gather_array_U8toU8(\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ int4 indice = read_imagei(input1, coord_in.xy);\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ Image img1 = create_image_from_image2d(input0, 1);\n\ @@ -49333,6 +54231,7 @@ __kernel void gather_array_F16toF16(\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ int4 indice = read_imagei(input1, coord_in.xy);\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ Image img1 = create_image_from_image2d(input0, 2);\n\ @@ -49361,6 +54260,7 @@ __kernel void gather_array_I32toI32(\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ int4 indice = read_imagei(input1, coord_in.xy);\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ Image img1 = create_image_from_image2d(input0, 4);\n\ @@ -49389,6 +54289,7 @@ __kernel void gather_array_F32toF32(\n\ \n\ int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\ int4 indice = read_imagei(input1, coord_in.xy);\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.w = gidz * axis_num + indice.x;\n\ \n\ Image img1 = create_image_from_image2d(input0, 4);\n\ @@ -49423,6 +54324,7 @@ static const char gather_batch_cl[] = "__kernel void gather_batch_U8toU8(\n\ {\n\ int4 indice = read_imagei(input1, coord_idx);\n\ coord_idx.y++;\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.y = gidz * axis_num + indice.x;\n\ \n\ uint4 data = read_imageui(input0, coord_in);\n\ @@ -49454,6 +54356,7 @@ __kernel void gather_batch_F16toF16(\n\ {\n\ int4 indice = read_imagei(input1, coord_idx);\n\ coord_idx.y++;\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.y = gidz * axis_num + indice.x;\n\ \n\ float4 data = read_imagef(input0, coord_in);\n\ @@ -49485,6 +54388,7 @@ __kernel void gather_batch_I32toI32(\n\ {\n\ int4 indice = read_imagei(input1, coord_idx);\n\ coord_idx.y++;\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.y = gidz * axis_num + indice.x;\n\ \n\ int4 data = read_imagei(input0, coord_in);\n\ @@ -49516,6 +54420,7 @@ __kernel void gather_batch_F32toF32(\n\ {\n\ int4 indice = read_imagei(input1, coord_idx);\n\ coord_idx.y++;\n\ + indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\ coord_in.y = gidz * axis_num + indice.x;\n\ \n\ float4 data = read_imagef(input0, coord_in);\n\ @@ -49526,7 +54431,15 @@ __kernel void gather_batch_F32toF32(\n\ }\n\ "; /* end of gather_batch_cl*/ -static const char gather_elements_cl[] = "\n\ +static const char gather_elements_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\ +\n\ +_viv_uniform uint width0;\n\ +_viv_uniform uint height0;\n\ +_viv_uniform uint width1;\n\ +_viv_uniform uint height1;\n\ +_viv_uniform uint width_out;\n\ +_viv_uniform uint height_out;\n\ +\n\ #define GATHER_ELEMENTS_AXIS0_2D(name, data_type, read_func, write_func, conv_func) \\\n\ __kernel void gather_elements_axis0_##name##_I32to##name##_2D \\\n\ ( \\\n\ @@ -49661,6 +54574,162 @@ __kernel void gather_elements_axis2_##name##_I32to##name \\\n\ GATHER_ELEMENTS_AXIS2(F32, float4, read_imagef, write_imagef, convert_float4)\n\ GATHER_ELEMENTS_AXIS2(I32, int4, read_imagei, write_imagei, convert_int4_rte)\n\ GATHER_ELEMENTS_AXIS2(U32, uint4, read_imageui, write_imageui, convert_uint4_rte)\n\ +\n\ +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(name, data_type, data_type_ptr, stride) \\\n\ +__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output, \\\n\ + float input_scale, \\\n\ + float input_tail, \\\n\ + int axis_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \\\n\ + int* index_ptr = (int*)index_tensor.ptr; \\\n\ + int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \\\n\ + \\\n\ + Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \\\n\ + data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \\\n\ + data_type data = input_ptr[index + coord.y * width0 + coord.z * width0 * height0]; \\\n\ + \\\n\ + Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \\\n\ + data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \\\n\ + output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \\\n\ +}\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(F32, float, float*, 4)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I32, int, int*, 4)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(F16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I8, char, char*, 1)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(U8, uchar, uchar*, 1)\n\ +\n\ +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(name, data_type, data_type_ptr, stride) \\\n\ +__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output, \\\n\ + float input_scale, \\\n\ + float input_tail, \\\n\ + int axis_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \\\n\ + int* index_ptr = (int*)index_tensor.ptr; \\\n\ + int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \\\n\ + \\\n\ + Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \\\n\ + data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \\\n\ + data_type data = input_ptr[coord.x + index * width0 + coord.z * width0 * height0]; \\\n\ + \\\n\ + Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \\\n\ + data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \\\n\ + output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \\\n\ +}\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(F32, float, float*, 4)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I32, int, int*, 4)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(F16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I8, char, char*, 1)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(U8, uchar, uchar*, 1)\n\ +\n\ +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(name, data_type, data_type_ptr, stride) \\\n\ +__kernel void gather_elements_beyond_maxwidth_axis2_##name##_I32to##name \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output, \\\n\ + float input_scale, \\\n\ + float input_tail, \\\n\ + int axis_size \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \\\n\ + int* index_ptr = (int*)index_tensor.ptr; \\\n\ + int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \\\n\ + \\\n\ + Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \\\n\ + data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \\\n\ + data_type data = input_ptr[coord.x + coord.y * width0 + index * width0 * height0]; \\\n\ + \\\n\ + Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \\\n\ + data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \\\n\ + output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \\\n\ +}\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(F32, float, float*, 4)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I32, int, int*, 4)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(F16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I8, char, char*, 1)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(U8, uchar, uchar*, 1)\n\ +\n\ +\n\ +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(name, data_type, data_type_ptr, stride) \\\n\ +__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name##_2D \\\n\ + ( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + float input_scale, \\\n\ + float input_tail, \\\n\ + int axis_size \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + Image index_img = create_image_from_image2d(input1, 4); \\\n\ + int* index_ptr = (int*)index_img.ptr; \\\n\ + int index = index_ptr[coord.x + coord.y * width1]; \\\n\ + \\\n\ + Image input_img = create_image_from_image2d(input0, stride); \\\n\ + data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \\\n\ + data_type data = input_ptr[index + coord.y * width0]; \\\n\ + \\\n\ + Image output_img = create_image_from_image2d(output, stride); \\\n\ + data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \\\n\ + output_ptr[coord.x + coord.y * width_out] = data; \\\n\ +}\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(F32, float, float*, 4)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I32, int, int*, 4)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(F16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I8, char, char*, 1)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(U8, uchar, uchar*, 1)\n\ +\n\ +#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(name, data_type, data_type_ptr, stride) \\\n\ +__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name##_2D \\\n\ + ( \\\n\ + __read_only image2d_t input0, \\\n\ + __read_only image2d_t input1, \\\n\ + __write_only image2d_t output, \\\n\ + float input_scale, \\\n\ + float input_tail, \\\n\ + int axis_size \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + Image index_img = create_image_from_image2d(input1, 4); \\\n\ + int* index_ptr = (int*)index_img.ptr; \\\n\ + int index = index_ptr[coord.x + coord.y * width1]; \\\n\ + \\\n\ + Image input_img = create_image_from_image2d(input0, stride); \\\n\ + data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \\\n\ + data_type data = input_ptr[coord.x + index * width0]; \\\n\ + \\\n\ + Image output_img = create_image_from_image2d(output, stride); \\\n\ + data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \\\n\ + output_ptr[coord.x + coord.y * width_out] = data; \\\n\ +}\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(F32, float, float*, 4)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I32, int, int*, 4)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(F16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I16, short, short*, 2)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I8, char, char*, 1)\n\ +GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(U8, uchar, uchar*, 1)\n\ "; /* end of gather_elements_cl*/ static const char gather_nd_cl[] = "__kernel void gather_nd_U8toU8_1D(\n\ @@ -49919,127 +54988,136 @@ __kernel void gather_nd_F32toF32_3D(\n\ static const char gather_nd_batch_cl[] = "__kernel void gather_nd_batch_U8toU8_1D(\n\ __read_only image2d_t input0,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ int block_size,\n\ int coord_dim\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ - int gidy = get_global_id(1); // batch_num\n\ + int gidy = get_global_id(1); // index_num\n\ + int gidz = get_global_id(2); // batch_num\n\ \n\ - int4 coord = (int4)(gidx, gidy, 0, 0);\n\ - int4 indice = read_imagei(input1, coord.wy);\n\ - coord.z = indice.x * block_size + gidx;\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + int4 indice = read_imagei(input1, coord.wyzw);\n\ + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\ \n\ - uint4 data = read_imageui(input0, coord.zy);\n\ - write_imageui(output, coord.xy, data);\n\ + uint4 data = read_imageui(input0, coord0);\n\ + write_imageui(output, coord, data);\n\ }\n\ \n\ __kernel void gather_nd_batch_F16toF16_1D(\n\ __read_only image2d_t input0,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ int block_size,\n\ int coord_dim\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ - int gidy = get_global_id(1); // batch_num\n\ + int gidy = get_global_id(1); // index_num\n\ + int gidz = get_global_id(2); // batch_num\n\ \n\ - int4 coord = (int4)(gidx, gidy, 0, 0);\n\ - int4 indice = read_imagei(input1, coord.wy);\n\ - coord.z = indice.x * block_size + gidx;\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + int4 indice = read_imagei(input1, coord.wyzw);\n\ + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\ \n\ - float4 data = read_imagef(input0, coord.zy);\n\ - write_imagef(output, coord.xy, data);\n\ + float4 data = read_imagef(input0, coord0);\n\ + write_imagef(output, coord, data);\n\ }\n\ \n\ __kernel void gather_nd_batch_I8toI8_1D(\n\ __read_only image2d_t input0,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ int block_size,\n\ int coord_dim\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ - int gidy = get_global_id(1); // batch_num\n\ + int gidy = get_global_id(1); // index_num\n\ + int gidz = get_global_id(2); // batch_num\n\ \n\ - int4 coord = (int4)(gidx, gidy, 0, 0);\n\ - int4 indice = read_imagei(input1, coord.wy);\n\ - coord.z = indice.x * block_size + gidx;\n\ + int4 coord = (int4)(gidx, gidy, gidz, 0);\n\ + int4 indice = read_imagei(input1, coord.wyzw);\n\ + int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\ \n\ - int4 data = read_imagei(input0, coord.zy);\n\ - write_imagei(output, coord.xy, data);\n\ + int4 data = read_imagei(input0, coord0);\n\ + write_imagei(output, coord, data);\n\ }\n\ \n\ //2D\n\ __kernel void gather_nd_batch_U8toU8_2D(\n\ __read_only image2d_array_t input0,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ int block_size,\n\ int coord_dim\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ - int gidy = get_global_id(1); // batch_num\n\ + int gidy = get_global_id(1); // index_num\n\ + int gidz = get_global_id(2); // batch_num\n\ \n\ - int4 coord = (int4)(0, gidy, gidx, 1);\n\ - int4 indice = read_imagei(input1, coord.xy);\n\ - int4 indice1 = read_imagei(input1, coord.wy);\n\ + int4 coord = (int4)(1, gidy, gidz, 0);\n\ + int4 indice = read_imagei(input1, coord.wyzw);\n\ + int4 indice1 = read_imagei(input1, coord.xyzw);\n\ indice.x = indice.x * block_size + gidx;\n\ indice.y = indice1.x;\n\ - indice.zw = coord.yx;\n\ + indice.zw = coord.zw;\n\ \n\ uint4 data = read_imageui(input0, indice);\n\ - write_imageui(output, coord.zy, data);\n\ + coord.x = gidx;\n\ + write_imageui(output, coord, data);\n\ }\n\ \n\ __kernel void gather_nd_batch_F16toF16_2D(\n\ __read_only image2d_array_t input0,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ int block_size,\n\ int coord_dim\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ - int gidy = get_global_id(1); // batch_num\n\ + int gidy = get_global_id(1); // index_num\n\ + int gidz = get_global_id(2); // batch_num\n\ \n\ - int4 coord = (int4)(0, gidy, gidx, 1);\n\ - int4 indice = read_imagei(input1, coord.xy);\n\ - int4 indice1 = read_imagei(input1, coord.wy);\n\ + int4 coord = (int4)(1, gidy, gidz, 0);\n\ + int4 indice = read_imagei(input1, coord.wyzw);\n\ + int4 indice1 = read_imagei(input1, coord.xyzw);\n\ indice.x = indice.x * block_size + gidx;\n\ indice.y = indice1.x;\n\ - indice.zw = coord.yx;\n\ + indice.zw = coord.zw;\n\ \n\ float4 data = read_imagef(input0, indice);\n\ - write_imagef(output, coord.zy, data);\n\ + coord.x = gidx;\n\ + write_imagef(output, coord, data);\n\ }\n\ \n\ __kernel void gather_nd_batch_I8toI8_2D(\n\ __read_only image2d_array_t input0,\n\ - __read_only image2d_t input1,\n\ - __write_only image2d_t output,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ int block_size,\n\ int coord_dim\n\ )\n\ {\n\ int gidx = get_global_id(0); // block_size\n\ - int gidy = get_global_id(1); // batch_num\n\ + int gidy = get_global_id(1); // index_num\n\ + int gidz = get_global_id(2); // batch_num\n\ \n\ - int4 coord = (int4)(0, gidy, gidx, 1);\n\ - int4 indice = read_imagei(input1, coord.xy);\n\ - int4 indice1 = read_imagei(input1, coord.wy);\n\ + int4 coord = (int4)(1, gidy, gidz, 0);\n\ + int4 indice = read_imagei(input1, coord.wyzw);\n\ + int4 indice1 = read_imagei(input1, coord.xyzw);\n\ indice.x = indice.x * block_size + gidx;\n\ indice.y = indice1.x;\n\ indice.y = indice1.x;\n\ - indice.zw = coord.yx;\n\ + indice.zw = coord.zw;\n\ \n\ int4 data = read_imagei(input0, indice);\n\ - write_imagei(output, coord.zy, data);\n\ + coord.x = gidx;\n\ + write_imagei(output, coord, data);\n\ }\n\ "; /* end of gather_nd_batch_cl*/ @@ -57045,6 +62123,103 @@ GEMM_TRANSB_3D(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);\n\ \n\ "; /* end of matrixmul_cl*/ +static const char matrixmul_cross_cl[] = "__kernel void gemm_F32F32toF32_merge(\n\ + __read_only image2d_array_t inputA,\n\ + __read_only image2d_array_t inputB,\n\ + __write_only image2d_array_t output,\n\ + int M,\n\ + int K,\n\ + int N,\n\ + int ac2zero,\n\ + int bc2zero,\n\ + float scale_a,\n\ + float zp_a,\n\ + float scale_b,\n\ + float zp_b,\n\ + float scale_out,\n\ + float zp_out,\n\ + int outer)\n\ +{\n\ + for(int i = 0; i < outer; i++)\n\ + {\n\ + int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? i : get_global_id(2)), 0);\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0);\n\ +\n\ + float4 sum = (float4)(0);\n\ +\n\ + for(; coord_a.x < K;)\n\ + {\n\ + float4 tempA0;\n\ + float4 tempB0;\n\ +\n\ + tempA0 = read_imagef(inputA, coord_a);\n\ + tempB0 = read_imagef(inputB, coord_b);\n\ + coord_a.x++;\n\ + coord_b.y++;\n\ +\n\ + sum = sum + tempA0 * tempB0;\n\ + }\n\ +\n\ + coord_b.y = get_global_id(1);\n\ + coord_b.z = get_global_id(2) + i * get_global_size(2);\n\ + write_imagef(output, coord_b, sum);\n\ + }\n\ +}\n\ +\n\ +#define GEMM_MERGE(name, dst_type, read_image_type, convert_type, write_image_type) \\\n\ +__kernel void gemm_##name##_merge( \\\n\ + __read_only image2d_array_t inputA, \\\n\ + __read_only image2d_array_t inputB, \\\n\ + __write_only image2d_array_t output, \\\n\ + int M, \\\n\ + int K, \\\n\ + int N, \\\n\ + int ac2zero, \\\n\ + int bc2zero, \\\n\ + float scale_a, \\\n\ + float zp_a, \\\n\ + float scale_b, \\\n\ + float zp_b, \\\n\ + float scale_out, \\\n\ + float zp_out, \\\n\ + int outer) \\\n\ +{ \\\n\ + for(int i = 0; i < outer; i++) \\\n\ + { \\\n\ + int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? i : get_global_id(2)), 0); \\\n\ + int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); \\\n\ + float4 sum = (float4)(0); \\\n\ + dst_type dst; \\\n\ + \\\n\ + for(; coord_a.x < K;) \\\n\ + { \\\n\ + float4 tempA0; \\\n\ + float4 tempB0; \\\n\ + \\\n\ + tempA0 = convert_float4(read_image_type(inputA, coord_a)); \\\n\ + tempB0 = convert_float4(read_image_type(inputB, coord_b)); \\\n\ + tempA0.x = (tempA0.x - zp_a) * scale_a; \\\n\ + tempB0.x = (tempB0.x - zp_b) * scale_b; \\\n\ + \\\n\ + coord_a.x++; \\\n\ + coord_b.y++; \\\n\ + \\\n\ + sum = sum + tempA0 * tempB0; \\\n\ + } \\\n\ + sum.x = sum.x * scale_out + zp_out; \\\n\ + dst = convert_type(sum); \\\n\ + \\\n\ + coord_b.y = get_global_id(1); \\\n\ + coord_b.z = get_global_id(2) + i * get_global_size(2); \\\n\ + write_image_type(output, coord_b, dst); \\\n\ + } \\\n\ +}\n\ +GEMM_MERGE(I8I8toI8,int4,read_imagei,convert_int4,write_imagei);\n\ +GEMM_MERGE(U8U8toU8,uint4,read_imageui,convert_uint4,write_imageui);\n\ +GEMM_MERGE(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);\n\ +\n\ +"; /* end of matrixmul_cross_cl*/ + static const char matrixmul_transA_cl[] = "__kernel void gemm_transa_F32F32toF32_2D(\n\ __read_only image2d_t inputA,\n\ __read_only image2d_t inputB,\n\ @@ -59324,6 +64499,85 @@ __kernel void moments_axis2_BF16toF32(\n\ }\n\ "; /* end of moments_axis2_cl*/ +static const char nearest_grid_sample_cl[] = "__kernel void nearest_grid_sample_F32_F32toF32(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + float half_input0_w,\n\ + float half_input0_h,\n\ + float add_float_value_w,\n\ + float add_float_value_h,\n\ + int depth\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int2 coord_in1 = (int2)(get_global_id(0) * 2, get_global_id(1));\n\ +\n\ + float fx = read_imagef(input1, coord_in1).x;\n\ + coord_in1.x = coord_in1.x + 1;\n\ + float fy = read_imagef(input1, coord_in1).x;\n\ +\n\ + fx = fx * half_input0_w + add_float_value_w;\n\ + fy = fy * half_input0_h + add_float_value_h;\n\ + int x_index = convert_int(fx);\n\ + int y_index = convert_int(fy);\n\ + int4 coord_in = (int4)(x_index, y_index, 0, 0);\n\ +\n\ + float4 dst;\n\ +\n\ + while (coord_in.z < depth){\n\ + dst = read_imagef(input0, coord_in);\n\ + write_imagef(output, coord_out, dst);\n\ + coord_in.z++;\n\ + coord_out.z++;\n\ + }\n\ +}\n\ +\n\ +\n\ +__kernel void nearest_grid_sample_U8_U8toU8(\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_array_t output,\n\ + float half_input0_w,\n\ + float half_input0_h,\n\ + float add_float_value_w,\n\ + float add_float_value_h,\n\ + int depth,\n\ + float in0_scale,\n\ + float in0_tail,\n\ + float in1_scale,\n\ + float in1_tail,\n\ + float out_scale,\n\ + float out_tail\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int2 coord_in1 = (int2)(get_global_id(0) * 2, get_global_id(1));\n\ +\n\ + float fx = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;\n\ + coord_in1.x = coord_in1.x + 1;\n\ + float fy = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;\n\ +\n\ + fx = fx * half_input0_w + add_float_value_w;\n\ + fy = fy * half_input0_h + add_float_value_h;\n\ + int x_index = convert_int(fx);\n\ + int y_index = convert_int(fy);\n\ + int4 coord_in = (int4)(x_index, y_index, 0, 0);\n\ +\n\ + float4 val;\n\ + uint4 dst;\n\ +\n\ + while (coord_in.z < depth){\n\ + val = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;\n\ + dst = convert_uint4_rte(val * out_scale + out_tail);\n\ + write_imageui(output, coord_out, dst);\n\ + coord_in.z++;\n\ + coord_out.z++;\n\ + }\n\ +\n\ +}\n\ +"; /* end of nearest_grid_sample_cl*/ + static const char one_hot_cl[] = "__kernel void one_hot_F32toF32\n\ (\n\ __read_only image2d_t input,\n\ @@ -62168,6 +67422,290 @@ __kernel void resize_1d_nearest_U8toU8(\n\ }\n\ "; /* end of resize_1d_nearest_cl*/ +static const char resize_3d_bilinear_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\ +\n\ +#define RESIZE_3D(in_name, out_name, read_image_type, dst_type, convert_type, write_image_type) \\\n\ +__kernel void resize_3d_bilinear_##in_name##to##out_name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float scale_x, \\\n\ + float scale_y, \\\n\ + float scale_z, \\\n\ + float half_pixel_value, \\\n\ + uint in_width, \\\n\ + uint in_height, \\\n\ + uint in_depth, \\\n\ + float in_scale, \\\n\ + float in_tail, \\\n\ + float out_scale, \\\n\ + float out_tail \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value; \\\n\ + float left_x_f = fmax(floor(in_x), 0); \\\n\ + float x_lerp = in_x - left_x_f; \\\n\ + int left_x_idx = convert_int(left_x_f); \\\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value; \\\n\ + float top_y_f = fmax(floor(in_y), 0); \\\n\ + float y_lerp = in_y - top_y_f; \\\n\ + int top_y_idx = convert_int(top_y_f); \\\n\ + float in_z = (convert_float(coord_out.z) + half_pixel_value) * scale_z - half_pixel_value; \\\n\ + float front_z_f = fmax(floor(in_z), 0); \\\n\ + float z_lerp = in_z - front_z_f; \\\n\ + int front_z_idx = convert_int(front_z_f); \\\n\ + int4 coord_in = (int4)(left_x_idx, top_y_idx, front_z_idx, 0); \\\n\ + float4 data_000, data_100, data_010, data_110, data_001, data_011, data_101, data_111; \\\n\ + dst_type dst; \\\n\ + \\\n\ + int dx, dy, dz; \\\n\ + dx = in_x < 0 ? 0 : (left_x_f < in_width - 1 ? 1 : 0); \\\n\ + dy = in_y < 0 ? 0 : (top_y_f < in_height - 1 ? 1 : 0); \\\n\ + dz = in_z < 0 ? 0 : (front_z_idx < in_depth - 1 ? 1 : 0); \\\n\ + \\\n\ + data_000 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\ + coord_in.y = coord_in.y + dy; \\\n\ + data_010 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\ + coord_in.x = coord_in.x + dx; \\\n\ + data_110 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\ + coord_in.y = coord_in.y - dy; \\\n\ + data_100 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\ + coord_in.z = coord_in.z + dz; \\\n\ + data_101 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\ + coord_in.y = coord_in.y + dy; \\\n\ + data_111 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\ + coord_in.x = coord_in.x - dx; \\\n\ + data_011 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\ + coord_in.y = coord_in.y - dy; \\\n\ + data_001 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\ + \\\n\ + data_000 = data_000 + (data_100 - data_000) * x_lerp; \\\n\ + data_010 = data_010 + (data_110 - data_010) * x_lerp; \\\n\ + data_000 = data_000 + (data_010 - data_000) * y_lerp; \\\n\ + \\\n\ + data_001 = data_001 + (data_101 - data_001) * x_lerp; \\\n\ + data_011 = data_011 + (data_111 - data_011) * x_lerp; \\\n\ + data_001 = data_001 + (data_011 - data_001) * y_lerp; \\\n\ + data_000 = data_000 + (data_001 - data_000) * z_lerp; \\\n\ + \\\n\ + dst = convert_type(data_000 * out_scale + out_tail); \\\n\ + \\\n\ + write_image_type(output, coord_out, dst); \\\n\ +}\n\ +RESIZE_3D(F32, F32, read_imagef, float4, convert_float4, write_imagef)\n\ +RESIZE_3D(F32, U8, read_imagef, uint4, convert_uint4, write_imageui)\n\ +RESIZE_3D(U8, F32, read_imageui, float4, convert_float4, write_imagef)\n\ +RESIZE_3D(U8, U8, read_imageui, uint4, convert_uint4, write_imageui)\n\ +RESIZE_3D(I8, I8, read_imagei, int4, convert_int4, write_imagei)\n\ +\n\ +__kernel void resize_3d_bilinear_BF16toBF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float scale_x,\n\ + float scale_y,\n\ + float scale_z,\n\ + float half_pixel_value,\n\ + uint in_width,\n\ + uint in_height,\n\ + uint in_depth,\n\ + float in_scale,\n\ + float in_tail,\n\ + float out_scale,\n\ + float out_tail\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value;\n\ + float left_x_f = fmax(floor(in_x), 0);\n\ + float x_lerp = in_x - left_x_f;\n\ + int left_x_idx = convert_int(left_x_f);\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value;\n\ + float top_y_f = fmax(floor(in_y), 0);\n\ + float y_lerp = in_y - top_y_f;\n\ + int top_y_idx = convert_int(top_y_f);\n\ + float in_z = (convert_float(coord_out.z) + half_pixel_value) * scale_z - half_pixel_value;\n\ + float front_z_f = fmax(floor(in_z), 0);\n\ + float z_lerp = in_z - front_z_f;\n\ + int front_z_idx = convert_int(front_z_f);\n\ + int4 coord_in = (int4)(left_x_idx, top_y_idx, front_z_idx, 0);\n\ + uint4 data_000, data_100, data_010, data_110, data_001, data_011, data_101, data_111;\n\ + float4 data_000_f, data_100_f, data_010_f, data_110_f, data_001_f, data_011_f, data_101_f, data_111_f;\n\ + uint4 dst;\n\ +\n\ + int dx, dy, dz;\n\ + dx = in_x < 0 ? 0 : (left_x_f < in_width - 1 ? 1 : 0);\n\ + dy = in_y < 0 ? 0 : (top_y_f < in_height - 1 ? 1 : 0);\n\ + dz = in_z < 0 ? 0 : (front_z_idx < in_depth - 1 ? 1 : 0);\n\ +\n\ + data_000 = read_imageui(input, coord_in);\n\ + data_000 = data_000 << 16;\n\ + coord_in.y = coord_in.y + dy;\n\ + data_010 = read_imageui(input, coord_in);\n\ + data_010 = data_010 << 16;\n\ + coord_in.x = coord_in.x + dx;\n\ + data_110 = read_imageui(input, coord_in);\n\ + data_110 = data_110 << 16;\n\ + coord_in.y = coord_in.y - dy;\n\ + data_100 = read_imageui(input, coord_in);\n\ + data_100 = data_100 << 16;\n\ + coord_in.z = coord_in.z + dz;\n\ + data_101 = read_imageui(input, coord_in);\n\ + data_101 = data_101 << 16;\n\ + coord_in.y = coord_in.y + dy;\n\ + data_111 = read_imageui(input, coord_in);\n\ + data_111 = data_111 << 16;\n\ + coord_in.x = coord_in.x - dx;\n\ + data_011 = read_imageui(input, coord_in);\n\ + data_011 = data_011 << 16;\n\ + coord_in.y = coord_in.y - dy;\n\ + data_001 = read_imageui(input, coord_in);\n\ + data_001 = data_001 << 16;\n\ +\n\ + _viv_asm(COPY, data_000_f, data_000, 16);\n\ + _viv_asm(COPY, data_010_f, data_010, 16);\n\ + _viv_asm(COPY, data_110_f, data_110, 16);\n\ + _viv_asm(COPY, data_100_f, data_100, 16);\n\ + _viv_asm(COPY, data_101_f, data_101, 16);\n\ + _viv_asm(COPY, data_111_f, data_111, 16);\n\ + _viv_asm(COPY, data_011_f, data_011, 16);\n\ + _viv_asm(COPY, data_001_f, data_001, 16);\n\ +\n\ + data_000_f = data_000_f + (data_100_f - data_000_f) * x_lerp;\n\ + data_010_f = data_010_f + (data_110_f - data_010_f) * x_lerp;\n\ + data_000_f = data_000_f + (data_010_f - data_000_f) * y_lerp;\n\ +\n\ + data_001_f = data_001_f + (data_101_f - data_001_f) * x_lerp;\n\ + data_011_f = data_011_f + (data_111_f - data_011_f) * x_lerp;\n\ + data_001_f = data_001_f + (data_011_f - data_001_f) * y_lerp;\n\ + data_000_f = data_000_f + (data_001_f - data_000_f) * z_lerp;\n\ +\n\ + _viv_asm(COPY, dst, data_000_f, 16);\n\ + dst = dst >> 16;\n\ + write_imageui(output, coord_out, dst);\n\ +}\n\ +"; /* end of resize_3d_bilinear_cl*/ + +static const char resize_3d_nearest_cl[] = "\n\ +#define NEAREST_INDEX_PROCESS() \\\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x + round_value; \\\n\ + int in_x_idx = convert_int(in_x); \\\n\ + float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y + round_value; \\\n\ + int in_y_idx = convert_int(in_y); \\\n\ + float in_z = (convert_float(coord_out.z) + half_pixel_value) * scale_z + round_value; \\\n\ + int in_z_idx = convert_int(in_z); \\\n\ +\n\ +__kernel void resize_3d_nearest_F32toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float scale_x,\n\ + float scale_y,\n\ + float scale_z,\n\ + float half_pixel_value,\n\ + float round_value,\n\ + float output_scale,\n\ + float output_tail)\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ + int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);\n\ + float4 dst;\n\ + dst = read_imagef(input, coord_in);\n\ + write_imagef(output, coord_out, dst);\n\ +}\n\ +\n\ +\n\ +__kernel void resize_3d_nearest_U8toU8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float scale_x,\n\ + float scale_y,\n\ + float scale_z,\n\ + float half_pixel_value,\n\ + float round_value,\n\ + float output_scale,\n\ + float output_tail)\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ + int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);\n\ + uint4 dst;\n\ + dst = convert_uint4(convert_float4(read_imageui(input, coord_in)) * output_scale + output_tail);\n\ + write_imageui(output, coord_out, dst);\n\ +}\n\ +\n\ +__kernel void resize_3d_nearest_U8toF32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float scale_x,\n\ + float scale_y,\n\ + float scale_z,\n\ + float half_pixel_value,\n\ + float round_value,\n\ + float output_scale,\n\ + float output_tail)\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ + int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);\n\ + float4 dst;\n\ + dst = convert_float4(read_imageui(input, coord_in)) * output_scale + output_tail;\n\ + write_imagef(output, coord_out, dst);\n\ +}\n\ +\n\ +__kernel void resize_3d_nearest_F32toU8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float scale_x,\n\ + float scale_y,\n\ + float scale_z,\n\ + float half_pixel_value,\n\ + float round_value,\n\ + float output_scale,\n\ + float output_tail)\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ + int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);\n\ + uint4 dst;\n\ + dst = convert_uint4(read_imagef(input, coord_in) * output_scale + output_tail);\n\ + write_imageui(output, coord_out, dst);\n\ +}\n\ +\n\ +__kernel void resize_3d_nearest_I8toI8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float scale_x,\n\ + float scale_y,\n\ + float scale_z,\n\ + float half_pixel_value,\n\ + float round_value,\n\ + float output_scale,\n\ + float output_tail)\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ + int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);\n\ + int4 dst;\n\ + dst = convert_int4(convert_float4(read_imagei(input, coord_in)) * output_scale);\n\ + write_imagei(output, coord_out, dst);\n\ +}\n\ +\n\ +__kernel void resize_3d_nearest_BF16toBF16(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + float scale_x,\n\ + float scale_y,\n\ + float scale_z,\n\ + float half_pixel_value,\n\ + float round_value,\n\ + float output_scale,\n\ + float output_tail)\n\ +{\n\ + NEAREST_INDEX_PROCESS()\n\ + int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);\n\ + uint4 dst;\n\ + dst = read_imageui(input, coord_in);\n\ + write_imageui(output, coord_out, dst);\n\ +}\n\ +\n\ +"; /* end of resize_3d_nearest_cl*/ + static const char resize_bilinear_cl[] = "__kernel void resize_bilinear_F32toF32(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_array_t output,\n\ @@ -64556,7 +70094,7 @@ __kernel void swish_F32toU8_2D(\n\ }"; /* end of swish_cl*/ static const char tile_cl[] = "\n\ -#define TILE_3D(name0, name1, data_type, read_image_func, write_image_func) \\\n\ +#define TILE_3D(name0, name1, src_type, dst_type, conv_type, read_image_func, write_image_func) \\\n\ __kernel void tile_##name0##to##name1 \\\n\ ( \\\n\ __read_only image2d_array_t input, \\\n\ @@ -64567,7 +70105,9 @@ __kernel void tile_##name0##to##name1 \\\n\ int multiples_0, \\\n\ int multiples_1, \\\n\ int multiples_2, \\\n\ - int multiples_3 \\\n\ + int multiples_3, \\\n\ + float inoutscale, \\\n\ + float inouttail \\\n\ ) \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ @@ -64575,7 +70115,9 @@ __kernel void tile_##name0##to##name1 \\\n\ int width = get_image_width(input); \\\n\ int height = get_image_height(input); \\\n\ \\\n\ - data_type src; \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + \\\n\ read_image_func(src, input, coord); \\\n\ \\\n\ int batch_id = (short)coord.z / (short)depthIn; \\\n\ @@ -64597,17 +70139,19 @@ __kernel void tile_##name0##to##name1 \\\n\ for (int x = 0; x < multiples_0; x++) \\\n\ { \\\n\ coord_out.x = coord.x + x * width; \\\n\ - write_image_func(output, coord_out.xyzw, src); \\\n\ + dst = conv_type(convert_float4(src) * inoutscale + inouttail); \\\n\ + write_image_func(output, coord_out.xyzw, dst); \\\n\ } \\\n\ } \\\n\ } \\\n\ } \\\n\ }\n\ -TILE_3D(I32, I32, int4, READ_IMAGEI_2DARRAY, write_imagei)\n\ -TILE_3D(U32, U32, uint4, READ_IMAGEUI_2DARRAY, write_imageui)\n\ -TILE_3D(F32, F32, float4, READ_IMAGEF_2DARRAY, write_imagef)\n\ +TILE_3D(I32, I32, int4, int4, convert_int4_rte, READ_IMAGEI_2DARRAY, write_imagei)\n\ +TILE_3D(U32, U32, uint4, uint4, convert_uint4_rte, READ_IMAGEUI_2DARRAY, write_imageui)\n\ +TILE_3D(F32, F32, float4, float4,convert_float4_rte,READ_IMAGEF_2DARRAY, write_imagef)\n\ +TILE_3D(F32, U32, float4, uint4, convert_uint4_rte, READ_IMAGEF_2DARRAY, write_imageui)\n\ \n\ -#define TILE_2D(name0, name1, data_type, read_image_func, write_image_func) \\\n\ +#define TILE_2D(name0, name1, src_type, dst_type, conv_type, read_image_func, write_image_func) \\\n\ __kernel void tile_##name0##to##name1##_2D \\\n\ ( \\\n\ __read_only image2d_t input, \\\n\ @@ -64618,7 +70162,9 @@ __kernel void tile_##name0##to##name1##_2D \\\n\ int multiples_0, \\\n\ int multiples_1, \\\n\ int multiples_2, \\\n\ - int multiples_3 \\\n\ + int multiples_3, \\\n\ + float inoutscale, \\\n\ + float inouttail \\\n\ ) \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ @@ -64627,22 +70173,25 @@ __kernel void tile_##name0##to##name1##_2D \\\n\ int output_width = get_image_width(output); \\\n\ int output_height = get_image_height(output); \\\n\ \\\n\ - data_type src = read_image_func(input, coord); \\\n\ + src_type src = read_image_func(input, coord); \\\n\ + dst_type dst; \\\n\ \\\n\ do \\\n\ { \\\n\ do \\\n\ { \\\n\ - write_image_func(output, coord, src); \\\n\ + dst = conv_type(convert_float4(src) * inoutscale + inouttail); \\\n\ + write_image_func(output, coord, dst); \\\n\ coord.x += width; \\\n\ } while (coord.x < output_width); \\\n\ coord.x = get_global_id(0); \\\n\ coord.y += height; \\\n\ } while (coord.y < output_height); \\\n\ }\n\ -TILE_2D(I32, I32, int4, read_imagei, write_imagei)\n\ -TILE_2D(U32, U32, uint4, read_imageui, write_imageui)\n\ -TILE_2D(F32, F32, float4, read_imagef, write_imagef)\n\ +TILE_2D(I32, I32, int4, int4, convert_int4_rte, read_imagei, write_imagei)\n\ +TILE_2D(U32, U32, uint4, uint4, convert_uint4_rte, read_imageui, write_imageui)\n\ +TILE_2D(F32, F32, float4, float4,convert_float4_rte,read_imagef, write_imagef)\n\ +TILE_2D(F32, U32, float4, uint4, convert_uint4_rte, read_imagef, write_imageui)\n\ \n\ \n\ \n\ @@ -65903,9 +71452,13 @@ static const source_map_t evis_resource[] = {"cumsum_vx", cumsum_vx}, {"cumsum_2d_vx", cumsum_2d_vx}, {"cumsum_bf16_vx", cumsum_bf16_vx}, + {"cumsum_ex_rev_axis0_vx", cumsum_ex_rev_axis0_vx}, + {"cumsum_ex_rev_axis1_vx", cumsum_ex_rev_axis1_vx}, + {"cumsum_ex_rev_axis2_vx", cumsum_ex_rev_axis2_vx}, {"cumsum_f16_u8_vx", cumsum_f16_u8_vx}, {"custom_softmax_vx", custom_softmax_vx}, {"custom_warp_affine_vx", custom_warp_affine_vx}, + {"custom_warp_affine_rgb_vx", custom_warp_affine_rgb_vx}, {"custom_warp_perspective_vx", custom_warp_perspective_vx}, {"depth2space_crd_vx", depth2space_crd_vx}, {"depthwise_conv1d_src0_vx", depthwise_conv1d_src0_vx}, @@ -65988,12 +71541,15 @@ static const source_map_t evis_resource[] = {"lstmunit_activation_S_F16_vx", lstmunit_activation_S_F16_vx}, {"lstmunit_activation_S_U8_vx", lstmunit_activation_S_U8_vx}, {"matrixmul_bf16_vx", matrixmul_bf16_vx}, + {"matrixmul_cross_vx", matrixmul_cross_vx}, + {"matrixmul_cross_i16_vx", matrixmul_cross_i16_vx}, {"matrixmul_f16_vx", matrixmul_f16_vx}, {"matrixmul_f16f16_u8_vx", matrixmul_f16f16_u8_vx}, {"matrixmul_f16i16_i16_vx", matrixmul_f16i16_i16_vx}, {"matrixmul_f16u8_f16_vx", matrixmul_f16u8_f16_vx}, {"matrixmul_f16u8_u8_vx", matrixmul_f16u8_u8_vx}, {"matrixmul_i16_vx", matrixmul_i16_vx}, + {"matrixmul_merge_vx", matrixmul_merge_vx}, {"matrixmul_transA_vx", matrixmul_transA_vx}, {"matrixmul_transB_f16_vx", matrixmul_transB_f16_vx}, {"matrixmul_transB_f16_mix_vx", matrixmul_transB_f16_mix_vx}, @@ -66015,6 +71571,12 @@ static const source_map_t evis_resource[] = {"moments_axis2_vx", moments_axis2_vx}, {"moments_u8_vx", moments_u8_vx}, {"moments_u8_axis012_vx", moments_u8_axis012_vx}, + {"nearest_grid_sample_BF16_to_BF16_vx", nearest_grid_sample_BF16_to_BF16_vx}, + {"nearest_grid_sample_F16_to_F16_vx", nearest_grid_sample_F16_to_F16_vx}, + {"nearest_grid_sample_F16_to_U8_vx", nearest_grid_sample_F16_to_U8_vx}, + {"nearest_grid_sample_I16_to_I16_vx", nearest_grid_sample_I16_to_I16_vx}, + {"nearest_grid_sample_I8_to_I8_vx", nearest_grid_sample_I8_to_I8_vx}, + {"nearest_grid_sample_U8_to_U8_vx", nearest_grid_sample_U8_to_U8_vx}, {"one_hot_vx", one_hot_vx}, {"poolwithargmax_F16_vx", poolwithargmax_F16_vx}, {"poolwithargmax_I16_vx", poolwithargmax_I16_vx}, @@ -66031,9 +71593,15 @@ static const source_map_t evis_resource[] = {"pre_process_rgb888_planar_0_vx", pre_process_rgb888_planar_0_vx}, {"pre_process_rgb888_planar_1_vx", pre_process_rgb888_planar_1_vx}, {"pre_process_rgb888_planar_2_vx", pre_process_rgb888_planar_2_vx}, + {"pre_process_rgb888_planar_nhwc_0_vx", pre_process_rgb888_planar_nhwc_0_vx}, + {"pre_process_rgb888_planar_nhwc_1_vx", pre_process_rgb888_planar_nhwc_1_vx}, + {"pre_process_rgb888_planar_nhwc_2_vx", pre_process_rgb888_planar_nhwc_2_vx}, {"pre_process_rgb888_planar_sep_0_vx", pre_process_rgb888_planar_sep_0_vx}, {"pre_process_rgb888_planar_sep_1_vx", pre_process_rgb888_planar_sep_1_vx}, {"pre_process_rgb888_planar_sep_2_vx", pre_process_rgb888_planar_sep_2_vx}, + {"pre_process_rgb888_planar_sep_nhwc_0_vx", pre_process_rgb888_planar_sep_nhwc_0_vx}, + {"pre_process_rgb888_planar_sep_nhwc_1_vx", pre_process_rgb888_planar_sep_nhwc_1_vx}, + {"pre_process_rgb888_planar_sep_nhwc_2_vx", pre_process_rgb888_planar_sep_nhwc_2_vx}, {"pre_process_rgb_copy_vx", pre_process_rgb_copy_vx}, {"pre_process_yuv420_copy_vx", pre_process_yuv420_copy_vx}, {"pre_process_yuv420_scale_0_vx", pre_process_yuv420_scale_0_vx}, @@ -66092,6 +71660,8 @@ static const source_map_t evis_resource[] = {"scatter_nd_update_vx", scatter_nd_update_vx}, {"scatter_nd_update_atom_vx", scatter_nd_update_atom_vx}, {"scatter_nd_update_big_vx", scatter_nd_update_big_vx}, + {"scatter_nd_update_fp_vx", scatter_nd_update_fp_vx}, + {"scatter_nd_update_qint_vx", scatter_nd_update_qint_vx}, {"scatter_nd_update_special_vx", scatter_nd_update_special_vx}, {"select_vx", select_vx}, {"sequence_mask_vx", sequence_mask_vx}, @@ -66102,6 +71672,8 @@ static const source_map_t evis_resource[] = {"tensorstackconcat_vx", tensorstackconcat_vx}, {"tile_vx", tile_vx}, {"tile_mix_vx", tile_mix_vx}, + {"tiny_yolov4_postprocess_box_vx", tiny_yolov4_postprocess_box_vx}, + {"tiny_yolov4_postprocess_confidence_vx", tiny_yolov4_postprocess_confidence_vx}, {"upsample_F16_vx", upsample_F16_vx}, {"upsample_I16_vx", upsample_I16_vx}, {"upsample_I8_vx", upsample_I8_vx}, @@ -66192,6 +71764,7 @@ static const source_map_t cl_resource[] = {"lstmunit_activation_S_F32_cl", lstmunit_activation_S_F32_cl}, {"lstmunit_activation_S_U8_cl", lstmunit_activation_S_U8_cl}, {"matrixmul_cl", matrixmul_cl}, + {"matrixmul_cross_cl", matrixmul_cross_cl}, {"matrixmul_transA_cl", matrixmul_transA_cl}, {"maximum_cl", maximum_cl}, {"maxpoolwithargmax_cl", maxpoolwithargmax_cl}, @@ -66204,6 +71777,7 @@ static const source_map_t cl_resource[] = {"moments_axis012_cl", moments_axis012_cl}, {"moments_axis1_cl", moments_axis1_cl}, {"moments_axis2_cl", moments_axis2_cl}, + {"nearest_grid_sample_cl", nearest_grid_sample_cl}, {"one_hot_cl", one_hot_cl}, {"poolwithargmax_cl", poolwithargmax_cl}, {"pow_cl", pow_cl}, @@ -66229,6 +71803,8 @@ static const source_map_t cl_resource[] = {"repeat_cl", repeat_cl}, {"resize_1d_bilinear_cl", resize_1d_bilinear_cl}, {"resize_1d_nearest_cl", resize_1d_nearest_cl}, + {"resize_3d_bilinear_cl", resize_3d_bilinear_cl}, + {"resize_3d_nearest_cl", resize_3d_nearest_cl}, {"resize_bilinear_cl", resize_bilinear_cl}, {"resize_nearest_cl", resize_nearest_cl}, {"reversesequence_cl", reversesequence_cl}, diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c index 8462aad82..2c63c1e5e 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c @@ -33,6 +33,7 @@ #include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "libnnext/vsi_nn_libnnext_resource.h" +#include "vsi_nn_error.h" static char s_vx_resource_path[VSI_NN_MAX_PATH] = "VX"; @@ -63,6 +64,11 @@ uint8_t * vsi_nn_LoadBinarySource fseek( fp, 0, SEEK_SET ); buf = (uint8_t *)malloc( len + 1 ); + if (buf == NULL) + { + fclose( fp ); + return NULL; + } n = (int32_t)fread( buf, 1, len, fp ); fclose( fp ); @@ -208,7 +214,10 @@ static vsi_status vsi_nn_RegisterVXKernel evis = context->config.evis.ver; program_src = (const char**)malloc(kernel_info->resource_num * sizeof(char *)); + CHECK_PTR_FAIL_GOTO( program_src, "Create buffer fail.", final ); program_len = (vx_size*)malloc(kernel_info->resource_num * sizeof(vx_size)); + CHECK_PTR_FAIL_GOTO( program_len, "Create buffer fail.", final ); + for (i = 0; i < kernel_info->resource_num; i++) { program_src[i] = vsi_nn_resource_load_source_code( @@ -228,7 +237,7 @@ static vsi_status vsi_nn_RegisterVXKernel { VSILOGE("[%s : %d] vxCreateProgramWithSource() Error!\n", __FILE__, __LINE__); status = VSI_FAILURE; - goto OnError; + goto final; } if(evis == VSI_NN_HW_EVIS_NONE) @@ -267,16 +276,17 @@ static vsi_status vsi_nn_RegisterVXKernel { VSILOGE( "Add kernel %s fail.", kernel->name ); } -OnError: +final: for (i = 0; i < kernel_info->resource_num; i++) { - if (program_src[i] && load_from_file) + if (load_from_file && program_src[i]) { free((char *)program_src[i]); } } if(program_src) free((char**)program_src); if(program_len) free(program_len); + return status; } @@ -286,7 +296,7 @@ static vsi_status vsi_nn_RegisterBinKernel vsi_nn_kernel_info_t * kernel_info ) { - vsi_status status; + vsi_status status = VSI_FAILURE; vx_kernel obj; vx_program program = NULL; vx_size program_len = 0; @@ -308,6 +318,11 @@ static vsi_status vsi_nn_RegisterBinKernel program_ptr = vsi_nn_VxBinResourceGetResource( kernel_info->resource_name[kernel_info->resource_num - 1], &program_len); + if (program_ptr == NULL) + { + VSILOGE("[%s : %d] vsi_nn_VxBinResourceGetResource() Error!\n", __FILE__, __LINE__); + return status; + } program = vxCreateProgramWithBinary(ctx, (const vx_uint8 *)program_ptr, program_len); status = vxGetStatus((vx_reference)program); @@ -396,10 +411,19 @@ vx_node vsi_nn_RegisterClientKernelAndNewNode ) { vsi_status status; - vx_context ctx; - vx_kernel obj; - vx_node node; - vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index]; + vx_context ctx = NULL; + vx_kernel obj = NULL; + vx_node node = NULL; + vx_kernel_description_t * kernel = NULL; + + if (kernel_info->kernel) + { + kernel = kernel_info->kernel[kernel_info->kernel_index]; + } + else + { + goto final; + } ctx = vxGetContext( (vx_reference)graph->g ); @@ -444,6 +468,8 @@ vx_node vsi_nn_RegisterClientKernelAndNewNode kernel->name, status ); return NULL; } + +final: return node; } /* vsi_nn_RegisterClientKernelAndNewNode() */ @@ -501,6 +527,10 @@ vsi_status VX_CALLBACK vsi_nn_KernelValidator vx_meta_format metas[] ) { + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(parameters); + VSI_UNREFERENCED(num); + VSI_UNREFERENCED(metas); return VSI_SUCCESS; } /* vsi_nn_KernelValidator() */ @@ -511,6 +541,9 @@ vsi_status VX_CALLBACK vsi_nn_KernelInitializer uint32_t paraNum ) { + VSI_UNREFERENCED(nodObj); + VSI_UNREFERENCED(paramObj); + VSI_UNREFERENCED(paraNum); return VSI_SUCCESS; } /* vsi_nn_KernelInitializer() */ @@ -521,6 +554,9 @@ vsi_status VX_CALLBACK vsi_nn_KernelDeinitializer uint32_t paraNum ) { + VSI_UNREFERENCED(nodObj); + VSI_UNREFERENCED(paraObj); + VSI_UNREFERENCED(paraNum); return VSI_SUCCESS; } /* vsi_nn_KernelDeinitializer() */ @@ -543,6 +579,8 @@ const uint8_t * vsi_nn_VxBinResourceGetResource vx_size *len ) { + VSI_UNREFERENCED(name); + VSI_UNREFERENCED(len); return NULL; } /* vsi_nn_VxResourceGetBinResource() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c b/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c index 1f371d471..97da8bd51 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c @@ -39,6 +39,7 @@ #include "kernel/vsi_nn_kernel.h" #include "kernel/vsi_nn_kernel_eltwise.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_error.h" static vsi_status op_compute ( @@ -78,6 +79,7 @@ static vsi_bool op_check attr.vtl = TRUE; attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; a_times_b[0] = vsi_nn_CreateTensor(self->graph, &attr); + CHECK_PTR_FAIL_GOTO(a_times_b[0], "Create tensor failed", final); ret = vsi_nn_OpCheck(VSI_NN_OP_MULTIPLY, self, inputs, a_times_b); if (!ret) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c index 078d708a7..b248d9054 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c @@ -34,6 +34,7 @@ #include "vsi_nn_internal_node.h" #include "utils/vsi_nn_constraint_check.h" #include "vsi_nn_kernel_prv.h" +#include "vsi_nn_error.h" static int32_t _get_input_num ( @@ -91,6 +92,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -101,6 +104,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } /* op_check() */ @@ -112,6 +118,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ @@ -122,7 +130,7 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - vsi_bool ret = TRUE; + vsi_bool ret = FALSE; uint32_t i; vsi_nn_tensor_attr_t attr; vsi_nn_internal_node_t* curr = NULL; @@ -134,6 +142,12 @@ static vsi_bool op_setup input_num = _get_input_num(self, inputs); + if (input_num < 2) + { + VSILOGE( "Wrong input tensor number = %u.", input_num ); + return FALSE; + } + is_sp_supported = vsi_nn_is_sp_supported_broadcast(self->graph, inputs, input_num, outputs[0]); for(i = 0; i < input_num -1; i++) @@ -142,6 +156,7 @@ static vsi_bool op_setup /* setup input for each add */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); if(i == 0) { curr->inputs[0] = inputs[i]; @@ -174,6 +189,7 @@ static vsi_bool op_setup } temp_output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(temp_output_tensor, curr, "Create internal tensor failed", final); curr->outputs[0] = temp_output_tensor->t; } @@ -182,8 +198,10 @@ static vsi_bool op_setup curr->outputs[0] = outputs[0]; } - vsi_nn_internal_setup_node( self, curr ); + ret = vsi_nn_internal_setup_node( self, curr ); } + +final: return ret; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c b/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c index 23248759e..6252e4d52 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c @@ -70,6 +70,9 @@ static vsi_bool op_check ) { /*TODO: Check tensor shapes. */ + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } /* op_check() */ @@ -80,6 +83,7 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.size[0] = inputs[1]->attr.size[0]; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c index 56889cbed..0e6fa13e5 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c @@ -248,6 +248,7 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); /* TODO: Add code to comput outputs' shape. */ if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c index a969fa6b5..7afa231b4 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c @@ -78,6 +78,7 @@ static vsi_bool setup_op_shapes attr.is_const = TRUE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); inputs[BI_LSTM_FW_INPUT_H_STATE] = output_tensor->t; } @@ -91,6 +92,7 @@ static vsi_bool setup_op_shapes attr.is_const = TRUE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); inputs[BI_LSTM_BW_INPUT_H_STATE] = output_tensor->t; } @@ -119,6 +121,8 @@ static vsi_bool setup_op_shapes } return TRUE; +final: + return FALSE; } static vsi_status op_compute @@ -128,6 +132,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -139,6 +145,9 @@ static vsi_bool op_check ) { /*TODO: Check tensor shapes. */ + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } /* op_check() */ @@ -150,6 +159,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ @@ -183,6 +194,9 @@ static vsi_bool op_setup vsi_size_t batch_size = 0; uint32_t time_step = 0; vsi_size_t i = 0; + vsi_bool ret = FALSE; + vsi_status status = VSI_FAILURE; + vsi_nn_tensor_t** merge_tensors = NULL; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_node_wksp( self ); @@ -207,6 +221,7 @@ static vsi_bool op_setup /* transpose to time_major */ output_tensor = vsi_nn_rnn_transpose_time_major(self, inputs[BI_LSTM_INPUT_INPUT], NULL, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); input_tensor = output_tensor->t; } @@ -219,6 +234,7 @@ static vsi_bool op_setup /* transpose to time_major */ output_tensor = vsi_nn_rnn_transpose_time_major(self, inputs[BI_LSTM_AUX_INPUT], NULL, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); aux_input_tensor = output_tensor->t; } } @@ -231,10 +247,12 @@ static vsi_bool op_setup CHECK_PTR_FAIL_GOTO( reshape_output_tensors, "Create buffer fail.", final ); memset( reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); - vsi_nn_rnn_split_input_tensor(self, input_tensor, + status = vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, time_step, use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO(status, final); - vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor); + status = vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO(status, final); /* split aux input tensor */ if(has_aux_input) @@ -246,10 +264,12 @@ static vsi_bool op_setup CHECK_PTR_FAIL_GOTO( aux_reshape_output_tensors, "Create buffer fail.", final ); memset( aux_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); - vsi_nn_rnn_split_input_tensor(self, aux_input_tensor, + status = vsi_nn_rnn_split_input_tensor(self, aux_input_tensor, aux_split_output_tensors, time_step, use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO(status, final); - vsi_nn_rnn_data_check_aligned(self, aux_split_output_tensors, time_step, use_virtual_tensor); + status = vsi_nn_rnn_data_check_aligned(self, aux_split_output_tensors, time_step, use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO(status, final); } /* prepare output tensor */ @@ -267,6 +287,7 @@ static vsi_bool op_setup /* reshape for split output */ output_tensor = vsi_nn_rnn_reshape_split_output(self, split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); reshape_output_tensors[i] = output_tensor->t; if (has_aux_input) @@ -274,6 +295,7 @@ static vsi_bool op_setup /* reshape for aux split output */ output_tensor = vsi_nn_rnn_reshape_split_output(self, aux_split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); aux_reshape_output_tensors[i] = output_tensor->t; } } @@ -291,21 +313,25 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); lstmcell_out0 = output_tensor->t; /* lstmcell output h_state */ vsi_nn_internal_init_tensor_attr(&attr, &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); lstmcell_out1 = output_tensor->t; /* lstmcell output c_state */ vsi_nn_internal_init_tensor_attr(&attr, &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); lstmcell_out2 = output_tensor->t; curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_OVXLIB, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.lstmunit_ovxlib.activation = curr_param->activation; curr->node->nn_param.lstmunit_ovxlib.cell_clip = curr_param->cell_clip; curr->node->nn_param.lstmunit_ovxlib.forget_bias = curr_param->forget_bias; @@ -373,6 +399,7 @@ static vsi_bool op_setup /* reshape output to 3-dims */ output_tensor = vsi_nn_rnn_reshape_cell_output(self, lstmcell_out0, (uint32_t)batch_size, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); lstmcell_reshape_output_tensors_fw[i] = output_tensor->t; } @@ -391,21 +418,25 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); lstmcell_out0 = output_tensor->t; /* lstmcell output h_state */ vsi_nn_internal_init_tensor_attr(&attr, &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); lstmcell_out1 = output_tensor->t; /* lstmcell output c_state */ vsi_nn_internal_init_tensor_attr(&attr, &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); lstmcell_out2 = output_tensor->t; curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_OVXLIB, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.lstmunit_ovxlib.activation = curr_param->activation; curr->node->nn_param.lstmunit_ovxlib.cell_clip = curr_param->cell_clip; curr->node->nn_param.lstmunit_ovxlib.forget_bias = curr_param->forget_bias; @@ -473,12 +504,12 @@ static vsi_bool op_setup /* reshape output to 3-dims */ output_tensor = vsi_nn_rnn_reshape_cell_output(self, lstmcell_out0, (uint32_t)batch_size, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); lstmcell_reshape_output_tensors_bw[i] = output_tensor->t; } if(curr_param->merge_outputs) { - vsi_nn_tensor_t** merge_tensors = NULL; merge_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); CHECK_PTR_FAIL_GOTO( merge_tensors, "Create buffer fail.", final ); memset( merge_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); @@ -489,6 +520,7 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); tensor = output_tensor->t; } @@ -499,8 +531,10 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 2, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.concat.axis = 0; curr->inputs[0] = lstmcell_reshape_output_tensors_fw[i]; curr->inputs[1] = lstmcell_reshape_output_tensors_bw[i]; @@ -512,6 +546,7 @@ static vsi_bool op_setup /* concat lstmcell output, the lstm's output is 3-dims */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.concat.axis = 2; for( i = 0; i < time_step; i++ ) { @@ -526,7 +561,6 @@ static vsi_bool op_setup vsi_nn_rnn_transpose_time_major(self, tensor, outputs[BI_LSTM_FW_OUTPUT_OUTPUT], use_virtual_tensor); } - vsi_nn_safe_free( merge_tensors ); } else { @@ -537,12 +571,14 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); tensor = output_tensor->t; } /* concat lstmcell output, the lstm's output is 3-dims */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.concat.axis = 2; for( i = 0; i < time_step; i++ ) { @@ -565,12 +601,14 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &outputs[BI_LSTM_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); tensor = output_tensor->t; } /* concat lstmcell output, the lstm's output is 3-dims */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.concat.axis = 2; for( i = 0; i < time_step; i++ ) { @@ -587,7 +625,10 @@ static vsi_bool op_setup } } + ret = TRUE; + final: + vsi_nn_safe_free( merge_tensors ); vsi_nn_safe_free( split_output_tensors ); vsi_nn_safe_free( aux_split_output_tensors ) vsi_nn_safe_free( reshape_output_tensors ); @@ -595,7 +636,7 @@ static vsi_bool op_setup vsi_nn_safe_free( lstmcell_reshape_output_tensors_fw ); vsi_nn_safe_free( lstmcell_reshape_output_tensors_bw ); - return TRUE; + return ret; } /* op_setup() */ static vsi_status op_deinit diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c index c122de7f5..8b3844de0 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c @@ -79,6 +79,7 @@ static vsi_bool setup_op_shapes attr.is_const = TRUE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); inputs[BI_RNN_FW_INPUT_H_STATE] = output_tensor->t; } @@ -92,6 +93,7 @@ static vsi_bool setup_op_shapes attr.is_const = TRUE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); inputs[BI_RNN_BW_INPUT_H_STATE] = output_tensor->t; } @@ -103,6 +105,7 @@ static vsi_bool setup_op_shapes attr.vtl = use_virtual_tensor; attr.is_const = FALSE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); outputs[BI_RNN_FW_OUTPUT_H_STATE] = output_tensor->t; } @@ -114,6 +117,7 @@ static vsi_bool setup_op_shapes attr.vtl = use_virtual_tensor; attr.is_const = FALSE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); outputs[BI_RNN_BW_OUTPUT_H_STATE] = output_tensor->t; } @@ -162,6 +166,8 @@ static vsi_bool setup_op_shapes } } return TRUE; +final: + return FALSE; } static vsi_status op_compute @@ -171,6 +177,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -181,6 +189,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -193,6 +204,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ @@ -225,6 +238,9 @@ static vsi_bool op_setup vsi_size_t batch_size = 0; vsi_size_t time_step = 0; vsi_size_t i = 0; + vsi_bool ret = FALSE; + vsi_nn_tensor_t** merge_tensors = NULL; + vsi_status status = VSI_FAILURE; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_node_wksp( self ); @@ -249,6 +265,7 @@ static vsi_bool op_setup /* transpose to time_major */ output_tensor = vsi_nn_rnn_transpose_time_major(self, inputs[BI_RNN_INPUT_INPUT], NULL, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); input_tensor = output_tensor->t; } @@ -261,6 +278,7 @@ static vsi_bool op_setup /* transpose to time_major */ output_tensor = vsi_nn_rnn_transpose_time_major(self, inputs[BI_RNN_AUX_INPUT], NULL, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); aux_input_tensor = output_tensor->t; } } @@ -273,10 +291,12 @@ static vsi_bool op_setup CHECK_PTR_FAIL_GOTO( reshape_output_tensors, "Create buffer fail.", final ); memset( reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); - vsi_nn_rnn_split_input_tensor(self, input_tensor, + status = vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO(status, final); - vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); + status = vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO(status, final); /* split aux input tensor */ if(has_aux_input) @@ -288,10 +308,13 @@ static vsi_bool op_setup CHECK_PTR_FAIL_GOTO( aux_reshape_output_tensors, "Create buffer fail.", final ); memset( aux_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); - vsi_nn_rnn_split_input_tensor(self, aux_input_tensor, + status = vsi_nn_rnn_split_input_tensor(self, aux_input_tensor, aux_split_output_tensors, (uint32_t)time_step, use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO(status, final); - vsi_nn_rnn_data_check_aligned(self, aux_split_output_tensors, (uint32_t)time_step, use_virtual_tensor); + status = vsi_nn_rnn_data_check_aligned(self, aux_split_output_tensors, + (uint32_t)time_step, use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO(status, final); } /* prepare output tensor */ @@ -309,6 +332,7 @@ static vsi_bool op_setup /* reshape for split output */ output_tensor = vsi_nn_rnn_reshape_split_output(self, split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); reshape_output_tensors[i] = output_tensor->t; if (has_aux_input) @@ -316,6 +340,7 @@ static vsi_bool op_setup /* reshape for aux split output */ output_tensor = vsi_nn_rnn_reshape_split_output(self, aux_split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); aux_reshape_output_tensors[i] = output_tensor->t; } } @@ -331,12 +356,14 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); rnncell_out0 = output_tensor->t; /* rnncell output h_state */ vsi_nn_internal_init_tensor_attr(&attr, &outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); rnncell_out1 = output_tensor->t; if (reshape_output_tensors[i]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && @@ -366,6 +393,7 @@ static vsi_bool op_setup } curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RNNCELL_OVXLIB, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.rnncell_ovxlib.activation = curr_param->activation; memcpy( curr->node->nn_param.rnncell_ovxlib.internal_dtype, curr_param->internal_dtype, @@ -399,6 +427,7 @@ static vsi_bool op_setup /* reshape output to 3-dims */ output_tensor = vsi_nn_rnn_reshape_cell_output(self, rnncell_out0, (uint32_t)batch_size, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); rnncell_reshape_output_tensors_fw[i] = output_tensor->t; } @@ -421,12 +450,14 @@ static vsi_bool op_setup &outputs[BI_RNN_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); } output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); rnncell_out0 = output_tensor->t; /* rnncell output h_state */ vsi_nn_internal_init_tensor_attr(&attr, &outputs[BI_RNN_BW_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); rnncell_out1 = output_tensor->t; if (reshape_output_tensors[time_step - 1 - i]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && @@ -456,6 +487,7 @@ static vsi_bool op_setup } curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RNNCELL_OVXLIB, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.rnncell_ovxlib.activation = curr_param->activation; memcpy( curr->node->nn_param.rnncell_ovxlib.internal_dtype, curr_param->internal_dtype, @@ -489,12 +521,12 @@ static vsi_bool op_setup /* reshape output to 3-dims */ output_tensor = vsi_nn_rnn_reshape_cell_output(self, rnncell_out0, (uint32_t)batch_size, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); rnncell_reshape_output_tensors_bw[time_step - 1 - i] = output_tensor->t; } if(curr_param->merge_outputs) { - vsi_nn_tensor_t** merge_tensors = NULL; merge_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); CHECK_PTR_FAIL_GOTO( merge_tensors, "Create buffer fail.", final ); memset( merge_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); @@ -505,6 +537,7 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); tensor = output_tensor->t; } @@ -515,8 +548,10 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 2, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.concat.axis = 0; curr->inputs[0] = rnncell_reshape_output_tensors_fw[i]; curr->inputs[1] = rnncell_reshape_output_tensors_bw[i]; @@ -528,6 +563,7 @@ static vsi_bool op_setup /* concat rnncell output, the rnn's output is 3-dims */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.concat.axis = 2; for( i = 0; i < time_step; i++ ) { @@ -542,7 +578,6 @@ static vsi_bool op_setup vsi_nn_rnn_transpose_time_major(self, tensor, outputs[BI_RNN_FW_OUTPUT_OUTPUT], use_virtual_tensor); } - vsi_nn_safe_free( merge_tensors ); } else { @@ -553,6 +588,7 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); tensor = output_tensor->t; } @@ -561,6 +597,7 @@ static vsi_bool op_setup if (outputs[BI_RNN_FW_OUTPUT_H_STATE] != NULL) { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = last_step_h_state_fw; curr->outputs[0] = outputs[BI_RNN_FW_OUTPUT_H_STATE]; vsi_nn_internal_setup_node(self, curr); @@ -568,6 +605,7 @@ static vsi_bool op_setup /* concat rnncell output, the rnn's output is 3-dims */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.concat.axis = 2; for( i = 0; i < time_step; i++ ) { @@ -590,6 +628,7 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &outputs[BI_RNN_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); tensor = output_tensor->t; } @@ -598,6 +637,7 @@ static vsi_bool op_setup if (outputs[BI_RNN_BW_OUTPUT_H_STATE] != NULL) { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = last_step_h_state_bw; curr->outputs[0] = outputs[BI_RNN_BW_OUTPUT_H_STATE]; vsi_nn_internal_setup_node(self, curr); @@ -605,6 +645,7 @@ static vsi_bool op_setup /* concat rnncell output, the rnn's output is 3-dims */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.concat.axis = 2; for( i = 0; i < time_step; i++ ) { @@ -621,6 +662,7 @@ static vsi_bool op_setup } } + ret = TRUE; final: vsi_nn_safe_free( split_output_tensors ); vsi_nn_safe_free( aux_split_output_tensors ) @@ -628,8 +670,9 @@ static vsi_bool op_setup vsi_nn_safe_free( aux_reshape_output_tensors ); vsi_nn_safe_free( rnncell_reshape_output_tensors_fw ); vsi_nn_safe_free( rnncell_reshape_output_tensors_bw ); + vsi_nn_safe_free( merge_tensors ); - return TRUE; + return ret; } /* op_setup() */ static vsi_status op_deinit diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c b/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c index 878c60692..9f7e6ace9 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c @@ -81,6 +81,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -92,6 +95,8 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = 1; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c index cac99d089..f53aeb548 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c @@ -149,6 +149,8 @@ static vsi_bool op_setup vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_bool ret = TRUE; + VSI_UNREFERENCED(self); + out_rank = inputs[0]->attr.dim_num; for (i = 0; i < out_rank; i++) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c index 1eaa7839a..e3de22fff 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c @@ -37,6 +37,7 @@ #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_error.h" /* Declare number of input and output. @@ -290,7 +291,7 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - vsi_bool ret = TRUE; + vsi_bool ret = FALSE; if ( NULL == self ) { @@ -298,7 +299,7 @@ static vsi_bool op_setup } ret = vsi_nn_op_common_setup(self, inputs, outputs); - if ( _is_dataconvert_op(self, inputs, outputs) ) + if ( _is_dataconvert_op(self, inputs, outputs) ) { vsi_nn_internal_node_t* curr = NULL; curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1); @@ -309,7 +310,7 @@ static vsi_bool op_setup curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node(self, curr); + ret &= vsi_nn_internal_setup_node(self, curr); } return ret; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c index 3e1db0e6d..bade3f959 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c @@ -39,6 +39,7 @@ #include "vsi_nn_internal_node.h" #include "utils/vsi_nn_constraint_check.h" #include "utils/vsi_nn_dtype_util.h" +#include "vsi_nn_error.h" #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) @@ -194,7 +195,7 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - vsi_bool ret = TRUE; + vsi_bool ret = FALSE; vsi_nn_internal_node_t* curr = NULL; float min = self->nn_param.clip.min; float max = self->nn_param.clip.max; @@ -224,11 +225,12 @@ static vsi_bool op_setup { curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0); } + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); self->nn_param.clip.local2->is_internal_node = TRUE; } @@ -236,6 +238,8 @@ static vsi_bool op_setup { ret = vsi_nn_op_common_setup(self, inputs, outputs); } + +final: return ret; } /* op_init() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_common.c b/src/tim/vx/internal/src/ops/vsi_nn_op_common.c index 354b6ce61..f4e70c55f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_common.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_common.c @@ -38,6 +38,9 @@ vsi_status vsi_nn_op_common_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); //TODO: assert_always() return VSI_FAILURE; } /* op_common_init() */ @@ -64,6 +67,7 @@ vsi_bool vsi_nn_op_common_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(node); if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; @@ -81,5 +85,8 @@ vsi_status vsi_nn_op_common_optimize vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(node); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return VSI_SUCCESS; } /* op_common_optimize() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c index bb1be6e1a..47b5889df 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c @@ -159,6 +159,8 @@ static vsi_status copy_tensor_to_view vsi_status ret; vsi_nn_concat_lcl_data * data; + VSI_UNREFERENCED(axis); + ret = VSI_SUCCESS; /* Malloc ptr */ data = (vsi_nn_concat_lcl_data *)malloc( sizeof(vsi_nn_concat_lcl_data) ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c index f07a690eb..f802f44e9 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c @@ -32,6 +32,7 @@ #include "utils/vsi_nn_dtype_util.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_error.h" typedef struct _conv1d_local_data_t { vsi_bool use_ext_pad; @@ -324,12 +325,16 @@ static vsi_bool op_setup memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, TRUE); tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( tensor, "Create tensor fail.", final ); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_PAD, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); front_data = (uint32_t*)\ vsi_nn_internal_new_node_param(curr, sizeof(uint32_t) * inputs[0]->attr.dim_num); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(front_data, curr, "Create internal buffer failed", final); back_data = (uint32_t*)\ vsi_nn_internal_new_node_param(curr, sizeof(uint32_t) * inputs[0]->attr.dim_num); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(back_data, curr, "Create internal buffer failed", final); front_data[0] = p->pad[0]; front_data[1] = 0; @@ -353,6 +358,8 @@ static vsi_bool op_setup } return TRUE; +final: + return FALSE; } /* op_setup() */ static vsi_status op_deinit diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c index 03118aaa2..2e1ae75f5 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c @@ -38,6 +38,7 @@ #include "vsi_nn_error.h" #include "vsi_nn_internal_node.h" #include "vsi_nn_rnn_helper.h" +#include "vsi_nn_error.h" static vsi_nn_internal_tensor_t * reshape_cell_out ( @@ -54,11 +55,14 @@ static vsi_nn_internal_tensor_t * reshape_cell_out vsi_nn_internal_init_tensor_attr(&attr, &cell_out->attr.dtype, TRUE); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); /* reshape cell_out [w,h,c,n] to [w,h,c,1,n] */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); reshape_cell_size = vsi_nn_internal_new_node_param(curr, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( reshape_cell_size, curr, "Create internal buffer fail.", final ); reshape_cell_size[0] = cell_out->attr.size[0]; reshape_cell_size[1] = cell_out->attr.size[1]; reshape_cell_size[2] = cell_out->attr.size[2]; @@ -71,6 +75,8 @@ static vsi_nn_internal_tensor_t * reshape_cell_out curr->outputs[0] = output_tensor->t; vsi_nn_internal_setup_node( self, curr ); + +final: return output_tensor; } /* reshape_cell_out() */ @@ -88,11 +94,14 @@ static vsi_nn_internal_tensor_t * reshape_split_out memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_tensor_attr(&attr, &split_out->attr.dtype, TRUE); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); /* reshape [w,h,c,t,n] to [w,h,c,n] */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); reshape_split_size = vsi_nn_internal_new_node_param(curr, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( reshape_split_size, curr, "Create internal buffer fail.", final ); reshape_split_size[0] = split_out->attr.size[0]; reshape_split_size[1] = split_out->attr.size[1]; reshape_split_size[2] = split_out->attr.size[2]; @@ -104,10 +113,11 @@ static vsi_nn_internal_tensor_t * reshape_split_out curr->outputs[0] = output_tensor->t; vsi_nn_internal_setup_node( self, curr ); +final: return output_tensor; } /* reshape_split_out() */ -static void split_input_tensor +static vsi_status split_input_tensor ( vsi_nn_node_t * self, vsi_nn_tensor_t * input, @@ -115,6 +125,7 @@ static void split_input_tensor uint32_t time_step ) { + vsi_status status = VSI_FAILURE; uint32_t i; vsi_nn_tensor_attr_t attr; vsi_nn_internal_node_t* curr = NULL; @@ -124,7 +135,9 @@ static void split_input_tensor i = 0; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, time_step ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); slices = (uint32_t *)vsi_nn_internal_new_node_param(curr, time_step * sizeof(uint32_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(slices, curr, "Create internal buffer failed", final); curr->node->nn_param.split.axis = 3; /* input_shape [w,h,c,t,n] */ curr->node->nn_param.split.slices_num = time_step; curr->inputs[0] = input; @@ -135,10 +148,15 @@ static void split_input_tensor slices[i] = 1; vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, TRUE); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( output_tensor, curr, "Create internal tensor fail.", final ); curr->outputs[i] = output_tensor->t; output[i] = output_tensor->t; } vsi_nn_internal_setup_node( self, curr ); + + status = VSI_SUCCESS; +final: + return status; } /* split_input_tensor() */ static void trans_output_tensor @@ -182,13 +200,14 @@ static void trans_output_tensor } } /* trans_output_tensor() */ -static void trans_input_tensor +static vsi_status trans_input_tensor ( vsi_nn_node_t * self, vsi_nn_tensor_t ** inputs, vsi_nn_tensor_t ** trans_inputs ) { + vsi_status status = VSI_FAILURE; vsi_size_t perm[VSI_NN_MAX_DIM_NUM]; vsi_nn_internal_tensor_t * tmp_tensor = NULL; vsi_nn_conv2d_lstm_param * p = &self->nn_param.conv2d_lstm; @@ -203,6 +222,7 @@ static void trans_input_tensor perm[3] = 3; perm[4] = 4; tmp_tensor = vsi_nn_rnn_create_permute(self, inputs[CONV2D_LSTM_IN_INPUT], NULL, perm, 5, TRUE); + CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final ); trans_inputs[CONV2D_LSTM_IN_INPUT] = tmp_tensor->t; // [c,w,h,n] --> [w,h,c,n] @@ -211,9 +231,11 @@ static void trans_input_tensor perm[2] = 0; perm[3] = 3; tmp_tensor = vsi_nn_rnn_create_permute(self, inputs[CONV2D_LSTM_IN_H_STATE], NULL, perm, 4, TRUE); + CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final ); trans_inputs[CONV2D_LSTM_IN_H_STATE] = tmp_tensor->t; tmp_tensor = vsi_nn_rnn_create_permute(self, inputs[CONV2D_LSTM_IN_C_STATE], NULL, perm, 4, TRUE); + CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final ); trans_inputs[CONV2D_LSTM_IN_C_STATE] = tmp_tensor->t; } else @@ -222,9 +244,13 @@ static void trans_input_tensor trans_inputs[CONV2D_LSTM_IN_H_STATE] = inputs[CONV2D_LSTM_IN_H_STATE]; trans_inputs[CONV2D_LSTM_IN_C_STATE] = inputs[CONV2D_LSTM_IN_C_STATE]; } + + status = VSI_SUCCESS; +final: + return status; } /* trans_input_tensor() */ -static void create_state_tensor +static vsi_status create_state_tensor ( vsi_nn_node_t * self, vsi_nn_tensor_t ** inputs, @@ -234,6 +260,7 @@ static void create_state_tensor vsi_size_t out_channel ) { + vsi_status status = VSI_FAILURE; vsi_size_t samples, state_shape[4]; vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t * tensor = NULL; @@ -267,6 +294,7 @@ static void create_state_tensor attr.is_const = TRUE; tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final ); inputs[CONV2D_LSTM_IN_H_STATE] = tensor->t; } @@ -280,6 +308,7 @@ static void create_state_tensor attr.is_const = TRUE; tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final ); inputs[CONV2D_LSTM_IN_C_STATE] = tensor->t; } @@ -291,6 +320,7 @@ static void create_state_tensor attr.vtl = TRUE; attr.is_const = FALSE; tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final ); outputs[CONV2D_LSTM_OUT_H_STATE] = tensor->t; } @@ -303,8 +333,12 @@ static void create_state_tensor attr.vtl = TRUE; attr.is_const = FALSE; tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final ); outputs[CONV2D_LSTM_OUT_C_STATE] = tensor->t; } + status = VSI_SUCCESS; +final: + return status; } /* create_state_tensor() */ static vsi_bool setup_op_shapes @@ -314,6 +348,7 @@ static vsi_bool setup_op_shapes vsi_nn_tensor_t ** outputs ) { + vsi_status status = VSI_FAILURE; vsi_nn_tensor_attr_t attr; vsi_size_t w_out, h_out, samples, timestep, out_channel; vsi_size_t conv_in_shape[4]; @@ -411,7 +446,8 @@ static vsi_bool setup_op_shapes } /* create hstate and cstate input/output if app doesn't provide them */ - create_state_tensor(self, inputs, outputs, w_out, h_out, out_channel); + status = create_state_tensor(self, inputs, outputs, w_out, h_out, out_channel); + CHECK_STATUS_FAIL_GOTO(status, final); /* hidden state output */ if(VSI_NN_DIM_AUTO == outputs[CONV2D_LSTM_OUT_H_STATE]->attr.dim_num) @@ -452,6 +488,8 @@ static vsi_bool setup_op_shapes } return TRUE; +final: + return FALSE; } /* setup_op_shapes() */ static vsi_status op_compute @@ -461,6 +499,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -471,6 +511,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -483,6 +526,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ @@ -493,6 +538,7 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + vsi_status status = VSI_FAILURE; vsi_size_t i, timestep, perm[VSI_NN_MAX_DIM_NUM]; vsi_nn_tensor_t * trans_inputs[3] = { NULL }; vsi_nn_tensor_t * conv2dlstm_outputs[3] = { NULL }; @@ -503,6 +549,7 @@ static vsi_bool op_setup vsi_nn_tensor_t * cell_out0 = NULL, * cell_out1 = NULL, * cell_out2 = NULL; vsi_nn_conv2d_lstm_param * p = &self->nn_param.conv2d_lstm; vsi_nn_internal_node_t* curr = NULL; + vsi_bool ret = FALSE; memset(&attr, 0, sizeof(attr)); memset(perm, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); @@ -512,7 +559,8 @@ static vsi_bool op_setup setup_op_shapes(self, inputs, outputs); - trans_input_tensor(self, inputs, trans_inputs); + status = trans_input_tensor(self, inputs, trans_inputs); + CHECK_STATUS_FAIL_GOTO(status, final); split_outputs = (vsi_nn_tensor_t **)malloc(sizeof(vsi_nn_tensor_t *) * timestep); CHECK_PTR_FAIL_GOTO( split_outputs, "Create buffer fail.", final ); @@ -522,7 +570,8 @@ static vsi_bool op_setup memset(conv2dlstm_step_outputs, 0, sizeof(vsi_nn_tensor_t *) * timestep); /* split input tensor by time-step */ - split_input_tensor(self, trans_inputs[CONV2D_LSTM_IN_INPUT], split_outputs, (uint32_t)timestep); + status = split_input_tensor(self, trans_inputs[CONV2D_LSTM_IN_INPUT], split_outputs, (uint32_t)timestep); + CHECK_STATUS_FAIL_GOTO(status, final); cell_out0 = cell_out1 = cell_out2 = NULL; step_h_state = trans_inputs[CONV2D_LSTM_IN_H_STATE]; @@ -533,6 +582,7 @@ static vsi_bool op_setup /* reshape for split output */ tmp_tensor = reshape_split_out(self, split_outputs[i]); + CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final ); reshape_output = tmp_tensor->t; if((i == timestep - 1) && p->return_sequences == FALSE && p->data_format == CONV2D_LSTM_CHANNELS_FIRST) @@ -543,6 +593,7 @@ static vsi_bool op_setup { vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.dtype, TRUE); tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final ); cell_out0 = tmp_tensor->t; } @@ -556,16 +607,19 @@ static vsi_bool op_setup /* conv2d_lstm hstate output */ vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_OUT_H_STATE]->attr.dtype, TRUE); tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final ); cell_out1 = tmp_tensor->t; /* conv2d_lstm cstate output */ vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_OUT_C_STATE]->attr.dtype, TRUE); tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final ); cell_out2 = tmp_tensor->t; } /* create a conv2d_lstm_cell */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONV2D_LSTM_CELL, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.conv2d_lstm_cell.filters = p->filters; curr->node->nn_param.conv2d_lstm_cell.activation = p->activation; curr->node->nn_param.conv2d_lstm_cell.recurrent_activation = p->recurrent_activation; @@ -600,6 +654,7 @@ static vsi_bool op_setup { /* store step's outputs */ tmp_tensor = reshape_cell_out(self, cell_out0); + CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final ); conv2dlstm_step_outputs[i] = tmp_tensor->t; } } @@ -610,6 +665,7 @@ static vsi_bool op_setup { vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.dtype, TRUE); tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final ); conv2dlstm_outputs[CONV2D_LSTM_OUT_OUTPUT] = tmp_tensor->t; } else @@ -618,6 +674,7 @@ static vsi_bool op_setup } /* concat all step's output0 data on dimension t --- cell out0 shape: [w,h,c,t,n] */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)timestep, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.concat.axis = 3; for(i = 0; i < timestep; i++) { @@ -638,10 +695,11 @@ static vsi_bool op_setup trans_output_tensor(self, conv2dlstm_outputs, outputs); } + ret = TRUE; final: vsi_nn_safe_free(split_outputs); vsi_nn_safe_free(conv2dlstm_step_outputs) - return TRUE; + return ret; } /* op_setup() */ static vsi_status op_deinit @@ -660,6 +718,7 @@ static vsi_status op_init ) { vsi_status status = VSI_SUCCESS; + VSI_UNREFERENCED(self); return status; } /* op_init() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c index 388de95c3..3a31d44db 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c @@ -35,7 +35,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "libnnext/vsi_nn_vxkernel.h" +#include "vsi_nn_error.h" #include "vsi_nn_internal_node.h" #include "vsi_nn_rnn_helper.h" @@ -99,8 +99,10 @@ static vsi_nn_internal_tensor_t * create_input_conv attr.vtl = TRUE; attr.is_const = FALSE; input_conv_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(input_conv_out, "Create internal tensor failed", final); input_conv = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 ); + CHECK_PTR_FAIL_GOTO(input_conv, "Create internal node failed", final); input_conv->node->nn_param.conv2d.group = 1; input_conv->node->nn_param.conv2d.ksize[0] = p->conv2d.ksize[0]; input_conv->node->nn_param.conv2d.ksize[1] = p->conv2d.ksize[1]; @@ -129,6 +131,7 @@ static vsi_nn_internal_tensor_t * create_input_conv // reshape whcn --> xn reshape_out = reshape_tensor_to_act(self, input_conv_out->t); +final: return reshape_out; } /* create_input_conv() */ @@ -176,8 +179,10 @@ static vsi_nn_internal_tensor_t * create_recurrent_conv attr.vtl = TRUE; attr.is_const = FALSE; recurrent_conv_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(recurrent_conv_out, "Create internal tensor failed", final); recurrent_conv = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 ); + CHECK_PTR_FAIL_GOTO(recurrent_conv, "Create internal node failed", final); recurrent_conv->node->nn_param.conv2d.pad_type = VSI_NN_PAD_SAME; recurrent_conv->node->nn_param.conv2d.group = 1; recurrent_conv->node->nn_param.conv2d.ksize[0] = p->conv2d.ksize[0]; @@ -203,6 +208,8 @@ static vsi_nn_internal_tensor_t * create_recurrent_conv // reshape whcn --> xn reshape_out = reshape_tensor_to_act(self, recurrent_conv_out->t); + +final: return reshape_out; } /* create_recurrent_conv() */ @@ -303,6 +310,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -313,6 +322,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -325,6 +337,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ @@ -344,6 +358,7 @@ static vsi_bool op_setup vsi_nn_internal_tensor_t * reshape_h_out = NULL; vsi_nn_internal_tensor_t * reshape_c_out = NULL; vsi_nn_conv2d_lstm_cell_param * p = &self->nn_param.conv2d_lstm_cell; + vsi_bool ret = FALSE; vsi_nn_internal_init_node_wksp( self ); @@ -359,6 +374,7 @@ static vsi_bool op_setup inputs[CONV2D_LSTM_CELL_IN_KERNEL_I2I + i], inputs[CONV2D_LSTM_CELL_IN_BIAS_I + i] ); + CHECK_PTR_FAIL_GOTO(input_conv_outputs[i], "Create internal tensor failed", final); } /* create recurrent convolution */ @@ -369,10 +385,12 @@ static vsi_bool op_setup inputs[CONV2D_LSTM_CELL_IN_H_STATE], inputs[CONV2D_LSTM_CELL_IN_KERNEL_R2I + i] ); + CHECK_PTR_FAIL_GOTO(recurrent_conv_outputs[i], "Create internal tensor failed", final); } /* activations */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_ACTIVATION, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.lstmunit_activation.cell_clip = 0; curr->node->nn_param.lstmunit_activation.proj_clip = 0; curr->node->nn_param.lstmunit_activation.forget_bias = 0; @@ -384,6 +402,7 @@ static vsi_bool op_setup curr->node->nn_param.lstmunit_activation.recurrent_activation = p->recurrent_activation; reshape_cell_in = reshape_tensor_to_act(self, inputs[CONV2D_LSTM_CELL_IN_C_STATE]); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_cell_in, curr, "Create internal tensor failed", final); curr->inputs[LSTMUNIT_ACT_CSTATE_IN] = reshape_cell_in->t; for(i = 0; i < CONV2D_LSTM_CELL_GATE_NUM; i++) { @@ -392,15 +411,20 @@ static vsi_bool op_setup curr->inputs[LSTMUNIT_ACT_HSTATE_FC_I + i] = recurrent_conv_outputs[i]->t; } reshape_out = reshape_tensor_to_act(self, outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_out, curr, "Create internal tensor failed", final); reshape_h_out = reshape_tensor_to_act(self, outputs[CONV2D_LSTM_CELL_OUT_H_STATE]); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_h_out, curr, "Create internal tensor failed", final); reshape_c_out = reshape_tensor_to_act(self, outputs[CONV2D_LSTM_CELL_OUT_C_STATE]); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_c_out, curr, "Create internal tensor failed", final); curr->outputs[LSTMUNIT_ACT_OUTPUT] = reshape_out->t; curr->outputs[LSTMUNIT_ACT_CSTATE_OUT] = reshape_c_out->t; curr->outputs[LSTMUNIT_ACT_HSTATE_OUT] = reshape_h_out->t; vsi_nn_internal_setup_node(self, curr); - return TRUE; + ret = TRUE; +final: + return ret; } /* op_setup() */ static vsi_status op_deinit @@ -419,7 +443,7 @@ static vsi_status op_init ) { vsi_status status = VSI_SUCCESS; - + VSI_UNREFERENCED(self); return status; } /* op_init() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c index 1825e3b98..98217903a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c @@ -353,7 +353,7 @@ static vsi_status op_init //self->nn_param.conv3d.local = \ // (conv3d_local_data_t*)malloc(sizeof(conv3d_local_data_t)); */ - + VSI_UNREFERENCED(self); return VSI_SUCCESS; } /* op_init() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c index 6aaa61d5c..ed26a68f0 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c @@ -36,6 +36,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_log.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_error.h" #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) @@ -47,6 +48,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -72,6 +75,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } @@ -86,6 +91,7 @@ static vsi_bool op_setup int32_t i = 0; uint32_t j = 0; vsi_nn_internal_node_t* curr = NULL; + vsi_bool ret = FALSE; vsi_nn_internal_init_node_wksp( self ); p = (vsi_nn_crop_param *)&(self->nn_param.crop); @@ -96,46 +102,43 @@ static vsi_bool op_setup return FALSE; } - if ( VSI_NN_DIM_AUTO != outputs[0]->attr.dim_num ) - { - goto final; - } - - if (p->dims + p->axis == inputs[0]->attr.dim_num) + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { - for (i = 0; i < p->axis; i++) - { - outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; - } - for (i = p->axis; i < (int32_t)inputs[0]->attr.dim_num; i++) + if (p->dims + p->axis == inputs[0]->attr.dim_num) { - outputs[0]->attr.size[i] = inputs[1]->attr.size[i]; - } - outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; - } - else - { - if (p->dims == 1) - { - for (i = 0; i <= p->axis; i++) + for (i = 0; i < p->axis; i++) { - outputs[0]->attr.size[i] = inputs[1]->attr.size[i]; - p->offset[i] = p->offset[0]; + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; } - for (i = p->axis + 1; i < (int32_t)inputs[0]->attr.dim_num; i++) + for (i = p->axis; i < (int32_t)inputs[0]->attr.dim_num; i++) { - outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + outputs[0]->attr.size[i] = inputs[1]->attr.size[i]; } outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; } else { - VSILOGE("Invalid parameter: offset dims!\n"); - return FALSE; + if (p->dims == 1) + { + for (i = 0; i <= p->axis; i++) + { + outputs[0]->attr.size[i] = inputs[1]->attr.size[i]; + p->offset[i] = p->offset[0]; + } + for (i = p->axis + 1; i < (int32_t)inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + } + else + { + VSILOGE("Invalid parameter: offset dims!\n"); + return FALSE; + } } } -final: for (j = 0; j < self->nn_param.crop.dims; j++) { p->lcl_data->begin_dims[j] = (int32_t)self->nn_param.crop.offset[j]; @@ -151,6 +154,7 @@ static vsi_bool op_setup } curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.strided_slice.begin_dims = p->lcl_data->begin_dims; curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num; curr->node->nn_param.strided_slice.end_dims = p->lcl_data->end_dims; @@ -163,9 +167,10 @@ static vsi_bool op_setup curr->node->nn_param.strided_slice.new_axis_mask = 0; curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node( self, curr ); + ret = vsi_nn_internal_setup_node( self, curr ); - return TRUE; +final: + return ret; } /* op_setup() */ static vsi_status op_init diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c index d976b13b8..43f8a8f43 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c @@ -136,6 +136,8 @@ static vsi_bool op_setup /* TODO: Add code to comput outputs' shape. */ uint32_t i = 0; + VSI_UNREFERENCED(self); + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c index d1a778528..6d109f00b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c @@ -70,6 +70,8 @@ static vsi_bool _is_same_quant { vsi_nn_dtype_t *dtype,*_dtype; + VSI_UNREFERENCED(self); + dtype = &inputs[0]->attr.dtype; _dtype = &outputs[0]->attr.dtype; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c index 7048f5173..ba3a3c511 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c @@ -37,6 +37,7 @@ #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_error.h" typedef struct _deconv3d_local_data_t { int32_t placeholder; @@ -135,7 +136,9 @@ void _rotate_weight_data( int32_t item_size = vsi_nn_TypeGetBytes(weights->attr.dtype.vx_type); weight_data = vsi_nn_ConvertTensorToData(graph, weights); + CHECK_PTR_FAIL_GOTO( weight_data, "Create weight_data fail.", final ); buffer = (uint8_t*)malloc(item_size * depth_size * weight_ic * weight_oc); + CHECK_PTR_FAIL_GOTO( buffer, "Create buffer fail.", final ); memset(buffer, 0x00, item_size * depth_size * weight_ic * weight_oc); //memcpy(buffer, weight_data, item_size * slice_size * weight_ic * weight_oc); for(oc = 0; oc < weight_oc; oc++) @@ -164,6 +167,8 @@ void _rotate_weight_data( } vsi_nn_CopyDataToTensor( graph, weights, buffer ); + +final: vsi_nn_Free( buffer ); vsi_nn_safe_free( weight_data ); } @@ -263,7 +268,7 @@ static vsi_status op_init //self->nn_param.deconv3d.local = \ // (deconv3d_local_data_t*)malloc(sizeof(deconv3d_local_data_t)); */ - + VSI_UNREFERENCED(self); return VSI_SUCCESS; } /* op_init() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c index 09c59d81d..be301ea20 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c @@ -36,6 +36,183 @@ #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_constraint_check.h" +#define LOCAL() (local) + +typedef struct _vsi_nn_grouped_deconv2d_param_local_data { + vsi_nn_tensor_t ** input_tensor_group; + vsi_nn_tensor_t ** weight_tensor_group; + vsi_nn_tensor_t ** bias_tensor_group; + vsi_nn_tensor_t ** output_tensor_group; +} vsi_nn_grouped_deconv2d_param_local_data; + +static vsi_status op_grouped_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * inputs[3], + vsi_nn_tensor_t ** outputs, + vx_nn_deconvolution_params_ext2_t param + ) +{ + vsi_bool res; + uint32_t i; + vsi_status status = VSI_FAILURE; + vsi_nn_deconv_param *nn_param = &self->nn_param.deconv; + uint32_t group = nn_param->group; + vsi_nn_grouped_deconv2d_param_local_data *local = + (vsi_nn_grouped_deconv2d_param_local_data*)malloc(sizeof(vsi_nn_grouped_deconv2d_param_local_data)); + if (NULL == local) + { + VSILOGE("Malloc fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__); + goto final; + } + memset(local, 0, sizeof(vsi_nn_grouped_deconv2d_param_local_data)); + /* TODO */ + LOCAL()->input_tensor_group = (vsi_nn_tensor_t **)malloc( + group * sizeof(vsi_nn_tensor_t *)); + if (NULL == LOCAL()->input_tensor_group) + { + VSILOGE("Malloc fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__); + goto final; + } + memset(LOCAL()->input_tensor_group, 0, group * sizeof(vsi_nn_tensor_t *)); + res = vsi_nn_CreateTensorGroup(self->graph, inputs[0], 2, + LOCAL()->input_tensor_group, group); + if (res == FALSE) + { + VSILOGE("CreateTensorGroup fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__); + goto final; + } + + LOCAL()->weight_tensor_group = (vsi_nn_tensor_t **)malloc( + group * sizeof(vsi_nn_tensor_t *)); + if (NULL == LOCAL()->weight_tensor_group) + { + VSILOGE("Malloc fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__); + return VSI_FAILURE; + } + memset(LOCAL()->weight_tensor_group, 0, group * sizeof(vsi_nn_tensor_t *)); + res = vsi_nn_CreateTensorGroup(self->graph, inputs[1], 2, + LOCAL()->weight_tensor_group, group); + if (res == FALSE) + { + VSILOGE("CreateTensorGroup fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__); + goto final; + } + + LOCAL()->bias_tensor_group = (vsi_nn_tensor_t **)malloc( + group * sizeof(vsi_nn_tensor_t *)); + if (NULL == LOCAL()->bias_tensor_group) + { + VSILOGE("Malloc fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__); + goto final; + } + memset(LOCAL()->bias_tensor_group, 0, group * sizeof(vsi_nn_tensor_t *)); + if (inputs[2] != NULL) + { + res = vsi_nn_CreateTensorGroup(self->graph, inputs[2], 0, + LOCAL()->bias_tensor_group, group); + if (res == FALSE) + { + VSILOGE("CreateTensorGroup fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__); + goto final; + } + } + + LOCAL()->output_tensor_group = (vsi_nn_tensor_t **)malloc( + group * sizeof(vsi_nn_tensor_t *)); + if (NULL == LOCAL()->output_tensor_group) + { + VSILOGE("Malloc fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__); + goto final; + } + memset(LOCAL()->output_tensor_group, 0, group * sizeof(vsi_nn_tensor_t *)); + res = vsi_nn_CreateTensorGroup(self->graph, outputs[0], 2, + LOCAL()->output_tensor_group, group); + if (res == FALSE) + { + VSILOGE("CreateTensorGroup fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__); + goto final; + } + + param.ext.channel_group = 1; + for (i = 0; i < group; i++) + { + vx_tensor bias; + + if ( inputs[2] == NULL ) + { + bias = NULL; + } + else + { + bias = LOCAL()->bias_tensor_group[i]->t; + } + + self->n = vxDeconvolutionLayer( + self->graph->g, + LOCAL()->input_tensor_group[i]->t, + LOCAL()->weight_tensor_group[i]->t, + bias, + (vx_nn_deconvolution_params_t *)¶m, + sizeof( vx_nn_deconvolution_params_ext2_t ), + LOCAL()->output_tensor_group[i]->t + ); + if ( NULL == self->n ) + { + VSILOGE("Add vxConvolutionLayer fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__); + status = VSI_FAILURE; + goto final; + } + else + { + // no need to maintain self->n + vxReleaseNode( &self->n ); + status = VSI_SUCCESS; + self->n = NULL; + } + } + +final: + if (LOCAL()) + { + if (LOCAL()->input_tensor_group) + { + for (i = 0; i < group; i++) + { + vsi_safe_release_tensor((LOCAL()->input_tensor_group[i])); + } + vsi_nn_safe_free(LOCAL()->input_tensor_group); + } + if (LOCAL()->weight_tensor_group) + { + for (i = 0; i < group; i++) + { + vsi_safe_release_tensor((LOCAL()->weight_tensor_group[i])); + } + vsi_nn_safe_free(LOCAL()->weight_tensor_group); + } + if (LOCAL()->bias_tensor_group != NULL) + { + for (i = 0; i < group; i++) + { + vsi_safe_release_tensor((LOCAL()->bias_tensor_group[i])); + } + vsi_nn_safe_free(LOCAL()->bias_tensor_group); + } + if (LOCAL()->output_tensor_group != NULL) + { + for (i = 0; i < group; i++) + { + vsi_safe_release_tensor((LOCAL()->output_tensor_group[i])); + } + vsi_nn_safe_free(LOCAL()->output_tensor_group); + } + + vsi_nn_safe_free(LOCAL()); + } + return status; +} /* op_compute() */ + #define COMPUTE_DECONV_SZ( in, ksize, pad_1, pad_2, stride, output_padding )\ (( in - 1 ) * stride + ksize - pad_1 - pad_2 + output_padding) static vsi_status op_compute @@ -161,18 +338,31 @@ static vsi_status op_compute //param.border_mode; //param.border_const; - self->n = vxDeconvolutionLayer( - self->graph->g, - inputs[0]->t, - weight_tensor->t, - (NULL == inputs[2]) ? NULL : inputs[2]->t, - (vx_nn_deconvolution_params_t *)¶m, - sizeof( vx_nn_deconvolution_params_ext2_t ), - outputs[0]->t - ); - if( NULL != self->n ) + if (self->nn_param.deconv.group > 1 && + self->nn_param.deconv.group < inputs[0]->attr.size[2]) { - status = VSI_SUCCESS; + vsi_nn_tensor_t *inputs_tensors[3] = {NULL}; + + inputs_tensors[0] = inputs[0]; + inputs_tensors[1] = weight_tensor; + inputs_tensors[2] = inputs[2]; + status = op_grouped_compute(self, inputs_tensors, outputs, param ); + } + else + { + self->n = vxDeconvolutionLayer( + self->graph->g, + inputs[0]->t, + weight_tensor->t, + (NULL == inputs[2]) ? NULL : inputs[2]->t, + (vx_nn_deconvolution_params_t *)¶m, + sizeof( vx_nn_deconvolution_params_ext2_t ), + outputs[0]->t + ); + if ( NULL != self->n ) + { + status = VSI_SUCCESS; + } } final: diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c index 4128480bf..1180dbee9 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c @@ -64,7 +64,9 @@ static vsi_status op_compute weight_attr.size[2] = weight_attr.size[1]; weight_attr.size[1] = 1; weight_attr.dim_num = 4; - if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) + if (inputs[1]->attr.dtype.qnt_type != + VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC && + inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8) { weight_tensor = vsi_nn_reshape_tensor( self->graph, inputs[1], weight_attr.size, 4 ); CHECK_PTR_FAIL_GOTO( weight_tensor, "create tensor fail.", final ); @@ -118,6 +120,7 @@ static vsi_status op_compute attr.size[2] = weight_tensor->attr.size[3]; attr.size[3] = weight_tensor->attr.size[2]; permute_tensor = vsi_nn_CreateTensor(self->graph, &attr); + CHECK_PTR_FAIL_GOTO( permute_tensor, "Create tensor fail.", final ); self->n = vxTensorPermuteNode( self->graph->g, weight_tensor->t, permute_tensor->t, perm_array, 4); if ( NULL == self->n ) @@ -135,6 +138,7 @@ static vsi_status op_compute memset(&attr_reverse, 0, sizeof(vsi_nn_tensor_attr_t)); memcpy(&attr_reverse, &tmp_in_tensor->attr, sizeof(vsi_nn_tensor_attr_t) ); reverse_tensor = vsi_nn_CreateTensor(self->graph, &attr_reverse); + CHECK_PTR_FAIL_GOTO( reverse_tensor, "Create tensor fail.", final ); para.axis = axis_reverse; para.numberOfAxis = 2; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c index 551aa59ea..cee8b8c7c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c @@ -36,6 +36,7 @@ #include "utils/vsi_nn_util.h" #include "vsi_nn_internal_node.h" #include "utils/vsi_nn_math.h" +#include "vsi_nn_error.h" static vsi_status vsi_nn_depth2space_compute ( @@ -46,29 +47,38 @@ static vsi_status vsi_nn_depth2space_compute { vsi_status status; vsi_nn_tensor_t *block_size_tensor = NULL; - vx_nn_reorg_params_t param; +#if (VX_DEPTH2SPACE_CRD_MODE_SUPPORT) + vx_nn_reorg_params_ext3_t paramExt; + vx_nn_reorg_params_t *param = (vx_nn_reorg_params_t *)¶mExt.base.base; + size_t size = sizeof(vx_nn_reorg_params_ext3_t); + paramExt.mode = self->nn_param.depth2space.mode; +#else + vx_nn_reorg_params_t base; + vx_nn_reorg_params_t *param = &base; + size_t size = sizeof(vx_nn_reorg_params_t); + memset(param, 0, sizeof(vx_nn_reorg_params_t)); +#endif status = VSI_FAILURE; - memset(¶m, 0, sizeof(vx_nn_reorg_params_t)); block_size_tensor = vsi_nn_VariableToTensor(self, (uint8_t *)&self->nn_param.depth2space.block_size, VSI_NN_TYPE_INT32); - if( NULL == block_size_tensor ) + if ( NULL == block_size_tensor ) { VSILOGE("Create block_size_tensor fail.(depth2space)"); return VSI_FAILURE; } self->nn_param.depth2space.local.block_size_tensor = block_size_tensor; - param.block_size = REQUIRED_IO(block_size_tensor); - param.type = VX_REORG_DEPTH_TO_SPACE; + param->block_size = REQUIRED_IO(block_size_tensor); + param->type = VX_REORG_DEPTH_TO_SPACE; self->n = vxReorgLayer2( self->graph->g, inputs[0]->t, - ¶m, - sizeof(vx_nn_reorg_params_t), + param, + size, outputs[0]->t); - if( NULL != self->n ) + if ( NULL != self->n ) { status = VSI_SUCCESS; } @@ -84,6 +94,13 @@ static vsi_status op_compute { vsi_status status = VSI_FAILURE; +#if (VX_DEPTH2SPACE_CRD_MODE_SUPPORT) + if (self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_DCR || + self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_CRD) + { + status = vsi_nn_depth2space_compute(self, inputs, outputs); + } +#else if (self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_DCR) { status = vsi_nn_depth2space_compute(self, inputs, outputs); @@ -92,6 +109,7 @@ static vsi_status op_compute { status = vsi_nn_internal_compute_node( self ); } +#endif else { VSILOGE("Unknown depth2space mode.(depth2space)"); @@ -101,24 +119,6 @@ static vsi_status op_compute return status; } /* op_compute() */ -static vsi_status op_optimize - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs, - vsi_nn_opt_direction_e direction - ) -{ - if (self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_CRD) - { - return vsi_nn_internal_optimize_node(self, direction ); - } - else - { - return VSI_SUCCESS; - } -} /* op_optimize() */ - static vsi_bool op_check ( vsi_nn_node_t * self, @@ -139,6 +139,7 @@ static vsi_bool op_check return ret; } /* op_check() */ +#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT) static void op_set_depth2space_param_value(vsi_nn_nn_param_t *nn_param, vsi_nn_op_t type_name, vsi_nn_depth2space_mode_e mode, @@ -160,20 +161,23 @@ static vsi_bool op_set_depth2space_internal vsi_nn_op_t type_name ) { - vsi_bool retn = TRUE; + vsi_bool retn = FALSE; vsi_nn_internal_node_t* curr = NULL; vsi_nn_internal_init_node_wksp( self ); curr = vsi_nn_internal_new_node( self, type_name, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); op_set_depth2space_param_value(&(curr->node->nn_param), type_name, self->nn_param.depth2space.mode, self->nn_param.depth2space.block_size); curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; retn = vsi_nn_internal_setup_node(self, curr); +final: return retn; } +#endif static vsi_status op_init ( @@ -199,7 +203,7 @@ static vsi_bool op_setup { vsi_bool ret = TRUE; uint32_t size = node->nn_param.depth2space.block_size; - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; outputs[0]->attr.size[0] = inputs[0]->attr.size[0] * size; @@ -208,10 +212,12 @@ static vsi_bool op_setup outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; } +#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT) if (node->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_CRD) { ret = op_set_depth2space_internal(node, inputs, outputs, VSI_NN_OP_DEPTH2SPACE_INTERNAL); } +#endif return ret; } /* op_setup() */ @@ -225,11 +231,13 @@ static vsi_status op_deinit vsi_nn_ReleaseTensor(&(self->nn_param.depth2space.local.block_size_tensor)); } +#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT) if (self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_CRD) { vsi_nn_internal_deinit_node_wksp(self); } else +#endif { vsi_nn_op_common_deinit(self); } @@ -249,7 +257,7 @@ DEF_OP_REG /* deinit */ op_deinit, /* check */ op_check, /* setup */ op_setup, - /* optimize */ op_optimize, + /* optimize */ NULL, /* input_num */ 1, /* output_num */ 1 ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c index fa5336755..1b417b168 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c @@ -34,7 +34,7 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "libnnext/vsi_nn_vxkernel.h" +#include "vsi_nn_error.h" #include "vsi_nn_internal_node.h" #include "utils/vsi_nn_constraint_check.h" @@ -48,6 +48,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -82,19 +84,21 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - vsi_bool ret = TRUE; + vsi_bool ret = FALSE; vsi_nn_internal_node_t* curr = NULL; vsi_nn_internal_init_node_wksp(self); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_LINEAR, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.linear.a = self->nn_param.dropout.ratio; curr->node->nn_param.linear.b = 0; curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); +final: return ret; } /* op_init() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c index 68c6993a0..280e5eee2 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c @@ -120,6 +120,8 @@ static vsi_bool op_setup vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_bool ret = TRUE; + VSI_UNREFERENCED(self); + out_rank = inputs[0]->attr.dim_num; for (i = 0; i < out_rank; i++) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c index bcdf270f5..c1f2fc56e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c @@ -122,6 +122,8 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(node); + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = inputs[1]->attr.dim_num; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c index 68c9fc257..d586d3141 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c @@ -35,6 +35,7 @@ #include "utils/vsi_nn_dtype_util_prv.h" #include "utils/vsi_nn_math.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_error.h" static vsi_status op_compute ( @@ -43,6 +44,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } @@ -85,6 +88,7 @@ static vsi_bool op_check IO_TYPE(D_BF16, D_F32) IO_TYPE(D_I32|Q_DFP, D_I32|Q_DFP) IO_TYPE(D_I32|Q_ASYM, D_I32|Q_ASYM) + IO_TYPE(D_BOOL8, D_BOOL8) END_IO_TYPE_DECL(EXPAND_BROADCAST) if (!VALIDATE_OP_IO_TYPES(EXPAND_BROADCAST, self, inputs, self->input.num, outputs, self->output.num)) { @@ -109,9 +113,11 @@ static vsi_bool op_setup vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t* input_0 = NULL; vsi_nn_internal_tensor_t *input_1 = NULL; + vsi_nn_internal_tensor_t* input_2 = NULL; vsi_nn_internal_node_t* mul_node = NULL; vsi_nn_tensor_t* mul_input = NULL; int32_t use_virtual_tensor = 1; + vsi_bool is_same_shape = TRUE; vsi_nn_expand_broadcast_param *p = &self->nn_param.expand_broadcast; vsi_nn_internal_init_node_wksp(self); @@ -120,33 +126,55 @@ static vsi_bool op_setup attr.dim_num = p->dim_num; if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || - inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16)) { + inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16)) + { attr.dtype.vx_type = VSI_NN_TYPE_INT32; } - else { + else if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BOOL8) + { + attr.dtype.vx_type = VSI_NN_TYPE_BOOL8; + } + else + { attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; } attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; attr.is_const = TRUE; - for(i = 0; i < p->dim_num; i++) + for (i = 0; i < p->dim_num; i++) { + vsi_size_t sz = i < inputs[0]->attr.dim_num ? + inputs[0]->attr.size[i] : 1; + attr.size[i] = p->shape[i]; + if (( p->shape[i] != sz && p->shape[i] != 1) + && is_same_shape) + { + is_same_shape = FALSE; + } } input_1 = vsi_nn_internal_new_tensor( self, &attr, 1.0f ); + CHECK_PTR_FAIL_GOTO(input_1, "Create tensor failed", final); - if (p->dimensions_num > 0) { + if (p->dimensions_num > 0) + { vsi_nn_internal_node_t* reshape_node = NULL; vsi_size_t* reshape_input_size = NULL; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor); input_0 = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(input_0, "Create internal tensor failed", final); reshape_node = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(reshape_node, "Create internal node failed", final); reshape_input_size = (vsi_size_t*)vsi_nn_internal_new_node_param(reshape_node, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); - for(i = 0; i < p->dim_num; i++) { + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_input_size, reshape_node, + "Create internal buffer failed", final); + for (i = 0; i < p->dim_num; i++) + { reshape_input_size[i] = 1; } - for (i = 0; i < p->dimensions_num; i++) { + for (i = 0; i < p->dimensions_num; i++) + { reshape_input_size[p->dimensions[i]] = p->shape[p->dimensions[i]]; } @@ -156,20 +184,74 @@ static vsi_bool op_setup reshape_node->outputs[0] = input_0->t; vsi_nn_internal_setup_node( self, reshape_node ); mul_input = input_0->t; - } else { + } + else + { mul_input = inputs[0]; } - mul_node = vsi_nn_internal_new_node(self, VSI_NN_OP_MULTIPLY, 0, 0 ); - mul_node->inputs[0] = mul_input; - mul_node->inputs[1] = input_1->t; - mul_node->node->nn_param.multiply.scale = 1.0f; - mul_node->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; - mul_node->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN; - mul_node->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node(self, mul_node); + if (is_same_shape) + { + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_tensor_t* temp_tensor = NULL; + + if (input_1->t->attr.dim_num != mul_input->attr.dim_num) + { + vsi_size_t* shape_sizes = NULL; + uint32_t rank0 = input_1->t->attr.dim_num; + uint32_t rank1 = mul_input->attr.dim_num; + uint32_t rank = vsi_nn_max( rank0, rank1 ); + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_tensor_attr(&attr, &mul_input->attr.dtype, use_virtual_tensor); + input_2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(input_2, "Create internal tensor failed", final); + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); + shape_sizes = (vsi_size_t*)vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(shape_sizes, curr, + "Create internal buffer failed", final); + for (i = 0; i < rank; i++) + { + shape_sizes[i] = i < rank1 ? mul_input->attr.size[i] : 1; + } + curr->node->nn_param.reshape2.size = shape_sizes; + curr->node->nn_param.reshape2.dim_num = rank; + curr->inputs[0] = mul_input; + curr->outputs[0] = input_2->t; + vsi_nn_internal_setup_node( self, curr ); + + temp_tensor = input_2->t; + } + else + { + temp_tensor = mul_input; + } + + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); + curr->inputs[0] = temp_tensor; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node(self, curr); + } + else + { + mul_node = vsi_nn_internal_new_node(self, VSI_NN_OP_MULTIPLY, 0, 0 ); + CHECK_PTR_FAIL_GOTO(mul_node, "Create internal node failed", final); + mul_node->inputs[0] = mul_input; + mul_node->inputs[1] = input_1->t; + mul_node->node->nn_param.multiply.scale = 1.0f; + mul_node->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + mul_node->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN; + mul_node->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node(self, mul_node); + } return TRUE; +final: + return FALSE; } static vsi_status op_deinit diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c index 23be09a06..958b06b10 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c @@ -123,6 +123,9 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /* TODO: Add code to comput outputs' shape. */ if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c index 92b13378c..4a803ad6e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c @@ -171,6 +171,8 @@ static vsi_bool op_setup vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_bool ret = TRUE; + VSI_UNREFERENCED(self); + in1_rank = inputs[0]->attr.dim_num; in2_rank = inputs[1]->attr.dim_num; out_rank = vsi_nn_max( in1_rank, in2_rank ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c index 1f3f281c2..489d3cb96 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c @@ -34,6 +34,7 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_tensor_util_prv.h" #define _ARG_NUM (1) #define _INPUT_NUM (2) @@ -80,7 +81,31 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "axis", (int32_t)axis ); vsi_nn_kernel_param_add_int32( param, "indices_num", (int32_t)indices_num ); vsi_nn_kernel_param_add_int32( param, "batch_dims", (int32_t)batch_dims ); - n = vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, outputs, 1, param ); + + if (vsi_nn_is_same_data_type(inputs[0], outputs[0]) == FALSE || + vsi_nn_is_same_quant_type(inputs[0], outputs[0])) + { + n = vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, outputs, 1, param ); + } + else + { + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t* temp_tensors = NULL; + + VSILOGW("gather is no_range_change operation! \ + Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!"); + + memcpy( &attr, &outputs[0]->attr, sizeof(attr)); + memcpy( &attr.dtype, &inputs[0]->attr.dtype, sizeof(attr.dtype)); + attr.is_const = FALSE; + attr.vtl = TRUE; + temp_tensors = vsi_nn_CreateTensor( self->graph, &attr ); + + vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, &temp_tensors, 1, param ); + n = vxTensorCopyNode( self->graph->g, temp_tensors->t, outputs[0]->t); + + vsi_safe_release_tensor(temp_tensors); + } if ( n != NULL ) { self->n = (vx_node)n; @@ -187,7 +212,7 @@ static vsi_bool op_setup outputs[0]->attr.size[j] = inputs[0]->attr.size[i]; j++; } - for (i = 0; i < inputs[1]->attr.dim_num; i++) + for (i = 0; i < q_rank; i++) { outputs[0]->attr.size[j] = inputs[1]->attr.size[i]; j++; @@ -198,8 +223,8 @@ static vsi_bool op_setup j++; } } - } + return TRUE; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c index baf55b1dc..b77a39db3 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c @@ -58,6 +58,7 @@ static vsi_status op_compute { vsi_status status = VSI_FAILURE; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + vsi_nn_tensor_t* temp_tensors = NULL; vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; uint32_t rank_in = 0; int32_t axis = 0; @@ -66,6 +67,8 @@ static vsi_status op_compute vsi_bool ret = FALSE; vsi_nn_kernel_param_t * param = NULL; vsi_nn_gather_elements_param * p = NULL; + vsi_size_t depth0 = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1; + vsi_size_t depth1 = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1; if ( NULL == self ) { @@ -86,7 +89,31 @@ static vsi_status op_compute // Add params param = vsi_nn_kernel_param_create(); - if ( ret && new_axis0 == new_axis1 ) + if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE) + { + vsi_nn_tensor_attr_t attr; + + VSILOGW("gather_element is no_range_change operation! \ + Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!"); + + memcpy( &attr, &outputs[0]->attr, sizeof(attr)); + memcpy( &attr.dtype, &inputs[0]->attr.dtype, sizeof(attr.dtype)); + attr.is_const = FALSE; + attr.vtl = TRUE; + temp_tensors = vsi_nn_CreateTensor( self->graph, &attr ); + } + else + { + temp_tensors = outputs[0]; + } + + if ( ret && new_axis0 == new_axis1 && + inputs[0]->attr.size[0] < GPU_TENSOR_MAX_WIDTH && + inputs[0]->attr.size[1] < GPU_TENSOR_MAX_WIDTH && + inputs[1]->attr.size[0] < GPU_TENSOR_MAX_WIDTH && + inputs[1]->attr.size[1] < GPU_TENSOR_MAX_WIDTH && + depth0 < GPU_TENSOR_MAX_WIDTH && + depth1 < GPU_TENSOR_MAX_WIDTH) { vsi_nn_kernel_param_add_int32( param, "axis", new_axis0 ); @@ -95,7 +122,7 @@ static vsi_status op_compute reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, inputs[1], shapes[1], rank_in ); reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, - outputs[0], shapes[1], rank_in ); + temp_tensors, shapes[1], rank_in ); self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "gather_elements", @@ -112,7 +139,13 @@ static vsi_status op_compute self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "gather_elements", inputs, 2, - outputs, 1, param ); + &temp_tensors, 1, param ); + } + + if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE) + { + self->n = vxTensorCopyNode( self->graph->g, temp_tensors->t, outputs[0]->t); + vsi_safe_release_tensor(temp_tensors); } vsi_nn_kernel_param_release( ¶m ); @@ -164,6 +197,8 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { uint32_t i = 0; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c index 4246ee6aa..26d47dd7e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c @@ -30,10 +30,11 @@ #include "vsi_nn_prv.h" #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" -#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_tensor_util_prv.h" #define _ARG_NUM (2) #define _INPUT_NUM (2) @@ -50,19 +51,20 @@ static vsi_status op_compute vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_node_t n = NULL; vsi_size_t i = 0; - int32_t batch_dims = self->nn_param.gather_nd.batch_dims == 0 ? 0 : 1; + int32_t batch_dims = self->nn_param.gather_nd.batch_dims; vsi_size_t block_size = 1, coord_dim = 1; vsi_size_t *input_size = inputs[0]->attr.size; vsi_size_t dims_num = inputs[0]->attr.dim_num; + batch_dims = batch_dims < 0 ? 0 : batch_dims; + if (inputs[1]->attr.dim_num > 1) { coord_dim = inputs[1]->attr.size[0]; } if (coord_dim > 4 || (coord_dim > 3 && input_size[dims_num - 1] != 1) - || (batch_dims && coord_dim >= 3)) + || (batch_dims && coord_dim >= 3) || (batch_dims >= (int32_t)vsi_nn_min(dims_num, inputs[1]->attr.dim_num))) { - CHECK_STATUS(status); return status; } @@ -76,7 +78,32 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "block_size", (int32_t)block_size ); vsi_nn_kernel_param_add_int32( param, "coord_dim", (int32_t)coord_dim ); vsi_nn_kernel_param_add_int32( param, "batch_dims", (int32_t)batch_dims ); - n = vsi_nn_kernel_selector( self->graph, "gather_nd", inputs, 2, outputs, 1, param ); + + if (vsi_nn_is_same_data_type(inputs[0], outputs[0]) == FALSE || + vsi_nn_is_same_quant_type(inputs[0], outputs[0])) + { + n = vsi_nn_kernel_selector( self->graph, "gather_nd", inputs, 2, outputs, 1, param ); + } + else + { + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t* temp_tensors = NULL; + + VSILOGW("gather_nd is no_range_change operation! \ + Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!"); + + memcpy( &attr, &outputs[0]->attr, sizeof(attr)); + memcpy( &attr.dtype, &inputs[0]->attr.dtype, sizeof(attr.dtype)); + attr.is_const = FALSE; + attr.vtl = TRUE; + temp_tensors = vsi_nn_CreateTensor( self->graph, &attr ); + + vsi_nn_kernel_selector( self->graph, "gather_nd", inputs, 2, &temp_tensors, 1, param ); + n = vxTensorCopyNode( self->graph->g, temp_tensors->t, outputs[0]->t); + + vsi_safe_release_tensor(temp_tensors); + } + if ( n != NULL ) { self->n = (vx_node)n; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c b/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c index 77feaafe3..09e96a1f0 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c @@ -78,6 +78,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } /* op_check() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_globallppool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_globallppool.c index de9059ecf..cc6463f63 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_globallppool.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_globallppool.c @@ -155,6 +155,8 @@ static vsi_bool op_setup { vsi_size_t i = 0; + VSI_UNREFERENCED(self); + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c index d8c99aa89..86f15f81d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c @@ -51,7 +51,38 @@ static vsi_status op_compute { vsi_status status = VSI_FAILURE; - status = vsi_nn_internal_compute_node(self); + vsi_nn_kernel_param_t* param = NULL; + int32_t align_corners = self->nn_param.gridsample.align_corners; + vsi_nn_kernel_node_t n; + char kernel_name[128]; + + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32(param, "align_corners", align_corners); + + switch (self->nn_param.gridsample.mode) { + case VSI_NN_INTERPOLATION_BILINEAR: + snprintf(kernel_name, sizeof(kernel_name), "bilinear_grid_sample"); + break; + case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR: + snprintf(kernel_name, sizeof(kernel_name), "nearest_grid_sample"); + break; + default: + break; + } + + n = (vx_node)vsi_nn_kernel_selector( + self->graph, kernel_name, inputs, 2, outputs, 1, param); + + if (n == NULL) { + vsi_nn_kernel_param_release(¶m); + status = VSI_FAILURE; + return status; + } + self->n = (vx_node)n; + vsi_nn_kernel_param_release(¶m); + if (self->n) { + status = VSI_SUCCESS; + } return status; } /* op_compute() */ @@ -63,8 +94,12 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - if (VSI_NN_INTERPOLATION_BILINEAR != self->nn_param.gridsample.mode) { - VSILOGE("Only support bilinear_grid_sample now!"); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + if ((VSI_NN_INTERPOLATION_BILINEAR != self->nn_param.gridsample.mode) && + (VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR != + self->nn_param.gridsample.mode)) { + VSILOGE("Only support bilinear or nearest grid sample mode now!"); return FALSE; } @@ -85,8 +120,6 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - vsi_nn_internal_node_t* curr = NULL; - if (NULL == self) { return FALSE; } @@ -101,22 +134,6 @@ static vsi_bool op_setup } } - if (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.gridsample.mode) { - vsi_nn_internal_init_node_wksp(self); - curr = vsi_nn_internal_new_node( - self, VSI_NN_OP_BILINEAR_GRID_SAMPLE, 2, 1); - curr->node->nn_param.bilinear_grid_sample.align_corners = - self->nn_param.gridsample.align_corners; - curr->node->nn_param.bilinear_grid_sample.padding_mode = - self->nn_param.gridsample.padding_mode; - curr->node->nn_param.bilinear_grid_sample.const_val = - self->nn_param.gridsample.const_val; - curr->inputs[0] = inputs[0]; - curr->inputs[1] = inputs[1]; - curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node(self, curr); - } - return TRUE; } /* op_setup() */ @@ -129,7 +146,7 @@ static vsi_status op_init //self->nn_param.grid_sample.local = \ // (grid_sample_local_data_t*)malloc(sizeof(grid_sample_local_data_t)); */ - + VSI_UNREFERENCED(self); return VSI_SUCCESS; } /* op_init() */ @@ -140,7 +157,7 @@ static vsi_status op_deinit { vsi_status status = VSI_SUCCESS; - status = vsi_nn_internal_deinit_node_wksp(self); + status = vsi_nn_op_common_deinit(self); return status; } /* op_deinit() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c index 5cfeddf58..a40497949 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c @@ -77,6 +77,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -103,6 +105,7 @@ static vsi_bool op_setup { vsi_nn_internal_node_t* curr = NULL; vsi_nn_grouped_conv1d_param* p = &self->nn_param.grouped_conv1d; + vsi_bool ret = FALSE; vsi_nn_internal_init_node_wksp(self); @@ -125,7 +128,9 @@ static vsi_bool op_setup p->local->input = _expand_tensor_dim( self->graph, inputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 ); - if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) + if (inputs[1]->attr.dtype.qnt_type != + VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC && + inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8) { p->local->weight = _expand_tensor_dim( self->graph, inputs[1], inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 ); @@ -159,6 +164,7 @@ static vsi_bool op_setup curr = vsi_nn_internal_new_node(self, VSI_NN_OP_GROUPED_CONV2D, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = p->local->input; curr->inputs[1] = p->local->weight; curr->inputs[2] = inputs[2]; @@ -179,10 +185,10 @@ static vsi_bool op_setup curr->node->nn_param.grouped_conv2d.pad_type = p->pad_type; curr->node->nn_param.grouped_conv2d.pad_mode = p->pad_mode; - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); final: - return TRUE; + return ret; } /* op_setup() */ static vsi_status op_init diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c index 00545d3c9..629486c69 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c @@ -77,6 +77,7 @@ static vsi_bool _is_3d_group_norm vsi_nn_tensor_t ** inputs ) { + VSI_UNREFERENCED(self); if ( 3 == inputs[0]->attr.dim_num ) { return TRUE; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c index ad4c2a741..24acf6f94 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c @@ -39,13 +39,14 @@ #include "utils/vsi_nn_tensor_op.h" #include "utils/vsi_nn_util.h" #include "ops/vsi_nn_op_gru.h" +#include "vsi_nn_error.h" typedef struct _vsi_nn_gru_local { void * placeholder; } vsi_nn_gru_local; -static void create_state_tensor +static vsi_status create_state_tensor ( vsi_nn_node_t * self, vsi_nn_tensor_t ** inputs, @@ -54,6 +55,7 @@ static void create_state_tensor vsi_size_t hidden_size ) { + vsi_status status = VSI_FAILURE; vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t * tensor = NULL; @@ -67,6 +69,7 @@ static void create_state_tensor attr.vtl = TRUE; attr.is_const = FALSE; tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(tensor, "Create internal tensor failed", final); outputs[GRU_OUT_H_STATE] = tensor->t; } @@ -80,9 +83,13 @@ static void create_state_tensor attr.is_const = TRUE; tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(tensor, "Create internal tensor failed", final); inputs[GRU_IN_H_STATE] = tensor->t; } + status = VSI_SUCCESS; +final: + return status; } /* create_state_tensor() */ static vsi_bool setup_op_shapes @@ -92,8 +99,10 @@ static vsi_bool setup_op_shapes vsi_nn_tensor_t ** outputs ) { + vsi_status status = VSI_FAILURE; vsi_nn_gru_param * p = &self->nn_param.gru; vsi_size_t batch_size = 0, hidden_size = 0, timesetp = 0; + vsi_bool ret = FALSE; hidden_size = p->num_units; if(p->time_major) @@ -137,7 +146,8 @@ static vsi_bool setup_op_shapes } /* create hstate input/output if app doesn't provide them */ - create_state_tensor(self, inputs, outputs, batch_size, hidden_size); + status = create_state_tensor(self, inputs, outputs, batch_size, hidden_size); + CHECK_STATUS_FAIL_GOTO(status, final); /* hstate output */ if(VSI_NN_DIM_AUTO == outputs[GRU_OUT_H_STATE]->attr.dim_num) @@ -147,7 +157,9 @@ static vsi_bool setup_op_shapes outputs[GRU_OUT_H_STATE]->attr.size[1] = batch_size; } - return TRUE; + ret = TRUE; +final: + return ret; } /* setup_op_shapes() */ static vsi_status op_compute @@ -157,6 +169,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } @@ -167,6 +181,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } @@ -187,6 +204,8 @@ static vsi_bool op_setup vsi_nn_tensor_t ** gru_step_outputs = NULL; vsi_nn_internal_tensor_t * tmp_tensor = NULL; vsi_nn_tensor_attr_t attr; + vsi_bool ret = FALSE; + vsi_status status = VSI_FAILURE; memset(&attr, 0, sizeof(attr)); vsi_nn_internal_init_node_wksp( self ); @@ -211,15 +230,19 @@ static vsi_bool op_setup /* transpose to time_major */ tmp_tensor = vsi_nn_rnn_transpose_time_major(self, inputs[GRU_INPUT_INPUT], NULL, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); input_tensor = tmp_tensor->t; } split_outputs = (vsi_nn_tensor_t **)malloc(timestep * sizeof(vsi_nn_tensor_t *)); + CHECK_PTR_FAIL_GOTO( split_outputs, "Create buffer fail.", final ); memset(split_outputs, 0, timestep * sizeof(vsi_nn_tensor_t *)); gru_step_outputs = (vsi_nn_tensor_t **)malloc(timestep * sizeof(vsi_nn_tensor_t *)); + CHECK_PTR_FAIL_GOTO( gru_step_outputs, "Create buffer fail.", final ); memset(gru_step_outputs, 0, timestep * sizeof(vsi_nn_tensor_t *)); - vsi_nn_rnn_split_input_tensor(self, input_tensor, split_outputs, (uint32_t)timestep, use_virtual_tensor); + status = vsi_nn_rnn_split_input_tensor(self, input_tensor, split_outputs, (uint32_t)timestep, use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO(status, final); //vsi_nn_rnn_data_check_aligned(self, split_outputs, timestep, use_virtual_tensor); ?? @@ -233,6 +256,7 @@ static vsi_bool op_setup /* reshape split_outputs to cell_input */ tmp_tensor = vsi_nn_rnn_reshape_split_output( self, split_outputs[i], (uint32_t)batch_size, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); reshape_output = tmp_tensor->t; /* grucell output */ @@ -245,6 +269,7 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &outputs[GRU_OUT_OUTPUT]->attr.dtype, use_virtual_tensor); tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); cell_out0 = tmp_tensor->t; } @@ -254,6 +279,7 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &outputs[GRU_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor); tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); cell_out1 = tmp_tensor->t; } else @@ -263,6 +289,7 @@ static vsi_bool op_setup /* create a grucell */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.grucell.num_units = p->num_units; curr->node->nn_param.grucell.activation = p->activation; curr->node->nn_param.grucell.recurrent_activation = p->recurrent_activation; @@ -292,6 +319,7 @@ static vsi_bool op_setup /* reshape every step output to 3-dims for GRU_OUTPUT */ tmp_tensor = vsi_nn_rnn_reshape_cell_output(self, cell_out0, (uint32_t)batch_size, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); gru_step_outputs[i] = tmp_tensor->t; } } /* for(i = 0; i < timestep; i++) end */ @@ -305,11 +333,13 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); output_tensor = tmp_tensor->t; } /* concat all grucell output0, the reshaped grucell output shape: [hidden_size, batch, 1] */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, timestep, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.concat.axis = 2; /* concat the cell_outs in timestep */ for( i = 0; i < timestep; i++ ) { @@ -326,10 +356,12 @@ static vsi_bool op_setup } } + ret = TRUE; +final: vsi_nn_safe_free( split_outputs ); vsi_nn_safe_free( gru_step_outputs ); - return TRUE; + return ret; } static vsi_status op_deinit @@ -350,6 +382,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c index 5ac947b9f..9d7e34897 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c @@ -93,6 +93,7 @@ static vsi_bool setup_op_shapes attr.is_const = TRUE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); inputs[GRU_INPUT_H_STATE] = output_tensor->t; } @@ -103,6 +104,7 @@ static vsi_bool setup_op_shapes memcpy( &attr.dtype, &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) ); attr.vtl = TRUE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); outputs[GRU_OUTPUT_H_STATE] = output_tensor->t; } @@ -132,6 +134,8 @@ static vsi_bool setup_op_shapes } return TRUE; +final: + return FALSE; } static vsi_status op_compute @@ -141,6 +145,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -151,6 +157,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -163,6 +172,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ @@ -187,6 +198,7 @@ static vsi_bool op_setup_default vsi_size_t time_step = 0; vsi_size_t i = 0; vsi_bool ret = FALSE; + vsi_status status = VSI_FAILURE; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_node_wksp( self ); @@ -211,6 +223,7 @@ static vsi_bool op_setup_default /* transpose to time_major */ output_tensor = vsi_nn_rnn_transpose_time_major(self, inputs[GRU_INPUT_INPUT], NULL, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); input_tensor = output_tensor->t; } @@ -222,9 +235,12 @@ static vsi_bool op_setup_default CHECK_PTR_FAIL_GOTO( grucell_reshape_output_tensors, "Create buffer fail.", final ); memset( grucell_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); - vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); + status = vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, + (uint32_t)time_step, use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO(status, final); - vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); + status = vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO(status, final); last_step_h_state = inputs[GRU_INPUT_H_STATE]; for( i = 0; i < time_step; i++ ) @@ -236,6 +252,7 @@ static vsi_bool op_setup_default /* reshape for split output */ output_tensor = vsi_nn_rnn_reshape_split_output(self, split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); reshape_output = output_tensor->t; /* grucell output */ @@ -248,6 +265,7 @@ static vsi_bool op_setup_default vsi_nn_internal_init_tensor_attr(&attr, &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); grucell_out0 = output_tensor->t; } @@ -257,6 +275,7 @@ static vsi_bool op_setup_default vsi_nn_internal_init_tensor_attr(&attr, &outputs[GRU_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); grucell_out1 = output_tensor->t; } else @@ -265,13 +284,14 @@ static vsi_bool op_setup_default } curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_OVXLIB, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.grucell_ovxlib.num_units = curr_param->num_units; curr->node->nn_param.grucell_ovxlib.activation = curr_param->activation; curr->node->nn_param.grucell_ovxlib.recurrent_activation = curr_param->recurrent_activation; curr->node->nn_param.grucell_ovxlib.linear_before_reset = curr_param->linear_before_reset; if ( reshape_output->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) { - int32_t k = 0; + size_t k = 0; for (k = 0; k < _cnt_of_array( curr_param->internal_dtype ); k++) { if (curr_param->internal_dtype[k].vx_type == VSI_NN_TYPE_NONE) @@ -316,6 +336,7 @@ static vsi_bool op_setup_default /* reshape output to 3-dims */ output_tensor = vsi_nn_rnn_reshape_cell_output(self, grucell_out0, (uint32_t)batch_size, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); grucell_reshape_output_tensors[i] = output_tensor->t; } } @@ -328,12 +349,14 @@ static vsi_bool op_setup_default vsi_nn_internal_init_tensor_attr(&attr, &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); tensor = output_tensor->t; } /* concat grucell output, the gru's output is 3-dims */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.concat.axis = 2; for( i = 0; i < time_step; i++ ) { @@ -383,6 +406,8 @@ static vsi_bool op_setup_optimized vsi_nn_internal_tensor_t* input_weight_for_nn = NULL; vsi_size_t permute_in_perm[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_size_t reshape_size[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_bool ret = FALSE; + vsi_status status = VSI_FAILURE; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_node_wksp( self ); @@ -409,57 +434,69 @@ static vsi_bool op_setup_optimized /* transpose to time_major */ output_tensor = vsi_nn_rnn_transpose_time_major(self, inputs[GRU_INPUT_INPUT], NULL, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); input_tensor = output_tensor->t; } /* input FC */ p->local->weights_input = vsi_nn_ConcatTensor(self->graph, 1, inputs[GRU_INPUT_WEIGHT_I2R], inputs[GRU_INPUT_WEIGHT_I2Z], inputs[GRU_INPUT_WEIGHT_I2C]); + CHECK_PTR_FAIL_GOTO(p->local->weights_input, "Create tensor failed", final); p->local->weights_input->attr.is_const = TRUE; vsi_nn_SetTensorAttr(p->local->weights_input, VSI_NN_TENSOR_ATTR_CONST); p->local->weights_recurrent = vsi_nn_ConcatTensor(self->graph, 1, inputs[GRU_INPUT_WEIGHT_H2R], inputs[GRU_INPUT_WEIGHT_H2Z], inputs[GRU_INPUT_WEIGHT_H2C]); + CHECK_PTR_FAIL_GOTO(p->local->weights_recurrent, "Create tensor failed", final); p->local->weights_recurrent->attr.is_const = TRUE; vsi_nn_SetTensorAttr(p->local->weights_recurrent, VSI_NN_TENSOR_ATTR_CONST); p->local->bias_r = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2R]->attr, inputs[GRUCELL_INPUT_BIAS_I2R], inputs[GRUCELL_INPUT_BIAS_H2R]); + CHECK_PTR_FAIL_GOTO(p->local->bias_r, "Create tensor failed", final); p->local->bias_r->attr.is_const = TRUE; vsi_nn_SetTensorAttr(p->local->bias_r, VSI_NN_TENSOR_ATTR_CONST); p->local->bias_z = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2Z]->attr, inputs[GRUCELL_INPUT_BIAS_I2Z], inputs[GRUCELL_INPUT_BIAS_H2Z]); + CHECK_PTR_FAIL_GOTO(p->local->bias_z, "Create tensor failed", final); p->local->bias_z->attr.is_const = TRUE; vsi_nn_SetTensorAttr(p->local->bias_z, VSI_NN_TENSOR_ATTR_CONST); p->local->bias_c = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2C]->attr, inputs[GRUCELL_INPUT_BIAS_I2C], inputs[GRUCELL_INPUT_BIAS_H2C]); + CHECK_PTR_FAIL_GOTO(p->local->bias_c, "Create tensor failed", final); p->local->bias_c->attr.is_const = TRUE; vsi_nn_SetTensorAttr(p->local->bias_c, VSI_NN_TENSOR_ATTR_CONST); /* prepare weight and bias for recurrent fc */ recurrent_weight_for_nn = vsi_nn_rnn_prepare_weight_for_nn_fc(self, p->local->weights_recurrent, 1, 1); + CHECK_PTR_FAIL_GOTO(recurrent_weight_for_nn, "Create internal tensor failed", final); /* transpose input from [T,B,D] to [D,T,B] */ permute_in_perm[0] = 1; permute_in_perm[1] = 2; permute_in_perm[2] = 0; tmp_tensor = vsi_nn_rnn_create_permute(self, input_tensor, NULL, permute_in_perm, 3, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final ); reshape_size[0] = tmp_tensor->t->attr.size[0]; reshape_size[1] = tmp_tensor->t->attr.size[1]; reshape_size[2] = tmp_tensor->t->attr.size[2]; reshape_size[3] = 1; /* new batch dim */ tmp_tensor = vsi_nn_rnn_create_reshape(self, tmp_tensor->t, NULL, reshape_size, 4, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); input_weight_for_nn = vsi_nn_rnn_prepare_weight_for_nn_fc(self, p->local->weights_input, 1, 1); + CHECK_PTR_FAIL_GOTO(input_weight_for_nn, "Create internal tensor failed", final); vsi_nn_internal_init_tensor_attr(&attr, &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_INPUT], use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.conv2d.ksize[0] = 1; curr->node->nn_param.conv2d.ksize[1] = 1; curr->node->nn_param.conv2d.stride[0] = 1; @@ -483,11 +520,13 @@ static vsi_bool op_setup_optimized reshape_size[1] = output_tensor->t->attr.size[1]; reshape_size[2] = output_tensor->t->attr.size[2]; output_tensor = vsi_nn_rnn_create_reshape(self, output_tensor->t, NULL, reshape_size, 3, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); permute_in_perm[0] = 0; permute_in_perm[1] = 2; permute_in_perm[2] = 1; tmp_tensor = vsi_nn_rnn_create_permute(self, output_tensor->t, NULL, permute_in_perm, 3, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); /* split input tensor */ split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); @@ -497,19 +536,24 @@ static vsi_bool op_setup_optimized CHECK_PTR_FAIL_GOTO( grucell_reshape_output_tensors, "Create buffer fail.", final ); memset( grucell_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); - vsi_nn_rnn_split_input_tensor(self, tmp_tensor->t, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); + status = vsi_nn_rnn_split_input_tensor(self, tmp_tensor->t, split_output_tensors, + (uint32_t)time_step, use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO(status, final); - vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); + status = vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO(status, final); memcpy(&attr, &p->local->bias_r->attr, sizeof(vsi_nn_tensor_attr_t)); attr.size[1] = 1; attr.dim_num = 2; p->local->cond_zeros = vsi_nn_CreateTensorWithDefault(self->graph, &attr, 0.0); + CHECK_PTR_FAIL_GOTO(p->local->cond_zeros, "Create tensor failed", final); last_step_h_state = inputs[GRU_INPUT_H_STATE]; permute_in_perm[0] = 1; permute_in_perm[1] = 0; tmp_tensor = vsi_nn_rnn_create_permute(self, last_step_h_state, NULL, permute_in_perm, 2, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); last_step_h_state = tmp_tensor->t; for( i = 0; i < time_step; i++ ) @@ -525,6 +569,7 @@ static vsi_bool op_setup_optimized /* reshape for split output */ output_tensor = vsi_nn_rnn_reshape_split_output(self, split_output_tensors[i], (uint32_t)(unit_nums * 3), use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); input_fc_output = output_tensor->t; /* last_step_h_state is not batch first, no need to permute */ @@ -533,13 +578,16 @@ static vsi_bool op_setup_optimized reshape_size[1] = 1/*kernel_h*/; reshape_size[0] = last_step_h_state->attr.size[0]; tmp = vsi_nn_rnn_create_reshape(self, last_step_h_state, NULL, reshape_size, 4, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final); vsi_nn_internal_init_tensor_attr(&attr, &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_HIDDEN], use_virtual_tensor); tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.conv2d.ksize[0] = 1; curr->node->nn_param.conv2d.ksize[1] = 1; curr->node->nn_param.conv2d.stride[0] = 1; @@ -562,37 +610,35 @@ static vsi_bool op_setup_optimized reshape_size[1] = recurrent_weight_for_nn->t->attr.size[3]; reshape_size[0] = batch_size; tmp_tensor = vsi_nn_rnn_create_reshape(self, tmp_tensor->t, NULL, reshape_size, 2, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); recurrent_fc_output = tmp_tensor->t; /* grucell output */ vsi_nn_internal_init_tensor_attr(&attr, &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); grucell_out0 = output_tensor->t; /* grucell output h_state */ vsi_nn_internal_init_tensor_attr(&attr, &outputs[GRU_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); grucell_out1 = output_tensor->t; curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[GRUCELL_ACTIVATION_INPUT_H_STATE] = last_step_h_state; - if(0) - { - curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R] = input_fc_output; - curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z] = NULL; - curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_C] = NULL; - curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_R] = recurrent_fc_output; - curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_Z] = NULL; - curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_C] = NULL; - } - else { splited_input_fc_output_tensors = vsi_nn_create_split(self, input_fc_output, 1, 3, NULL, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(splited_input_fc_output_tensors, curr, + "Create internal tensor failed", final); splited_recurrent_fc_output_tensors = vsi_nn_create_split(self, recurrent_fc_output, 1, 3, NULL, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(splited_recurrent_fc_output_tensors, curr, + "Create internal tensor failed", final); curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R] = splited_input_fc_output_tensors[0]->t; curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z] = splited_input_fc_output_tensors[1]->t; curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_C] = splited_input_fc_output_tensors[2]->t; @@ -623,8 +669,10 @@ static vsi_bool op_setup_optimized vsi_nn_internal_init_tensor_attr(&attr, &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.concat.axis = 1; for( i = 0; i < time_step; i++ ) { @@ -634,9 +682,10 @@ static vsi_bool op_setup_optimized vsi_nn_internal_setup_node(self, curr); reshape_size[0] = batch_size; - reshape_size[1] = -1; + reshape_size[1] = (vsi_size_t)-1; reshape_size[2] = time_step; tmp_tensor = vsi_nn_rnn_create_reshape(self, tmp_tensor->t, NULL, reshape_size, 3, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); if(p->time_major) { @@ -657,11 +706,12 @@ static vsi_bool op_setup_optimized vsi_nn_rnn_create_permute(self, last_step_h_state, outputs[GRU_OUTPUT_H_STATE], permute_in_perm, 2, use_virtual_tensor); + ret = TRUE; final: vsi_nn_safe_free( split_output_tensors ); vsi_nn_safe_free( grucell_reshape_output_tensors ); - return TRUE; + return ret; } /* op_setup_optimized() */ static vsi_bool op_setup diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c index 18ae5545a..2fc49d033 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c @@ -39,6 +39,7 @@ #include "utils/vsi_nn_tensor_op.h" #include "utils/vsi_nn_util.h" #include "ops/vsi_nn_op_grucell.h" +#include "vsi_nn_error.h" typedef struct _vsi_nn_grucell_local { @@ -64,6 +65,7 @@ static vsi_nn_internal_tensor_t * _create_fc { /* create zero bias for NN/TP */ tmp_tensor = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE); + CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create tensor fail.", final ); bias_tensor = tmp_tensor->t; } else @@ -85,8 +87,10 @@ static vsi_nn_internal_tensor_t * _create_fc attr.vtl = TRUE; attr.is_const = FALSE; fc_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(fc_out, "Create internal tensor failed", final); fc_node = vsi_nn_internal_new_node(self, VSI_NN_OP_FCL, 0, 0 ); + CHECK_PTR_FAIL_GOTO(fc_node, "Create internal node failed", final); fc_node->node->nn_param.fcl.axis = 0; fc_node->node->nn_param.fcl.weights = (uint32_t)weight->attr.size[1]; fc_node->inputs[0] = input; @@ -95,6 +99,7 @@ static vsi_nn_internal_tensor_t * _create_fc fc_node->outputs[0] = fc_out->t; vsi_nn_internal_setup_node(self, fc_node); +final: return fc_out; } /* () */ @@ -136,6 +141,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -146,6 +153,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } @@ -167,6 +177,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } @@ -200,6 +212,7 @@ static vsi_bool op_setup_default inputs[GRUCELL_IN_KERNEL_I2Z + i], inputs[GRUCELL_IN_BIAS_I2Z + i] ); + CHECK_PTR_FAIL_GOTO(input_fc_outputs[i], "Create internal tensor failed", final); } /* create hstate fc */ @@ -211,6 +224,7 @@ static vsi_bool op_setup_default inputs[GRUCELL_IN_KERNEL_R2Z + i], inputs[GRUCELL_IN_BIAS_R2Z + i] ); + CHECK_PTR_FAIL_GOTO(hstate_fc_outputs[i], "Create internal tensor failed", final); } memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); @@ -228,8 +242,10 @@ static vsi_bool op_setup_default attr.vtl = TRUE; attr.is_const = FALSE; h_times_r = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(h_times_r, "Create internal tensor failed", final); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_H_TIMES_ACTIVATION_R, 3, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.grucell_h_times_activation_r.recurrent_activation = p->recurrent_activation; curr->inputs[0] = inputs[GRUCELL_IN_H_STATE]; curr->inputs[1] = input_fc_outputs[GRUCELL_GATES_R]->t; @@ -243,8 +259,10 @@ static vsi_bool op_setup_default inputs[GRUCELL_IN_KERNEL_R2H], inputs[GRUCELL_IN_BIAS_R2H] ); + CHECK_PTR_FAIL_GOTO(hstate_fc_outputs[GRUCELL_GATES_H], "Create internal tensor failed", final); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_Z_H, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.grucell_activation_z_h.activation = p->activation; curr->node->nn_param.grucell_activation_z_h.recurrent_activation = p->recurrent_activation; curr->inputs[GRUCELL_ACT_Z_H_HSTATE] = inputs[GRUCELL_IN_H_STATE]; @@ -257,6 +275,8 @@ static vsi_bool op_setup_default vsi_nn_internal_setup_node(self, curr); return TRUE; +final: + return FALSE; } #endif @@ -287,6 +307,7 @@ static vsi_bool op_setup_reset_after inputs[GRUCELL_IN_KERNEL_I2Z + i], inputs[GRUCELL_IN_BIAS_I2Z + i] ); + CHECK_PTR_FAIL_GOTO(input_fc_outputs[i], "Create internal tensor failed", final); } /* create hstate fc */ @@ -298,9 +319,11 @@ static vsi_bool op_setup_reset_after inputs[GRUCELL_IN_KERNEL_R2Z + i], inputs[GRUCELL_IN_BIAS_R2Z + i] ); + CHECK_PTR_FAIL_GOTO(hstate_fc_outputs[i], "Create internal tensor failed", final); } curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.grucell_activation.activation = p->activation; curr->node->nn_param.grucell_activation.recurrent_activation = p->recurrent_activation; curr->inputs[GRUCELL_ACT_H_STATE] = inputs[GRUCELL_IN_H_STATE]; @@ -315,6 +338,8 @@ static vsi_bool op_setup_reset_after vsi_nn_internal_setup_node(self, curr); return TRUE; +final: + return FALSE; } static vsi_bool op_setup diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c index 4fcd61200..1478eac41 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c @@ -75,6 +75,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -86,6 +89,8 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + if (VSI_NN_DIM_AUTO == outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dim_num) { outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dim_num = \ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c index 42fc9fbc3..a77d05dd6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c @@ -73,6 +73,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } /* op_check() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c index ba9b540cf..cf35692d0 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c @@ -70,6 +70,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -81,6 +84,8 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + if(VSI_NN_DIM_AUTO == outputs[GRUCELL_ACTIVATION_SMA_OUTPUT_OUTPUT]->attr.dim_num) { outputs[GRUCELL_ACTIVATION_SMA_OUTPUT_OUTPUT]->attr.dim_num = \ @@ -108,6 +113,8 @@ static vsi_status op_init { vsi_status status = VSI_SUCCESS; + VSI_UNREFERENCED(self); + return status; } /* op_init() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_z_h.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_z_h.c index 46eff0d9d..7980d4281 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_z_h.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_z_h.c @@ -76,6 +76,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -87,6 +90,8 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + if (VSI_NN_DIM_AUTO == outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]->attr.dim_num) { outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]->attr.dim_num = \ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_h_times_activation_r.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_h_times_activation_r.c index e1e448077..58dc548e6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_h_times_activation_r.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_h_times_activation_r.c @@ -81,6 +81,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -92,6 +95,8 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) { outputs[0]->attr.dim_num = \ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c index 020ab32e6..432ce2032 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c @@ -35,12 +35,12 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "libnnext/vsi_nn_vxkernel.h" #include "ops/vsi_nn_op_grucell_ovxlib.h" #include "vsi_nn_internal_node.h" #include "vsi_nn_rnn_helper.h" #include "utils/vsi_nn_tensor_op.h" #include "utils/vsi_nn_util.h" +#include "vsi_nn_error.h" #define USE_GRUCELL_ACTIVATION @@ -78,8 +78,10 @@ static vsi_nn_internal_tensor_t* create_multiply memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); tensor1 = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final); tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_MULTIPLY, 0, 0 ); + CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); tmp_inode->inputs[0] = input1; tmp_inode->inputs[1] = input2; @@ -89,6 +91,7 @@ static vsi_nn_internal_tensor_t* create_multiply tmp_inode->outputs[0] = tensor1->t; vsi_nn_internal_setup_node(self, tmp_inode); +final: return tensor1; } @@ -125,6 +128,7 @@ static vsi_bool setup_op_shapes attr.is_const = FALSE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); inputs[GRUCELL_INPUT_H_STATE] = output_tensor->t; } @@ -133,6 +137,7 @@ static vsi_bool setup_op_shapes vsi_nn_internal_init_tensor_attr(&attr, &outputs[GRUCELL_OUTPUT_OUTPUT]->attr.dtype, TRUE); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); outputs[GRUCELL_OUTPUT_H_STATE] = output_tensor->t; } @@ -156,6 +161,8 @@ static vsi_bool setup_op_shapes } return TRUE; +final: + return FALSE; } static vsi_status op_compute @@ -165,6 +172,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -175,6 +184,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -187,6 +199,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ @@ -205,24 +219,31 @@ static vsi_bool op_setup_float vsi_nn_internal_tensor_t* tensor_rt = NULL; vsi_nn_internal_tensor_t* input_hstate = NULL; vsi_nn_internal_tensor_t** splited_tensors = NULL; + vsi_bool ret = FALSE; p->local->weights_update = vsi_nn_ConcatTensor(self->graph, 0, inputs[GRUCELL_INPUT_WEIGHT_I2Z], inputs[GRUCELL_INPUT_WEIGHT_H2Z]); + CHECK_PTR_FAIL_GOTO(p->local->weights_update, "Create tensor failed", final); p->local->weights_reset = vsi_nn_ConcatTensor(self->graph, 0, inputs[GRUCELL_INPUT_WEIGHT_I2R], inputs[GRUCELL_INPUT_WEIGHT_H2R]); + CHECK_PTR_FAIL_GOTO(p->local->weights_reset, "Create tensor failed", final); p->local->bias_z = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2Z]->attr, inputs[GRUCELL_INPUT_BIAS_I2Z], inputs[GRUCELL_INPUT_BIAS_H2Z]); + CHECK_PTR_FAIL_GOTO(p->local->bias_z, "Create tensor failed", final); p->local->bias_r = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2R]->attr, inputs[GRUCELL_INPUT_BIAS_I2R], inputs[GRUCELL_INPUT_BIAS_H2R]); + CHECK_PTR_FAIL_GOTO(p->local->bias_r, "Create tensor failed", final); p->local->bias_z_r = vsi_nn_ConcatTensor(self->graph, 0, p->local->bias_z, p->local->bias_r); + CHECK_PTR_FAIL_GOTO(p->local->bias_z_r, "Create tensor failed", final); p->local->weights_z_r = vsi_nn_ConcatTensor(self->graph, 1, p->local->weights_update, p->local->weights_reset); + CHECK_PTR_FAIL_GOTO(p->local->weights_z_r, "Create tensor failed", final); p->local->weights_c = vsi_nn_ConcatTensor(self->graph, 0, inputs[GRUCELL_INPUT_WEIGHT_I2C], inputs[GRUCELL_INPUT_WEIGHT_H2C]); + CHECK_PTR_FAIL_GOTO(p->local->weights_c, "Create tensor failed", final); p->local->bias_c = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2C]->attr, inputs[GRUCELL_INPUT_BIAS_I2C], inputs[GRUCELL_INPUT_BIAS_H2C]); + CHECK_PTR_FAIL_GOTO(p->local->bias_c, "Create tensor failed", final); - vsi_safe_release_tensor(p->local->bias_z); - vsi_safe_release_tensor(p->local->bias_r); p->local->bias_z_r->attr.is_const = TRUE; vsi_nn_SetTensorAttr(p->local->bias_z_r, VSI_NN_TENSOR_ATTR_CONST); p->local->weights_z_r->attr.is_const = TRUE; @@ -234,6 +255,7 @@ static vsi_bool op_setup_float input_hstate = vsi_nn_rnn_create_concat(self, 0, use_virtual_tensor, inputs[GRUCELL_INPUT_INPUT], inputs[GRUCELL_INPUT_H_STATE]); + CHECK_PTR_FAIL_GOTO(input_hstate, "Create internal tensor failed", final); dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; if ( input_hstate->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 || @@ -247,8 +269,10 @@ static vsi_bool op_setup_float } tmp_tensor = vsi_nn_rnn_create_tp_fc(self, input_hstate->t, p->local->weights_z_r, p->local->bias_z_r, &dtype, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); splited_tensors = vsi_nn_create_split(self, tmp_tensor->t, 0, 2, NULL, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(splited_tensors, "Create internal tensor failed", final); /* reset Gate activations */ tensor_rt = vsi_nn_rnn_create_activation(self, @@ -256,6 +280,7 @@ static vsi_bool op_setup_float p->local->gate_activation, &splited_tensors[1]->t->attr.dtype, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tensor_rt, "Create internal tensor failed", final); /* if linear_before_reset=0: ht=g(input*w_ic + (r.hstate)*w_hc + b_ic + b_hc)*/ if ( p->linear_before_reset == 0 ) @@ -263,10 +288,12 @@ static vsi_bool op_setup_float /* r{t} * h{t-1}*/ tensor_rt = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_MULTIPLY, tensor_rt->t, inputs[GRUCELL_INPUT_H_STATE], &tensor_rt->t->attr.dtype, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tensor_rt, "Create internal tensor failed", final); /* [x{t}, r{t}] */ tmp_tensor = vsi_nn_rnn_create_concat(self, 0, use_virtual_tensor, inputs[GRUCELL_INPUT_INPUT], tensor_rt->t); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; if ( tmp_tensor->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 || @@ -281,6 +308,7 @@ static vsi_bool op_setup_float /* W{c} x [x{t}, r{t}] */ tmp_tensor = vsi_nn_rnn_create_tp_fc(self, tmp_tensor->t, p->local->weights_c, p->local->bias_c, &dtype, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); } /* if linear_before_reset!=0: ht=g(input*w_ic + (r.(hstate*w_hc + b_hc)) + b_ic)*/ else @@ -298,19 +326,24 @@ static vsi_bool op_setup_float /* r.(hstate*w_hc + b_hc) */ tmp_tensor = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_H_STATE], inputs[GRUCELL_INPUT_WEIGHT_H2C], inputs[GRUCELL_INPUT_BIAS_H2C], &dtype, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); tensor_rt = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_MULTIPLY, tensor_rt->t, tmp_tensor->t, &tensor_rt->t->attr.dtype, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tensor_rt, "Create internal tensor failed", final); /* input*w_ic + b_ic */ tmp_tensor = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_INPUT], inputs[GRUCELL_INPUT_WEIGHT_I2C], inputs[GRUCELL_INPUT_BIAS_I2C], &dtype, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); tmp_tensor = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_ADD, tensor_rt->t, tmp_tensor->t, &tensor_rt->t->attr.dtype, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); } #define USE_GRUCELL_ACTIVATION #ifdef USE_GRUCELL_ACTIVATION curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = splited_tensors[0]->t; curr->inputs[1] = tmp_tensor->t; curr->inputs[2] = inputs[GRUCELL_INPUT_H_STATE]; @@ -342,6 +375,7 @@ static vsi_bool op_setup_float tmp_tensor = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_MULTIPLY, tensor_zt->t, tmp_tensor->t, &tensor_ht_->t->attr.dtype, use_virtual_tensor); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = tmp_tensor->t; curr->inputs[1] = tensor_ht_->t; curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; @@ -349,12 +383,18 @@ static vsi_bool op_setup_float } curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; curr->outputs[0] = outputs[GRUCELL_OUTPUT_H_STATE]; vsi_nn_internal_setup_node(self, curr); #endif - return TRUE; + + ret = TRUE; +final: + vsi_safe_release_tensor(p->local->bias_z); + vsi_safe_release_tensor(p->local->bias_r); + return ret; } static vsi_bool op_setup_float_cudnn @@ -379,24 +419,29 @@ static vsi_bool op_setup_float_cudnn p->local->weights_input = vsi_nn_ConcatTensor(self->graph, 1, inputs[GRUCELL_INPUT_WEIGHT_I2R], inputs[GRUCELL_INPUT_WEIGHT_I2Z], inputs[GRUCELL_INPUT_WEIGHT_I2C]); + CHECK_PTR_FAIL_GOTO(p->local->weights_input, "Create tensor failed", final); p->local->weights_input->attr.is_const = TRUE; vsi_nn_SetTensorAttr(p->local->weights_input, VSI_NN_TENSOR_ATTR_CONST); p->local->weights_recurrent = vsi_nn_ConcatTensor(self->graph, 1, inputs[GRUCELL_INPUT_WEIGHT_H2R], inputs[GRUCELL_INPUT_WEIGHT_H2Z], inputs[GRUCELL_INPUT_WEIGHT_H2C]); + CHECK_PTR_FAIL_GOTO(p->local->weights_recurrent, "Create tensor failed", final); p->local->weights_recurrent->attr.is_const = TRUE; vsi_nn_SetTensorAttr(p->local->weights_recurrent, VSI_NN_TENSOR_ATTR_CONST); p->local->bias_r = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2R]->attr, inputs[GRUCELL_INPUT_BIAS_I2R], inputs[GRUCELL_INPUT_BIAS_H2R]); + CHECK_PTR_FAIL_GOTO(p->local->bias_r, "Create tensor failed", final); p->local->bias_r->attr.is_const = TRUE; vsi_nn_SetTensorAttr(p->local->bias_r, VSI_NN_TENSOR_ATTR_CONST); p->local->bias_z = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2Z]->attr, inputs[GRUCELL_INPUT_BIAS_I2Z], inputs[GRUCELL_INPUT_BIAS_H2Z]); + CHECK_PTR_FAIL_GOTO(p->local->bias_z, "Create tensor failed", final); p->local->bias_z->attr.is_const = TRUE; vsi_nn_SetTensorAttr(p->local->bias_z, VSI_NN_TENSOR_ATTR_CONST); p->local->bias_c = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2C]->attr, inputs[GRUCELL_INPUT_BIAS_I2C], inputs[GRUCELL_INPUT_BIAS_H2C]); + CHECK_PTR_FAIL_GOTO(p->local->bias_c, "Create tensor failed", final); p->local->bias_c->attr.is_const = TRUE; vsi_nn_SetTensorAttr(p->local->bias_c, VSI_NN_TENSOR_ATTR_CONST); @@ -412,16 +457,19 @@ static vsi_bool op_setup_float_cudnn /* reshape and transpose input */ input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_INPUT], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); - + CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final); tmp = vsi_nn_rnn_create_nn_fc(self, input_tensor->t, p->local->weights_input, NULL, kernel_h, kernel_w, &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_INPUT], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final); + /* transpose and reshape output */ reshaped_size[0] = inputs[GRUCELL_INPUT_INPUT]->attr.size[1]; reshaped_size[1] = p->local->weights_input->attr.size[1]; input_fc_output = vsi_nn_rnn_create_reshape(self, tmp->t, NULL, reshaped_size, 2, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(input_fc_output, "Create internal tensor failed", final); grucell_activation_input_layout = GRUCELL_ACTIVATION_INPUT_LAYOUT_INPUT_NC_FC_CN; } @@ -430,6 +478,7 @@ static vsi_bool op_setup_float_cudnn input_fc_output = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_INPUT], p->local->weights_input, NULL, &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_INPUT], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(input_fc_output, "Create internal tensor failed", final); grucell_activation_input_layout = GRUCELL_ACTIVATION_INPUT_LAYOUT_ALL_NC; } @@ -444,25 +493,31 @@ static vsi_bool op_setup_float_cudnn /* reshape and transpose input */ input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_H_STATE], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final); tmp = vsi_nn_rnn_create_nn_fc(self, input_tensor->t, p->local->weights_recurrent, NULL, kernel_h, kernel_w, &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_HIDDEN], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final); + /* transpose and reshape output */ reshaped_size[0] = inputs[GRUCELL_INPUT_H_STATE]->attr.size[1]; reshaped_size[1] = p->local->weights_recurrent->attr.size[1]; recurrent_fc_output = vsi_nn_rnn_create_reshape(self, tmp->t, NULL, reshaped_size, 2, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(recurrent_fc_output, "Create internal tensor failed", final); } else { recurrent_fc_output = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_H_STATE], p->local->weights_recurrent, NULL, &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_HIDDEN], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(recurrent_fc_output, "Create internal tensor failed", final); } #ifdef USE_GRUCELL_ACTIVATION curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[GRUCELL_ACTIVATION_INPUT_H_STATE] = inputs[GRUCELL_INPUT_H_STATE]; if(p->local->multi_batch) @@ -480,8 +535,12 @@ static vsi_bool op_setup_float_cudnn { splited_input_fc_output_tensors = vsi_nn_create_split(self, input_fc_output->t, 1, 3, NULL, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(splited_input_fc_output_tensors, curr, + "Create internal tensor failed", final); splited_recurrent_fc_output_tensors = vsi_nn_create_split(self, recurrent_fc_output->t, 1, 3, NULL, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(splited_recurrent_fc_output_tensors, curr, + "Create internal tensor failed", final); curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R] = splited_input_fc_output_tensors[0]->t; curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z] = splited_input_fc_output_tensors[1]->t; curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_C] = splited_input_fc_output_tensors[2]->t; @@ -494,8 +553,12 @@ static vsi_bool op_setup_float_cudnn { splited_input_fc_output_tensors = vsi_nn_create_split(self, input_fc_output->t, 0, 3, NULL, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(splited_input_fc_output_tensors, curr, + "Create internal tensor failed", final); splited_recurrent_fc_output_tensors = vsi_nn_create_split(self, recurrent_fc_output->t, 0, 3, NULL, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(splited_recurrent_fc_output_tensors, curr, + "Create internal tensor failed", final); curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R] = splited_input_fc_output_tensors[0]->t; curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z] = splited_input_fc_output_tensors[1]->t; curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_C] = splited_input_fc_output_tensors[2]->t; @@ -593,12 +656,14 @@ static vsi_bool op_setup_float_cudnn tensor_u->t, tmp_tensor->t, &tmp_tensor->t->attr.dtype, use_virtual_tensor); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = tmp_tensor->t; curr->inputs[1] = tensor_c->t; curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; vsi_nn_internal_setup_node(self, curr); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; curr->outputs[0] = outputs[GRUCELL_OUTPUT_H_STATE]; vsi_nn_internal_setup_node(self, curr); @@ -606,6 +671,8 @@ static vsi_bool op_setup_float_cudnn #endif return TRUE; +final: + return FALSE; } /* @@ -629,34 +696,38 @@ static vsi_bool op_setup_float_cudnn_v2 vsi_nn_internal_tensor_t* tensor_r = NULL; vsi_nn_internal_tensor_t* concated_input = NULL; vsi_nn_tensor_attr_t attr; + vsi_bool ret = FALSE; /* input to r,z */ p->local->weights_update = vsi_nn_ConcatTensor(self->graph, 1/* axis */, inputs[GRUCELL_INPUT_WEIGHT_I2R], inputs[GRUCELL_INPUT_WEIGHT_I2Z]); + CHECK_PTR_FAIL_GOTO(p->local->weights_update, "Create tensor failed", final); /* recurrent to r,z */ p->local->weights_reset = vsi_nn_ConcatTensor(self->graph, 1/* axis */, inputs[GRUCELL_INPUT_WEIGHT_H2R], inputs[GRUCELL_INPUT_WEIGHT_H2Z]); + CHECK_PTR_FAIL_GOTO(p->local->weights_reset, "Create tensor failed", final); /* [input, recurrent] to r,z */ p->local->weights_input = vsi_nn_ConcatTensor(self->graph, 0/* axis */, p->local->weights_update, p->local->weights_reset); + CHECK_PTR_FAIL_GOTO(p->local->weights_input, "Create tensor failed", final); p->local->weights_input->attr.is_const = TRUE; vsi_nn_SetTensorAttr(p->local->weights_input, VSI_NN_TENSOR_ATTR_CONST); - vsi_safe_release_tensor(p->local->weights_update); - vsi_safe_release_tensor(p->local->weights_reset); p->local->bias_z = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2Z]->attr, inputs[GRUCELL_INPUT_BIAS_I2Z], inputs[GRUCELL_INPUT_BIAS_H2Z]); + CHECK_PTR_FAIL_GOTO(p->local->bias_z, "Create tensor failed", final); p->local->bias_r = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2R]->attr, inputs[GRUCELL_INPUT_BIAS_I2R], inputs[GRUCELL_INPUT_BIAS_H2R]); + CHECK_PTR_FAIL_GOTO(p->local->bias_r, "Create tensor failed", final); p->local->bias_z_r = vsi_nn_ConcatTensor(self->graph, 0/* axis */, p->local->bias_r, p->local->bias_z); + CHECK_PTR_FAIL_GOTO(p->local->bias_z_r, "Create tensor failed", final); p->local->bias_z_r->attr.is_const = TRUE; vsi_nn_SetTensorAttr(p->local->bias_z_r, VSI_NN_TENSOR_ATTR_CONST); - vsi_safe_release_tensor(p->local->bias_z); - vsi_safe_release_tensor(p->local->bias_r); concated_input = vsi_nn_rnn_create_concat(self, 0/* axis */, use_virtual_tensor, inputs[GRUCELL_INPUT_INPUT], inputs[GRUCELL_INPUT_H_STATE]); + CHECK_PTR_FAIL_GOTO(concated_input, "Create internal tensor failed", final); dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; if ( concated_input->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 || @@ -670,6 +741,16 @@ static vsi_bool op_setup_float_cudnn_v2 } tmp_tensor = vsi_nn_rnn_create_tp_fc(self, concated_input->t, p->local->weights_input, p->local->bias_z_r, &dtype, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); + + { + uint32_t _slices[] = { 0, 0 }; + _slices[0] = (uint32_t)inputs[GRUCELL_INPUT_INPUT]->attr.size[0]; + _slices[1] = (uint32_t)inputs[GRUCELL_INPUT_H_STATE]->attr.size[0]; + splited_input_fc_output_tensors = vsi_nn_create_split(self, concated_input->t, + 0, 2, _slices, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( splited_input_fc_output_tensors, "Create internal tensor fail.", final ); + } dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; if ( splited_input_fc_output_tensors[0]->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 || @@ -681,14 +762,10 @@ static vsi_bool op_setup_float_cudnn_v2 { dtype.vx_type = VSI_NN_TYPE_FLOAT16; } - { - uint32_t _slices[] = { (uint32_t)inputs[GRUCELL_INPUT_INPUT]->attr.size[0], - (uint32_t)inputs[GRUCELL_INPUT_H_STATE]->attr.size[0] }; - splited_input_fc_output_tensors = vsi_nn_create_split(self, concated_input->t, - 0, 2, _slices, use_virtual_tensor); - } + input2cand_output = vsi_nn_rnn_create_tp_fc(self, splited_input_fc_output_tensors[0]->t, inputs[GRUCELL_INPUT_WEIGHT_I2C], inputs[GRUCELL_INPUT_BIAS_I2C], &dtype, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(input2cand_output, "Create internal tensor failed", final); dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; if ( inputs[GRUCELL_INPUT_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 || @@ -702,14 +779,17 @@ static vsi_bool op_setup_float_cudnn_v2 } recurrent2cand_output = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_H_STATE], inputs[GRUCELL_INPUT_WEIGHT_H2C], inputs[GRUCELL_INPUT_BIAS_H2C], &dtype, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(recurrent2cand_output, "Create internal tensor failed", final); tmp_tensor = vsi_nn_rnn_create_activation(self, tmp_tensor->t, p->local->gate_activation, &tmp_tensor->t->attr.dtype, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); /* split for combined FC outputs, r_t, z_t */ splited_input_fc_output_tensors = vsi_nn_create_split(self, tmp_tensor->t, 0/* axis */, 2/* dim num */, NULL, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(splited_input_fc_output_tensors, "Create internal tensor failed", final); memset( &attr, 0x00, sizeof(attr) ); attr.dim_num = VSI_NN_DIM_AUTO; @@ -726,8 +806,10 @@ static vsi_bool op_setup_float_cudnn_v2 dtype.vx_type = VSI_NN_TYPE_FLOAT16; } tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_A_TIMES_B_PLUS_C, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = splited_input_fc_output_tensors[0]->t; curr->inputs[1] = recurrent2cand_output->t; curr->inputs[2] = input2cand_output->t; @@ -736,10 +818,12 @@ static vsi_bool op_setup_float_cudnn_v2 tensor_r = vsi_nn_rnn_create_activation(self, tmp_tensor->t, p->local->candidate_activation, &tmp_tensor->t->attr.dtype, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(tensor_r, curr, "Create internal tensor failed", final); #define USE_GRUCELL_ACTIVATION_SMA #ifdef USE_GRUCELL_ACTIVATION_SMA curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL_SMA, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[GRUCELL_ACTIVATION_SMA_INPUT_H_STATE] = inputs[GRUCELL_INPUT_H_STATE]; curr->inputs[GRUCELL_ACTIVATION_SMA_INPUT_H_T_] = tensor_r->t; curr->inputs[GRUCELL_ACTIVATION_SMA_INPUT_Z_T] = splited_input_fc_output_tensors[1]->t; @@ -758,18 +842,25 @@ static vsi_bool op_setup_float_cudnn_v2 tmp_tensor->t, &tmp_tensor->t->attr.dtype, use_virtual_tensor); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = tmp_tensor->t; curr->inputs[1] = tensor_r->t; curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; vsi_nn_internal_setup_node(self, curr); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; curr->outputs[0] = outputs[GRUCELL_OUTPUT_H_STATE]; vsi_nn_internal_setup_node(self, curr); #endif - - return TRUE; + ret = TRUE; +final: + vsi_safe_release_tensor(p->local->bias_z); + vsi_safe_release_tensor(p->local->bias_r); + vsi_safe_release_tensor(p->local->weights_update); + vsi_safe_release_tensor(p->local->weights_reset); + return ret; } static vsi_bool op_setup_default @@ -804,6 +895,8 @@ static vsi_bool op_setup_default uint32_t kernel_h = 1; uint32_t kernel_w = 1; int32_t i = 0; + vsi_nn_tensor_t* wei_r2c_tensor = NULL; + vsi_nn_tensor_t* bias_r2c_tensor = NULL; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); memset( &attr, 0x00, sizeof( attr ) ); @@ -853,6 +946,7 @@ static vsi_bool op_setup_default inputs[GRUCELL_INPUT_BIAS_I2R + i], &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2R + i], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(input_gate_fc_outputs[i], "Create internal tensor failed", final); } } else @@ -862,6 +956,7 @@ static vsi_bool op_setup_default (uint32_t)inputs[GRUCELL_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w); input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_INPUT], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final); for( i = 0; i < GRUCELL_RZ_GATE_COUNT; i++) { @@ -872,9 +967,11 @@ static vsi_bool op_setup_default kernel_h, kernel_w, &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2R + i], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final); /* transpose and reshape output */ input_gate_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self, tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(input_gate_fc_outputs[i], "Create internal tensor failed", final); } } @@ -889,10 +986,7 @@ static vsi_bool op_setup_default inputs[GRUCELL_INPUT_BIAS_H2R + i], &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2R + i], use_virtual_tensor); - if (hstate_gate_fc_outputs[i] == NULL) - { - goto error; - } + CHECK_PTR_FAIL_GOTO(hstate_gate_fc_outputs[i], "Create internal tensor failed", final); } } else @@ -902,6 +996,7 @@ static vsi_bool op_setup_default (uint32_t)inputs[GRUCELL_INPUT_H_STATE]->attr.size[0], &kernel_h, &kernel_w); hstate_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_H_STATE], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(hstate_input_tensor, "Create internal tensor failed", final); for( i = 0; i < GRUCELL_RZ_GATE_COUNT; i++) { @@ -912,9 +1007,11 @@ static vsi_bool op_setup_default kernel_h, kernel_w, &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2R + i], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final); /* transpose and reshape output */ hstate_gate_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self, tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(hstate_gate_fc_outputs[i], "Create internal tensor failed", final); } } @@ -926,6 +1023,7 @@ static vsi_bool op_setup_default hstate_gate_fc_outputs[i]->t, &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2R + i], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(gate_fc_outputs[i], "Create internal tensor failed", final); } /* Gate activations */ @@ -936,6 +1034,7 @@ static vsi_bool op_setup_default p->local->gate_activation, &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2R + i], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(gate_act_outputs[i], "Create internal tensor failed", final); } /* Candidate FC */ @@ -948,6 +1047,7 @@ static vsi_bool op_setup_default inputs[GRUCELL_INPUT_H_STATE], &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2R], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(rh_mul_outputs, "Create internal tensor failed", final); } else { @@ -957,6 +1057,7 @@ static vsi_bool op_setup_default inputs[GRUCELL_INPUT_H_STATE]->attr.size, inputs[GRUCELL_INPUT_H_STATE]->attr.dim_num, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(rh_mul_outputs, "Create internal tensor failed", final); } if( inputs[GRUCELL_INPUT_INPUT]->attr.dtype.qnt_type @@ -999,6 +1100,7 @@ static vsi_bool op_setup_default inputs[GRUCELL_INPUT_BIAS_I2C], &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(input_cand_fc_output, "Create internal tensor failed", final); } else { @@ -1008,6 +1110,8 @@ static vsi_bool op_setup_default (uint32_t)inputs[GRUCELL_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w); input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_INPUT], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final); + tmp = vsi_nn_rnn_create_nn_fc(self, input_tensor->t, inputs[GRUCELL_INPUT_WEIGHT_I2C], @@ -1015,9 +1119,11 @@ static vsi_bool op_setup_default kernel_h, kernel_w, &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final); /* transpose and reshape output */ input_cand_fc_output = vsi_nn_rnn_process_output_for_nn_fc(self, tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(input_cand_fc_output, "Create internal tensor failed", final); } if ( is_hstate_cand_fc_op_tp ) { @@ -1025,9 +1131,6 @@ static vsi_bool op_setup_default if ((rh_mul_outputs->t->attr.dtype.vx_type) != (inputs[GRUCELL_INPUT_WEIGHT_H2C]->attr.dtype.vx_type) && (p->local->multi_batch)) { - vsi_nn_tensor_t* wei_r2c_tensor = NULL; - vsi_nn_tensor_t* bias_r2c_tensor = NULL; - memcpy(&attr, &(inputs[GRUCELL_INPUT_WEIGHT_H2C]->attr), sizeof(attr)); attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; if ( rh_mul_outputs->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 || @@ -1041,14 +1144,18 @@ static vsi_bool op_setup_default } wei_r2c_tensor = vsi_nn_ConvertTensorDtype(self->graph, inputs[GRUCELL_INPUT_WEIGHT_H2C], &(attr.dtype)); + CHECK_PTR_FAIL_GOTO(wei_r2c_tensor, "Create tensor failed", final); attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; bias_r2c_tensor = vsi_nn_ConvertTensorDtype(self->graph, inputs[GRUCELL_INPUT_BIAS_H2C], &(attr.dtype)); + CHECK_PTR_FAIL_GOTO(bias_r2c_tensor, "Create tensor failed", final); + rh_cand_fc_output = vsi_nn_rnn_create_tp_fc(self, rh_mul_outputs->t, wei_r2c_tensor, bias_r2c_tensor, &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2C], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(rh_cand_fc_output, "Create internal tensor failed", final); } else { @@ -1058,6 +1165,7 @@ static vsi_bool op_setup_default inputs[GRUCELL_INPUT_BIAS_H2C], &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2C], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(rh_cand_fc_output, "Create internal tensor failed", final); } } else @@ -1068,6 +1176,8 @@ static vsi_bool op_setup_default (uint32_t)rh_mul_outputs->t->attr.size[0], &kernel_h, &kernel_w); hstate_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, rh_mul_outputs->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(hstate_input_tensor, "Create internal tensor failed", final); + tmp = vsi_nn_rnn_create_nn_fc(self, hstate_input_tensor->t, inputs[GRUCELL_INPUT_WEIGHT_H2C], @@ -1075,9 +1185,11 @@ static vsi_bool op_setup_default kernel_h, kernel_w, &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2C], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final); /* transpose and reshape output */ rh_cand_fc_output = vsi_nn_rnn_process_output_for_nn_fc(self, tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(rh_cand_fc_output, "Create internal tensor failed", final); } if ( p->linear_before_reset == 0 ) @@ -1091,6 +1203,7 @@ static vsi_bool op_setup_default rh_cand_fc_output->t, &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2C], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(r_mul_hcand_fc_output, "Create internal tensor failed", final); } /* Candidate input FC add r*h FC */ cand_fc_output = vsi_nn_rnn_create_tensor_add(self, @@ -1098,6 +1211,7 @@ static vsi_bool op_setup_default r_mul_hcand_fc_output->t, &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(cand_fc_output, "Create internal tensor failed", final); /* Candidate activation */ cand_act_output = vsi_nn_rnn_create_activation(self, @@ -1105,6 +1219,7 @@ static vsi_bool op_setup_default p->local->candidate_activation, &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(cand_act_output, "Create internal tensor failed", final); /* GRU cell output */ memcpy( &attr.dtype, &gate_act_outputs[GRUCELL_GATE_Z]->t->attr.dtype, sizeof( attr.dtype ) ); @@ -1113,6 +1228,7 @@ static vsi_bool op_setup_default attr.vtl = use_virtual_tensor; attr.is_const = TRUE; input_tensor = vsi_nn_internal_new_tensor(self, &attr, 1.0f); + CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final); memset( &attr, 0x00, sizeof(attr) ); //memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); @@ -1131,9 +1247,11 @@ static vsi_bool op_setup_default } tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); /* create internal tensor sub node (1-zt)*c */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SUBTRACT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = input_tensor->t; curr->inputs[1] = gate_act_outputs[GRUCELL_GATE_Z]->t; curr->outputs[0] = tmp_tensor->t; @@ -1146,6 +1264,7 @@ static vsi_bool op_setup_default cand_act_output->t, &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); /* create internal multiply node zt*hstate */ tmp_tensor = create_multiply(self, @@ -1153,9 +1272,11 @@ static vsi_bool op_setup_default inputs[GRUCELL_INPUT_H_STATE], &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2Z], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); /* create internal tensor add node (1-zt)*c + zt*hstate */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = output_tensor->t; curr->inputs[1] = tmp_tensor->t; curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; @@ -1164,13 +1285,16 @@ static vsi_bool op_setup_default /* copy output to h_state */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT]; curr->outputs[0] = outputs[GRUCELL_OUTPUT_H_STATE]; vsi_nn_internal_setup_node(self, curr); return TRUE; -error: +final: + vsi_safe_release_tensor(wei_r2c_tensor); + vsi_safe_release_tensor(bias_r2c_tensor); return FALSE; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c b/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c index dbec83887..4a07faab6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c @@ -94,6 +94,8 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(node); + if( outputs[0]->attr.dim_num == VSI_NN_DIM_AUTO ) { outputs[0]->attr.dim_num = inputs[2]->attr.dim_num; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c b/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c index 46ee1d284..cc4b44362 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c @@ -68,6 +68,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -79,6 +82,8 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = 2; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c b/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c index d9b3b320f..5386af725 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c @@ -60,6 +60,10 @@ static vsi_status op_compute { vsi_status status = VSI_SUCCESS; + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + return status; } /* op_compute() */ @@ -124,6 +128,12 @@ vsi_status vsi_nn_op_imageprocess_single_node vsi_nn_tensor_t *tensor_out ) { + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(attr); + VSI_UNREFERENCED(p); + VSI_UNREFERENCED(data); + VSI_UNREFERENCED(tensor_out); + return VSI_SUCCESS; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c b/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c index 9a2043e9e..2066865a5 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c @@ -34,6 +34,7 @@ #include "vsi_nn_tensor.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" +#include "vsi_nn_error.h" /* Declare number of input and output. @@ -50,6 +51,9 @@ static vsi_status op_compute { vsi_status status = VSI_SUCCESS; + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + status = vsi_nn_internal_compute_node( self ); return status; @@ -64,6 +68,9 @@ static vsi_bool op_check { vsi_nn_interp_param *p = NULL; + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + p = &self->nn_param.interp; if ((p->pad_beg > 0) || (p->pad_end > 0)) @@ -166,8 +173,10 @@ static vsi_bool op_setup memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor); crop_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(crop_tensor, "Create internal tensor failed", final); crop_in_tensor = crop_tensor->t; curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 1, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num; curr->node->nn_param.strided_slice.end_dims_num = inputs[0]->attr.dim_num; curr->node->nn_param.strided_slice.stride_dims_num = inputs[0]->attr.dim_num; @@ -177,10 +186,13 @@ static vsi_bool op_setup curr->node->nn_param.strided_slice.new_axis_mask = 0; begin_dims = (vsi_ssize_t *)vsi_nn_internal_new_node_param(curr, VSI_NN_MAX_DIM_NUM * sizeof(vsi_ssize_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(begin_dims, curr, "Create internal buffer failed", final); end_dims = (vsi_ssize_t *)vsi_nn_internal_new_node_param(curr, VSI_NN_MAX_DIM_NUM * sizeof(vsi_ssize_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(end_dims, curr, "Create internal buffer failed", final); stride_dims = (vsi_ssize_t *)vsi_nn_internal_new_node_param(curr, VSI_NN_MAX_DIM_NUM * sizeof(vsi_ssize_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(stride_dims, curr, "Create internal buffer failed", final); for (i = 0; i < inputs[0]->attr.dim_num; i++) { stride_dims[i] = 1; @@ -215,6 +227,7 @@ static vsi_bool op_setup && (height_in_eff_ == (vsi_ssize_t)outputs[0]->attr.size[1])) { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 1, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = crop_in_tensor; curr->outputs[0] = outputs[0]; vsi_nn_internal_setup_node(self, curr); @@ -222,6 +235,7 @@ static vsi_bool op_setup else { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_INTERNAL, 1, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.resize_internal.align_corners = vx_true_e; curr->node->nn_param.resize_internal.factor = factor; curr->node->nn_param.resize_internal.half_pixel_centers = vx_false_e; @@ -231,6 +245,8 @@ static vsi_bool op_setup } return TRUE; +final: + return FALSE; } /* op_setup() */ static vsi_status op_optimize @@ -243,6 +259,9 @@ static vsi_status op_optimize { vsi_status status; + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + status = VSI_SUCCESS; vsi_nn_internal_optimize_node( self, direction ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c index cff15071e..242099b11 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c @@ -42,7 +42,7 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status = VX_FAILURE; + vsi_status status = VSI_FAILURE; int32_t axis = self->nn_param.l2_normalize.axis; vsi_nn_kernel_param_t * param = NULL; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c index e872a3dc5..d52eb7d19 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c @@ -54,7 +54,7 @@ static vsi_nn_tensor_t* _expand_scale_tensor vsi_size_t scale_size_out ) { - vsi_status status = VX_SUCCESS; + vsi_status status = VSI_FAILURE; float* f32_in_buffer = NULL; float* f32_out_buffer = NULL; vsi_size_t i = 0; @@ -144,13 +144,7 @@ static vsi_bool _check_value_is_equal_to_one } } - if ( !tensor->attr.is_created_from_handle ) - { - if ( tensor_data ) - { - free(tensor_data); - } - } + vsi_nn_safe_free(tensor_data); return ret; } @@ -324,7 +318,7 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - vsi_bool ret = TRUE; + vsi_bool ret = FALSE; vsi_nn_internal_node_t* curr = NULL; if( NULL == self ) @@ -349,10 +343,11 @@ static vsi_bool op_setup { self->nn_param.l2normalizescale.local.use_internal_node = TRUE; curr = vsi_nn_internal_new_node(self, VSI_NN_OP_L2_NORMALIZE, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.l2_normalize.axis = self->nn_param.l2normalizescale.axis; curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node( self, curr ); + ret = vsi_nn_internal_setup_node( self, curr ); } else if ( ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 && outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) || @@ -370,8 +365,10 @@ static vsi_bool op_setup attr.vtl = TRUE; attr.is_const = FALSE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_L2_NORMALIZE, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.l2_normalize.axis = self->nn_param.l2normalizescale.axis; curr->inputs[0] = inputs[0]; curr->outputs[0] = output_tensor->t; @@ -389,22 +386,26 @@ static vsi_bool op_setup attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; } reshape_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create internal tensor failed", final); + vsi_nn_ConvertTensor(self->graph, inputs[1], reshape_tensor->t); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_MULTIPLY, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = output_tensor->t; curr->inputs[1] = reshape_tensor->t; curr->node->nn_param.multiply.scale = 1.0f; curr->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; curr->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node( self, curr ); + ret = vsi_nn_internal_setup_node( self, curr ); } else { ret = vsi_nn_op_common_setup(self, inputs, outputs); } +final: return ret; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c index 75354a7c5..a90ae594b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c @@ -37,6 +37,7 @@ #include "vsi_nn_tensor_util_prv.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_error.h" #define _INPUT_NUM (3) #define _OUTPUT_NUM (1) @@ -116,11 +117,15 @@ static vsi_bool op_setup attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; mean_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(mean_tensor, "Create internal tensor failed", final); vari_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(vari_tensor, "Create internal tensor failed", final); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_MOMENTS, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); axis_array = (int32_t*)\ vsi_nn_internal_new_node_param(curr, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(axis_array, curr, "Create internal buffer failed", final); axis_array[0] = axis; curr->node->nn_param.moments.axis = axis_array; @@ -131,6 +136,7 @@ static vsi_bool op_setup vsi_nn_internal_setup_node( self, curr ); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_BATCHNORM_SINGLE, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[0]; curr->inputs[1] = mean_tensor->t; curr->inputs[2] = vari_tensor->t; @@ -138,13 +144,14 @@ static vsi_bool op_setup curr->inputs[4] = inputs[1]; curr->node->nn_param.batchnorm_single.eps = self->nn_param.layernorm.eps; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node( self, curr ); + ret = vsi_nn_internal_setup_node( self, curr ); } else { ret = vsi_nn_op_common_setup(self, inputs, outputs); } +final: return ret; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c index fd12173cf..34c329c4c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c @@ -112,6 +112,8 @@ static vsi_bool _log_softmax_op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(kernel_name); + /* TODO: Add code to comput outputs' shape. */ if( NULL == self ) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c index e44440ead..6bddcff6e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c @@ -100,6 +100,8 @@ static vsi_bool op_setup vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_bool ret = TRUE; + VSI_UNREFERENCED(self); + out_rank = inputs[0]->attr.dim_num; for(i = 0; i < out_rank; i++) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c index 01695c42b..7cb068ed0 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c @@ -106,6 +106,8 @@ static vsi_bool op_setup vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_bool ret = TRUE; + VSI_UNREFERENCED(self); + in1_rank = inputs[0]->attr.dim_num; in2_rank = inputs[1]->attr.dim_num; out_rank = vsi_nn_max( in1_rank, in2_rank ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lpnorm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lpnorm.c index 7a3eb91c0..9547d8be8 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lpnorm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lpnorm.c @@ -164,6 +164,8 @@ static vsi_bool op_setup { vsi_size_t i = 0; + VSI_UNREFERENCED(self); + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c index 63a85f7ab..8d55f065d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c @@ -34,6 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" static vsi_status op_compute ( @@ -42,15 +43,17 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status = VSI_SUCCESS; + vsi_status status = VSI_FAILURE; vsi_nn_tensor_t * type_tensor = NULL; vx_nn_lshproj_params_t p; vx_bool valued = TRUE; vsi_nn_tensor_t * weight_tensor = NULL; + float* const_data = NULL; type_tensor = vsi_nn_VariableToTensor(self, (uint8_t *)&self->nn_param.lsh_projection.type, VSI_NN_TYPE_INT32); + CHECK_PTR_FAIL_GOTO( type_tensor, "Create tensor fail.", final ); memset(&p, 0, sizeof(p)); p.hash_func = REQUIRED_IO(inputs[0]); @@ -65,7 +68,9 @@ static vsi_status op_compute float const_one = 1.0; vsi_size_t i; vsi_size_t count = inputs[1]->attr.size[1]; - float* const_data = (float*)malloc(count * sizeof(float)); + + const_data = (float*)malloc(count * sizeof(float)); + CHECK_PTR_FAIL_GOTO( const_data, "Create buffer fail.", final ); for (i = 0; i < count; i++) { @@ -78,9 +83,8 @@ static vsi_status op_compute attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; weight_tensor = vsi_nn_CreateTensorFromData(self->graph, (uint8_t *)const_data, &attr); + CHECK_PTR_FAIL_GOTO( weight_tensor, "Create tensor fail.", final ); p.weights = weight_tensor->t; - free(const_data); - //valued = FALSE; } vxSetTensorAttribute(p.weights, VX_TENSOR_VALUE, &valued, sizeof(vx_bool)); @@ -90,8 +94,12 @@ static vsi_status op_compute { status = VSI_FAILURE; } - vsi_nn_ReleaseTensor( &type_tensor ); - if (weight_tensor != NULL) vsi_nn_ReleaseTensor(&weight_tensor); + +final: + vsi_nn_safe_free(const_data); + vsi_safe_release_tensor( type_tensor ); + vsi_safe_release_tensor( weight_tensor ); + return status; } /* op_compute() */ @@ -102,6 +110,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm.c index 900e50b7d..d3cc0c824 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm.c @@ -202,6 +202,8 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(outputs); //TODO: Check tensor shapes. if( inputs[0]->attr.dim_num < 3) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c index 283f930b5..ebd17a3f2 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c @@ -35,9 +35,9 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "libnnext/vsi_nn_vxkernel.h" #include "vsi_nn_internal_node.h" #include "vsi_nn_rnn_helper.h" +#include "vsi_nn_error.h" static vsi_bool setup_op_shapes ( @@ -82,6 +82,7 @@ static vsi_bool setup_op_shapes attr.is_const = TRUE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); inputs[LSTM_INPUT_H_STATE] = output_tensor->t; } @@ -96,6 +97,7 @@ static vsi_bool setup_op_shapes attr.is_const = TRUE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); inputs[LSTM_INPUT_C_STATE] = output_tensor->t; } @@ -107,6 +109,7 @@ static vsi_bool setup_op_shapes attr.vtl = use_virtual_tensor; attr.is_const = FALSE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); outputs[LSTM_OUTPUT_H_STATE] = output_tensor->t; } @@ -119,6 +122,7 @@ static vsi_bool setup_op_shapes attr.vtl = use_virtual_tensor; attr.is_const = FALSE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); outputs[LSTM_OUTPUT_C_STATE] = output_tensor->t; } @@ -156,6 +160,8 @@ static vsi_bool setup_op_shapes } return TRUE; +final: + return FALSE; } static vsi_status op_compute @@ -165,6 +171,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -175,6 +183,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -187,6 +198,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ @@ -211,6 +224,8 @@ static vsi_bool op_setup uint32_t batch_size = 0; uint32_t time_step = 0; uint32_t i = 0; + vsi_bool ret = FALSE; + vsi_status status = VSI_FAILURE; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_node_wksp( self ); @@ -235,21 +250,26 @@ static vsi_bool op_setup /* transpose to time_major */ output_tensor = vsi_nn_rnn_transpose_time_major(self, inputs[LSTM_INPUT_INPUT], NULL, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); input_tensor = output_tensor->t; } /* split input tensor */ split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * \ sizeof(vsi_nn_tensor_t *)); + CHECK_PTR_FAIL_GOTO( split_output_tensors, "Create buffer fail.", final ); memset( split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t *)); lstmunit_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * \ sizeof(vsi_nn_tensor_t *)); + CHECK_PTR_FAIL_GOTO( lstmunit_reshape_output_tensors, "Create buffer fail.", final ); memset( lstmunit_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t *)); - vsi_nn_rnn_split_input_tensor(self, input_tensor, + status = vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, time_step, use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO(status, final); - vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor); + status = vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO(status, final); last_step_h_state = inputs[LSTM_INPUT_H_STATE]; last_step_c_state = inputs[LSTM_INPUT_C_STATE]; @@ -263,6 +283,7 @@ static vsi_bool op_setup /* reshape for split output */ output_tensor = vsi_nn_rnn_reshape_split_output(self, split_output_tensors[i], batch_size, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); reshape_output = output_tensor->t; /* lstmunit output */ @@ -275,6 +296,7 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &outputs[LSTM_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); lstmunit_out0 = output_tensor->t; } @@ -284,12 +306,14 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &outputs[LSTM_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); lstmunit_out1 = output_tensor->t; /* lstmunit output c_state */ vsi_nn_internal_init_tensor_attr(&attr, &outputs[LSTM_OUTPUT_C_STATE]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); lstmunit_out2 = output_tensor->t; } else @@ -299,6 +323,7 @@ static vsi_bool op_setup } curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_OVXLIB, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.lstmunit_ovxlib.activation = curr_param->activation; curr->node->nn_param.lstmunit_ovxlib.cell_clip = curr_param->cell_clip; curr->node->nn_param.lstmunit_ovxlib.forget_bias = curr_param->forget_bias; @@ -350,6 +375,7 @@ static vsi_bool op_setup /* reshape output to 3-dims */ output_tensor = vsi_nn_rnn_reshape_cell_output(self, lstmunit_out0, batch_size, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); lstmunit_reshape_output_tensors[i] = output_tensor->t; } } @@ -362,19 +388,21 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &outputs[LSTM_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); tensor = output_tensor->t; } /* concat */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.concat.axis = 2; for( i = 0; i < time_step; i++ ) { curr->inputs[i] = lstmunit_reshape_output_tensors[i]; } curr->outputs[0] = tensor; - vsi_nn_internal_setup_node( self, curr ); + ret = vsi_nn_internal_setup_node( self, curr ); if( !curr_param->time_major ) { @@ -383,11 +411,17 @@ static vsi_bool op_setup tensor, outputs[LSTM_OUTPUT_OUTPUT], use_virtual_tensor); } } + else + { + /* return_sequences = False, return true to setup lstm node. */ + ret = TRUE; + } +final: vsi_nn_safe_free( split_output_tensors ); vsi_nn_safe_free( lstmunit_reshape_output_tensors ); - return TRUE; + return ret; } /* op_setup() */ static vsi_status op_deinit diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c index 7730fee89..13fe0fed8 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c @@ -222,6 +222,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); //TODO: Check tensor shapes. return TRUE; } /* op_check() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c index 27b545719..22dfd664d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c @@ -49,7 +49,7 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status = VX_FAILURE; + vsi_status status = VSI_FAILURE; vsi_nn_kernel_param_t * param = NULL; int32_t _is_ln= 0; int32_t _is_cifg= 0; @@ -107,6 +107,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c index d792d34b2..f715c99ad 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c @@ -35,7 +35,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "libnnext/vsi_nn_vxkernel.h" +#include "vsi_nn_error.h" #include "ops/vsi_nn_op_lstmunit_ovxlib.h" #include "vsi_nn_internal_node.h" #include "vsi_nn_rnn_helper.h" @@ -64,8 +64,10 @@ static vsi_nn_internal_tensor_t* create_tp_fc vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); tensor2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(tensor2, "Create internal tensor failed", final); tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_FCL, 0, 0 ); + CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); tmp_inode->node->nn_param.fcl.axis = 0; tmp_inode->node->nn_param.fcl.weights = (uint32_t)weight->attr.size[1]; @@ -75,6 +77,7 @@ static vsi_nn_internal_tensor_t* create_tp_fc tmp_inode->outputs[0] = tensor2->t; vsi_nn_internal_setup_node(self, tmp_inode); +final: return tensor2; } @@ -105,6 +108,7 @@ static vsi_nn_internal_tensor_t* create_nn_fc vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); tensor2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(tensor2, "Create internal tensor failed", final); reshaped_weight_shape[3] = weight->attr.size[1]; reshaped_weight_shape[2] = weight->attr.size[0] / ( kernel_h * kernel_w ); @@ -118,10 +122,12 @@ static vsi_nn_internal_tensor_t* create_nn_fc memcpy( &attr.dtype, &weight->attr.dtype, sizeof(attr.dtype) ); memcpy( &attr.size, &reshaped_weight_shape, sizeof(attr.size)); reshaped_weight_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(reshaped_weight_tensor, "Create internal tensor failed", final); vsi_nn_ReshapeTensor( self->graph, weight, reshaped_weight_tensor->t, reshaped_weight_shape, 4 ); tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 ); + CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); tmp_inode->node->nn_param.conv2d.ksize[0] = kernel_w; tmp_inode->node->nn_param.conv2d.ksize[1] = kernel_h; tmp_inode->node->nn_param.conv2d.stride[0] = 1; @@ -141,10 +147,11 @@ static vsi_nn_internal_tensor_t* create_nn_fc tmp_inode->outputs[0] = tensor2->t; vsi_nn_internal_setup_node(self, tmp_inode); +final: return tensor2; } -static void create_peephole +static vsi_status create_peephole ( vsi_nn_node_t * self, vsi_nn_tensor_t * input, @@ -153,6 +160,7 @@ static void create_peephole vsi_bool use_virtual_tensor ) { + vsi_status status = VSI_FAILURE; vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t* input_tensor0 = NULL; vsi_nn_internal_tensor_t* input_tensor1 = NULL; @@ -164,8 +172,10 @@ static void create_peephole attr.is_const = FALSE; memcpy(&(attr.dtype), &((*input_fc)->t->attr.dtype), sizeof(vsi_nn_dtype_t)); input_tensor0 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(input_tensor0, "Create internal tensor failed", final); /* create internal nodes */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_MULTIPLY, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.multiply.scale = 1.0f; curr->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; curr->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN; @@ -174,13 +184,19 @@ static void create_peephole curr->outputs[0] = input_tensor0->t; vsi_nn_internal_setup_node(self, curr); input_tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO( input_tensor1, "Create internal tensor fail.", final ); /* create internal nodes */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = (*input_fc)->t; curr->inputs[1] = input_tensor0->t; curr->outputs[0] = input_tensor1->t; vsi_nn_internal_setup_node(self, curr); *input_fc = input_tensor1; + + status = VSI_SUCCESS; +final: + return status; } static vsi_bool setup_op_shapes @@ -236,6 +252,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -246,6 +264,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -258,6 +279,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ @@ -272,7 +295,6 @@ static vsi_bool op_setup vsi_nn_tensor_attr_t attr; vsi_bool is_input_fc_on_tp = FALSE; vsi_bool is_recurrent_fc_on_tp = FALSE; - vsi_nn_internal_tensor_t* add_tensor = NULL; vsi_nn_internal_tensor_t* input_tensor = NULL; vsi_nn_internal_tensor_t* output_tensor = NULL; vsi_nn_internal_tensor_t* recurrent_input_tensor = NULL; @@ -364,6 +386,7 @@ static vsi_bool op_setup bias_tensors[i], &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I + i], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( input_fc_outputs[i], "Create tensor fail.", final ); } if (inputs[LSTMUNIT_INPUT_AUX_INPUT] != NULL) { @@ -375,6 +398,7 @@ static vsi_bool op_setup NULL, &p->internal_dtype_aux[LSTMUNIT_QUANTIZE_PARAM_AUX_I2I + i], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( aux_input_fc_outputs[i], "Create tensor fail.", final ); } } } @@ -385,6 +409,7 @@ static vsi_bool op_setup (uint32_t)inputs[LSTMUNIT_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w); input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[LSTMUNIT_INPUT_INPUT], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( input_tensor, "Create tensor fail.", final ); for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++) { @@ -395,9 +420,11 @@ static vsi_bool op_setup kernel_h, kernel_w, &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I + i], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( tmp, "Create tensor fail.", final ); /* transpose and reshape output */ input_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self, tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( input_fc_outputs[i], "Create tensor fail.", final ); } if (inputs[LSTMUNIT_INPUT_AUX_INPUT] != NULL) { @@ -406,6 +433,7 @@ static vsi_bool op_setup (uint32_t)inputs[LSTMUNIT_INPUT_AUX_INPUT]->attr.size[0], &kernel_h, &kernel_w); input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[LSTMUNIT_INPUT_AUX_INPUT], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( input_tensor, "Create tensor fail.", final ); for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++) { @@ -416,9 +444,11 @@ static vsi_bool op_setup kernel_h, kernel_w, &p->internal_dtype_aux[LSTMUNIT_QUANTIZE_PARAM_AUX_I2I + i], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( tmp, "Create tensor fail.", final ); /* transpose and reshape output */ aux_input_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self, tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( aux_input_fc_outputs[i], "Create tensor fail.", final ); } } } @@ -432,6 +462,7 @@ static vsi_bool op_setup aux_input_fc_outputs[i]->t, &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( input_add_aux_input_fc_outputs[i], "Create tensor fail.", final ); input_fc_outputs[i] = input_add_aux_input_fc_outputs[i]; } } @@ -447,6 +478,7 @@ static vsi_bool op_setup NULL, &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_R2I + i], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( recurrent_fc_outputs[i], "Create tensor fail.", final ); } } else @@ -456,6 +488,7 @@ static vsi_bool op_setup (uint32_t)inputs[LSTMUNIT_INPUT_H_STATE]->attr.size[0], &kernel_h, &kernel_w); recurrent_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[LSTMUNIT_INPUT_H_STATE], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( recurrent_input_tensor, "Create tensor fail.", final ); for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++) { @@ -466,31 +499,37 @@ static vsi_bool op_setup kernel_h, kernel_w, &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_R2I + i], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( tmp, "Create tensor fail.", final ); /* transpose and reshape output */ recurrent_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self, tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( recurrent_fc_outputs[i], "Create tensor fail.", final ); } } if (p->local->use_peephole) { + vsi_status status = VSI_FAILURE; /* update input gate */ if (!p->local->use_cifg) { - create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE], + status = create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE], inputs[LSTMUNIT_INPUT_WEIGHT_C2I], &(input_fc_outputs[0]), use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO( status, final ); } /* update forget gate */ - create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE], + status = create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE], inputs[LSTMUNIT_INPUT_WEIGHT_C2F], &(input_fc_outputs[1]), use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO( status, final ); /* update output gate */ - create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE], + status = create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE], inputs[LSTMUNIT_INPUT_WEIGHT_C2O], &(input_fc_outputs[3]), use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO( status, final ); } /* layernorm */ @@ -498,59 +537,31 @@ static vsi_bool op_setup { for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++ ) { - if (self->graph->ctx->config.support_stream_processor) - { - memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); - attr.dim_num = VSI_NN_DIM_AUTO; - attr.vtl = use_virtual_tensor; - attr.is_const = FALSE; - attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; - add_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); - /* create internal nodes */ - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 ); - curr->inputs[0] = input_fc_outputs[i]->t; - curr->inputs[1] = recurrent_fc_outputs[i]->t; - curr->outputs[0] = add_tensor->t; - vsi_nn_internal_setup_node(self, curr); - - /* create internal nodes */ - input_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LAYER_NORM, 0, 0 ); - curr->node->nn_param.layernorm.eps = (float)1e-8; - curr->inputs[0] = add_tensor->t; - curr->inputs[1] = inputs[LSTMUNIT_INPUT_BIAS_I + i]; - curr->inputs[2] = inputs[LSTMUNIT_INPUT_LAYERNORM_I + i]; - curr->outputs[0] = input_tensor->t; - vsi_nn_internal_setup_node(self, curr); - - layernorm_outputs[i] = input_tensor; - } - else - { - memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); - attr.dim_num = VSI_NN_DIM_AUTO; - attr.vtl = use_virtual_tensor; - attr.is_const = FALSE; - attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; - input_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); - - /* create internal nodes */ - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_TENSOR_ADD_MEAN_STDDEV_NORM, 0, 0 ); - curr->node->nn_param.tensor_add_mean_stddev_norm.eps = (float)1e-8; - curr->inputs[0] = input_fc_outputs[i]->t; - curr->inputs[1] = recurrent_fc_outputs[i]->t; - curr->outputs[0] = input_tensor->t; - vsi_nn_internal_setup_node(self, curr); - - layernorm_outputs[i] = input_tensor; - } + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + input_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO( input_tensor, "Create tensor fail.", final ); + + /* create internal nodes */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_TENSOR_ADD_MEAN_STDDEV_NORM, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); + curr->node->nn_param.tensor_add_mean_stddev_norm.eps = (float)1e-8; + curr->inputs[0] = input_fc_outputs[i]->t; + curr->inputs[1] = recurrent_fc_outputs[i]->t; + curr->outputs[0] = input_tensor->t; + vsi_nn_internal_setup_node(self, curr); + + layernorm_outputs[i] = input_tensor; } } /* activations */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_ACTIVATION, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.lstmunit_activation.cell_clip = p->cell_clip; curr->node->nn_param.lstmunit_activation.proj_clip = p->proj_clip; curr->node->nn_param.lstmunit_activation.forget_bias = p->forget_bias; @@ -562,10 +573,9 @@ static vsi_bool op_setup curr->node->nn_param.lstmunit_activation.recurrent_activation = p->recurrent_activation; curr->inputs[LSTMUNIT_ACT_CSTATE_IN] = inputs[LSTMUNIT_INPUT_C_STATE]; - for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++ ) + for ( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++ ) { - if( (p->local->use_layer_norm && !self->graph->ctx->config.support_stream_processor) || - p->local->use_hybrid ) + if( p->local->use_layer_norm || p->local->use_hybrid ) { curr->inputs[LSTMUNIT_ACT_DATA_BI + i] = inputs[LSTMUNIT_INPUT_BIAS_I + i]; } @@ -573,14 +583,7 @@ static vsi_bool op_setup if( p->local->use_layer_norm ) { /* Pass layernorm weights to VSI_NN_OP_LSTMUNIT_ACTIVATION */ - if (self->graph->ctx->config.support_stream_processor) - { - curr->inputs[LSTMUNIT_ACT_LN_WI + i] = NULL; - } - else - { - curr->inputs[LSTMUNIT_ACT_LN_WI + i] = inputs[LSTMUNIT_INPUT_LAYERNORM_I + i]; - } + curr->inputs[LSTMUNIT_ACT_LN_WI + i] = inputs[LSTMUNIT_INPUT_LAYERNORM_I + i]; curr->inputs[LSTMUNIT_ACT_INPUT_FC_I + i] = layernorm_outputs[i]->t; curr->inputs[LSTMUNIT_ACT_HSTATE_FC_I + i] = NULL; } @@ -616,6 +619,7 @@ static vsi_bool op_setup attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; } output_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( output_tensor, curr, "Create tensor fail.", final ); curr->outputs[LSTMUNIT_ACT_OUTPUT] = output_tensor->t; curr->outputs[LSTMUNIT_ACT_CSTATE_OUT] = outputs[LSTMUNIT_OUTPUT_C_STATE]; @@ -637,11 +641,14 @@ static vsi_bool op_setup use_virtual_tensor = inputs[LSTMUNIT_INPUT_BIAS_PROJ]->attr.vtl; input_tensor = vsi_nn_internal_create_zero_bias_tensor(self, &output_tensor->t->attr, &inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr, VSI_NN_OP_FCL, FALSE); + CHECK_PTR_FAIL_GOTO( input_tensor, "Create tensor fail.", final ); + zero_bias_tensor = input_tensor->t; if (use_virtual_tensor) { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[LSTMUNIT_INPUT_BIAS_PROJ]; curr->outputs[0] = zero_bias_tensor; @@ -656,6 +663,8 @@ static vsi_bool op_setup { input_tensor = vsi_nn_internal_create_zero_bias_tensor(self, &output_tensor->t->attr, &inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr, VSI_NN_OP_FCL, FALSE); + CHECK_PTR_FAIL_GOTO( input_tensor, "Create tensor fail.", final ); + zero_bias_tensor = input_tensor->t; } else @@ -664,6 +673,7 @@ static vsi_bool op_setup } curr = vsi_nn_internal_new_node( self, VSI_NN_OP_FCL, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.fcl.axis = 0; curr->node->nn_param.fcl.weights = (uint32_t)inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr.size[1]; @@ -678,12 +688,15 @@ static vsi_bool op_setup /* copy h_state to output */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = outputs[LSTMUNIT_OUTPUT_H_STATE]; curr->outputs[0] = outputs[LSTMUNIT_OUTPUT_OUTPUT]; vsi_nn_internal_setup_node(self, curr); } return TRUE; +final: + return FALSE; } /* op_setup() */ static vsi_status op_deinit diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c index 846339029..f4005a841 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c @@ -35,6 +35,8 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + #define _ARG_NUM (7) #define _INPUT_NUM (2) @@ -49,22 +51,24 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status = VSI_FAILURE; - vsi_nn_kernel_param_t * param = NULL; - vsi_nn_kernel_node_t n = NULL; - vsi_nn_tensor_t * tmp_inputs[2] = {NULL}; - vsi_nn_tensor_t * tmp_outputs[1] = {NULL}; - vsi_nn_tensor_t * rs_input = NULL; - vsi_nn_tensor_t * rs_output = NULL; - vsi_size_t shape_in[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; - vsi_size_t shape_out[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; - uint32_t i = 0; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t *param = NULL; + vsi_nn_kernel_node_t n = NULL; + vsi_nn_tensor_t * tmp_inputs[2] = {NULL}; + vsi_nn_tensor_t * tmp_outputs[1] = {NULL}; + uint32_t new_rank[3] = {0}; + vsi_bool ret = FALSE; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; int32_t transposeA = self->nn_param.matrixmul.transpose[0]; int32_t transposeB = self->nn_param.matrixmul.transpose[1]; int32_t adjointA = self->nn_param.matrixmul.adjoint[0]; int32_t adjointB = self->nn_param.matrixmul.adjoint[1]; + uint32_t cross_flg = 0; + uint32_t size_axis_inner_outer[3] = {0}; + uint32_t stride_axis_inner_outer[9] = {0}; + param = vsi_nn_kernel_param_create(); vsi_nn_kernel_param_add_int32( param, "transposeA", transposeA ); @@ -72,46 +76,35 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "adjointA", adjointA ); vsi_nn_kernel_param_add_int32( param, "adjointB", adjointB ); - if (inputs[0]->attr.dim_num == 1 && inputs[1]->attr.dim_num > 1) - { - shape_in[0] = inputs[0]->attr.size[0]; - shape_in[1] = 1; - shape_out[0] = outputs[0]->attr.size[0]; - shape_out[1] = 1; - for(i = 2; i <= outputs[0]->attr.dim_num; i++) - { - shape_out[i] = outputs[0]->attr.size[i - 1]; - } - rs_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape_in, 2); - rs_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape_out, outputs[0]->attr.dim_num + 1); - tmp_inputs[0] = rs_input; - tmp_inputs[1] = inputs[1]; - tmp_outputs[0] = rs_output; - } - else if (inputs[1]->attr.dim_num == 1 && inputs[0]->attr.dim_num > 1) - { - shape_in[0] = 1; - shape_in[1] = inputs[1]->attr.size[0]; - shape_out[0] = 1; - for(i = 1; i <= outputs[0]->attr.dim_num; i++) - { - shape_out[i] = outputs[0]->attr.size[i - 1]; - } - rs_input = vsi_nn_reshape_tensor(self->graph, inputs[1], shape_in, 2); - rs_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape_out, outputs[0]->attr.dim_num + 1); + ret = vsi_nn_kernel_optimize_matrixmul_broadcast_shape( + inputs[0]->attr.size, + inputs[1]->attr.size, + outputs[0]->attr.size, + inputs[0]->attr.dim_num, + inputs[1]->attr.dim_num, + outputs[0]->attr.dim_num, + shapes[0], shapes[1], shapes[2], new_rank, + &cross_flg, size_axis_inner_outer, stride_axis_inner_outer); + + if (ret) + { + vsi_nn_kernel_param_add_int32( param, "cross_flg", cross_flg ); + vsi_nn_kernel_param_add_buffer( param, "size_axis_inner_outer", size_axis_inner_outer, 3); + vsi_nn_kernel_param_add_buffer( param, "stride_axis_inner_outer", stride_axis_inner_outer, 9); - tmp_inputs[0] = inputs[0]; - tmp_inputs[1] = rs_input; - tmp_outputs[0] = rs_output; + tmp_inputs[0] = vsi_nn_reshape_tensor(self->graph, inputs[0], shapes[0], new_rank[0]); + tmp_inputs[1] = vsi_nn_reshape_tensor(self->graph, inputs[1], shapes[1], new_rank[1]); + tmp_outputs[0] = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes[2], new_rank[2]); } else { - tmp_inputs[0] = inputs[0]; - tmp_inputs[1] = inputs[1]; - tmp_outputs[0] = outputs[0]; + VSILOGE("illegal inputs shape"); + status = VSI_FAILURE; + goto final; } + n = vsi_nn_kernel_selector( self->graph, "matrixmul", tmp_inputs, 2, tmp_outputs, 1, param ); if ( n != NULL ) { @@ -119,19 +112,15 @@ static vsi_status op_compute status = VSI_SUCCESS; } +final: if (param != NULL) { vsi_nn_kernel_param_release( ¶m ); } - if (rs_input != NULL) - { - vsi_nn_ReleaseTensor( &rs_input ); - } - if (rs_output != NULL) - { - vsi_nn_ReleaseTensor( &rs_output ); - } + vsi_safe_release_tensor( tmp_inputs[0] ); + vsi_safe_release_tensor( tmp_inputs[1] ); + vsi_safe_release_tensor( tmp_outputs[0] ); return status; } /* op_compute() */ @@ -282,32 +271,17 @@ static vsi_bool op_setup outputs[0]->attr.size[i] = inputs[0]->attr.size[i + 1]; } } - else if (inputs[0]->attr.dim_num > inputs[1]->attr.dim_num) - { - for (i = 2; i < inputs[0]->attr.dim_num; i++) - { - outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; - } - } - else if (inputs[1]->attr.dim_num > inputs[0]->attr.dim_num) - { - for (i = 2; i < inputs[1]->attr.dim_num; i++) - { - outputs[0]->attr.size[i] = inputs[1]->attr.size[i]; - } - } - else if (inputs[0]->attr.size[2] >= inputs[1]->attr.size[2]) - { - for (i = 2; i < inputs[0]->attr.dim_num; i++) - { - outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; - } - } else { - for (i = 2; i < inputs[1]->attr.dim_num; i++) + uint32_t rank0 = inputs[0]->attr.dim_num; + uint32_t rank1 = inputs[1]->attr.dim_num; + for (i = 2; i < outputs[0]->attr.dim_num; i++) { - outputs[0]->attr.size[i] = inputs[1]->attr.size[i]; + vsi_size_t sz0 = i < rank0 ? inputs[0]->attr.size[i] : 1; + vsi_size_t sz1 = i < rank1 ? inputs[1]->attr.size[i] : 1; + vsi_size_t sz2 = vsi_nn_max(sz0, sz1); + + outputs[0]->attr.size[i] = sz2; } } } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c index 57f8cad39..a94df5511 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c @@ -36,6 +36,7 @@ #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_link_list.h" #include "vsi_nn_internal_node.h" +#include "vsi_nn_error.h" typedef struct _max_pool3d_local_data_t { int32_t placeholder; @@ -54,6 +55,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -79,6 +82,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } @@ -89,7 +94,7 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - vsi_bool ret = TRUE; + vsi_bool ret = FALSE; vsi_nn_max_pool3d_param *p = &(self->nn_param.max_pool3d); vsi_size_t ksize[_cnt_of_array(p->ksize)] = {0}, i = 0; vsi_size_t pad[_cnt_of_array(p->pad)] = {0}; @@ -173,10 +178,14 @@ static vsi_bool op_setup memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, TRUE); input_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final); pool2d_0_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(pool2d_0_tensor, "Create internal tensor failed", final); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); reshape_input_size = vsi_nn_internal_new_node_param(curr, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_input_size, curr, "Create internal buffer failed", final); reshape_input_size[0] = inputs[0]->attr.size[0]; reshape_input_size[1] = inputs[0]->attr.size[1]; reshape_input_size[2] = 1; @@ -189,9 +198,10 @@ static vsi_bool op_setup curr->node->nn_param.reshape2.dim_num = 4; curr->inputs[0] = inputs[0]; curr->outputs[0] = input_tensor->t; - vsi_nn_internal_setup_node( self, curr ); + ret = vsi_nn_internal_setup_node( self, curr ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_POOL, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.pool.ksize[0] = p->ksize[0]; curr->node->nn_param.pool.ksize[1] = p->ksize[1]; curr->node->nn_param.pool.stride[0] = p->stride[0]; @@ -205,28 +215,33 @@ static vsi_bool op_setup curr->node->nn_param.pool.pad_type = p->pad_type; curr->inputs[0] = input_tensor->t; curr->outputs[0] = pool2d_0_tensor->t; - vsi_nn_internal_setup_node( self, curr ); + ret &= vsi_nn_internal_setup_node( self, curr ); if (p->ksize[2] == 1 && p->stride[2] == 1 && p->pad[4] == 0 && p->pad[5] == 0) { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.reshape2.size = outputs[0]->attr.size; curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num; curr->inputs[0] = pool2d_0_tensor->t; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node( self, curr ); + ret &= vsi_nn_internal_setup_node( self, curr ); } else { memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, TRUE); reshape_0_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(reshape_0_tensor, "Create internal tensor failed", final); pool2d_1_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(pool2d_1_tensor, "Create internal tensor failed", final); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); reshape_pool_size = vsi_nn_internal_new_node_param(curr, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); - reshape_pool_size[0] = -1; + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_pool_size, curr, "Create internal buffer failed", final); + reshape_pool_size[0] = (vsi_size_t)-1; reshape_pool_size[1] = inputs[0]->attr.size[2]; reshape_pool_size[2] = 1; for (i = 3; i < inputs[0]->attr.dim_num; i++) @@ -238,9 +253,10 @@ static vsi_bool op_setup curr->node->nn_param.reshape2.dim_num = 4; curr->inputs[0] = pool2d_0_tensor->t; curr->outputs[0] = reshape_0_tensor->t; - vsi_nn_internal_setup_node( self, curr ); + ret &= vsi_nn_internal_setup_node( self, curr ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_POOL, 1, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.pool.ksize[0] = 1; curr->node->nn_param.pool.ksize[1] = p->ksize[2]; curr->node->nn_param.pool.stride[0] = 1; @@ -254,16 +270,18 @@ static vsi_bool op_setup curr->node->nn_param.pool.pad_type = p->pad_type; curr->inputs[0] = reshape_0_tensor->t; curr->outputs[0] = pool2d_1_tensor->t; - vsi_nn_internal_setup_node( self, curr ); + ret &= vsi_nn_internal_setup_node( self, curr ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.reshape2.size = outputs[0]->attr.size; curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num; curr->inputs[0] = pool2d_1_tensor->t; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node( self, curr ); + ret &= vsi_nn_internal_setup_node( self, curr ); } +final: return ret; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c index 9df9c1b27..2deed48b7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c @@ -56,20 +56,29 @@ static vsi_status op_compute vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; uint32_t new_rank = 0; vsi_nn_kernel_param_t * param = NULL; - int32_t ksize_x = (int32_t)self->nn_param.pool.ksize[0]; - int32_t ksize_y = (int32_t)self->nn_param.pool.ksize[1]; - int32_t stride_x = (int32_t)self->nn_param.pool.stride[0]; - int32_t stride_y = (int32_t)self->nn_param.pool.stride[1]; - int32_t pad_left = (int32_t)self->nn_param.pool.pad[0]; - int32_t pad_right = (int32_t)self->nn_param.pool.pad[1]; - int32_t pad_top = (int32_t)self->nn_param.pool.pad[2]; - int32_t pad_bottom = (int32_t)self->nn_param.pool.pad[3]; + int32_t ksize_x = 0; + int32_t ksize_y = 0; + int32_t stride_x = 0; + int32_t stride_y = 0; + int32_t pad_left = 0; + int32_t pad_right = 0; + int32_t pad_top = 0; + int32_t pad_bottom = 0; if ( NULL == self ) { return VSI_FAILURE; } + ksize_x = (int32_t)self->nn_param.pool.ksize[0]; + ksize_y = (int32_t)self->nn_param.pool.ksize[1]; + stride_x = (int32_t)self->nn_param.pool.stride[0]; + stride_y = (int32_t)self->nn_param.pool.stride[1]; + pad_left = (int32_t)self->nn_param.pool.pad[0]; + pad_right = (int32_t)self->nn_param.pool.pad[1]; + pad_top = (int32_t)self->nn_param.pool.pad[2]; + pad_bottom = (int32_t)self->nn_param.pool.pad[3]; + param = vsi_nn_kernel_param_create(); vsi_nn_kernel_optimize_nchw2xhw_shape(inputs[0]->attr.size, inputs[0]->attr.dim_num, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_mod.c b/src/tim/vx/internal/src/ops/vsi_nn_op_mod.c index 29310ad96..7be779db1 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_mod.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_mod.c @@ -59,13 +59,15 @@ static vsi_status op_compute vsi_size_t new_rank = 0; vsi_bool ret; vsi_nn_kernel_param_t * param = NULL; - int32_t isfmod = (int32_t)self->nn_param.mod.fmod; + int32_t isfmod = 0; if (NULL == self) { return VSI_FAILURE; } + isfmod = (int32_t)self->nn_param.mod.fmod; + param = vsi_nn_kernel_param_create(); ret = vsi_nn_kernel_optimize_eltwise_shape( @@ -183,6 +185,8 @@ static vsi_bool op_setup vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_bool ret = TRUE; + VSI_UNREFERENCED(self); + in1_rank = inputs[0]->attr.dim_num; in2_rank = inputs[1]->attr.dim_num; out_rank = vsi_nn_max( in1_rank, in2_rank ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c b/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c index 8276c0f7c..39dda244d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c @@ -71,13 +71,14 @@ static void _set_io_index vxSetParameterByIndex(self->n, idx++, (vx_reference)inputs[i]->t); scalar_index = idx; param = vxGetParameterByIndex(self->n, scalar_index); - vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); - if (param != NULL) + + if (param) { + vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); vxReleaseParameter(¶m); param = NULL; - } + if (type != VX_TYPE_SCALAR) { continue; @@ -92,17 +93,18 @@ static void _set_io_index vx_reference ref = 0; vsi_status status; param = vxGetParameterByIndex(self->n, j); - vxQueryParameter(param, VX_PARAMETER_REF, &ref, sizeof(vx_reference)); - status = vxQueryScalar((vx_scalar)ref, VX_SCALAR_TYPE, &data_type, sizeof(vx_enum)); - if (status == VX_ERROR_INVALID_REFERENCE) - { - vx_scalar scalar = vxCreateScalar(self->graph->ctx->c, VX_TYPE_INT32, 0); - ref = (vx_reference)scalar; - vxSetParameterByIndex(self->n, idx++, ref); - vxReleaseReference(&ref); - } - if (param != NULL) + + if (param) { + vxQueryParameter(param, VX_PARAMETER_REF, &ref, sizeof(vx_reference)); + status = vxQueryScalar((vx_scalar)ref, VX_SCALAR_TYPE, &data_type, sizeof(vx_enum)); + if (status == VX_ERROR_INVALID_REFERENCE) + { + vx_scalar scalar = vxCreateScalar(self->graph->ctx->c, VX_TYPE_INT32, 0); + ref = (vx_reference)scalar; + vxSetParameterByIndex(self->n, idx++, ref); + vxReleaseReference(&ref); + } vxReleaseParameter(¶m); param = NULL; } @@ -165,6 +167,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } /* op_check() */ @@ -178,6 +183,9 @@ static vsi_bool op_setup /* * Network Binary Graph node do not need to calculate output shape */ + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c b/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c index 3c8a57d0a..acd1c9eae 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c @@ -85,6 +85,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -96,6 +99,8 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = 1; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c b/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c index 71a5e0786..766392ac4 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c @@ -43,6 +43,9 @@ static vsi_status op_compute ) { int i; + + VSI_UNREFERENCED(self); + for( i = 0; i < 10; i ++ ) { if( NULL == outputs[i] ) @@ -65,6 +68,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } /* op_check() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c index 2c7dba946..111fc3d3c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c @@ -156,6 +156,7 @@ static vsi_status op_compute attr.is_const = FALSE; convert_tensor = vsi_nn_CreateTensor(self->graph, &attr); + CHECK_PTR_FAIL_GOTO( convert_tensor, "Create tensor fail.", final ); self->n = vxTensorCopyNode( self->graph->g, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c index 399d0c6be..146ee332f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c @@ -51,6 +51,8 @@ static vsi_bool _is_same_memory_shape uint32_t dim_num0 = inputs[0]->attr.dim_num; uint32_t dim_num1 = self->nn_param.permute.dim_num; + VSI_UNREFERENCED(outputs); + if (dim_num0 != dim_num1) return FALSE; @@ -102,6 +104,8 @@ static vsi_bool _is_same_quant { vsi_nn_dtype_t *dtype,*_dtype; + VSI_UNREFERENCED(self); + dtype = &inputs[0]->attr.dtype; _dtype = &outputs[0]->attr.dtype; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c index cfdf7c2f1..24b0d6260 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c @@ -34,7 +34,6 @@ #include "utils/vsi_nn_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel_eltwise.h" #include "utils/vsi_nn_constraint_check.h" @@ -136,21 +135,28 @@ static vsi_status op_compute vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; uint32_t new_rank = 0; - vsi_bool ret; + vsi_bool ret = FALSE; vsi_nn_kernel_param_t * param = NULL; - int32_t ksize_x = (int32_t)self->nn_param.pool.ksize[0]; - int32_t ksize_y = (int32_t)self->nn_param.pool.ksize[1]; - int32_t stride_x = (int32_t)self->nn_param.pool.stride[0]; - int32_t stride_y = (int32_t)self->nn_param.pool.stride[1]; - int32_t pad_x = (int32_t)self->nn_param.pool.pad[0]; - int32_t pad_y = (int32_t)self->nn_param.pool.pad[2]; + int32_t ksize_x = 0; + int32_t ksize_y = 0; + int32_t stride_x = 0; + int32_t stride_y = 0; + int32_t pad_x = 0; + int32_t pad_y = 0; - if( NULL == self ) + if ( NULL == self ) { return VSI_FAILURE; } - param =vsi_nn_kernel_param_create(); + ksize_x = (int32_t)self->nn_param.pool.ksize[0]; + ksize_y = (int32_t)self->nn_param.pool.ksize[1]; + stride_x = (int32_t)self->nn_param.pool.stride[0]; + stride_y = (int32_t)self->nn_param.pool.stride[1]; + pad_x = (int32_t)self->nn_param.pool.pad[0]; + pad_y = (int32_t)self->nn_param.pool.pad[2]; + + param = vsi_nn_kernel_param_create(); ret = vsi_nn_poolwithargmax_optimize_shape(self, (vsi_ssize_t*)inputs[0]->attr.size, (vsi_ssize_t*)outputs[0]->attr.size, @@ -164,7 +170,7 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "pad_x", pad_x ); vsi_nn_kernel_param_add_int32( param, "pad_y", pad_y ); - if( ret ) + if ( ret ) { reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0], shapes[0], new_rank ); @@ -180,7 +186,7 @@ static vsi_status op_compute vsi_nn_ReleaseTensor( &reshape_tensors[2] ); } - if( self->n ) + if ( self->n ) { status = VSI_SUCCESS; } @@ -270,10 +276,12 @@ static vsi_bool op_setup self->nn_param.pool.pad[i] = (uint32_t)pad[i]; } - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { ret = vsi_nn_OpSetup( VSI_NN_OP_POOL, self, inputs, outputs ); - + } + if ( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num ) + { outputs[1]->attr.dim_num = outputs[0]->attr.dim_num; memcpy( outputs[1]->attr.size, outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c index 18942faf4..9b060f141 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "libnnext/vsi_nn_vxkernel.h" +#include "vsi_nn_error.h" #include "utils/vsi_nn_dtype_util.h" #include "vsi_nn_internal_node.h" @@ -48,6 +48,8 @@ static vsi_bool _is_same_type vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + if(vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) { return FALSE; @@ -63,6 +65,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -73,6 +77,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -85,6 +92,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ @@ -95,7 +104,7 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - vsi_bool ret; + vsi_bool ret = FALSE; uint32_t i; uint32_t axis; vsi_nn_tensor_attr_t attr; @@ -112,7 +121,6 @@ static vsi_bool op_setup return FALSE; } - ret = TRUE; /* output */ if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { @@ -152,6 +160,7 @@ static vsi_bool op_setup self->nn_param.post_process.local.enable_perm == FALSE) { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.reshape2.size = outputs[0]->attr.size; curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num; curr->inputs[0] = inputs[POST_PROCESS_INPUT]; @@ -163,6 +172,7 @@ static vsi_bool op_setup self->nn_param.post_process.local.enable_perm == FALSE) { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[POST_PROCESS_INPUT]; curr->outputs[0] = outputs[POST_PROCESS_OUTPUT]; @@ -172,6 +182,7 @@ static vsi_bool op_setup self->nn_param.post_process.local.enable_perm == TRUE) { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.permute.perm = self->nn_param.post_process.perm; curr->node->nn_param.permute.dim_num = self->nn_param.post_process.dim_num; curr->inputs[0] = inputs[POST_PROCESS_INPUT]; @@ -187,8 +198,10 @@ static vsi_bool op_setup attr.vtl = use_virtual_tensor; attr.is_const = FALSE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.permute.perm = self->nn_param.post_process.perm; curr->node->nn_param.permute.dim_num = self->nn_param.post_process.dim_num; curr->inputs[0] = inputs[POST_PROCESS_INPUT]; @@ -197,12 +210,15 @@ static vsi_bool op_setup vsi_nn_internal_setup_node( self, curr ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = output_tensor->t; curr->outputs[0] = outputs[POST_PROCESS_OUTPUT]; vsi_nn_internal_setup_node(self, curr); } + ret = TRUE; +final: return ret; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c index aa5b46c1b..f977e32d0 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c @@ -36,6 +36,7 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_internal_node.h" #include "utils/vsi_nn_util.h" +#include "vsi_nn_error.h" static vsi_status op_compute ( @@ -44,8 +45,27 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status = vsi_nn_internal_compute_node( self ); - self->n = vsi_nn_internal_get_node_by_uid(self, 1)->node->n; + vsi_status status = VSI_SUCCESS; + vsi_nn_internal_node_t* interal_node = NULL; + + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + + status = vsi_nn_internal_compute_node( self ); + CHECK_STATUS_FAIL_GOTO(status, final ); + + interal_node = vsi_nn_internal_get_node_by_uid(self, 1); + + if (interal_node) + { + self->n = interal_node->node->n; + } + else + { + status = VSI_FAILURE; + } + +final: return status; } /* op_compute() */ @@ -56,6 +76,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -68,6 +91,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ @@ -81,9 +106,10 @@ static vsi_bool op_setup /* TODO: Add code to comput outputs' shape. */ vsi_nn_internal_node_t* curr = NULL; vsi_nn_pre_process_param * p = NULL; - vsi_bool ret = TRUE; + vsi_bool ret = FALSE; vsi_nn_internal_tensor_t* preprocess_tensor = NULL; vsi_nn_preprocess_dest_layout_e layout = VSI_NN_DEST_LAYOUT_NCHW; + vsi_bool enable_rgb88_planar_nhwc = FALSE; p = (vsi_nn_pre_process_param *)&(self->nn_param.pre_process); @@ -122,11 +148,18 @@ static vsi_bool op_setup if (i != self->nn_param.pre_process_rgb.dim_num) { layout = VSI_NN_DEST_LAYOUT_NHWC; + + if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR || + p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP) + { + enable_rgb88_planar_nhwc = self->graph->ctx->options.enable_rgb88_planar_nhwc; + } } - if (layout == VSI_NN_DEST_LAYOUT_NHWC) + if (layout == VSI_NN_DEST_LAYOUT_NHWC && !enable_rgb88_planar_nhwc) { memcpy( &attr, &outputs[PRE_PROCESS_OUTPUT]->attr, sizeof( attr ) ); + attr.size[0] = p->output_attr.size[1]; attr.size[1] = p->output_attr.size[2]; attr.size[2] = p->output_attr.size[0]; @@ -136,7 +169,8 @@ static vsi_bool op_setup attr.vtl = use_virtual_tensor; attr.is_const = FALSE; - preprocess_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + preprocess_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(preprocess_tensor, "Create internal tensor failed", final); } } @@ -145,6 +179,7 @@ static vsi_bool op_setup case VSI_NN_SOURCE_FORMAT_TENSOR: { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_TENSOR, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.pre_process_tensor.perm = p->perm; curr->node->nn_param.pre_process_tensor.dim_num = p->dim_num; @@ -152,12 +187,13 @@ static vsi_bool op_setup curr->inputs[0] = inputs[PRE_PROCESS_INPUT0]; curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); } break; case VSI_NN_SOURCE_FORMAT_IMAGE_GRAY: { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_GRAY, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.pre_process_gray.mean = p->norm.mean[0]; curr->node->nn_param.pre_process_gray.scale = p->norm.scale; @@ -178,27 +214,33 @@ static vsi_bool op_setup curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; } - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); } break; case VSI_NN_SOURCE_FORMAT_IMAGE_RGB: { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_RGB, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); if (p->reverse_channel) { curr->node->nn_param.pre_process_rgb.r_mean = p->norm.mean[2]; curr->node->nn_param.pre_process_rgb.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_rgb.b_mean = p->norm.mean[0]; + curr->node->nn_param.pre_process_rgb.r_scale = p->norm2.scale[2]; + curr->node->nn_param.pre_process_rgb.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_rgb.b_scale = p->norm2.scale[0]; } else { curr->node->nn_param.pre_process_rgb.r_mean = p->norm.mean[0]; curr->node->nn_param.pre_process_rgb.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_rgb.b_mean = p->norm.mean[2]; + curr->node->nn_param.pre_process_rgb.r_scale = p->norm2.scale[0]; + curr->node->nn_param.pre_process_rgb.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_rgb.b_scale = p->norm2.scale[2]; } - curr->node->nn_param.pre_process_rgb.rgb_scale = p->norm.scale; curr->node->nn_param.pre_process_rgb.reverse_channel = p->reverse_channel; curr->node->nn_param.pre_process_rgb.rect.left = p->rect.left; curr->node->nn_param.pre_process_rgb.rect.top = p->rect.top; @@ -219,27 +261,51 @@ static vsi_bool op_setup curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; } - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); } break; case VSI_NN_SOURCE_FORMAT_IMAGE_YUV420: { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV420, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); if (p->reverse_channel) { curr->node->nn_param.pre_process_yuv420.r_mean = p->norm.mean[2]; curr->node->nn_param.pre_process_yuv420.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_yuv420.b_mean = p->norm.mean[0]; + if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) + { + curr->node->nn_param.pre_process_yuv420.r_scale = p->norm.scale; + curr->node->nn_param.pre_process_yuv420.g_scale = p->norm.scale; + curr->node->nn_param.pre_process_yuv420.b_scale = p->norm.scale; + } + else + { + curr->node->nn_param.pre_process_yuv420.r_scale = p->norm2.scale[2]; + curr->node->nn_param.pre_process_yuv420.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_yuv420.b_scale = p->norm2.scale[0]; + } } else { curr->node->nn_param.pre_process_yuv420.r_mean = p->norm.mean[0]; curr->node->nn_param.pre_process_yuv420.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_yuv420.b_mean = p->norm.mean[2]; + if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) + { + curr->node->nn_param.pre_process_yuv420.r_scale = p->norm.scale; + curr->node->nn_param.pre_process_yuv420.g_scale = p->norm.scale; + curr->node->nn_param.pre_process_yuv420.b_scale = p->norm.scale; + } + else + { + curr->node->nn_param.pre_process_yuv420.r_scale = p->norm2.scale[0]; + curr->node->nn_param.pre_process_yuv420.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_yuv420.b_scale = p->norm2.scale[2]; + } } - curr->node->nn_param.pre_process_yuv420.rgb_scale = p->norm.scale; curr->node->nn_param.pre_process_yuv420.reverse_channel = p->reverse_channel; curr->node->nn_param.pre_process_yuv420.rect.left = p->rect.left; curr->node->nn_param.pre_process_yuv420.rect.top = p->rect.top; @@ -262,27 +328,51 @@ static vsi_bool op_setup curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; } - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); } break; case VSI_NN_SOURCE_FORMAT_IMAGE_BGRA: { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_BGRA, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); if (p->reverse_channel) { curr->node->nn_param.pre_process_bgra.r_mean = p->norm.mean[2]; curr->node->nn_param.pre_process_bgra.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_bgra.b_mean = p->norm.mean[0]; + if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) + { + curr->node->nn_param.pre_process_bgra.r_scale = p->norm.scale; + curr->node->nn_param.pre_process_bgra.g_scale = p->norm.scale; + curr->node->nn_param.pre_process_bgra.b_scale = p->norm.scale; + } + else + { + curr->node->nn_param.pre_process_bgra.r_scale = p->norm2.scale[2]; + curr->node->nn_param.pre_process_bgra.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_bgra.b_scale = p->norm2.scale[0]; + } } else { curr->node->nn_param.pre_process_bgra.r_mean = p->norm.mean[0]; curr->node->nn_param.pre_process_bgra.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_bgra.b_mean = p->norm.mean[2]; + if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) + { + curr->node->nn_param.pre_process_bgra.r_scale = p->norm.scale; + curr->node->nn_param.pre_process_bgra.g_scale = p->norm.scale; + curr->node->nn_param.pre_process_bgra.b_scale = p->norm.scale; + } + else + { + curr->node->nn_param.pre_process_bgra.r_scale = p->norm2.scale[0]; + curr->node->nn_param.pre_process_bgra.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_bgra.b_scale = p->norm2.scale[2]; + } } - curr->node->nn_param.pre_process_bgra.rgb_scale = p->norm.scale; curr->node->nn_param.pre_process_bgra.reverse_channel = p->reverse_channel; curr->node->nn_param.pre_process_bgra.rect.left = p->rect.left; curr->node->nn_param.pre_process_bgra.rect.top = p->rect.top; @@ -303,59 +393,30 @@ static vsi_bool op_setup curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; } - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); } break; case VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR: case VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP: { - uint32_t i = 0; - uint32_t axis = 2; vsi_bool is_input_sep = p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ? FALSE : TRUE; - vsi_nn_internal_tensor_t * output_tensor_group[3] = {NULL}; - vsi_nn_internal_tensor_t* tmp_outputs[3] = { NULL }; - vsi_nn_tensor_attr_t attr; float mean[3] = {0}; - vsi_size_t size_32bit[VSI_NN_MAX_DIM_NUM] = {0}; - - memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - memcpy(&attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t)); - for(i = 0; i < p->output_attr.dim_num; i++) - { - attr.size[i] = -1 == p->output_attr.size[i] ? -1 : (vsi_size_t)p->output_attr.size[i]; - } - attr.size[axis] = 1; - attr.vtl = TRUE; - attr.is_const = FALSE; - output_tensor_group[0] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); - output_tensor_group[1] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); - output_tensor_group[2] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); - for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) - { - size_32bit[i] = attr.size[i]; - } if (p->reverse_channel) { - int32_t order[3] = {2, 1, 0}; - mean[0] = p->norm.mean[2]; mean[1] = p->norm.mean[1]; mean[2] = p->norm.mean[0]; - - vsi_nn_reorder_tensor( (vsi_nn_tensor_t **)output_tensor_group, order, - 3, (vsi_nn_tensor_t **)tmp_outputs ); } else { mean[0] = p->norm.mean[0]; mean[1] = p->norm.mean[1]; mean[2] = p->norm.mean[2]; - - memmove( tmp_outputs, output_tensor_group, sizeof(vsi_nn_tensor_t*) * 3 ); } curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_RGB888_PLANAR, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); if (is_input_sep) { curr->inputs[0] = inputs[0]; @@ -368,28 +429,7 @@ static vsi_bool op_setup curr->inputs[1] = NULL; curr->inputs[2] = NULL; } - curr->outputs[0] = output_tensor_group[0]->t; - curr->outputs[1] = output_tensor_group[1]->t; - curr->outputs[2] = output_tensor_group[2]->t; - curr->node->nn_param.pre_process_rgb888_planar.r_mean = mean[0]; - curr->node->nn_param.pre_process_rgb888_planar.g_mean = mean[1]; - curr->node->nn_param.pre_process_rgb888_planar.b_mean = mean[2]; - curr->node->nn_param.pre_process_rgb888_planar.scale = p->norm.scale; - curr->node->nn_param.pre_process_rgb888_planar.rect.left = p->rect.left; - curr->node->nn_param.pre_process_rgb888_planar.rect.top = p->rect.top; - curr->node->nn_param.pre_process_rgb888_planar.rect.width = p->rect.width; - curr->node->nn_param.pre_process_rgb888_planar.rect.height = p->rect.height; - curr->node->nn_param.pre_process_rgb888_planar.output_attr.size = size_32bit; - curr->node->nn_param.pre_process_rgb888_planar.output_attr.dim_num = p->output_attr.dim_num; - vsi_nn_internal_setup_node(self, curr); - - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 3, 1 ); - - curr->node->nn_param.concat.axis = axis; - curr->inputs[0] = tmp_outputs[0]->t; - curr->inputs[1] = tmp_outputs[1]->t; - curr->inputs[2] = tmp_outputs[2]->t; - if (layout == VSI_NN_DEST_LAYOUT_NHWC) + if (layout == VSI_NN_DEST_LAYOUT_NHWC && !enable_rgb88_planar_nhwc) { curr->outputs[0] = preprocess_tensor->t; } @@ -398,27 +438,93 @@ static vsi_bool op_setup curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; } - vsi_nn_internal_setup_node(self, curr); + if (p->reverse_channel) + { + if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) + { + curr->node->nn_param.pre_process_rgb888_planar.r_scale = p->norm.scale; + curr->node->nn_param.pre_process_rgb888_planar.g_scale = p->norm.scale; + curr->node->nn_param.pre_process_rgb888_planar.b_scale = p->norm.scale; + } + else + { + curr->node->nn_param.pre_process_rgb888_planar.r_scale = p->norm2.scale[2]; + curr->node->nn_param.pre_process_rgb888_planar.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_rgb888_planar.b_scale = p->norm2.scale[0]; + } + } + else + { + if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) + { + curr->node->nn_param.pre_process_rgb888_planar.r_scale = p->norm.scale; + curr->node->nn_param.pre_process_rgb888_planar.g_scale = p->norm.scale; + curr->node->nn_param.pre_process_rgb888_planar.b_scale = p->norm.scale; + } + else + { + curr->node->nn_param.pre_process_rgb888_planar.r_scale = p->norm2.scale[0]; + curr->node->nn_param.pre_process_rgb888_planar.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_rgb888_planar.b_scale = p->norm2.scale[2]; + } + } + + curr->node->nn_param.pre_process_rgb888_planar.r_mean = mean[0]; + curr->node->nn_param.pre_process_rgb888_planar.g_mean = mean[1]; + curr->node->nn_param.pre_process_rgb888_planar.b_mean = mean[2]; + curr->node->nn_param.pre_process_rgb888_planar.rect.left = p->rect.left; + curr->node->nn_param.pre_process_rgb888_planar.rect.top = p->rect.top; + curr->node->nn_param.pre_process_rgb888_planar.rect.width = p->rect.width; + curr->node->nn_param.pre_process_rgb888_planar.rect.height = p->rect.height; + curr->node->nn_param.pre_process_rgb888_planar.output_attr.size = p->output_attr.size; + curr->node->nn_param.pre_process_rgb888_planar.output_attr.dim_num = p->output_attr.dim_num; + curr->node->nn_param.pre_process_rgb888_planar.reverse_channel = p->reverse_channel; + curr->node->nn_param.pre_process_rgb888_planar.enable_rgb88_planar_nhwc = enable_rgb88_planar_nhwc; + ret = vsi_nn_internal_setup_node(self, curr); } break; case VSI_NN_SOURCE_FORMAT_IMAGE_YUV444: { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV444, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); if (p->reverse_channel) { curr->node->nn_param.pre_process_yuv444.r_mean = p->norm.mean[2]; curr->node->nn_param.pre_process_yuv444.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_yuv444.b_mean = p->norm.mean[0]; + if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) + { + curr->node->nn_param.pre_process_yuv444.r_scale = p->norm.scale; + curr->node->nn_param.pre_process_yuv444.g_scale = p->norm.scale; + curr->node->nn_param.pre_process_yuv444.b_scale = p->norm.scale; + } + else + { + curr->node->nn_param.pre_process_yuv444.r_scale = p->norm2.scale[2]; + curr->node->nn_param.pre_process_yuv444.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_yuv444.b_scale = p->norm2.scale[0]; + } } else { curr->node->nn_param.pre_process_yuv444.r_mean = p->norm.mean[0]; curr->node->nn_param.pre_process_yuv444.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_yuv444.b_mean = p->norm.mean[2]; + if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) + { + curr->node->nn_param.pre_process_yuv444.r_scale = p->norm.scale; + curr->node->nn_param.pre_process_yuv444.g_scale = p->norm.scale; + curr->node->nn_param.pre_process_yuv444.b_scale = p->norm.scale; + } + else + { + curr->node->nn_param.pre_process_yuv444.r_scale = p->norm2.scale[0]; + curr->node->nn_param.pre_process_yuv444.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_yuv444.b_scale = p->norm2.scale[2]; + } } - curr->node->nn_param.pre_process_yuv444.rgb_scale = p->norm.scale; curr->node->nn_param.pre_process_yuv444.reverse_channel = p->reverse_channel; curr->node->nn_param.pre_process_yuv444.rect.left = p->rect.left; curr->node->nn_param.pre_process_yuv444.rect.top = p->rect.top; @@ -441,25 +547,50 @@ static vsi_bool op_setup curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; } - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); } break; case VSI_NN_SOURCE_FORMAT_IMAGE_NV21: case VSI_NN_SOURCE_FORMAT_IMAGE_NV12: { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_NV12, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); if (p->reverse_channel) { curr->node->nn_param.pre_process_nv12.r_mean = p->norm.mean[2]; curr->node->nn_param.pre_process_nv12.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[0]; + if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) + { + curr->node->nn_param.pre_process_nv12.r_scale = p->norm.scale; + curr->node->nn_param.pre_process_nv12.g_scale = p->norm.scale; + curr->node->nn_param.pre_process_nv12.b_scale = p->norm.scale; + } + else + { + curr->node->nn_param.pre_process_nv12.r_scale = p->norm2.scale[2]; + curr->node->nn_param.pre_process_nv12.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_nv12.b_scale = p->norm2.scale[0]; + } } else { curr->node->nn_param.pre_process_nv12.r_mean = p->norm.mean[0]; curr->node->nn_param.pre_process_nv12.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[2]; + if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) + { + curr->node->nn_param.pre_process_nv12.r_scale = p->norm.scale; + curr->node->nn_param.pre_process_nv12.g_scale = p->norm.scale; + curr->node->nn_param.pre_process_nv12.b_scale = p->norm.scale; + } + else + { + curr->node->nn_param.pre_process_nv12.r_scale = p->norm2.scale[0]; + curr->node->nn_param.pre_process_nv12.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_nv12.b_scale = p->norm2.scale[2]; + } } if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12) @@ -471,7 +602,6 @@ static vsi_bool op_setup curr->node->nn_param.pre_process_nv12.nv_type = VSI_NN_YUV_TYPE_NV21; } - curr->node->nn_param.pre_process_nv12.rgb_scale = p->norm.scale; curr->node->nn_param.pre_process_nv12.reverse_channel = p->reverse_channel; curr->node->nn_param.pre_process_nv12.rect.left = p->rect.left; curr->node->nn_param.pre_process_nv12.rect.top = p->rect.top; @@ -493,25 +623,50 @@ static vsi_bool op_setup curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; } - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); } break; case VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422: case VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422: { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV422, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); if (p->reverse_channel) { curr->node->nn_param.pre_process_yuv422.r_mean = p->norm.mean[2]; curr->node->nn_param.pre_process_yuv422.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_yuv422.b_mean = p->norm.mean[0]; + if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) + { + curr->node->nn_param.pre_process_yuv422.r_scale = p->norm.scale; + curr->node->nn_param.pre_process_yuv422.g_scale = p->norm.scale; + curr->node->nn_param.pre_process_yuv422.b_scale = p->norm.scale; + } + else + { + curr->node->nn_param.pre_process_yuv422.r_scale = p->norm2.scale[2]; + curr->node->nn_param.pre_process_yuv422.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_yuv422.b_scale = p->norm2.scale[0]; + } } else { curr->node->nn_param.pre_process_yuv422.r_mean = p->norm.mean[0]; curr->node->nn_param.pre_process_yuv422.g_mean = p->norm.mean[1]; curr->node->nn_param.pre_process_yuv422.b_mean = p->norm.mean[2]; + if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1) + { + curr->node->nn_param.pre_process_yuv422.r_scale = p->norm.scale; + curr->node->nn_param.pre_process_yuv422.g_scale = p->norm.scale; + curr->node->nn_param.pre_process_yuv422.b_scale = p->norm.scale; + } + else + { + curr->node->nn_param.pre_process_yuv422.r_scale = p->norm2.scale[0]; + curr->node->nn_param.pre_process_yuv422.g_scale = p->norm2.scale[1]; + curr->node->nn_param.pre_process_yuv422.b_scale = p->norm2.scale[2]; + } } if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422) @@ -523,7 +678,6 @@ static vsi_bool op_setup curr->node->nn_param.pre_process_yuv422.yuv422_type = 1; } - curr->node->nn_param.pre_process_yuv422.rgb_scale = p->norm.scale; curr->node->nn_param.pre_process_yuv422.reverse_channel = p->reverse_channel; curr->node->nn_param.pre_process_yuv422.rect.left = p->rect.left; curr->node->nn_param.pre_process_yuv422.rect.top = p->rect.top; @@ -544,13 +698,13 @@ static vsi_bool op_setup curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; } - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); } break; default: { VSILOGE( "Not support this type!(PRE_PROCESS)\n"); - ret = FALSE; + goto final; } break; } @@ -564,22 +718,24 @@ static vsi_bool op_setup p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB || p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA || p->type == VSI_NN_SOURCE_FORMAT_IMAGE_GRAY || - p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR || - p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP + (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR && !enable_rgb88_planar_nhwc) || + (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP && !enable_rgb88_planar_nhwc) ) { if (layout == VSI_NN_DEST_LAYOUT_NHWC) { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.permute.perm = p->perm; curr->node->nn_param.permute.dim_num = p->dim_num; curr->inputs[0] = preprocess_tensor->t; curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT]; - vsi_nn_internal_setup_node( self, curr ); + ret = vsi_nn_internal_setup_node( self, curr ); } } +final: return ret; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c index c1be23962..2c5e5b77d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c @@ -60,7 +60,9 @@ static vsi_status op_compute vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_bgra.r_mean ); vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_bgra.g_mean ); vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_bgra.b_mean ); - vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_bgra.rgb_scale ); + vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_bgra.r_scale ); + vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_bgra.g_scale ); + vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_bgra.b_scale ); vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_bgra.reverse_channel ); vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_bgra.local.enable_perm ); vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_bgra.local.enable_copy ); @@ -111,6 +113,9 @@ static vsi_bool op_setup /* TODO: Add code to comput outputs' shape. */ vsi_nn_pre_process_bgra_param * p = NULL; uint32_t i = 0; + + VSI_UNREFERENCED(inputs); + p = (vsi_nn_pre_process_bgra_param *)&(self->nn_param.pre_process_bgra); if (p->rect.width == 0 || p->rect.height == 0) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c index d264ee7fa..6bc1f796b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c @@ -115,6 +115,9 @@ static vsi_bool op_setup { vsi_nn_pre_process_gray_param * p = NULL; uint32_t i = 0; + + VSI_UNREFERENCED(inputs); + p = (vsi_nn_pre_process_gray_param *)&(self->nn_param.pre_process_gray); if (p->rect.width == 0 || p->rect.height == 0) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c index 09eb682ff..7fa635a5b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c @@ -56,7 +56,9 @@ static vsi_status op_compute vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_nv12.r_mean ); vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_nv12.g_mean ); vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_nv12.b_mean ); - vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_nv12.rgb_scale ); + vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_nv12.r_scale ); + vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_nv12.g_scale ); + vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_nv12.b_scale ); vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_nv12.reverse_channel ); vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_nv12.local->enable_perm ); vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_nv12.local->enable_copy ); @@ -113,6 +115,9 @@ static vsi_bool op_setup /* TODO: Add code to comput outputs' shape. */ vsi_nn_pre_process_nv12_param * p = NULL; uint32_t i = 0; + + VSI_UNREFERENCED(inputs); + p = (vsi_nn_pre_process_nv12_param *)&(self->nn_param.pre_process_nv12); if (p->rect.width == 0 || p->rect.height == 0) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c index 6d19e4a47..80acd7974 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c @@ -59,7 +59,9 @@ static vsi_status op_compute vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_rgb.r_mean ); vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_rgb.g_mean ); vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_rgb.b_mean ); - vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_rgb.rgb_scale ); + vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_rgb.r_scale ); + vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_rgb.g_scale ); + vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_rgb.b_scale ); vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_rgb.reverse_channel ); vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_rgb.local.enable_perm ); vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_rgb.local.enable_copy ); @@ -116,6 +118,9 @@ static vsi_bool op_setup /* TODO: Add code to comput outputs' shape. */ vsi_nn_pre_process_rgb_param * p = NULL; uint32_t i = 0; + + VSI_UNREFERENCED(inputs); + p = (vsi_nn_pre_process_rgb_param *)&(self->nn_param.pre_process_rgb); if (p->rect.width == 0 || p->rect.height == 0) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c index 13a636d78..3c27ecc19 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c @@ -47,7 +47,7 @@ typedef struct _pre_process_rgb888_planar_local_data_t { Declare number of input and output. */ #define _INPUT_NUM (3) -#define _OUTPUT_NUM (3) +#define _OUTPUT_NUM (1) static vsi_status op_compute ( @@ -59,21 +59,35 @@ static vsi_status op_compute vsi_status status = VSI_FAILURE; vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_node_t n = NULL; + vsi_nn_pre_process_rgb888_planar_param * p = NULL; + + p = (vsi_nn_pre_process_rgb888_planar_param *)&(self->nn_param.pre_process_rgb888_planar); param = vsi_nn_kernel_param_create(); - vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_rgb888_planar.local->scale_x ); - vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_rgb888_planar.local->scale_y ); - vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_rgb888_planar.rect.left ); - vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_rgb888_planar.rect.top ); - vsi_nn_kernel_param_add_int32( param, "width", self->nn_param.pre_process_rgb888_planar.rect.width ); - vsi_nn_kernel_param_add_int32( param, "height", self->nn_param.pre_process_rgb888_planar.rect.height ); - vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_rgb888_planar.r_mean ); - vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_rgb888_planar.g_mean ); - vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_rgb888_planar.b_mean ); - vsi_nn_kernel_param_add_float32( param, "scale", self->nn_param.pre_process_rgb888_planar.scale ); - vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_rgb888_planar.local->enable_copy ); - - n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb888_planar", inputs, 3, outputs, 3, param ); + vsi_nn_kernel_param_add_int32( param, "scale_x", p->local->scale_x ); + vsi_nn_kernel_param_add_int32( param, "scale_y", p->local->scale_y ); + vsi_nn_kernel_param_add_int32( param, "left", p->rect.left ); + vsi_nn_kernel_param_add_int32( param, "top", p->rect.top ); + vsi_nn_kernel_param_add_int32( param, "width", p->rect.width ); + vsi_nn_kernel_param_add_int32( param, "height", p->rect.height ); + vsi_nn_kernel_param_add_float32( param, "r_mean", p->r_mean ); + vsi_nn_kernel_param_add_float32( param, "g_mean", p->g_mean ); + vsi_nn_kernel_param_add_float32( param, "b_mean", p->b_mean ); + vsi_nn_kernel_param_add_float32( param, "r_scale", p->r_scale ); + vsi_nn_kernel_param_add_float32( param, "g_scale", p->g_scale ); + vsi_nn_kernel_param_add_float32( param, "b_scale", p->b_scale ); + vsi_nn_kernel_param_add_int32( param, "enable_copy", p->local->enable_copy ); + vsi_nn_kernel_param_add_int32( param, "reverse", p->reverse_channel ); + + if (p->enable_rgb88_planar_nhwc) + { + n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb888_planar_nhwc", inputs, 3, outputs, 1, param ); + } + else + { + n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb888_planar", inputs, 3, outputs, 1, param ); + } + if ( n != NULL ) { self->n = (vx_node)n; @@ -97,11 +111,11 @@ static vsi_bool op_check { if (inputs[1] == NULL) { - BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 1, 3) - IO_TYPE(D_U8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_U8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_U8, D_F16, D_F16, D_F16) + BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 1, 1) + IO_TYPE(D_U8, D_U8|Q_ASYM) + IO_TYPE(D_U8, D_I8|Q_DFP) + IO_TYPE(D_U8, D_I16|Q_DFP) + IO_TYPE(D_U8, D_F16) END_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR) if (!VALIDATE_OP_IO_TYPES(PRE_PROCESS_RGB888_PLANAR, self, inputs, 1, @@ -115,11 +129,11 @@ static vsi_bool op_check } else { - BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 3, 3) - IO_TYPE(D_U8, D_U8, D_U8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8, D_U8, D_U8, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_U8, D_U8, D_U8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_U8, D_U8, D_U8, D_F16, D_F16, D_F16) + BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 3, 1) + IO_TYPE(D_U8, D_U8, D_U8, D_U8|Q_ASYM) + IO_TYPE(D_U8, D_U8, D_U8, D_I8|Q_DFP) + IO_TYPE(D_U8, D_U8, D_U8, D_I16|Q_DFP) + IO_TYPE(D_U8, D_U8, D_U8, D_F16) END_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR) if (!VALIDATE_OP_IO_TYPES(PRE_PROCESS_RGB888_PLANAR, self, inputs, self->input.num, @@ -144,6 +158,9 @@ static vsi_bool op_setup { vsi_nn_pre_process_rgb888_planar_param * p = NULL; uint32_t i = 0, j = 0; + + VSI_UNREFERENCED(inputs); + p = (vsi_nn_pre_process_rgb888_planar_param *)&(self->nn_param.pre_process_rgb888_planar); if (p->rect.width == 0 || p->rect.height == 0) @@ -163,29 +180,34 @@ static vsi_bool op_setup } } - for (j = 0; j < 3; j++) + + if ( VSI_NN_DIM_AUTO == outputs[j]->attr.dim_num ) { - if ( VSI_NN_DIM_AUTO == outputs[j]->attr.dim_num ) + if (p->output_attr.dim_num > 0) { - if (p->output_attr.dim_num > 0) - { - outputs[j]->attr.dim_num = p->output_attr.dim_num; - for (i = 0; i < p->output_attr.dim_num; i++) - { - outputs[j]->attr.dim_num = p->output_attr.dim_num; - outputs[j]->attr.size[i] = p->output_attr.size[i]; - } - } - else + outputs[j]->attr.dim_num = p->output_attr.dim_num; + for (i = 0; i < p->output_attr.dim_num; i++) { - VSILOGE("output dim num cannot be zero!(PRE_PROCESS_RGB888_PLANAR)\n"); - return FALSE; + outputs[j]->attr.size[i] = p->output_attr.size[i]; } } + else + { + VSILOGE("output dim num cannot be zero!(PRE_PROCESS_RGB888_PLANAR)\n"); + return FALSE; + } } - p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[0]->attr.size[0]); - p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[0]->attr.size[1]); + if (p->enable_rgb88_planar_nhwc) + { + p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[0]->attr.size[1]); + p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[0]->attr.size[2]); + } + else + { + p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[0]->attr.size[0]); + p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[0]->attr.size[1]); + } p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15))); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c index b4220a716..9886be018 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "libnnext/vsi_nn_vxkernel.h" +#include "vsi_nn_error.h" #include "utils/vsi_nn_dtype_util.h" #include "vsi_nn_internal_node.h" @@ -48,6 +48,8 @@ static vsi_bool _is_same_type vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + if(vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) { return FALSE; @@ -63,6 +65,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -73,6 +77,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -85,6 +92,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ @@ -95,7 +104,7 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - vsi_bool ret; + vsi_bool ret = FALSE; uint32_t i; uint32_t axis; vsi_nn_tensor_attr_t attr; @@ -112,7 +121,6 @@ static vsi_bool op_setup return FALSE; } - ret = TRUE; /* output */ if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { @@ -125,7 +133,7 @@ static vsi_bool op_setup VSILOGE( "Error permute axis '%u', the dim is '%u' ", axis, inputs[0]->attr.dim_num ); ret = FALSE; - break; + goto final; } outputs[0]->attr.size[i] = inputs[0]->attr.size[axis]; } @@ -152,32 +160,35 @@ static vsi_bool op_setup self->nn_param.pre_process_tensor.local.enable_perm == FALSE) { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.reshape2.size = outputs[0]->attr.size; curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num; curr->inputs[0] = inputs[PRE_PROCESS_TENSOR_INPUT]; curr->outputs[0] = outputs[PRE_PROCESS_TENSOR_OUTPUT]; - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); } else if (self->nn_param.pre_process_tensor.local.enable_data_conv == TRUE && self->nn_param.pre_process_tensor.local.enable_perm == FALSE) { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[PRE_PROCESS_TENSOR_INPUT]; curr->outputs[0] = outputs[PRE_PROCESS_TENSOR_OUTPUT]; - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); } else if (self->nn_param.pre_process_tensor.local.enable_data_conv == FALSE && self->nn_param.pre_process_tensor.local.enable_perm == TRUE) { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.permute.perm = self->nn_param.pre_process_tensor.perm; curr->node->nn_param.permute.dim_num = self->nn_param.pre_process_tensor.dim_num; curr->inputs[0] = inputs[PRE_PROCESS_TENSOR_INPUT]; curr->outputs[0] = outputs[PRE_PROCESS_TENSOR_OUTPUT]; - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); } else { @@ -187,22 +198,26 @@ static vsi_bool op_setup attr.vtl = use_virtual_tensor; attr.is_const = FALSE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[PRE_PROCESS_TENSOR_INPUT]; curr->outputs[0] = output_tensor->t; - vsi_nn_internal_setup_node( self, curr ); + ret = vsi_nn_internal_setup_node( self, curr ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.permute.perm = self->nn_param.pre_process_tensor.perm; curr->node->nn_param.permute.dim_num = self->nn_param.pre_process_tensor.dim_num; curr->inputs[0] = output_tensor->t; curr->outputs[0] = outputs[PRE_PROCESS_TENSOR_OUTPUT]; - vsi_nn_internal_setup_node(self, curr); + ret &= vsi_nn_internal_setup_node(self, curr); } +final: return ret; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c index bcac93c3c..37696ff6c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c @@ -56,7 +56,9 @@ static vsi_status op_compute vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_yuv420.r_mean ); vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_yuv420.g_mean ); vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_yuv420.b_mean ); - vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_yuv420.rgb_scale ); + vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_yuv420.r_scale ); + vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_yuv420.g_scale ); + vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_yuv420.b_scale ); vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_yuv420.reverse_channel ); vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_yuv420.local.enable_perm ); vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_yuv420.local.enable_copy ); @@ -113,6 +115,9 @@ static vsi_bool op_setup /* TODO: Add code to comput outputs' shape. */ vsi_nn_pre_process_yuv420_param * p = NULL; uint32_t i = 0; + + VSI_UNREFERENCED(inputs); + p = (vsi_nn_pre_process_yuv420_param *)&(self->nn_param.pre_process_yuv420); if (p->rect.width == 0 || p->rect.height == 0) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c index b9c4daf33..3922de4c2 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c @@ -65,7 +65,9 @@ static vsi_status op_compute vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_yuv422.r_mean ); vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_yuv422.g_mean ); vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_yuv422.b_mean ); - vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_yuv422.rgb_scale ); + vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_yuv422.r_scale ); + vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_yuv422.g_scale ); + vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_yuv422.b_scale ); vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_yuv422.reverse_channel ); vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_yuv422.local->enable_perm ); vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_yuv422.local->enable_copy ); @@ -123,6 +125,9 @@ static vsi_bool op_setup /* TODO: Add code to comput outputs' shape. */ vsi_nn_pre_process_yuv422_param * p = NULL; uint32_t i = 0; + + VSI_UNREFERENCED(inputs); + p = (vsi_nn_pre_process_yuv422_param *)&(self->nn_param.pre_process_yuv422); if (p->rect.width == 0 || p->rect.height == 0) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c index 6a350d16e..baa5cc440 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c @@ -56,7 +56,9 @@ static vsi_status op_compute vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_yuv444.r_mean ); vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_yuv444.g_mean ); vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_yuv444.b_mean ); - vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_yuv444.rgb_scale ); + vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_yuv444.r_scale ); + vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_yuv444.g_scale ); + vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_yuv444.b_scale ); vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_yuv444.reverse_channel ); vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_yuv444.local->enable_perm ); vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_yuv444.local->enable_copy ); @@ -113,6 +115,9 @@ static vsi_bool op_setup /* TODO: Add code to comput outputs' shape. */ vsi_nn_pre_process_yuv444_param * p = NULL; uint32_t i = 0; + + VSI_UNREFERENCED(inputs); + p = (vsi_nn_pre_process_yuv444_param *)&(self->nn_param.pre_process_yuv444); if (p->rect.width == 0 || p->rect.height == 0) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c index b66a5cf01..2bdc1362f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c @@ -213,6 +213,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } /* op_check() */ @@ -277,6 +280,8 @@ static vsi_status op_optimize uint32_t dim; vx_tensor rois_tmp, score_tmp; + VSI_UNREFERENCED(inputs); + rois_tmp = NULL, score_tmp = NULL; if( direction == VSI_NN_OPTIMIZE_BACKWARD ) { @@ -326,16 +331,20 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { - vx_tensor rois = self->nn_param.proposal.local.rois; - vx_tensor score = self->nn_param.proposal.local.score; - if( NULL != self && NULL != self->n ) + vx_tensor rois = NULL; + vx_tensor score = NULL; + + if ( NULL != self && NULL != self->n ) { - if(rois) + rois = self->nn_param.proposal.local.rois; + score = self->nn_param.proposal.local.score; + + if (rois) { vxReleaseTensor(&rois); rois = NULL; } - if(score) + if (score) { vxReleaseTensor(&score); score = NULL; @@ -343,6 +352,11 @@ static vsi_status op_deinit vxReleaseNode( &self->n ); self->n = NULL; } + else + { + return VSI_FAILURE; + } + return VSI_SUCCESS; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c index 4ea879fbf..c203fdd6a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c @@ -55,6 +55,9 @@ static vsi_status op_compute VX_CONVERT_POLICY_SATURATE, outputs[0]->t ); */ + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + if( NULL != self->n ) { status = VSI_SUCCESS; @@ -69,6 +72,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -80,6 +86,10 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + /* TODO: Add code to comput outputs' shape. */ return TRUE; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c index a7a549448..dcbb75b04 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c @@ -36,6 +36,7 @@ #include "vsi_nn_internal_node.h" #include "utils/vsi_nn_dtype_util.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "vsi_nn_error.h" #define _ARG_NUM (6) #define _INPUT_NUM (1) @@ -209,6 +210,8 @@ static vsi_status op_compute { vsi_status status = VSI_FAILURE; + VSI_UNREFERENCED(outputs); + if ( self->nn_param.reduce.local2->use_internal_node ) { status = vsi_nn_internal_compute_node( self ); @@ -219,7 +222,7 @@ static vsi_status op_compute vsi_nn_tensor_t *axis_tensor = NULL; vsi_nn_tensor_t *axis_tensor2 = NULL; vsi_nn_tensor_attr_t attr, attr2; - vx_int32 resolved_dim[4] = {-1, -1, -1, -1}; + vx_int32 resolved_dim[VSI_NN_MAX_DIM_NUM] = {-1}; vx_int32 resolved_dim_count = 0; uint32_t i = 0; vsi_size_t re_sizes[VSI_NN_MAX_DIM_NUM] = {1}; @@ -230,6 +233,9 @@ static vsi_status op_compute vsi_nn_tensor_t *reshaped_output1 = self->nn_param.reduce.local2->reshaped_output1; char tensor_name[128]; + CHECK_PTR_FAIL_GOTO( reshaped_input1, "check tensor pointer.", final ); + CHECK_PTR_FAIL_GOTO( reshaped_output1, "check tensor pointer.", final ); + memset(tensor_name, 0, sizeof(tensor_name)); snprintf(tensor_name, sizeof(tensor_name), @@ -240,11 +246,20 @@ static vsi_status op_compute { VSILOGW("Set uid %u reduce reshaped output name fail", self->uid); - return VSI_FAILURE; + + status = VSI_FAILURE; + goto final; } resolved_dim_count = self->nn_param.reduce.local2->axes_num; + if (resolved_dim_count > VSI_NN_MAX_DIM_NUM) + { + VSILOGE("resolved_dim_count greater than VSI_NN_MAX_DIM_NUM"); + + status = VSI_FAILURE; + goto final; + } for (i = 0; i < (uint32_t)resolved_dim_count; i++) { @@ -313,7 +328,7 @@ static vsi_status op_compute input_t, output_t); } - else if (3 == resolved_dim[resolved_dim_count - 1] && resolved_dim_count < 3) + else if (resolved_dim_count > 0 && 3 == resolved_dim[resolved_dim_count - 1] && resolved_dim_count < 3) { if (1 == resolved_dim_count) { @@ -349,6 +364,7 @@ static vsi_status op_compute attr2.size[resolved_dim[0]] = 1; attr2.vtl = FALSE; mean_tmp_tensor = vsi_nn_CreateTensor(self->graph, &attr2); + CHECK_PTR_FAIL_GOTO( mean_tmp_tensor, "Create tensor fail.", final ); self->nn_param.reduce.local2->reshaped_tmp = mean_tmp_tensor; re_sizes[resolved_dim[0]] = 1; memset(&attr, 0, sizeof(attr)); @@ -433,6 +449,8 @@ static vsi_status op_compute attr2.size[resolved_dim[1]] = 1; attr2.vtl = FALSE; mean_tmp_tensor = vsi_nn_CreateTensor(self->graph, &attr2); + CHECK_PTR_FAIL_GOTO( mean_tmp_tensor, "Create tensor fail.", final ); + self->nn_param.reduce.local2->reshaped_tmp = mean_tmp_tensor; re_sizes[resolved_dim[0]] = 1; re_sizes[resolved_dim[1]] = 1; @@ -446,11 +464,8 @@ static vsi_status op_compute self->graph, (uint8_t *)&resolved_dim[0], &attr); - if( NULL == axis_tensor ) - { - VSILOGE("Create axis_tensor fail.(reduce)"); - return VSI_FAILURE; - } + CHECK_PTR_FAIL_GOTO( axis_tensor, "Create tensor fail.", final ); + self->nn_param.reduce.local.axis_tensor = axis_tensor; status = op_comput_reduce_mean(self, axis_tensor, @@ -512,6 +527,7 @@ static vsi_status op_compute } } +final: return status; } /* op_compute() */ @@ -523,6 +539,9 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + if ( self->nn_param.reduce.local2->use_internal_node ) { return vsi_nn_internal_optimize_node(self, direction ); @@ -540,6 +559,10 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -711,7 +734,7 @@ static vsi_bool op_set_reduce_axis( for (i = 0; i < self->nn_param.reduce.axis_num; i++) { vx_int32 current_axis = self->nn_param.reduce.axis[i] < 0 ? \ - inputs[0]->attr.dim_num + self->nn_param.reduce.axis[i] : self->nn_param.reduce.axis[i]; + (int32_t)inputs[0]->attr.dim_num + self->nn_param.reduce.axis[i] : self->nn_param.reduce.axis[i]; if (current_axis < 0 || current_axis >= (vx_int32)inputs[0]->attr.dim_num) { @@ -822,16 +845,20 @@ static vsi_bool op_set_sp_reduce_internal int32_t axes_num = self->nn_param.reduce.local2->axes_num; int32_t i = 0, j = 0, index = 0; vsi_size_t reduce_size = 1; + vsi_bool ret = FALSE; vsi_nn_internal_init_node_wksp( self ); memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor); tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final); tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 ); + CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode, inputs[0]->attr.dim_num * sizeof(uint32_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(permute_in_perm, tmp_inode, "Create buffer failed", final); for ( i = 0; i < axes_num; i++) { @@ -862,11 +889,14 @@ static vsi_bool op_set_sp_reduce_internal vsi_nn_internal_setup_node(self, tmp_inode); new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes, outputs[0]->attr.dim_num); + CHECK_PTR_FAIL_GOTO(new_output, "Create tensor failed", final); + self->nn_param.reduce.local2->reshaped_output = new_output; tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_REDUCE_MEAN_INTERNAL, 0, 0 ); - + CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); new_axis = (int32_t *)vsi_nn_internal_new_node_param(tmp_inode, axes_num * sizeof(int32_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(new_axis, tmp_inode, "Create buffer failed", final); for (i = 0; i < axes_num; i++) { new_axis[i] = i; @@ -885,11 +915,10 @@ static vsi_bool op_set_sp_reduce_internal tmp_inode->node->nn_param.reduce_mean_internal.scale = 1.0f / (float)reduce_size; } - vsi_nn_internal_setup_node(self, tmp_inode); + ret = vsi_nn_internal_setup_node(self, tmp_inode); - self->nn_param.reduce.local2->reshaped_output = new_output; - - return TRUE; +final: + return ret; } static vsi_bool op_set_reduce_internal @@ -912,6 +941,8 @@ static vsi_bool op_set_reduce_internal vx_int32 resolved_dim_count = 0; int32_t * axes = self->nn_param.reduce.local2->axes; vx_bool is_use_float = vx_false_e; + vsi_bool ret = FALSE; + resolved_dim_count = self->nn_param.reduce.local2->axes_num; if ((VSI_NN_OP_REDUCESUM_INTERNAL == type_name) || (VSI_NN_OP_REDUCEPROD_INTERNAL == type_name)) @@ -975,6 +1006,7 @@ static vsi_bool op_set_reduce_internal } curr = vsi_nn_internal_new_node( self, type_name, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); op_set_reduce_param_value(&(curr->node->nn_param), type_name, axes, 1, self->nn_param.reduce.keep_dim); if (self->nn_param.reduce.local2->reshaped_input) @@ -1001,9 +1033,11 @@ static vsi_bool op_set_reduce_internal attr.vtl = use_virtual_tensor; attr.is_const = FALSE; tmp_output_tensor[0] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(tmp_output_tensor[0], "Create internal tensor failed", final); re_sizes[axes[0]] = 1; curr = vsi_nn_internal_new_node( self, type_name, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); op_set_reduce_param_value(&(curr->node->nn_param), type_name, &(axes[0]), 1, vx_true_e); curr->inputs[0] = inputs[POST_PROCESS_INPUT]; @@ -1034,8 +1068,11 @@ static vsi_bool op_set_reduce_internal re_sizes[axes[1]] = 1; new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], re_sizes, dim_num); } + CHECK_PTR_FAIL_GOTO(new_output, "Reshape tensor failed", final); + self->nn_param.reduce.local2->reshaped_output = new_output; curr = vsi_nn_internal_new_node( self, type_name, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); op_set_reduce_param_value(&(curr->node->nn_param), type_name, &(axes[1]), 1, vx_true_e); if (self->nn_param.reduce.local2->reshaped_input) @@ -1047,7 +1084,6 @@ static vsi_bool op_set_reduce_internal curr->inputs[0] = tmp_output_tensor[0]->t; } curr->outputs[0] = new_output; - self->nn_param.reduce.local2->reshaped_output = new_output; vsi_nn_internal_setup_node(self, curr); } else if (3 == resolved_dim_count) @@ -1056,12 +1092,15 @@ static vsi_bool op_set_reduce_internal attr.vtl = use_virtual_tensor; attr.is_const = FALSE; tmp_output_tensor[0] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(tmp_output_tensor[0], "Create internal tensor failed", final); attr.size[axes[1]] = 1; tmp_output_tensor[1] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(tmp_output_tensor[1], "Create internal tensor failed", final); re_sizes[axes[0]] = 1; re_sizes[axes[1]] = 1; curr = vsi_nn_internal_new_node( self, type_name, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); op_set_reduce_param_value(&(curr->node->nn_param), type_name, &(axes[0]), 1, vx_true_e); curr->inputs[0] = inputs[POST_PROCESS_INPUT]; @@ -1069,6 +1108,7 @@ static vsi_bool op_set_reduce_internal vsi_nn_internal_setup_node( self, curr ); curr = vsi_nn_internal_new_node( self, type_name, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); op_set_reduce_param_value(&(curr->node->nn_param), type_name, &(axes[1]), 1, vx_true_e); curr->inputs[0] = tmp_output_tensor[0]->t; @@ -1100,6 +1140,7 @@ static vsi_bool op_set_reduce_internal } curr = vsi_nn_internal_new_node( self, type_name, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); op_set_reduce_param_value(&(curr->node->nn_param), type_name, &(axes[2]), 1, vx_true_e); if (self->nn_param.reduce.local2->reshaped_input) @@ -1119,7 +1160,10 @@ static vsi_bool op_set_reduce_internal VSILOGE("error: resolved_dim_count is %d\n", resolved_dim_count); return FALSE; } - return TRUE; + + ret = TRUE; +final: + return ret; } static vsi_bool op_setup diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c index 4f5022836..74132f149 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c @@ -57,11 +57,13 @@ static vsi_status op_compute int32_t axis_num = self->nn_param.reduce_mean_internal.axis_num; float scale = self->nn_param.reduce_mean_internal.scale; vsi_enum type = self->nn_param.reduce_mean_internal.type; + int32_t *axis = self->nn_param.reduce_mean_internal.axis; vsi_nn_kernel_param_t * param = NULL; param = vsi_nn_kernel_param_create(); vsi_nn_kernel_param_add_int32( param, "axis_num", axis_num ); vsi_nn_kernel_param_add_float32( param, "scale", scale ); + vsi_nn_kernel_param_add_str( param, "axis", (const char*)axis ); if (type == VSI_NN_REDUCE_MAX) { @@ -95,6 +97,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c index dd41b6a0e..08e5b9401 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c @@ -91,6 +91,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -102,6 +105,9 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /* TODO: Add code to comput outputs' shape. */ return TRUE; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c index 062922637..9efd8fca5 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c @@ -159,6 +159,8 @@ static vsi_bool op_setup vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_bool ret = TRUE; + VSI_UNREFERENCED(self); + in1_rank = inputs[0]->attr.dim_num; in2_rank = inputs[1]->attr.dim_num; out_rank = vsi_nn_max( in1_rank, in2_rank ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c index 8c40d429a..6ec9d19af 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c @@ -34,7 +34,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "libnnext/vsi_nn_vxkernel.h" +#include "vsi_nn_error.h" #include "vsi_nn_internal_node.h" #include "utils/vsi_nn_util.h" @@ -46,6 +46,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -56,6 +58,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -84,7 +89,9 @@ static vsi_bool op_setup float max_value = 0; float threshold = 0; uint32_t max_raw = 0; - if( NULL == self ) + vsi_bool ret = FALSE; + + if ( NULL == self ) { return FALSE; } @@ -101,30 +108,35 @@ static vsi_bool op_setup if (alpha == 0 && max_raw == VSI_NN_FLOAT32_INF && threshold == 0) { curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; } else if (alpha == 1.0f && max_value == 1.0f && threshold == -1.0f) { curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU1, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; } else if (alpha == 0 && max_value == 6.0f && threshold == 0) { curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU6, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; } else if (alpha == 0.1 && max_value == VSI_NN_FLOAT32_INF && threshold == 0) { curr = vsi_nn_internal_new_node(self, VSI_NN_OP_LEAKY_RELU, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; } else { curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU_KERAS_INTERNAL, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; curr->node->nn_param.relu_keras_internal.max_value = max_value; @@ -132,9 +144,10 @@ static vsi_bool op_setup curr->node->nn_param.relu_keras_internal.threshold = threshold; } - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); - return TRUE; +final: + return ret; } #ifdef __cplusplus diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c index 2a77c5c99..96d760e39 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c @@ -158,8 +158,32 @@ static vsi_status op_compute param = vsi_nn_kernel_param_create(); vsi_nn_kernel_param_add_int32( param, "axis", axis ); - n = vsi_nn_kernel_selector( self->graph, "repeat", - tmp_inputs, _INPUT_NUM, tmp_output, _OUTPUT_NUM, param ); + + if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE) + { + vsi_nn_tensor_t* temp_tensors = NULL; + vsi_nn_tensor_attr_t attr; + VSILOGW("repeat is no_range_change operation! \ + Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!"); + + memcpy( &attr, &tmp_output[0]->attr, sizeof(attr)); + memcpy( &attr.dtype, &tmp_inputs[0]->attr.dtype, sizeof(attr.dtype)); + attr.is_const = FALSE; + attr.vtl = TRUE; + temp_tensors = vsi_nn_CreateTensor( self->graph, &attr ); + + vsi_nn_kernel_selector( self->graph, "repeat", + tmp_inputs, _INPUT_NUM, &temp_tensors, _OUTPUT_NUM, param ); + + n = vxTensorCopyNode( self->graph->g, temp_tensors->t, tmp_output[0]->t); + vsi_safe_release_tensor(temp_tensors); + } + else + { + n = vsi_nn_kernel_selector( self->graph, "repeat", + tmp_inputs, _INPUT_NUM, tmp_output, _OUTPUT_NUM, param ); + } + if ( n != NULL ) { self->n = (vx_node)n; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c index e1cfdaa69..523eeb46a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c @@ -124,7 +124,8 @@ static vsi_bool op_setup uint32_t i = 0; for (i = 0; i < self->nn_param.reshape.dim_num; i++) { - shape[i] = -1 == self->nn_param.reshape.size[i] ? -1 : (vsi_size_t)self->nn_param.reshape.size[i]; + shape[i] = (uint32_t)-1 == self->nn_param.reshape.size[i] ? \ + (vsi_size_t)-1 : (vsi_size_t)self->nn_param.reshape.size[i]; } ret = vsi_nn_CalcReshapeTensor(inputs[0], outputs[0], diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c index 002b39be5..1a719af73 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c @@ -43,6 +43,7 @@ #include "vsi_nn_log.h" #include "vsi_nn_internal_node.h" #include "kernel/vsi_nn_kernel.h" +#include "vsi_nn_error.h" #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) @@ -83,7 +84,7 @@ static vsi_status op_compute } else { - char kernel_name[128]; + char kernel_name[128] = {0}; vsi_nn_kernel_param_t * param = NULL; int32_t align_corners = self->nn_param.resize.align_corners; int32_t half_pixel_centers = self->nn_param.resize.half_pixel_centers; @@ -156,6 +157,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -171,6 +175,7 @@ static vsi_bool op_setup float factor = self->nn_param.resize.factor; vsi_enum layout = self->nn_param.resize.layout; vsi_nn_internal_node_t* curr = NULL; + vsi_bool ret = FALSE; if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { @@ -220,13 +225,14 @@ static vsi_bool op_setup vsi_nn_internal_init_node_wksp( self ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_INTERNAL, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.resize_internal.align_corners = self->nn_param.resize.align_corners; curr->node->nn_param.resize_internal.factor = self->nn_param.resize.factor; curr->node->nn_param.resize_internal.half_pixel_centers = self->nn_param.resize.half_pixel_centers; curr->node->nn_param.resize_internal.layout = self->nn_param.resize.layout; curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); } else if (_is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num)) { @@ -234,12 +240,18 @@ static vsi_bool op_setup vsi_nn_internal_init_node_wksp( self ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); + } + else + { + ret = TRUE; } - return TRUE; +final: + return ret; } /* op_setup() */ static vsi_status op_deinit diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c index c05ec675a..d1b499ec7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c @@ -34,6 +34,7 @@ #include "vsi_nn_tensor.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" +#include "vsi_nn_error.h" /* Declare number of input and output. @@ -71,6 +72,9 @@ static vsi_status op_compute { vsi_status status = VSI_FAILURE; + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + status = vsi_nn_internal_compute_node( self ); return status; @@ -102,6 +106,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return TRUE; } /* op_check() */ @@ -114,6 +121,7 @@ static vsi_bool op_setup { float factor = self->nn_param.resize_1d.factor; vsi_nn_internal_node_t* curr = NULL; + vsi_bool ret = FALSE; if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { @@ -135,36 +143,40 @@ static vsi_bool op_setup { vsi_nn_internal_init_node_wksp( self ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); } else if (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize_1d.type) { vsi_nn_internal_init_node_wksp( self ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_1D_BILINEAR_INTERNAL, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.resize_1d_bilinear_internal.align_corners = self->nn_param.resize_1d.align_corners; curr->node->nn_param.resize_1d_bilinear_internal.factor = self->nn_param.resize_1d.factor; curr->node->nn_param.resize_1d_bilinear_internal.half_pixel_centers = \ self->nn_param.resize_1d.half_pixel_centers; curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); } else if (VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize_1d.type) { vsi_nn_internal_init_node_wksp( self ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_1D_NEAREST_INTERNAL, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.resize_1d_nearest_internal.align_corners = self->nn_param.resize_1d.align_corners; curr->node->nn_param.resize_1d_nearest_internal.factor = self->nn_param.resize_1d.factor; curr->node->nn_param.resize_1d_nearest_internal.half_pixel_centers = \ self->nn_param.resize_1d.half_pixel_centers; curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); } - return TRUE; +final: + return ret; } /* op_setup() */ static vsi_status op_init @@ -172,6 +184,8 @@ static vsi_status op_init vsi_nn_node_t* self ) { + VSI_UNREFERENCED(self); + return VSI_SUCCESS; } /* op_init() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_bilinear_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_bilinear_internal.c index 66ea066ed..5b37e89a8 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_bilinear_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_bilinear_internal.c @@ -135,7 +135,7 @@ static vsi_status op_init vsi_nn_node_t* self ) { - + VSI_UNREFERENCED(self); return VSI_SUCCESS; } /* op_init() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_nearest_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_nearest_internal.c index edddc1a27..b202f8ca3 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_nearest_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_nearest_internal.c @@ -134,6 +134,7 @@ static vsi_status op_init vsi_nn_node_t* self ) { + VSI_UNREFERENCED(self); return VSI_SUCCESS; } /* op_init() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_3d.c new file mode 100644 index 000000000..989bb1b70 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_3d.c @@ -0,0 +1,334 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_tensor_util.h" + +typedef struct _resize_3d_local_data_t { + int32_t placeholder; +} resize_3d_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_tensor_t * reshape_inputs[1] = {NULL}; + vsi_nn_tensor_t * reshape_outputs[1] = {NULL}; + + if ( self->nn_param.resize_3d.lcl_data->use_internal_node ) + { + status = vsi_nn_internal_compute_node( self ); + } + else + { + char kernel_name[128]; + vsi_nn_kernel_param_t * param = NULL; + int32_t align_corners = self->nn_param.resize_3d.align_corners; + int32_t half_pixel_centers = self->nn_param.resize_3d.half_pixel_centers; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}}; + uint32_t new_rank = 4; + uint32_t i = 0; + + if (inputs[0]->attr.dim_num > 3) + { + shapes[0][0] = inputs[0]->attr.size[0]; + shapes[0][1] = inputs[0]->attr.size[1]; + shapes[0][2] = inputs[0]->attr.size[2]; + shapes[1][0] = outputs[0]->attr.size[0]; + shapes[1][1] = outputs[0]->attr.size[1]; + shapes[1][2] = outputs[0]->attr.size[2]; + shapes[0][3] = 1; + shapes[1][3] = 1; + + for (i = 3; i < inputs[0]->attr.dim_num; i++) + { + shapes[0][3] = shapes[0][3] * inputs[0]->attr.size[i]; + } + shapes[1][3] = shapes[0][3]; + + reshape_inputs[0] = vsi_nn_reshape_tensor(self->graph, inputs[0], shapes[0], new_rank); + reshape_outputs[0] = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes[1], new_rank); + + if (reshape_inputs[0] == NULL || reshape_outputs[0] == NULL) + { + VSILOGE("reshape tensor failed"); + status = VSI_FAILURE; + goto final; + } + } + else + { + reshape_inputs[0] = inputs[0]; + reshape_outputs[0] = outputs[0]; + } + + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "align_corners", align_corners ); + vsi_nn_kernel_param_add_int32( param, "half_pixel_centers", half_pixel_centers ); + vsi_nn_kernel_param_add_int32( param, "type", self->nn_param.resize_3d.type ); + + switch (self->nn_param.resize_3d.type) + { + case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR: + snprintf(kernel_name, sizeof(kernel_name), + "resize_3d_nearest"); + break; + case VSI_NN_INTERPOLATION_BILINEAR: + snprintf(kernel_name, sizeof(kernel_name), + "resize_3d_bilinear"); + break; + default: + break; + } + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + kernel_name, &reshape_inputs[0], 1, &reshape_outputs[0], 1, param ); + + if (self->n) { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release(¶m); + } + +final: + vsi_safe_release_tensor( reshape_inputs[0] ); + vsi_safe_release_tensor( reshape_outputs[0] ); + + return status; +} /* op_compute() */ + +static vsi_bool _is_same_shape + ( + vsi_nn_tensor_t * inputs, + vsi_size_t *sizes, + uint32_t dims + ) +{ + uint32_t i = 0; + + if (inputs->attr.dim_num != dims) + return FALSE; + + for (i = 0; i < dims; i++) + { + if (sizes[i] != inputs->attr.size[i]) + return FALSE; + } + + return TRUE; +} + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + if ( self->nn_param.resize_3d.lcl_data->use_internal_node ) + { + return vsi_nn_internal_optimize_node(self, direction ); + } + else + { + int32_t half_pixel_centers = self->nn_param.resize_3d.half_pixel_centers; + vsi_size_t * input_size = inputs[0]->attr.size; + vsi_size_t * output_size = outputs[0]->attr.size; + + if ( (output_size[0] % input_size[0] == 0) && (output_size[1] % input_size[1] == 0) && + half_pixel_centers == TRUE && self->nn_param.resize_3d.type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR ) + { + self->nn_param.resize_3d.half_pixel_centers = FALSE; + } + + return VSI_SUCCESS; + } +} /* op_optimize() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(RESIZE_3D, 1, 1) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) + END_IO_TYPE_DECL(RESIZE_3D) + if (!VALIDATE_OP_IO_TYPES(RESIZE_3D, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + float factor = self->nn_param.resize_3d.factor; + vsi_nn_internal_node_t* curr = NULL; + uint32_t i = 0; + vsi_bool ret = TRUE; + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + if (factor != 0) + { + outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor); + outputs[0]->attr.size[1] = (uint32_t)(inputs[0]->attr.size[1] * factor); + outputs[0]->attr.size[2] = (uint32_t)(inputs[0]->attr.size[2] * factor); + } + else + { + outputs[0]->attr.size[0] = self->nn_param.resize_3d.size[0]; + outputs[0]->attr.size[1] = self->nn_param.resize_3d.size[1]; + outputs[0]->attr.size[2] = self->nn_param.resize_3d.size[2]; + } + for (i = 3; i < inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + } + + if (_is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num)) + { + self->nn_param.resize.lcl_data->use_internal_node = TRUE; + vsi_nn_internal_init_node_wksp( self ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + ret = vsi_nn_internal_setup_node(self, curr); + } + +final: + return ret; +} /* op_setup() */ + +static vsi_status op_init(vsi_nn_node_t* self) { + vsi_status status = VSI_SUCCESS; + + self->nn_param.resize_3d.lcl_data = + (vsi_nn_resize_3d_local_data*)malloc(sizeof(vsi_nn_resize_3d_local_data)); + if (NULL == self->nn_param.resize_3d.lcl_data) { + VSILOGE("Create resize_3d local data fail."); + status = VSI_FAILURE; + goto final; + } + memset(self->nn_param.resize_3d.lcl_data, 0, sizeof(vsi_nn_resize_3d_local_data)); + + self->nn_param.resize_3d.align_corners = FALSE; + self->nn_param.resize_3d.half_pixel_centers = FALSE; + + +final: + return status; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t* self + ) +{ + if (self->nn_param.resize_3d.lcl_data->use_internal_node) + { + vsi_nn_safe_free(self->nn_param.resize_3d.lcl_data); + vsi_nn_internal_deinit_node_wksp(self); + } + else + { + vsi_nn_safe_free(self->nn_param.resize_3d.lcl_data); + vsi_nn_op_common_deinit(self); + } + + return VSI_SUCCESS; +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RESIZE_3D, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c index 50924672f..1a9ad7d77 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c @@ -36,6 +36,7 @@ #include "vsi_nn_log.h" #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_error.h" #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) @@ -183,7 +184,7 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - vsi_bool ret = TRUE; + vsi_bool ret = FALSE; vsi_nn_internal_node_t* curr = NULL; vsi_nn_internal_init_node_wksp(self); @@ -201,21 +202,26 @@ static vsi_bool op_setup attr.vtl = TRUE; attr.is_const = FALSE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_REVERSE, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[0]; curr->outputs[0] = output_tensor->t; curr->node->nn_param.reverse.axis = self->nn_param.reverse.axis; curr->node->nn_param.reverse.axis_num = self->nn_param.reverse.axis_num; - vsi_nn_internal_setup_node(self, curr); + ret &= vsi_nn_internal_setup_node(self, curr); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = output_tensor->t; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node(self, curr); + ret &= vsi_nn_internal_setup_node(self, curr); } return ret; +final: + return FALSE; } /* op_setup() */ #ifdef __cplusplus diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rnn.c index 38df1523b..2632ed652 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_rnn.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rnn.c @@ -89,41 +89,43 @@ static vsi_bool op_check ) { uint32_t input_idx = 0; - do { + { vsi_bool break_early = FALSE; // input_idx = 0 : inputs[0].shape = shape(batch_size, input_size) - if (input_idx >= self->input.num) break; + if (input_idx >= self->input.num) goto continue_point; break_early = (inputs[input_idx]->attr.dim_num != 2); - if (break_early) break; + if (break_early) goto continue_point; input_idx ++; // input_idx = 1 : inputs[1].shape = shape(num_units, input_size) - if (input_idx >= self->input.num) break; + if (input_idx >= self->input.num) goto continue_point; break_early = (inputs[input_idx]->attr.dim_num != 2); - if (break_early) break; + if (break_early) goto continue_point; input_idx ++; // input_idx = 2 : inputs[2].shape = shape(num_units, num_units) - if (input_idx >= self->input.num) break; + if (input_idx >= self->input.num) goto continue_point; break_early = (inputs[input_idx]->attr.dim_num != 2); - if (break_early) break; + if (break_early) goto continue_point; input_idx ++; // input_idx = 3 : inputs[3].shape = shape(num_units) - if (input_idx >= self->input.num) break; + if (input_idx >= self->input.num) goto continue_point; break_early = (inputs[input_idx]->attr.dim_num != 1); - if (break_early) break; + if (break_early) goto continue_point; input_idx ++; // input_idx = 4 : inputs[4].shape = shape(batch_size, num_units) - if (input_idx >= self->input.num) break; + if (input_idx >= self->input.num) goto continue_point; break_early = (inputs[input_idx]->attr.dim_num != 2); - if (break_early) break; + if (break_early) goto continue_point; input_idx ++; return TRUE; - } while(0); + } + +continue_point: { BEGIN_IO_TYPE_DECL(RNN, 5, 1) @@ -155,6 +157,8 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) { outputs[0]->attr.size[0] = inputs[4]->attr.size[0]; outputs[0]->attr.size[1] = inputs[4]->attr.size[1]; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c index a5f82613a..b2c254fd9 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c @@ -46,6 +46,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -56,6 +58,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -68,6 +73,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ @@ -98,6 +105,7 @@ static vsi_bool setup_op_shapes attr.is_const = TRUE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); inputs[RNNCELL_INPUT_H_STATE] = output_tensor->t; } @@ -108,6 +116,7 @@ static vsi_bool setup_op_shapes memcpy( &attr.dtype, &outputs[RNNCELL_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) ); attr.vtl = TRUE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); outputs[RNNCELL_OUTPUT_H_STATE] = output_tensor->t; } @@ -131,7 +140,10 @@ static vsi_bool setup_op_shapes outputs[RNNCELL_OUTPUT_OUTPUT]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) ); } + return TRUE; +final: + return FALSE; } static vsi_bool op_setup @@ -207,6 +219,7 @@ static vsi_bool op_setup inputs[RNNCELL_INPUT_BIAS_I], &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(input_gate_fc_outputs, "Create internal tensor failed", final); if (inputs[RNNCELL_INPUT_AUX_INPUT] != NULL) { aux_input_gate_fc_outputs = vsi_nn_rnn_create_tp_fc(self, @@ -215,6 +228,7 @@ static vsi_bool op_setup NULL, &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(aux_input_gate_fc_outputs, "Create internal tensor failed", final); } } else @@ -225,6 +239,7 @@ static vsi_bool op_setup &kernel_h, &kernel_w); input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[RNNCELL_INPUT_INPUT], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final); tmp = vsi_nn_rnn_create_nn_fc(self, input_tensor->t, @@ -233,9 +248,11 @@ static vsi_bool op_setup kernel_h, kernel_w, &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final); /* transpose and reshape output */ input_gate_fc_outputs = vsi_nn_rnn_process_output_for_nn_fc(self, tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(input_gate_fc_outputs, "Create internal tensor failed", final); if (inputs[RNNCELL_INPUT_AUX_INPUT] != NULL) { /* reshape and transpose input */ @@ -245,6 +262,8 @@ static vsi_bool op_setup input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[RNNCELL_INPUT_AUX_INPUT], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final); + tmp = vsi_nn_rnn_create_nn_fc(self, input_tensor->t, inputs[RNNCELL_INPUT_AUX_INPUT], @@ -252,10 +271,13 @@ static vsi_bool op_setup kernel_h, kernel_w, &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final); + /* transpose and reshape output */ aux_input_gate_fc_outputs = vsi_nn_rnn_process_output_for_nn_fc(self, tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(aux_input_gate_fc_outputs, "Create internal tensor failed", final); } } @@ -268,6 +290,7 @@ static vsi_bool op_setup inputs[RNNCELL_INPUT_BIAS_H], &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_H], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(hstate_gate_fc_outputs, "Create internal tensor failed", final); } else { @@ -277,6 +300,7 @@ static vsi_bool op_setup hstate_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[RNNCELL_INPUT_H_STATE], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(hstate_input_tensor, "Create internal tensor failed", final); tmp = vsi_nn_rnn_create_nn_fc(self, hstate_input_tensor->t, @@ -285,9 +309,12 @@ static vsi_bool op_setup kernel_h, kernel_w, &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_H], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final); + /* transpose and reshape output */ hstate_gate_fc_outputs = vsi_nn_rnn_process_output_for_nn_fc(self, tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(hstate_gate_fc_outputs, "Create internal tensor failed", final); } input_add_hstate_outputs = vsi_nn_rnn_create_tensor_add(self, @@ -295,14 +322,22 @@ static vsi_bool op_setup hstate_gate_fc_outputs->t, &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(input_add_hstate_outputs, "Create internal tensor failed", final); if (inputs[RNNCELL_INPUT_AUX_INPUT] != NULL) { + if (aux_input_gate_fc_outputs == NULL || + input_add_hstate_outputs == NULL) + { + return FALSE; + } + gate_fc_outputs = vsi_nn_rnn_create_tensor_add(self, input_add_hstate_outputs->t, aux_input_gate_fc_outputs->t, &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I], use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(gate_fc_outputs, "Create internal tensor failed", final); } else { @@ -311,6 +346,7 @@ static vsi_bool op_setup /* activation */ curr = vsi_nn_internal_new_node( self, vsi_nn_rnn_get_act_op_type(p->activation), 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.tanh.scale_a = 1.0; curr->node->nn_param.tanh.scale_b = 1.0; curr->inputs[0] = gate_fc_outputs->t; @@ -320,12 +356,15 @@ static vsi_bool op_setup if (outputs[RNNCELL_OUTPUT_H_STATE] != NULL) { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = outputs[RNNCELL_OUTPUT_OUTPUT]; curr->outputs[0] = outputs[RNNCELL_OUTPUT_H_STATE]; vsi_nn_internal_setup_node(self, curr); } return TRUE; +final: + return FALSE; } /* op_setup() */ static vsi_status op_deinit diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c index 12668f0b5..f97dd1c07 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c @@ -157,6 +157,8 @@ static vsi_status op_optimize uint32_t dim; vx_tensor rois_tmp; + VSI_UNREFERENCED(outputs); + rois_tmp = NULL; if( direction == VSI_NN_OPTIMIZE_FORWARD && inputs[1]->attr.dim_num == 2 ) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c index 87a714451..6d607b488 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c @@ -37,7 +37,7 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_link_list.h" #include "utils/vsi_nn_dtype_util.h" -#include "libnnext/vsi_nn_vxkernel.h" +#include "vsi_nn_error.h" #define _INPUT_NUM (3) #define _OUTPUT_NUM (1) @@ -49,6 +49,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -59,6 +61,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -71,18 +76,20 @@ static vsi_bool op_setup ) { vsi_nn_internal_node_t* curr = NULL; - vsi_bool ret = TRUE; + vsi_bool ret = FALSE; vsi_nn_internal_init_node_wksp( node ); curr = vsi_nn_internal_new_node( node, VSI_NN_OP_A_TIMES_B_PLUS_C, node->input.num, node->output.num ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[0]; curr->inputs[1] = inputs[1]; curr->inputs[2] = inputs[2]; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node(node, curr); + ret = vsi_nn_internal_setup_node(node, curr); +final: return ret; } /* op_setup() */ @@ -94,6 +101,9 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c index 99f8e4056..a6e6c8ead 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c @@ -129,6 +129,8 @@ static vsi_bool op_setup uint32_t i = 0; uint32_t indices_dims = inputs[1]->attr.dim_num; + VSI_UNREFERENCED(self); + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c index d8c9842e1..462a2cad9 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c @@ -30,10 +30,11 @@ #include "vsi_nn_prv.h" #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" -#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_tensor_util_prv.h" #define _INPUT_NUM (2) #define _OUTPUT_NUM (1) @@ -75,7 +76,32 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "block_size", block_size ); vsi_nn_kernel_param_add_int32( param, "coord_dim", coord_dim ); vsi_nn_kernel_param_add_int32( param, "idx_num", idx_num ); - n = vsi_nn_kernel_selector( self->graph, "scatter_nd", inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); + + if (vsi_nn_is_same_data_type(inputs[1], outputs[0]) == FALSE || + vsi_nn_is_same_quant_type(inputs[1], outputs[0])) + { + n = vsi_nn_kernel_selector( self->graph, "scatter_nd", inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); + } + else + { + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t* temp_tensors = NULL; + + VSILOGW("scatter_nd is no_range_change operation! \ + Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!"); + + memcpy( &attr, &outputs[0]->attr, sizeof(attr)); + memcpy( &attr.dtype, &inputs[1]->attr.dtype, sizeof(attr.dtype)); + attr.is_const = FALSE; + attr.vtl = TRUE; + temp_tensors = vsi_nn_CreateTensor( self->graph, &attr ); + + vsi_nn_kernel_selector( self->graph, "scatter_nd", inputs, _INPUT_NUM, &temp_tensors, _OUTPUT_NUM, param ); + n = vxTensorCopyNode( self->graph->g, temp_tensors->t, outputs[0]->t); + + vsi_safe_release_tensor(temp_tensors); + } + if ( n != NULL ) { self->n = (vx_node)n; @@ -134,6 +160,8 @@ static vsi_bool op_setup uint32_t i = 0; vsi_nn_scatter_nd_param * p = &(self->nn_param.scatter_nd); + VSI_UNREFERENCED(inputs); + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { if (p->shape == NULL) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c index 63900eb98..e3e19ade7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c @@ -141,6 +141,8 @@ static vsi_bool op_setup /* TODO: Add code to comput outputs' shape. */ uint32_t i = 0; + VSI_UNREFERENCED(self); + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c index 485dcd5ef..7efc8c767 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c @@ -48,68 +48,15 @@ static vsi_status op_compute ) { vsi_status status = VSI_FAILURE; - vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL }; - vsi_size_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; - vsi_size_t* shapes_ptr[_IO_NUM]; - vsi_size_t* shapes_in[_INPUT_NUM]; - vsi_size_t rank_in[_INPUT_NUM]; - uint32_t new_rank = 0; - int32_t i = 0; - vsi_bool ret = FALSE; - vsi_nn_context_t ctx = NULL; if ( NULL == self ) { return VSI_FAILURE; } - ctx = self->graph->ctx; - - for (i = 0; i < _IO_NUM; i++) - { - shapes_ptr[i] = shapes[i]; - } - - for (i = 0; i < _INPUT_NUM; i++) - { - shapes_in[i] = inputs[i]->attr.size; - rank_in[i] = (vsi_size_t)inputs[i]->attr.dim_num; - } - - ret = vsi_nn_kernel_optimize_broadcast_shape( - (const vsi_size_t**)shapes_in, rank_in, _INPUT_NUM, - outputs[0]->attr.size, outputs[0]->attr.dim_num, - shapes_ptr, shapes[_INPUT_NUM], &new_rank); - - if ( ret && !ctx->config.support_stream_processor ) - { - for (i = 0; i < _INPUT_NUM; i++) - { - reshape_tensors[i] = vsi_nn_reshape_tensor( self->graph, - inputs[i], shapes[i], new_rank ); - } - - for (i = 0; i < _OUTPUT_NUM; i++) - { - reshape_tensors[i + _INPUT_NUM] = vsi_nn_reshape_tensor( self->graph, - outputs[i], shapes[i + _INPUT_NUM], new_rank ); - } - - self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "select", - &reshape_tensors[0], _INPUT_NUM, - &reshape_tensors[_INPUT_NUM], _OUTPUT_NUM, NULL ); - - for (i = 0; i < _IO_NUM; i++) - { - vsi_safe_release_tensor( reshape_tensors[i] ); - } - } - else - { - self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "select", - inputs, _INPUT_NUM, - outputs, _OUTPUT_NUM, NULL ); - } + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "select", + inputs, _INPUT_NUM, + outputs, _OUTPUT_NUM, NULL ); if ( self->n ) { @@ -247,6 +194,8 @@ static vsi_bool op_setup vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_bool ret = TRUE; + VSI_UNREFERENCED(self); + in0_rank = inputs[0]->attr.dim_num; in1_rank = inputs[1]->attr.dim_num; in2_rank = inputs[2]->attr.dim_num; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c b/src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c index 500e6761e..dc54ba7ad 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c @@ -100,6 +100,14 @@ static vsi_bool op_check IO_TYPE(D_I32, D_I32, D_F32) IO_TYPE(D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_NONE, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_NONE, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_SYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_NONE, D_F16) + IO_TYPE(D_F16, D_NONE, D_U8|Q_ASYM) IO_TYPE(D_I32, D_NONE, D_U8|Q_ASYM) IO_TYPE(D_I32, D_NONE, D_BOOL8) END_IO_TYPE_DECL(SEQUENCE_MASK) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c index bb41e98ad..f922b8d16 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c @@ -37,6 +37,7 @@ #include "kernel/vsi_nn_kernel.h" #include "vsi_nn_internal_node.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_error.h" #define _ARG_NUM (3) #define _INPUT_NUM (1) @@ -136,6 +137,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); if (self->input.num > 1) { return VSI_SUCCESS; @@ -153,9 +156,10 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - vsi_nn_slice_param * p; + vsi_nn_slice_param * p = NULL; vsi_nn_internal_node_t* curr = NULL; - uint32_t i; + uint32_t i = 0; + vsi_bool ret = FALSE; if (self->nn_param.slice.dims == 0) { @@ -187,6 +191,7 @@ static vsi_bool op_setup } curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.strided_slice.begin_dims = p->lcl_data->begin_dims; curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num; curr->node->nn_param.strided_slice.end_dims = p->lcl_data->end_dims; @@ -199,9 +204,10 @@ static vsi_bool op_setup curr->node->nn_param.strided_slice.new_axis_mask = 0; curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node( self, curr ); + ret = vsi_nn_internal_setup_node( self, curr ); - return TRUE; +final: + return ret; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c index c81639929..27431a73f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c @@ -37,6 +37,7 @@ #include "utils/vsi_nn_math.h" #include "utils/vsi_nn_constraint_check.h" #include "vsi_nn_tensor_util_prv.h" +#include "vsi_nn_error.h" static vsi_status op_compute ( @@ -45,6 +46,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -123,6 +126,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); if (VSI_NN_OPTIMIZE_BACKWARD == direction) { return VSI_SUCCESS; @@ -174,7 +179,9 @@ static vsi_bool op_setup ) { vsi_nn_internal_node_t* curr = NULL; - if( NULL == self ) + vsi_bool ret = FALSE; + + if ( NULL == self ) { return FALSE; } @@ -202,13 +209,15 @@ static vsi_bool op_setup vsi_nn_internal_init_node_wksp(self); curr = vsi_nn_internal_new_node(self, VSI_NN_OP_SOFTMAX_INTERNAL, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; curr->node->nn_param.softmax_internal.beta = self->nn_param.softmax.beta; curr->node->nn_param.softmax_internal.axis = self->nn_param.softmax.axis; - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); - return TRUE; +final: + return ret; } #ifdef __cplusplus diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c index 0dbe88c87..0d85eb13e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c @@ -45,6 +45,8 @@ static vsi_bool _need_split_softmax ) { vsi_bool ret = FALSE; + VSI_UNREFERENCED(self); + if(inputs[0]->attr.dim_num == 2 && inputs[0]->attr.size[1] > MAX_SOFTMAX_BATCH) { ret = TRUE; @@ -250,6 +252,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); //TODO: Check tensor shapes. return TRUE; } /* op_check() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c index d6e201e5b..71615e740 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c @@ -35,8 +35,7 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_math.h" -#include "libnnext/vsi_nn_vxkernel.h" -#include "libnnext/vx_lib_nnext.h" +#include "vsi_nn_error.h" #include "vsi_nn_test.h" #include "utils/vsi_nn_constraint_check.h" @@ -103,6 +102,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); if (self->nn_param.space2depth.block_size[0] != self->nn_param.space2depth.block_size[1]) { return vsi_nn_internal_optimize_node(self, direction ); @@ -142,12 +143,13 @@ static vsi_bool op_set_space2depth_internal vsi_nn_op_t type_name ) { - vsi_bool retn = TRUE; + vsi_bool retn = FALSE; vsi_nn_internal_node_t* curr = NULL; vsi_nn_internal_init_node_wksp( self ); curr = vsi_nn_internal_new_node( self, type_name, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.space2depth_internal.block_size_x = self->nn_param.space2depth.block_size[0]; curr->node->nn_param.space2depth_internal.block_size_y = @@ -156,6 +158,7 @@ static vsi_bool op_set_space2depth_internal curr->outputs[0] = outputs[0]; retn = vsi_nn_internal_setup_node(self, curr); +final: return retn; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_split.c b/src/tim/vx/internal/src/ops/vsi_nn_op_split.c index 9810b2c09..65dc6de93 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_split.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_split.c @@ -37,6 +37,7 @@ #include "utils/vsi_nn_link_list.h" #include "vsi_nn_internal_node.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_error.h" static vsi_status op_compute ( @@ -45,6 +46,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -178,9 +181,9 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - vsi_bool ret; - uint32_t i, num; - vsi_size_t average; + vsi_bool ret = FALSE; + uint32_t i = 0, num = 0; + vsi_size_t average = 1; vsi_size_t start[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_size_t end[VSI_NN_MAX_DIM_NUM] = { 0 }; uint32_t axis = self->nn_param.split.axis; @@ -189,8 +192,6 @@ static vsi_bool op_setup vsi_nn_split_param * p = NULL; vsi_nn_internal_node_t* curr = NULL; - ret = TRUE; - average = 1; /* compute the output tensor number */ num = (uint32_t)(self->output.num - 1); while ( NULL == outputs[num] ) @@ -237,6 +238,7 @@ static vsi_bool op_setup p->lcl_data->end_dims[j] = (int32_t)end[j]; } curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.strided_slice.begin_dims = p->lcl_data->begin_dims; curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num; curr->node->nn_param.strided_slice.end_dims = p->lcl_data->end_dims; @@ -249,10 +251,12 @@ static vsi_bool op_setup curr->node->nn_param.strided_slice.new_axis_mask = 0; curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[i]; - vsi_nn_internal_setup_node( self, curr ); + ret = vsi_nn_internal_setup_node( self, curr ); } return ret; +final: + return FALSE; } /* op_setup() */ static vsi_status op_init @@ -309,28 +313,12 @@ static vsi_status op_deinit p = &(self->nn_param.split); - if (p->lcl_data->begin_dims) - { - free(p->lcl_data->begin_dims); - p->lcl_data->begin_dims = NULL; - } - - if (p->lcl_data->end_dims) - { - free(p->lcl_data->end_dims); - p->lcl_data->end_dims = NULL; - } - - if (p->lcl_data->stride_dims) - { - free(p->lcl_data->stride_dims); - p->lcl_data->stride_dims = NULL; - } - - if (p->lcl_data) + if (p && p->lcl_data) { - free(p->lcl_data); - p->lcl_data = NULL; + vsi_nn_safe_free(p->lcl_data->begin_dims); + vsi_nn_safe_free(p->lcl_data->end_dims); + vsi_nn_safe_free(p->lcl_data->stride_dims); + vsi_nn_safe_free(p->lcl_data); } vsi_nn_internal_deinit_node_wksp( self ); @@ -346,6 +334,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c index 3609aad4f..4e0a5e566 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c @@ -35,6 +35,7 @@ #include "kernel/vsi_nn_kernel.h" #include "vsi_nn_tensor_util.h" #include "vsi_nn_internal_node.h" +#include "vsi_nn_error.h" /* Declare number of input and output. @@ -49,6 +50,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -106,6 +109,7 @@ static vsi_bool op_setup vsi_bool shouldSqueeze[VSI_NN_MAX_DIM_NUM] = {FALSE}; uint32_t numDimsSqueezed = 0; vsi_nn_internal_node_t* curr = NULL; + vsi_bool ret = FALSE; if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { @@ -122,7 +126,7 @@ static vsi_bool op_setup { int32_t rank = self->nn_param.squeeze.axis[i]; - rank = rank < 0 ? rank + inputs[0]->attr.dim_num : rank; + rank = rank < 0 ? rank + (int32_t)inputs[0]->attr.dim_num : rank; if ( !shouldSqueeze[rank] ) { @@ -145,13 +149,15 @@ static vsi_bool op_setup vsi_nn_internal_init_node_wksp( self ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.reshape2.size = outputs[0]->attr.size; curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num; curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node( self, curr ); + ret = vsi_nn_internal_setup_node( self, curr ); - return TRUE; +final: + return ret; } /* op_setup() */ static vsi_status op_deinit @@ -172,6 +178,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c index 9b59d9920..d59c6f5d1 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c @@ -37,7 +37,7 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_link_list.h" #include "utils/vsi_nn_dtype_util.h" -#include "libnnext/vsi_nn_vxkernel.h" +#include "vsi_nn_error.h" #define _ARG_NUM (1) #define _INPUT_NUM VSI_NN_STACK_MAX_INPUTS @@ -53,6 +53,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -63,6 +65,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -84,7 +89,7 @@ static vsi_bool op_setup vsi_nn_internal_node_t* curr = NULL; vsi_nn_tensor_t *output_rs = NULL; vsi_nn_stack_lcl_data * data = NULL; - vsi_bool ret = TRUE; + vsi_bool ret = FALSE; vx_int8 is_scalar = vsi_nn_GetTensorIsScalar(inputs[0]); vsi_nn_internal_init_node_wksp( node ); @@ -122,10 +127,12 @@ static vsi_bool op_setup if (1 == node->input.num) { curr = vsi_nn_internal_new_node( node, VSI_NN_OP_RESHAPE2, 1, 1); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num; curr->node->nn_param.reshape2.size = outputs[0]->attr.size; + ret = vsi_nn_internal_setup_node(node, curr); goto final; } @@ -133,17 +140,13 @@ static vsi_bool op_setup input_shape[1] = block_num; curr = vsi_nn_internal_new_node( node, VSI_NN_OP_CONCAT, node->input.num, node->output.num ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); for (i = 0; i < node->input.num; i++) { vsi_nn_tensor_t *input_rs = NULL; /* Malloc ptr */ data = (vsi_nn_stack_lcl_data *)malloc( sizeof(vsi_nn_stack_lcl_data) ); - if( NULL == data ) - { - VSILOGE( "Create stack local data fail." ); - ret = FALSE; - goto final; - } + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(data, curr, "Create buffer failed", final); memset( data, 0, sizeof(vsi_nn_stack_lcl_data) ); input_rs = vsi_nn_reshape_tensor(node->graph, inputs[i], input_shape, 2); @@ -171,16 +174,18 @@ static vsi_bool op_setup /* Malloc ptr */ data = (vsi_nn_stack_lcl_data *)malloc( sizeof(vsi_nn_stack_lcl_data) ); - if( NULL == data ) - { - VSILOGE( "Create stack local data fail." ); - ret = FALSE; - goto final; - } + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(data, curr, "Create buffer failed", final); memset( data, 0, sizeof(vsi_nn_stack_lcl_data) ); output_rs = vsi_nn_reshape_tensor(node->graph, outputs[0], output_shape, 2); - data->src_in = output_rs; + if (output_rs == NULL) + { + vsi_nn_internal_release_node(&curr); + VSILOGD("Create reshape tensor failed\n"); + vsi_nn_safe_free(data); + goto final; + } + data->src_in = output_rs; /* Store node, ptr */ vsi_nn_LinkListPushStart( (vsi_nn_link_list_t **)&node->nn_param.stack.lcl_data, @@ -188,10 +193,9 @@ static vsi_bool op_setup curr->outputs[0] = output_rs; curr->node->nn_param.concat.axis = axis; + ret = vsi_nn_internal_setup_node(node, curr); final: - vsi_nn_internal_setup_node(node, curr); - return ret; } /* op_setup() */ @@ -203,6 +207,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c index 1cf2891ad..ae43c05c8 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c @@ -749,6 +749,8 @@ static vsi_status op_optimize vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_bool is_same_quant_type = FALSE; vsi_bool is_same_shape = TRUE; + vsi_size_t input_elements = 0; + vsi_size_t output_elements = 0; /* Only forward run stride_slice's optimize */ if ( direction == VSI_NN_OPTIMIZE_BACKWARD ) @@ -775,38 +777,49 @@ static vsi_status op_optimize VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); - if ( NULL == inputs[0]->t ) - { - vsi_nn_TensorReinit( self->graph, inputs[0] ); - } - - /* Create tensor from view */ - memcpy( start, (vsi_size_t*)start_dims, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM ); - memcpy( end, (vsi_size_t*)stop_dims, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM ); - in_view_tensor = vsi_nn_CreateViewTensor(self->graph, start, end, inputs[0]); - if ( NULL == in_view_tensor ) - { - VSILOGE( "Create tensor %d from view fail.", i ); - status = VSI_FAILURE; - goto OnError; - } - self->nn_param.strided_slice.lcl2_data->is_optimized = TRUE; is_same_quant_type = _is_same_quant(inputs, outputs); - if ( NULL != outputs[0]->t || is_same_quant_type == FALSE) + input_elements = vsi_nn_GetElementNum( inputs[0] ); + output_elements = vsi_nn_GetElementNum( outputs[0] ); + if (NULL != outputs[0]->t && NULL == inputs[0]->t && + is_same_quant_type && input_elements == output_elements) { - VSILOGI( "stride slice copy tensor."); - // Copy old tensor values to the new address. - status = copy_tensor_to_view( self, in_view_tensor, outputs[0], shape, is_same_shape); - if ( VSI_FAILURE == status ) - { - goto OnError; - } + inputs[0]->t = vsi_nn_safe_reshape_tensor( outputs[0]->t, + (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num, + sizeof(inputs[0]->attr.size[0]) ); } else { - outputs[0]->t = in_view_tensor; + if ( NULL == inputs[0]->t ) + { + vsi_nn_TensorReinit( self->graph, inputs[0] ); + } + /* Create tensor from view */ + memcpy( start, (vsi_size_t*)start_dims, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM ); + memcpy( end, (vsi_size_t*)stop_dims, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM ); + in_view_tensor = vsi_nn_CreateViewTensor(self->graph, start, end, inputs[0]); + if ( NULL == in_view_tensor ) + { + VSILOGE( "Create tensor %d from view fail.", i ); + status = VSI_FAILURE; + goto OnError; + } + + if ( NULL != outputs[0]->t || is_same_quant_type == FALSE) + { + VSILOGI( "stride slice copy tensor."); + // Copy old tensor values to the new address. + status = copy_tensor_to_view( self, in_view_tensor, outputs[0], shape, is_same_shape); + if ( VSI_FAILURE == status ) + { + goto OnError; + } + } + else + { + outputs[0]->t = in_view_tensor; + } } OnError: @@ -841,32 +854,32 @@ static vsi_status op_deinit vsi_nn_safe_free( params->end_dims ); vsi_nn_safe_free( params->stride_dims ); - if (lcl2_data->cp_node) + if (lcl2_data && lcl2_data->cp_node) { vxReleaseNode( &lcl2_data->cp_node ); } - if (lcl2_data->src_tensor) + if (lcl2_data && lcl2_data->src_tensor) { vxReleaseTensor( &lcl2_data->src_tensor ); } - if (lcl2_data->dst_tensor && !lcl2_data->is_same_shape) + if (lcl2_data && lcl2_data->dst_tensor && !lcl2_data->is_same_shape) { vxReleaseTensor( &lcl2_data->dst_tensor ); } - if (lcl2_data->begin_dims) + if (lcl2_data && lcl2_data->begin_dims) { free(lcl2_data->begin_dims); } - if (lcl2_data->end_dims) + if (lcl2_data && lcl2_data->end_dims) { free(lcl2_data->end_dims); } - if (lcl2_data->stride_dims) + if (lcl2_data && lcl2_data->stride_dims) { free(lcl2_data->stride_dims); } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c b/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c index b8b4c1e53..080183652 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c @@ -31,6 +31,7 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_error.h" static vsi_status _create_local_tensor ( @@ -129,6 +130,7 @@ static vsi_status op_compute attr.is_const = TRUE; attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; bias_tensor = vsi_nn_CreateTensor(self->graph, &attr); + CHECK_PTR_FAIL_GOTO( bias_tensor, "Create tensor fail.", final ); param.bias = bias_tensor->t; } @@ -145,6 +147,7 @@ static vsi_status op_compute status = VSI_SUCCESS; } +final: if (bias_tensor != NULL) vsi_nn_ReleaseTensor(&bias_tensor); return status; } /* op_compute() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c b/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c index 812cea379..61a541c79 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c @@ -63,6 +63,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c index 78f350858..ff15f81de 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c @@ -49,7 +49,7 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status = VX_FAILURE; + vsi_status status = VSI_FAILURE; vsi_nn_kernel_param_t * param = NULL; vsi_nn_tensor_add_mean_stddev_norm_param * p = NULL; float eps; @@ -113,6 +113,8 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(node); + /* TODO: Add code to comput outputs' shape. */ if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c index 3098b6cf8..82f104a58 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c @@ -141,6 +141,8 @@ static vsi_bool op_setup vsi_nn_tensorstackconcat_param *p = NULL; int32_t axis = 0; + VSI_UNREFERENCED(outputs); + if ( NULL == self ) { return ret; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c index 647396fdb..b6fb26ec7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c @@ -41,6 +41,30 @@ Declare number of input and output. */ +static vsi_bool _is_supported_axis(vsi_size_t* multiples, vsi_size_t multiples_num) +{ + vsi_size_t i = 0; + + if ( multiples_num < 4) + { + return TRUE; + } + else if ( multiples_num > 4) + { + return FALSE; + } + + for ( i = 3; i < multiples_num; i++) + { + if (multiples[i] > 1) + { + return FALSE; + } + } + + return TRUE; +} + static vsi_status _tile_op_compute ( const char * kernel_name, @@ -49,18 +73,100 @@ static vsi_status _tile_op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status = VSI_FAILURE; + vsi_status status = VSI_FAILURE; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t new_rank = 0; + vsi_bool ret = FALSE; + vsi_size_t* multiples = (vsi_size_t*)self->nn_param.tile.multiples; + vsi_nn_tensor_t* temp_tensors[2] = { NULL }; + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + vsi_nn_tensor_attr_t attr; + + if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE) + { + VSILOGW("tile is no_range_change operation! \ + Insert DataConvert Operation when the quantization parameters\ + of input and output are inconsistent!"); - self->n = (vx_node)vsi_nn_kernel_selector( self->graph, - kernel_name, - &inputs[0], 1, - &outputs[0], 1, NULL ); + memcpy( &attr, &outputs[0]->attr, sizeof(attr)); + memcpy( &attr.dtype, &inputs[0]->attr.dtype, sizeof(attr.dtype)); + attr.is_const = FALSE; + attr.vtl = TRUE; + temp_tensors[1] = vsi_nn_CreateTensor( self->graph, &attr ); + } + else + { + temp_tensors[1] = outputs[0]; + } - if( self->n ) + ret = vsi_nn_kernel_optimize_tile_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, + multiples, inputs[0]->attr.dim_num, + temp_tensors[1]->attr.size, temp_tensors[1]->attr.dim_num, + shapes[0], shapes[1], shapes[2], &new_rank ); + + if (ret) + { + if (_is_supported_axis(shapes[1], new_rank) == FALSE) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0],\ + shapes[0], (vsi_size_t)new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, temp_tensors[1],\ + shapes[2], (vsi_size_t)new_rank ); + if (reshape_tensors[0] == NULL || reshape_tensors[1] == NULL) + { + VSILOGE("reshape tensor failed!"); + status = VSI_FAILURE; + goto final; + } + + memcpy( &attr, &reshape_tensors[0]->attr, sizeof(attr)); + attr.is_const = FALSE; + attr.vtl = TRUE; + attr.size[0] = reshape_tensors[1]->attr.size[0]; + attr.size[1] = reshape_tensors[1]->attr.size[1]; + + temp_tensors[0] = vsi_nn_CreateTensor( self->graph, &attr ); + + self->n = (vx_node)vsi_nn_kernel_selector( + self->graph, kernel_name, &reshape_tensors[0], 1, &temp_tensors[0], 1, NULL); + self->n = (vx_node)vsi_nn_kernel_selector( + self->graph, kernel_name, &temp_tensors[0], 1, &reshape_tensors[1], 1, NULL); + + } + else + { + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0],\ + shapes[0], (vsi_size_t)new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, temp_tensors[1],\ + shapes[2], (vsi_size_t)new_rank ); + if (reshape_tensors[0] == NULL || reshape_tensors[1] == NULL) + { + VSILOGE("reshape tensor failed!"); + status = VSI_FAILURE; + goto final; + } + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, kernel_name,\ + &reshape_tensors[0], 1, &reshape_tensors[1], 1, NULL ); + } + } + + if ( self->n ) { status = VSI_SUCCESS; } +final: + vsi_safe_release_tensor(reshape_tensors[0]); + vsi_safe_release_tensor(reshape_tensors[1]); + vsi_safe_release_tensor(temp_tensors[0]); + if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE) + { + self->n = vxTensorCopyNode( self->graph->g, temp_tensors[1]->t, outputs[0]->t); + vsi_safe_release_tensor(temp_tensors[1]); + } + return status; } /* _tile_op_compute() */ @@ -71,7 +177,7 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - /*TODO: Check tensor shapes. */ + /*TODO: Check tensor shapes. */ vsi_nn_tile_param * p; BEGIN_IO_TYPE_DECL(TILE, 1, 1) @@ -88,6 +194,8 @@ static vsi_bool op_check IO_TYPE(D_I32, D_I32) IO_TYPE(D_U32, D_U32) IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_U8|Q_ASYM) END_IO_TYPE_DECL(TILE) if (!VALIDATE_OP_IO_TYPES(TILE, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c index d797af2cd..ff8c0e0fd 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c @@ -36,10 +36,59 @@ #include "vsi_nn_tensor_util.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "vsi_nn_error.h" #define _INPUT_NUM (1) #define _OUTPUT_NUM (2) +vsi_nn_tensor_t* _create_permute_node + ( + vsi_nn_node_t* self, + vsi_nn_tensor_t* input_tensor, + vsi_nn_tensor_t* output_tensor, + uint32_t* perm, + uint32_t dim_num, + vsi_bool use_virtual_tensor + ) +{ + vsi_nn_tensor_t* tensor0 = NULL; + vsi_nn_tensor_t *output = NULL; + + if (output_tensor) + { + output = output_tensor; + } + else + { + uint32_t i = 0; + vsi_nn_tensor_attr_t attr; + memcpy(&attr, &input_tensor->attr, sizeof(attr)); + attr.vtl = use_virtual_tensor; + for ( i = 0; i < dim_num; i++ ) + { + attr.size[i] = input_tensor->attr.size[perm[i]]; + } + tensor0 = vsi_nn_CreateTensor( self->graph, &attr ); + CHECK_PTR_FAIL_GOTO( tensor0, "Create tensor fail.", final ); + output = tensor0; + } + self->n = vxTensorPermuteNode( + self->graph->g, + input_tensor->t, + output->t, + perm, + dim_num + ); + if (self->n == NULL) + { + vsi_safe_release_tensor(tensor0); + } + +final: + return tensor0; +} + static vsi_status op_compute ( vsi_nn_node_t * self, @@ -49,18 +98,122 @@ static vsi_status op_compute { vsi_status status = VSI_FAILURE; vsi_nn_kernel_param_t * param = NULL; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + uint32_t rank_in = 0; + uint32_t rank_out = 0; + int32_t new_axis0 = 0; + int32_t new_axis1 = 0; + int32_t axis = self->nn_param.topk.axis; + int32_t top_k = self->nn_param.topk.k; + vsi_nn_tensor_t * in_tensor = NULL; + vsi_nn_tensor_t * out0_tensor = NULL; + vsi_nn_tensor_t * out1_tensor = NULL; + vsi_bool ret = FALSE; + + ret = vsi_nn_kernel_optimize_softmax_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, + shapes[0], &rank_in, &new_axis0); + + ret = vsi_nn_kernel_optimize_softmax_shape( + outputs[0]->attr.size, outputs[0]->attr.dim_num, axis, + shapes[1], &rank_out, &new_axis1); param = vsi_nn_kernel_param_create(); - vsi_nn_kernel_param_add_int32( param, "top_k", self->nn_param.topk.k ); + vsi_nn_kernel_param_add_int32( param, "top_k", top_k ); + + if (ret) + { + uint32_t perm_in[VSI_NN_MAX_DIM_NUM] = {0}; + uint32_t perm_out[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_nn_tensor_t* input_tensor = NULL; + vsi_nn_tensor_t* outputs_tensor[2] = {NULL}; + + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], shapes[0], rank_in ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + outputs[0], shapes[1], rank_in ); + reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, + outputs[1], shapes[1], rank_in ); + + axis = new_axis0; + + if (axis != 0) + { + uint32_t i = 0; + uint32_t index = 0; + + vsi_nn_tensor_attr_t attr0, attr1; + memcpy(&attr0, &reshape_tensors[1]->attr, sizeof(attr0)); + memcpy(&attr1, &reshape_tensors[2]->attr, sizeof(attr1)); + + attr0.vtl = TRUE; + attr1.vtl = TRUE; + attr0.size[index] = (vsi_size_t)top_k; + attr1.size[index] = (vsi_size_t)top_k; + perm_in[index ++] = (uint32_t)axis; + for ( i = 0; i < rank_in; i++ ) + { + if ((int32_t)i == axis) + continue; + attr0.size[index] = shapes[1][i]; + attr1.size[index] = shapes[1][i]; + perm_in[index ++] = i; + } + + perm_out[axis] = 0; + for ( i = 1, index = 0; i < rank_in; i++ ) + { + if ((int32_t)index == axis) + { + index ++; + } + perm_out[index ++] = i; + } + + out0_tensor = vsi_nn_CreateTensor( self->graph, &attr0 ); + CHECK_PTR_FAIL_GOTO( out0_tensor, "Create tensor fail.", final ); + out1_tensor = vsi_nn_CreateTensor( self->graph, &attr1 ); + CHECK_PTR_FAIL_GOTO( out1_tensor, "Create tensor fail.", final ); + + in_tensor = _create_permute_node(self, reshape_tensors[0], NULL, perm_in, rank_in, TRUE); + CHECK_PTR_FAIL_GOTO( in_tensor, "Create internal tensor fail.", final ); + + input_tensor = in_tensor; + outputs_tensor[0] = out0_tensor; + outputs_tensor[1] = out1_tensor; + } + else + { + input_tensor = reshape_tensors[0]; + outputs_tensor[0] = reshape_tensors[1]; + outputs_tensor[1] = reshape_tensors[2]; + } + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "topk", + &input_tensor, _INPUT_NUM, + outputs_tensor, _OUTPUT_NUM, param ); - self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "topk", - inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param ); + if (axis != 0) + { + _create_permute_node(self, outputs_tensor[0], reshape_tensors[1], perm_out, rank_in, TRUE); + _create_permute_node(self, outputs_tensor[1], reshape_tensors[2], perm_out, rank_in, TRUE); + } + } - if( self->n ) + if ( self->n ) { status = VSI_SUCCESS; } +final: + vsi_safe_release_tensor( reshape_tensors[0] ); + vsi_safe_release_tensor( reshape_tensors[1] ); + vsi_safe_release_tensor( reshape_tensors[2] ); + vsi_safe_release_tensor( in_tensor ); + vsi_safe_release_tensor( out0_tensor ); + vsi_safe_release_tensor( out1_tensor ); + return status; } /* op_compute() */ @@ -107,29 +260,38 @@ static vsi_bool op_setup /* TODO: Add code to comput outputs' shape. */ uint32_t i; - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { vsi_nn_topk_param * p; p = &(self->nn_param.topk); + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; - outputs[0]->attr.size[0] = p->k; - for (i = 1; i < inputs[0]->attr.dim_num; i++) + outputs[0]->attr.size[p->axis] = p->k; + for (i = 0; i < inputs[0]->attr.dim_num; i++) { + if ((int32_t)i == p->axis) + { + continue; + } outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; } } - if( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num ) { vsi_nn_topk_param * p; p = &(self->nn_param.topk); outputs[1]->attr.dim_num = inputs[0]->attr.dim_num; - outputs[1]->attr.size[0] = p->k; - for (i = 1; i < inputs[0]->attr.dim_num; i++) + outputs[1]->attr.size[p->axis] = p->k; + for (i = 0; i < inputs[0]->attr.dim_num; i++) { + if ((int32_t)i == p->axis) + { + continue; + } outputs[1]->attr.size[i] = inputs[0]->attr.size[i]; } } @@ -137,6 +299,17 @@ static vsi_bool op_setup return TRUE; } /* op_setup() */ +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + self->nn_param.topk.axis = 0; + + return status; +} /* op_init() */ + #ifdef __cplusplus extern "C" { #endif @@ -144,7 +317,7 @@ extern "C" { DEF_OP_REG ( /* op_name */ TOPK, - /* init */ NULL, + /* init */ op_init, /* compute */ op_compute, /* deinit */ vsi_nn_op_common_deinit, /* check */ op_check, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c index a6d526633..ece932e6e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c @@ -35,9 +35,9 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "libnnext/vsi_nn_vxkernel.h" #include "vsi_nn_internal_node.h" #include "vsi_nn_rnn_helper.h" +#include "vsi_nn_error.h" static vsi_bool setup_op_shapes ( @@ -80,6 +80,7 @@ static vsi_bool setup_op_shapes attr.is_const = TRUE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); inputs[RNN_INPUT_H_STATE] = output_tensor->t; } @@ -91,6 +92,7 @@ static vsi_bool setup_op_shapes attr.vtl = use_virtual_tensor; attr.is_const = FALSE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); outputs[RNN_OUTPUT_H_STATE] = output_tensor->t; } @@ -112,6 +114,8 @@ static vsi_bool setup_op_shapes } return TRUE; +final: + return FALSE; } static vsi_status op_compute @@ -121,6 +125,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -131,6 +137,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -143,6 +152,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ @@ -168,6 +179,8 @@ static vsi_bool op_setup vsi_size_t batch_size = 0; vsi_size_t time_step = 0; uint32_t i = 0; + vsi_bool ret = FALSE; + vsi_status status = VSI_FAILURE; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); @@ -193,21 +206,28 @@ static vsi_bool op_setup /* transpose to time_major */ output_tensor = vsi_nn_rnn_transpose_time_major(self, inputs[RNN_INPUT_INPUT], NULL, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); input_tensor = output_tensor->t; } /* split input tensor */ split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); + CHECK_PTR_FAIL_GOTO( split_output_tensors, "Create buffer fail.", final ); memset( split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); rnncell_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); + CHECK_PTR_FAIL_GOTO( rnncell_reshape_output_tensors, "Create buffer fail.", final ); memset( rnncell_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); - vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); + status = vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, + (uint32_t)time_step, use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO(status, final); - vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); + status = vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); + CHECK_STATUS_FAIL_GOTO(status, final); last_step_h_state = inputs[RNN_INPUT_H_STATE]; + for( i = 0; i < time_step; i++ ) { vsi_nn_tensor_t* reshape_output = NULL; @@ -217,26 +237,30 @@ static vsi_bool op_setup /* reshape for split output */ output_tensor = vsi_nn_rnn_reshape_split_output(self, split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); reshape_output = output_tensor->t; /* rnncell output */ vsi_nn_internal_init_tensor_attr(&attr, &outputs[RNN_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); rnncell_out0 = output_tensor->t; /* rnncell output h_state */ vsi_nn_internal_init_tensor_attr(&attr, &outputs[RNN_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); rnncell_out1 = output_tensor->t; curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RNNCELL_OVXLIB, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.rnncell_ovxlib.activation = curr_param->activation; if ( reshape_output->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 || reshape_output->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ) { - int32_t k = 0; + size_t k = 0; for (k = 0; k < _cnt_of_array( curr_param->internal_dtype ); k++) { if (curr_param->internal_dtype[k].vx_type == VSI_NN_TYPE_NONE) @@ -274,6 +298,7 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &outputs[RNN_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final ); tensor = output_tensor->t; } @@ -281,6 +306,7 @@ static vsi_bool op_setup if (outputs[RNN_OUTPUT_H_STATE] != NULL) { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = last_step_h_state; curr->outputs[0] = outputs[RNN_OUTPUT_H_STATE]; vsi_nn_internal_setup_node(self, curr); @@ -288,13 +314,14 @@ static vsi_bool op_setup /* concat rnncell output, the rnn's output is 3-dims */ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.concat.axis = 2; for( i = 0; i < time_step; i++ ) { curr->inputs[i] = rnncell_reshape_output_tensors[i]; } curr->outputs[0] = tensor; - vsi_nn_internal_setup_node( self, curr ); + ret = vsi_nn_internal_setup_node( self, curr ); if( !curr_param->time_major ) { @@ -303,10 +330,11 @@ static vsi_bool op_setup tensor, outputs[RNN_OUTPUT_OUTPUT], use_virtual_tensor); } +final: vsi_nn_safe_free( split_output_tensors ); vsi_nn_safe_free( rnncell_reshape_output_tensors ); - return TRUE; + return ret; } /* op_setup() */ static vsi_status op_deinit diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c index 7e57e3223..35d84a5f8 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c @@ -34,8 +34,8 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "libnnext/vsi_nn_vxkernel.h" #include "vsi_nn_internal_node.h" +#include "vsi_nn_error.h" #define _INPUT_NUM (1) #define _OUTPUT_NUM (VSI_NN_UNSTACK_MAX_OUTPUTS) @@ -47,6 +47,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -58,6 +60,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ @@ -68,6 +72,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(self); + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); /*TODO: Check tensor shapes. */ return TRUE; } /* op_check() */ @@ -94,6 +101,7 @@ static vsi_bool op_setup uint32_t i = 0, j = 0; uint32_t rank = inputs[0]->attr.dim_num; int8_t is_scalar = (rank - 1) == 0 ? TRUE : FALSE; + vsi_bool ret = FALSE; vsi_nn_internal_init_node_wksp( self ); @@ -172,10 +180,13 @@ static vsi_bool op_setup memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor); input_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); reshape_input_size = (vsi_size_t*)vsi_nn_internal_new_node_param(curr, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_input_size, curr, "Create internal buffer failed", final); reshape_input_size[0] = block_size; reshape_input_size[1] = tensor_num; reshape_input_size[2] = block_num; @@ -186,23 +197,28 @@ static vsi_bool op_setup curr->outputs[0] = input_tensor->t; vsi_nn_internal_setup_node( self, curr ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, tensor_num ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); slices = (uint32_t *)vsi_nn_internal_new_node_param(curr, tensor_num * sizeof(uint32_t)); - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, tensor_num ); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(slices, curr, "Create internal buffer failed", final); curr->node->nn_param.split.axis = 1; curr->node->nn_param.split.slices = slices; curr->node->nn_param.split.slices_num = tensor_num; curr->inputs[0] = input_tensor->t; output_tensors = (vsi_nn_internal_tensor_t**)malloc(tensor_num * sizeof(vsi_nn_internal_tensor_t*)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( output_tensors, curr, "Create tensor fail.", final ); + for (i = 0; i < tensor_num; i++) { slices[i] = 1; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_tensor_attr(&attr, &outputs[i]->attr.dtype, use_virtual_tensor); output_tensors[i] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensors[i], "Create internal tensor failed", final); curr->outputs[i] = output_tensors[i]->t; } - vsi_nn_internal_setup_node( self, curr ); + ret = vsi_nn_internal_setup_node( self, curr ); for (i = 0; i < tensor_num; i++) { @@ -210,10 +226,12 @@ static vsi_bool op_setup output_size = (vsi_size_t *)vsi_nn_internal_new_node_param(curr, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(output_size, curr, "Create internal buffer failed", final); memcpy(output_size, outputs[i]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.reshape2.size = output_size; curr->node->nn_param.reshape2.dim_num = outputs[i]->attr.dim_num; curr->inputs[0] = output_tensors[i]->t; @@ -221,9 +239,10 @@ static vsi_bool op_setup vsi_nn_internal_setup_node( self, curr ); } +final: vsi_nn_safe_free(output_tensors); - return TRUE; + return ret; } /* op_setup() */ static vsi_status op_deinit diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c b/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c index 1923b26a6..36bbdbc34 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c @@ -35,7 +35,6 @@ #include "vsi_nn_prv.h" #include "vsi_nn_log.h" #include "ops/vsi_nn_op_upsample.h" -#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel_eltwise.h" #include "utils/vsi_nn_constraint_check.h" @@ -144,17 +143,20 @@ static vsi_status op_compute vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; uint32_t new_rank = 0; - vsi_bool ret; + vsi_bool ret = FALSE; vsi_nn_kernel_param_t * param = NULL; - int32_t scale_x = (int32_t)self->nn_param.upsample.scale[0]; - int32_t scale_y = (int32_t)self->nn_param.upsample.scale[1]; + int32_t scale_x = 0; + int32_t scale_y = 0; if( NULL == self ) { return VSI_FAILURE; } - param =vsi_nn_kernel_param_create(); + scale_x = (int32_t)self->nn_param.upsample.scale[0]; + scale_y = (int32_t)self->nn_param.upsample.scale[1]; + + param = vsi_nn_kernel_param_create(); ret = vsi_nn_upsample_optimize_shape(self, (vsi_ssize_t*)inputs[0]->attr.size, (vsi_ssize_t*)inputs[1]->attr.size, @@ -164,7 +166,7 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "scale_x", scale_x ); vsi_nn_kernel_param_add_int32( param, "scale_y", scale_y ); - if( ret ) + if ( ret ) { reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0], shapes[0], new_rank ); @@ -180,7 +182,7 @@ static vsi_status op_compute vsi_nn_ReleaseTensor( &reshape_tensors[2] ); } - if( self->n ) + if ( self->n ) { status = VSI_SUCCESS; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c index 6bb917586..4b7dd3f61 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c @@ -35,6 +35,7 @@ #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" +#include "vsi_nn_error.h" typedef struct _upsamplescale_local_data_t { int32_t placeholder; @@ -56,8 +57,8 @@ static vsi_status op_compute ) { vsi_status status = VSI_FAILURE; - int32_t stride = self->nn_param.upsamplescale.stride; - float scale = self->nn_param.upsamplescale.scale; + int32_t stride = 0; + float scale = 0; vsi_nn_kernel_param_t * param = NULL; if( NULL == self ) @@ -65,12 +66,15 @@ static vsi_status op_compute return VSI_FAILURE; } + stride = self->nn_param.upsamplescale.stride; + scale = self->nn_param.upsamplescale.scale; + if (stride == 1 || vsi_nn_abs(scale - 1.0f) == _EPSILON) { return vsi_nn_internal_compute_node( self ); } - param =vsi_nn_kernel_param_create(); + param = vsi_nn_kernel_param_create(); vsi_nn_kernel_param_add_int32( param, "stride", stride ); vsi_nn_kernel_param_add_float32( param, "scale", scale ); @@ -82,7 +86,7 @@ static vsi_status op_compute vsi_nn_kernel_param_release( ¶m ); - if( self->n ) + if ( self->n ) { status = VSI_SUCCESS; } @@ -141,6 +145,9 @@ static vsi_status op_optimize int32_t stride = self->nn_param.upsamplescale.stride; float scale = self->nn_param.upsamplescale.scale; + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); + if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON) { return vsi_nn_internal_optimize_node( self, direction ); @@ -163,30 +170,34 @@ static vsi_bool op_setup float scale = self->nn_param.upsamplescale.scale; int32_t i = 0; vsi_nn_internal_node_t* curr = NULL; + vsi_bool ret = FALSE; vsi_nn_internal_init_node_wksp(self); if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON) { curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); } else if (stride == 1) { curr = vsi_nn_internal_new_node(self, VSI_NN_OP_LINEAR, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.linear.a = scale; curr->node->nn_param.linear.b = 0; curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); } else if (vsi_nn_abs(scale - 1.0f) == _EPSILON) { curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESIZE, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->node->nn_param.resize.type = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR; curr->node->nn_param.resize.align_corners = FALSE; curr->node->nn_param.resize.half_pixel_centers = FALSE; @@ -195,7 +206,7 @@ static vsi_bool op_setup curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node(self, curr); + ret = vsi_nn_internal_setup_node(self, curr); } else { @@ -206,9 +217,12 @@ static vsi_bool op_setup outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; } outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + + ret = TRUE; } - return TRUE; +final: + return ret; } /* op_setup() */ static vsi_status op_init @@ -216,6 +230,8 @@ static vsi_status op_init vsi_nn_node_t* self ) { + VSI_UNREFERENCED(self); + return VSI_SUCCESS; } /* op_init() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c b/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c index a8a2a7e0b..f4dcb531e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c @@ -44,6 +44,8 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_compute_node( self ); } /* op_compute() */ @@ -69,6 +71,8 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { + VSI_UNREFERENCED(inputs); + VSI_UNREFERENCED(outputs); return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ diff --git a/src/tim/vx/internal/src/post/vsi_nn_post_cmupose.c b/src/tim/vx/internal/src/post/vsi_nn_post_cmupose.c index 7d1b9cf09..6e0ec8d03 100644 --- a/src/tim/vx/internal/src/post/vsi_nn_post_cmupose.c +++ b/src/tim/vx/internal/src/post/vsi_nn_post_cmupose.c @@ -348,7 +348,7 @@ static vx_status resize_binlinear } } - return VX_SUCCESS; + return VSI_SUCCESS; } #endif @@ -455,14 +455,15 @@ static void _convolve_same float *input, uint32_t input_size, double *kernel, - uint32_t kernel_size, + int32_t kernel_size, float *output ) { - uint32_t pad,pad_input_size; - uint32_t i,k,offset; + uint32_t pad = 0, pad_input_size = 0; + uint32_t i = 0, offset = 0; + int32_t k = 0; float *pad_input = NULL; - double sum; + double sum = 0; uint32_t pad_input_sizef,input_sizef; if(NULL == input || NULL == kernel || NULL == output) @@ -536,6 +537,9 @@ static void set_cols ) { uint32_t w; + + VSI_UNREFERENCED(height); + if(NULL == data || cols == NULL) { return ; @@ -947,6 +951,7 @@ static vsi_nn_con_candidate_t *_get_connection_candidate { con_candidate = (vsi_nn_con_candidate_t *) vsi_nn_LinkListNewNode(sizeof(vsi_nn_con_candidate_t), _init_candidate); + CHECK_PTR_FAIL_GOTO( con_candidate, "null point.", final ); sum++; con_candidate->data.i = i; @@ -963,6 +968,8 @@ static vsi_nn_con_candidate_t *_get_connection_candidate } *candidate_sum = sum; + +final: return con_candidate_list; } @@ -1276,6 +1283,8 @@ static vsi_nn_subset_t *_compute_subset vsi_nn_subset_t *subset_list = NULL, *subset = NULL; uint32_t *deleteIdx = NULL; + VSI_UNREFERENCED(all_connection_num); + if(NULL == all_connection || NULL == candidate || NULL == special_k || @@ -1319,6 +1328,8 @@ static vsi_nn_subset_t *_compute_subset { sig_subset= (vsi_nn_subset_t *) vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)subset_list, j); + CHECK_PTR_FAIL_GOTO( sig_subset, "null point.", final ); + if(sig_subset->data.idx[indexA] == partAs[i] || sig_subset->data.idx[indexB] == partBs[i]) { @@ -1338,6 +1349,8 @@ static vsi_nn_subset_t *_compute_subset int32_t ii = partBs[i]; sig_connect = (vsi_nn_connection_t *) vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)connection_k, i); + CHECK_PTR_FAIL_GOTO( sig_connect, "get point fail.", final ); + sig_subset->data.idx[indexB] = (float)ii; sig_subset->data.idx[20 - 1] += 1; sig_subset->data.idx[20 - 2] += @@ -1362,6 +1375,8 @@ static vsi_nn_subset_t *_compute_subset vsi_nn_subset_t *j2_iter = j2_subset; sig_connect = (vsi_nn_connection_t *) vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)connection_k, i); + CHECK_PTR_FAIL_GOTO( sig_connect, "get point fail.", final ); + for(ii=0; ii<(20-2); ii++) { j1_iter->data.idx[ii] += j2_iter->data.idx[ii] + 1; @@ -1380,6 +1395,8 @@ static vsi_nn_subset_t *_compute_subset int32_t ii = partBs[i]; sig_connect = (vsi_nn_connection_t *) vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)connection_k, i); + CHECK_PTR_FAIL_GOTO( sig_connect, "get point fail.", final ); + sum = candidate[ii].score + sig_connect->data.score; j1_subset->data.idx[indexB] = (float)ii; j1_subset->data.idx[20 - 1] += 1; @@ -1413,7 +1430,7 @@ static vsi_nn_subset_t *_compute_subset subset = (vsi_nn_subset_t *) vsi_nn_LinkListNewNode(sizeof(vsi_nn_subset_t), _init_subset); - + CHECK_PTR_FAIL_GOTO( subset, "null point.", final ); memcpy(&subset->data, row, sizeof(float) * 20); vsi_nn_LinkListPushEnd( @@ -1433,6 +1450,7 @@ static vsi_nn_subset_t *_compute_subset memset(deleteIdx, -1, sizeof(uint32_t) * num); subset = subset_list; + CHECK_PTR_FAIL_GOTO( subset, "null point.", final ); for(i=0,j=0; idata.idx[20 - 1]; @@ -1445,28 +1463,13 @@ static vsi_nn_subset_t *_compute_subset } for(i=0; idata.idx[i]); - } - subset = (vsi_nn_subset_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)subset); - n++; - } - #endif - final: if(deleteIdx)free(deleteIdx); return subset_list; @@ -1499,6 +1502,7 @@ static vsi_nn_connection_t **_compute_all_connetion score_mid = (float *)malloc(sizeof(float) * height * width * score_mid_depth); CHECK_PTR_FAIL_GOTO( score_mid, "Create buffer fail.", final ); connection_all = (vsi_nn_connection_t **)malloc(sizeof(vsi_nn_connection_t *) * mapIdx_len); + CHECK_PTR_FAIL_GOTO( connection_all, "Create buffer fail.", final ); special_k = (int32_t *)malloc(sizeof(int32_t) * mapIdx_len); CHECK_PTR_FAIL_GOTO( special_k, "Create buffer fail.", final ); @@ -1836,6 +1840,7 @@ vsi_status vsi_nn_CMUPose_Post_Process _fill_paf_avg(net_out, config, paf_avg); all_peaks = _compute_all_peaks(heatmap_avg, config, &peak_counter, &peak_list_num); + CHECK_PTR_FAIL_GOTO( all_peaks, "Create buffer fail.", final ); #if 0 for(n=0; nnode_num; i++) { node = vsi_nn_GetNode( graph, (vsi_nn_node_id_t)i ); //printf("i[%u] op[%s]\n", i, vsi_nn_OpGetName(node->op)); - if(node->op == VSI_NN_OP_PROPOSAL) + if (node && node->op == VSI_NN_OP_PROPOSAL) { memcpy(¶m->iminfo, &node->nn_param.proposal.im_info, sizeof(vsi_nn_proposal_im_info)); tensor = vsi_nn_GetTensor(graph,node->output.tensors[0]); + CHECK_PTR_FAIL_GOTO( tensor, "Get tensor fail.", final ); + param->rois_num = (uint32_t)tensor->attr.size[1]; } } @@ -164,6 +165,7 @@ static vsi_status _fill_fasterrcnn_param param->classes_num = VSI_NN_FASTERRCNN_CLASSES_NUM; param->classes = FASTER_RCNN_CLASSES; +final: return status; } /* _fill_fasterrcnn_param() */ @@ -572,6 +574,7 @@ static vsi_status _fasterrcnn_post_process { box = (vsi_nn_fasterrcnn_box_t *) vsi_nn_LinkListNewNode(sizeof(vsi_nn_fasterrcnn_box_t), _init_box); + CHECK_PTR_FAIL_GOTO( box, "Create box fail.", final ); box->score = dets[keep[k]*5+4]; box->class_id = i; box->x1 = dets[keep[k]*5+0]; diff --git a/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c b/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c index 85d862d23..27a3c45c7 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c @@ -34,8 +34,11 @@ static vsi_nn_binary_tree_t * _new_node node = (vsi_nn_binary_tree_t *)malloc( sizeof( vsi_nn_binary_tree_t ) ); + if (node) + { + memset( node, 0, sizeof( vsi_nn_binary_tree_t ) ); + } - memset( node, 0, sizeof( vsi_nn_binary_tree_t ) ); return node; } /* _new_node() */ @@ -181,7 +184,7 @@ void vsi_nn_BinaryTreeRemoveNode vsi_nn_binary_tree_key_t key ) { - if( NULL == root && NULL != *root ) + if ( NULL != root ) { return; } diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c index 4ce42c95e..d696e8cd5 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c @@ -465,6 +465,7 @@ static _op_param_gen_t s_op_gen[] = /* INVERSE_SIGMOID */ NULL, /* GRID_SAMPLE */ NULL, /* LPNORM */ NULL, + /* RESIZE_3D */ NULL, }; _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c ); @@ -548,6 +549,10 @@ void vsi_nn_GenGraphCCode node_id = i; } node = vsi_nn_GetNode( graph, node_id ); + if (node == NULL) + { + continue; + } _write_code( "node[%u] = vsi_nn_AppendNode( graph, %#x, NULL );", i, node->op ); for( j = 0; j < node->input.num; j ++ ) @@ -567,7 +572,7 @@ void vsi_nn_GenGraphCCode } } // write node params - if( node->op < _cnt_of_array( s_op_gen ) ) + if( node->op < (vsi_nn_op_t)_cnt_of_array( s_op_gen ) ) { if( NULL != s_op_gen[node->op] ) { diff --git a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c index 95f5cc7fb..22ab7bb47 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c @@ -77,6 +77,8 @@ static const char* _get_qtype_name(vsi_nn_qnt_type_e type) case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: return "ASYM"; case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: return "SYM"; case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: return "SYMM PC"; + case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8: return "FP8"; + case VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8: return "FP8 PC"; default: VSILOGE("Unknown quant type: %d\n", type); break; @@ -162,7 +164,9 @@ vsi_bool validate_op_io_types { vsi_bool matched = FALSE; - if(self && self->attr.enable_op_constraint_check) { + VSI_UNREFERENCED(name); + + if(self && self->attr.enable_op_constraint_check && op_constraint_reg) { uint32_t i = 0; int32_t j = 0; int32_t reg_tensor_num = op_constraint_reg->reg_input_num + op_constraint_reg->reg_output_num; @@ -218,14 +222,20 @@ char* generate_op_io_types_desc char* desc = NULL; for(i = 0; i < inputs_num; i++) { - if(inputs[i]) { + if (inputs[i] && + _get_qtype_name(inputs[i]->attr.dtype.qnt_type) && + _get_dtype_name(inputs[i]->attr.dtype.vx_type)) + { total_sz += snprintf(NULL, 0, "%s %s, ", _get_qtype_name(inputs[i]->attr.dtype.qnt_type), _get_dtype_name(inputs[i]->attr.dtype.vx_type)); } } for(i = 0; i < outputs_num; i++) { - if(outputs[i]) { + if (outputs[i] && + _get_qtype_name(outputs[i]->attr.dtype.qnt_type) && + _get_dtype_name(outputs[i]->attr.dtype.vx_type)) + { total_sz += snprintf(NULL, 0, "%s %s, ", _get_qtype_name(outputs[i]->attr.dtype.qnt_type), _get_dtype_name(outputs[i]->attr.dtype.vx_type)); @@ -234,17 +244,24 @@ char* generate_op_io_types_desc total_sz += 1; /* terminator */ desc = (char*)malloc(sizeof(char) * total_sz); + CHECK_PTR_FAIL_GOTO( desc, "Create buffer fail.", final ); memset(desc, 0x00, sizeof(char) * total_sz); for(i = 0; i < inputs_num; i++) { - if(inputs[i] && total_sz >= used_sz) { + if (inputs[i] && total_sz >= used_sz && + _get_qtype_name(inputs[i]->attr.dtype.qnt_type) && + _get_dtype_name(inputs[i]->attr.dtype.vx_type)) + { used_sz += snprintf(desc + used_sz, total_sz - used_sz, "%s %s, ", _get_qtype_name(inputs[i]->attr.dtype.qnt_type), _get_dtype_name(inputs[i]->attr.dtype.vx_type)); } } for(i = 0; i < outputs_num; i++) { - if(outputs[i] && total_sz >= used_sz) { + if (outputs[i] && total_sz >= used_sz && + _get_qtype_name(outputs[i]->attr.dtype.qnt_type) && + _get_dtype_name(outputs[i]->attr.dtype.vx_type)) + { used_sz += snprintf(desc + used_sz, total_sz - used_sz, "%s %s, ", _get_qtype_name(outputs[i]->attr.dtype.qnt_type), _get_dtype_name(outputs[i]->attr.dtype.vx_type)); @@ -255,6 +272,7 @@ char* generate_op_io_types_desc desc[used_sz - 2] = '\0'; } +final: return desc; } diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dlfcn.c b/src/tim/vx/internal/src/utils/vsi_nn_dlfcn.c index f64464962..dfabeed95 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_dlfcn.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_dlfcn.c @@ -4,17 +4,22 @@ #if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32)) void * vsi_nn_dlopen( const char *file, int mode ) { + VSI_UNREFERENCED(file); + VSI_UNREFERENCED(mode); return NULL; } int vsi_nn_dlclose( void *handle ) { + VSI_UNREFERENCED(handle); return -1; } __declspec(noinline) void* vsi_nn_dlsym( void *handle, const char *name ) { + VSI_UNREFERENCED(handle); + VSI_UNREFERENCED(name); return NULL; } diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c index 18575b716..ac4aa2ab1 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c @@ -116,6 +116,92 @@ static VSI_INLINE_API void _convert_float_to_bfloat16 } } /* _convert_float_to_bfloat16 */ +static VSI_INLINE_API vsi_bool _convert_quant_float8_e4m3_to_float + ( + const uint8_t * buffer, + size_t size, + const float scale, + float * out_buffer + ) +{ + uint32_t i = 0; + if( !buffer || !out_buffer ) + { + return FALSE; + } + for( i = 0; i < size; i ++ ) + { + out_buffer[i] = fp8_e4m3_to_fp32( (uint8_t)buffer[i], scale ); + } + + return TRUE; +} /* _convert_quant_float8_e4m3_to_float */ + +static VSI_INLINE_API vsi_bool _convert_float_to_quant_float8_e4m3 + ( + const float * buffer, + size_t size, + const float scale, + uint8_t * out_buffer + ) +{ + uint32_t i = 0; + if( !buffer || !out_buffer ) + { + return FALSE; + } + for( i = 0; i < size; i ++ ) + { + out_buffer[i] = fp32_to_fp8_e4m3( buffer[i], scale ); + } + + return TRUE; +} /* _convert_float_to_quant_float8_e4m3 */ + +static VSI_INLINE_API vsi_bool _convert_quant_float8_e5m2_to_float + ( + const uint8_t * buffer, + size_t size, + const float scale, + float * out_buffer + ) +{ + uint32_t i = 0; + + if( !buffer || !out_buffer ) + { + return FALSE; + } + + for( i = 0; i < size; i ++ ) + { + out_buffer[i] = fp8_e5m2_to_fp32( (uint8_t)buffer[i], scale ); + } + + return TRUE; +} /* _convert_quant_float8_e5m2_to_float */ + +static VSI_INLINE_API vsi_bool _convert_float_to_quant_float8_e5m2 + ( + const float * buffer, + size_t size, + const float scale, + uint8_t * out_buffer + ) +{ + uint32_t i = 0; + if( !buffer || !out_buffer ) + { + return FALSE; + } + for( i = 0; i < size; i ++ ) + { + out_buffer[i] = fp32_to_fp8_e5m2( buffer[i], scale ); + } + + return TRUE; +} /* _convert_float_to_quant_float8_e5m2 */ + #define DEF_DTYPE_CONVERT_QUANTIZE( SRC_NAME, SRC_DTYPE, ROUND, MIN, MAX ) \ vsi_bool vsi_nn_dtype_convert_quantize_##SRC_NAME##_to_float \ ( \ @@ -177,6 +263,15 @@ vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm8_perchannel int8_t * out_buffer ) { + VSI_UNREFERENCED(size); + VSI_UNREFERENCED(shape); + VSI_UNREFERENCED(rank); + VSI_UNREFERENCED(scale); + VSI_UNREFERENCED(scale_size); + VSI_UNREFERENCED(zero_point); + VSI_UNREFERENCED(zero_point_size); + VSI_UNREFERENCED(channel_dim); + if( !buffer || !out_buffer ) { return FALSE; @@ -195,6 +290,15 @@ vsi_bool vsi_nn_dtype_convert_quantize_symm8_perchannel_to_float float * out_buffer ) { + VSI_UNREFERENCED(size); + VSI_UNREFERENCED(shape); + VSI_UNREFERENCED(rank); + VSI_UNREFERENCED(scale); + VSI_UNREFERENCED(scale_size); + VSI_UNREFERENCED(zero_point); + VSI_UNREFERENCED(zero_point_size); + VSI_UNREFERENCED(channel_dim); + if( !buffer || !out_buffer ) { return FALSE; @@ -270,6 +374,12 @@ vsi_bool vsi_nn_dtype_convert_float_to_quantize_asymm case I8: return vsi_nn_dtype_convert_float_to_quantize_symm8( buffer, size, scale, zero_point, (int8_t*)out_buffer ); + case FP8_E4M3: + return _convert_float_to_quant_float8_e4m3( + buffer, size, scale, (uint8_t*)out_buffer ); + case FP8_E5M2: + return _convert_float_to_quant_float8_e5m2( + buffer, size, scale, (uint8_t*)out_buffer ); case I16: return vsi_nn_dtype_convert_float_to_quantize_symm16( buffer, size, scale, zero_point, (int16_t*)out_buffer ); @@ -423,6 +533,12 @@ vsi_bool vsi_nn_dtype_convert_quantize_asymm_to_float case U8: return vsi_nn_dtype_convert_quantize_asymm8_to_float( (const uint8_t *)buffer, size, scale, zero_point, out_buffer ); + case FP8_E4M3: + return _convert_quant_float8_e4m3_to_float( + (const uint8_t *)buffer, size, scale, out_buffer ); + case FP8_E5M2: + return _convert_quant_float8_e5m2_to_float( + (const uint8_t *)buffer, size, scale, out_buffer ); case U16: return vsi_nn_dtype_convert_quantize_asymm16_to_float( (const uint16_t*)buffer, size, scale, zero_point, out_buffer); diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c index 6547f463a..07249e7c4 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c @@ -408,12 +408,15 @@ vsi_bool vsi_nn_QuantCheck VSILOGE("input_fl[%d] + weight_fl[%d] != bias_fl[%d]", input->attr.dtype.fl, weight->attr.dtype.fl, - bias->attr.dtype.fl); + bias ? bias->attr.dtype.fl : 0); } break; case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: - if (weight->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) + case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8: + if (weight->attr.dtype.qnt_type == + VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC || + weight->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8) { ret = vsi_nn_QuantAffinePerchannelCheck(input, weight, bias); if(ret == FALSE) @@ -429,7 +432,7 @@ vsi_bool vsi_nn_QuantCheck VSILOGE("input_scale[%.12lf] * weight_scale[%.12lf] != bias_scale[%.12lf]", input->attr.dtype.scale, weight->attr.dtype.scale, - bias->attr.dtype.scale); + bias ? bias->attr.dtype.scale : 0); } } break; @@ -468,6 +471,7 @@ vsi_bool vsi_nn_DtypeCompare break; case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8: { const float diff = (float)1e-5; if (dtype0->zero_point != dtype1->zero_point) @@ -484,6 +488,7 @@ vsi_bool vsi_nn_DtypeCompare } case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC: + case VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8: { const float diff = (float)1e-5; int32_t i = 0; diff --git a/src/tim/vx/internal/src/utils/vsi_nn_hashmap.c b/src/tim/vx/internal/src/utils/vsi_nn_hashmap.c index b576fc1e6..8a8288d86 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_hashmap.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_hashmap.c @@ -47,7 +47,11 @@ static _binary_tree_t * _new_node node = (_binary_tree_t *)malloc( sizeof( _binary_tree_t ) ); - memset( node, 0, sizeof( _binary_tree_t ) ); + if (node) + { + memset( node, 0, sizeof( _binary_tree_t ) ); + } + return node; } /* _new_node() */ @@ -395,6 +399,7 @@ void vsi_nn_hashmap_add { iter = (vsi_nn_hashmap_item_t *)vsi_nn_LinkListNewNode( sizeof( vsi_nn_hashmap_item_t ), NULL ); + VSI_ASSERT( iter ); key_size = strlen( hash_key ) + 1; iter->hash_key = (char*)malloc( sizeof(char) * key_size ); VSI_ASSERT( iter->hash_key ); diff --git a/src/tim/vx/internal/src/utils/vsi_nn_link_list.c b/src/tim/vx/internal/src/utils/vsi_nn_link_list.c index 053e6e9b5..a2401aaf3 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_link_list.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_link_list.c @@ -27,6 +27,7 @@ #include "vsi_nn_prv.h" #include "utils/vsi_nn_link_list.h" #include "vsi_nn_types.h" +#include "vsi_nn_error.h" static vsi_nn_link_list_t * _walk_to_start ( @@ -239,6 +240,7 @@ vsi_nn_link_list_t * vsi_nn_LinkListNewNode ) { vsi_nn_link_list_t *node = (vsi_nn_link_list_t *)malloc(sz); + CHECK_PTR_FAIL_GOTO( node, "Create node fail.", final ); memset(node, 0, sz); if(init) @@ -246,6 +248,7 @@ vsi_nn_link_list_t * vsi_nn_LinkListNewNode init(node); } +final: return node; } /* vsi_nn_LinkListNewNode() */ diff --git a/src/tim/vx/internal/src/utils/vsi_nn_math.c b/src/tim/vx/internal/src/utils/vsi_nn_math.c index b2aae0586..260646da9 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_math.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_math.c @@ -360,8 +360,11 @@ struct r123array4x32 _philox4x32round(struct r123array4x32 ctr, struct r123array uint32_t hi1; uint32_t lo0 = mulhilo32(PHILOX_M4x32_0, ctr.v[0], &hi0); uint32_t lo1 = mulhilo32(PHILOX_M4x32_1, ctr.v[2], &hi1); - struct r123array4x32 out = {{hi1^ctr.v[1]^key.v[0], lo1, - hi0^ctr.v[3]^key.v[1], lo0}}; + struct r123array4x32 out = { { 0, 0, 0, 0 } }; + out.v[0] = hi1^ctr.v[1]^key.v[0]; + out.v[1] = lo1; + out.v[2] = hi0^ctr.v[3]^key.v[1]; + out.v[3] = lo0; return out; } diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c index e6a766feb..82d1aaaf1 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_util.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c @@ -306,7 +306,7 @@ vsi_size_t vsi_nn_GetStrideSizeBySize type_bits = vsi_nn_TypeGetBits( type); stride[0] = type_bits / BITS_PER_BYTE; total_bytes = stride[0]; - if( type_bits < BITS_PER_BYTE ) + if( type_bits < BITS_PER_BYTE && type_bits != 0 ) { total_bytes = 1; if( size[0] % (BITS_PER_BYTE / type_bits) == 0 ) @@ -375,6 +375,8 @@ float vsi_nn_DataAsFloat32 val = (float)((int8_t*)data)[0]; break; case VSI_NN_TYPE_UINT8: + case VSI_NN_TYPE_FLOAT8_E4M3: + case VSI_NN_TYPE_FLOAT8_E5M2: val = (float)data[0]; break; case VSI_NN_TYPE_INT16: @@ -600,6 +602,8 @@ void vsi_nn_ComputePadWithPadType vsi_size_t * out_pad ) { + VSI_UNREFERENCED(in_dim_num); + VSI_UNREFERENCED(rounding); vsi_nn_compute_padding(in_shape, ksize, stride, NULL, pad_type, out_pad); } /* vsi_nn_ComputePadWithPadType() */ @@ -651,6 +655,8 @@ void vsi_nn_ComputePadWithPadTypeForConv1D vsi_size_t * out_pad ) { + VSI_UNREFERENCED(in_dim_num); + VSI_UNREFERENCED(rounding); vsi_nn_compute_padding_conv1d(in_shape, ksize, stride, NULL, pad_type, out_pad); } /* vsi_nn_ComputePadWithPadTypeForConv1D() */ @@ -708,9 +714,10 @@ vsi_bool vsi_nn_CreateTensorGroup vsi_size_t end[VSI_NN_MAX_DIM_NUM]; vsi_nn_tensor_attr_t attr; - if( NULL == graph || NULL == in_tensor + if ( NULL == graph || NULL == in_tensor || NULL == out_tensors || 0 == group_number - || 0 == in_tensor->attr.size[axis] ) + || axis >= VSI_NN_MAX_DIM_NUM || + 0 == in_tensor->attr.size[axis] ) { VSILOGW( "Create tensor group fail." ); return FALSE; @@ -733,13 +740,14 @@ vsi_bool vsi_nn_CreateTensorGroup end[2] = in_tensor->attr.size[2]; end[3] = in_tensor->attr.size[3]; end[axis] = 0; - for( i = 0; i < group_number; i ++ ) { start[axis] = end[axis]; end[axis] += sz; #ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT - if ( attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC ) + if (attr.dtype.qnt_type == + VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC || + attr.dtype.qnt_type == VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8) { attr.dtype.scales = in_tensor->attr.dtype.scales + sz * i; attr.dtype.scale_dim = (int32_t)sz; @@ -835,6 +843,7 @@ int32_t vsi_nn_Mkdir int32_t mode ) { + VSI_UNREFERENCED(mode); if(NULL == path) { return -1; @@ -906,6 +915,10 @@ uint8_t * vsi_nn_MallocAlignedBuffer sz = sizeof(aligned_header) + mem_size + align_start_size + align_block_size + END_GUARD_SIZE; raw_addr = (uint8_t *)malloc( sz * sizeof( uint8_t ) ); + if (raw_addr == NULL) + { + return NULL; + } memset(raw_addr, 0, sizeof( uint8_t ) * sz); p = raw_addr + sizeof(aligned_header); @@ -1175,6 +1188,7 @@ vsi_bool vsi_nn_is_same_quant_type( break; case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8: { const float diff = (float)1e-5; if (src_dtype->zero_point != dst_dtype->zero_point) @@ -1190,6 +1204,7 @@ vsi_bool vsi_nn_is_same_quant_type( } case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC: + case VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8: { const float diff = (float)1e-5; int32_t i = 0; @@ -1340,6 +1355,7 @@ float vsi_nn_get_tensor_scale break; case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8: scale = tensor->attr.dtype.scale; break; default: @@ -1359,6 +1375,7 @@ int32_t vsi_nn_get_tensor_zero_point switch (tensor->attr.dtype.qnt_type) { case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: + case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8: zero_point = 0; break; case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: @@ -1408,6 +1425,14 @@ void vsi_nn_get_tensor_clamp_min_max *clampMin = - zero_point; *clampMax = 65535 - zero_point; } + else if (vx_type == VSI_NN_TYPE_FLOAT8_E4M3) { + *clampMin = -448; + *clampMax = 448; + } + else if (vx_type == VSI_NN_TYPE_FLOAT8_E5M2) { + *clampMin = -57344; + *clampMax = 57344; + } else { uint32_t f32_min = 0xff800000; diff --git a/src/tim/vx/internal/src/vip/virtual_device.cpp b/src/tim/vx/internal/src/vip/virtual_device.cpp index 88a146a83..2efa849cc 100644 --- a/src/tim/vx/internal/src/vip/virtual_device.cpp +++ b/src/tim/vx/internal/src/vip/virtual_device.cpp @@ -30,7 +30,7 @@ namespace vip { Device::Device(uint32_t id) { id_ = id; graphqueue_ = std::make_unique (); - worker_ = std::make_unique ();; + worker_ = std::make_unique (); ThreadInit(); } @@ -63,6 +63,9 @@ bool Device::ThreadExit() { bool Device::GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data) { bool status = false; + idle_mtx_.lock(); + submit_num_++; + idle_mtx_.unlock(); status = graphqueue_->Submit(graph, func, data); return status; } @@ -72,8 +75,10 @@ bool Device::GraphRemove(const vsi_nn_graph_t* graph) { } void Device::WaitThreadIdle() { - ThreadExit(); - ThreadInit(); + std::unique_lock lock(idle_mtx_); + while (submit_num_ > 0) { + cv_.wait(lock); + } } Worker::Worker() { @@ -108,6 +113,11 @@ void Device::HandleQueue() { break; } worker_->Handle(item); // run graph + + idle_mtx_.lock(); + submit_num_--; + idle_mtx_.unlock(); + cv_.notify_one(); } } diff --git a/src/tim/vx/internal/src/vip/virtual_device_private.h b/src/tim/vx/internal/src/vip/virtual_device_private.h index ed4c6bb68..b0e39a0cc 100644 --- a/src/tim/vx/internal/src/vip/virtual_device_private.h +++ b/src/tim/vx/internal/src/vip/virtual_device_private.h @@ -28,8 +28,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -99,6 +99,9 @@ class Device { std::array threads_; std::unique_ptr graphqueue_; std::unique_ptr worker_; + std::condition_variable cv_; + std::mutex idle_mtx_; + int submit_num_ = 0; }; } // namespace vip diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c index 7d7636fd1..99a5e7938 100644 --- a/src/tim/vx/internal/src/vsi_nn_context.c +++ b/src/tim/vx/internal/src/vsi_nn_context.c @@ -151,6 +151,13 @@ static vsi_status vsi_nn_initOptions options->enable_stream_processor = atoi(env_s); } + env_s = NULL; + options->enable_rgb88_planar_nhwc = 0; + if (vsi_nn_getEnv("VSI_NN_FORCE_RGB888_OUT_NHWC", &env_s) && env_s) + { + options->enable_rgb88_planar_nhwc = atoi(env_s); + } + return VSI_SUCCESS; } diff --git a/src/tim/vx/internal/src/vsi_nn_daemon.c b/src/tim/vx/internal/src/vsi_nn_daemon.c index a5b279712..4887368ef 100644 --- a/src/tim/vx/internal/src/vsi_nn_daemon.c +++ b/src/tim/vx/internal/src/vsi_nn_daemon.c @@ -28,11 +28,13 @@ _INITIALIZER( daemon_start ) { + //VSILOGD("OVXLIB init ... "); vsi_nn_kernel_backend_init(); } /* _daemon_start() */ _DEINITIALIZER( daemon_shutdown ) { + //VSILOGD("OVXLIB shutdown ... "); vsi_nn_kernel_backend_deinit(); } /* vsi_nn_daemen_shutdown() */ diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c index bbfdabcba..c9eed9cd8 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph.c +++ b/src/tim/vx/internal/src/vsi_nn_graph.c @@ -194,10 +194,10 @@ static vsi_status update_max_node_io vsi_nn_node_id_t *node_list ) { - uint32_t i,max_io; - vsi_status status; + uint32_t i = 0,max_io = 0; + vsi_status status = VSI_FAILURE; vsi_nn_node_id_t node_id; - vsi_nn_node_t *node; + vsi_nn_node_t *node = NULL; status = VSI_SUCCESS; max_io = VSI_NN_MAX_IO_NUM; /* default max node io */ @@ -205,11 +205,12 @@ static vsi_status update_max_node_io { node_id = node_list[i]; node = vsi_nn_GetNode( graph, node_id ); - if(node->input.num > max_io) + + if (node && node->input.num > max_io) { max_io = node->input.num; } - if(node->output.num > max_io) + if (node && node->output.num > max_io) { max_io = node->output.num; } @@ -250,6 +251,8 @@ static vsi_status optimize_node_backward /* Get inputs, outputs. */ node = vsi_nn_GetNode( graph, node_id ); + CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final ); + vsi_nn_GetTensors( graph, node->input.tensors, node->input.num, inputs ); vsi_nn_GetTensors( graph, node->output.tensors, @@ -301,6 +304,8 @@ static vsi_status optimize_node_forward /* Get inputs, outputs. */ node = vsi_nn_GetNode( graph, node_id ); + CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final ); + vsi_nn_GetTensors( graph, node->input.tensors, node->input.num, inputs ); vsi_nn_GetTensors( graph, node->output.tensors, @@ -353,6 +358,8 @@ static vsi_status compute_node /* Get inputs, outputs. */ node = vsi_nn_GetNode( graph, node_id ); + CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final ); + vsi_nn_GetTensors( graph, node->input.tensors, node->input.num, inputs ); vsi_nn_GetTensors( graph, node->output.tensors, @@ -458,6 +465,8 @@ static vsi_status setup_node /* Get inputs, outputs. */ node = vsi_nn_GetNode( graph, node_id ); + CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final ); + vsi_nn_GetTensors( graph, node->input.tensors, node->input.num, inputs ); vsi_nn_GetTensors( graph, node->output.tensors, @@ -525,6 +534,8 @@ static vsi_status set_graph_precision memset( outputs, 0, graph->max_node_io * sizeof( vsi_nn_tensor_t * ) ); /* Get inputs, outputs. */ node = vsi_nn_GetNode( graph, node_id ); + CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final ); + vsi_nn_GetTensors( graph, node->input.tensors, node->input.num, inputs ); vsi_nn_GetTensors( graph, node->output.tensors, @@ -560,6 +571,9 @@ vsi_nn_graph_t * vsi_nn_CreateGraph vsi_nn_graph_t * graph; graph = NULL; + VSI_UNREFERENCED(max_tensor_num); + VSI_UNREFERENCED(max_node_num); + VSILOGI( "%s", vsi_nn_GetVersion() ); if( NULL == ctx ) @@ -1002,6 +1016,70 @@ vsi_nn_tensor_id_t vsi_nn_AddTensorFromHandle return _add_tensor(graph, id, attr, data); } +vsi_nn_tensor_id_t vsi_nn_AddTensorFromView +( + vsi_nn_graph_t* graph, + vsi_nn_tensor_id_t parent_id, + vsi_size_t* start, + vsi_size_t* end +) +{ + uint32_t i = 0; + vx_tensor view_vxt = NULL; + vsi_nn_tensor_t* parent_tensor = NULL; + vsi_nn_tensor_t* new_tensor =NULL; + vsi_nn_tensor_id_t id = VSI_NN_TENSOR_ID_NA; + vsi_nn_tensor_attr_t attr; + + memset(&attr, 0x0, sizeof(vsi_nn_tensor_attr_t)); + parent_tensor = vsi_nn_GetTensor(graph, parent_id); + if (NULL == parent_tensor) + { + VSILOGE("Create view tensor failed, parent tensor is invalid."); + id = VSI_NN_TENSOR_ID_NA; + goto final; + } + + /* new tensor's all attribuites are inherited from parent tensor except 'size' */ + attr = parent_tensor->attr; + for (i = 0; i < attr.dim_num; i++) + { + attr.size[i] = end[i] - start[i]; + } + id = _add_tensor(graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL); + if (VSI_NN_TENSOR_ID_NA == id) + { + VSILOGE("Create view tensor failed, new tensor could not be created."); + goto final; + } + + new_tensor = vsi_nn_GetTensor(graph, id); + if (new_tensor && new_tensor->t) + { + vxReleaseTensor(&(new_tensor->t)); + } + else + { + VSILOGE("Create view tensor failed, new tensor or vxTensor is NULL."); + id = VSI_NN_TENSOR_ID_NA; + goto final; + } + + view_vxt = vsi_nn_CreateViewTensor(graph, start, end, parent_tensor); + if ( NULL != view_vxt) + { + new_tensor->t = view_vxt; + } + else + { + VSILOGE("Create view tensor failed, view vxTensor could not be created."); + id = VSI_NN_TENSOR_ID_NA; + goto final; + } +final: + return id; +} + vsi_nn_tensor_id_t vsi_nn_AttachTensorToGraph ( vsi_nn_graph_t * graph, @@ -1184,6 +1262,8 @@ vsi_nn_node_t * vsi_nn_AddExternalNode vsi_nn_node_id_t id; vsi_nn_op_proc_t * node_proc; + VSI_UNREFERENCED(node_id); + node_proc = (vsi_nn_op_proc_t*)proc; if( NULL == graph ) @@ -1210,12 +1290,25 @@ vsi_nn_node_t * vsi_nn_AddExternalNode node->output.num = node_proc->output_num; node->output.tensors = (vsi_nn_tensor_id_t *) malloc( node_proc->output_num * sizeof( vsi_nn_tensor_id_t ) ); + if ( NULL == node->output.tensors ) + { + VSILOGE("Create output tensor id %s. fail", vsi_nn_OpGetName(op)); + vsi_nn_safe_free(node); + return NULL; + } vsi_nn_InitTensorsId( node->output.tensors, node_proc->output_num ); /* init input struct */ node->input.num = node_proc->input_num; node->input.tensors = (vsi_nn_tensor_id_t *) malloc( node_proc->input_num * sizeof( vsi_nn_tensor_id_t ) ); + if ( NULL == node->input.tensors ) + { + VSILOGE("Create input tensor id %s. fail", vsi_nn_OpGetName(op)); + vsi_nn_safe_free(node->output.tensors); + vsi_nn_safe_free(node); + return NULL; + } vsi_nn_InitTensorsId( node->input.tensors, node_proc->input_num ); node->attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE; node->attr.enable_op_constraint_check = TRUE; @@ -1259,11 +1352,16 @@ vsi_bool vsi_nn_SetGraphInputs vsi_bool ret; ret = FALSE; - if( NULL == graph || tensor_num == 0 ) + if( NULL == graph ) { return ret; } + if ( tensor_num == 0 ) + { + return TRUE; + } + graph->input.tensors = (vsi_nn_tensor_id_t *)malloc( tensor_num * sizeof( vsi_nn_tensor_id_t ) ); @@ -1317,10 +1415,10 @@ vsi_nn_node_id_t * vsi_nn_SortGraphNode vsi_nn_graph_t * graph ) { - uint32_t i,j; - uint32_t count; - vsi_bool dirty; - vsi_bool all_tensor_processed; + uint32_t i = 0,j = 0; + uint32_t count = 1; + vsi_bool dirty = TRUE; + vsi_bool all_tensor_processed = FALSE; vsi_bool * tensors = NULL; vsi_nn_node_id_t * nodes = NULL; vsi_nn_node_id_t * sorted_nodes = NULL; @@ -1344,21 +1442,18 @@ vsi_nn_node_id_t * vsi_nn_SortGraphNode /* Init variables. */ tensors = (vsi_bool *)malloc( graph->tensor_num * sizeof( vsi_bool ) ); - - if( NULL == tensors ) - { - goto _SortGraphNodeFinally; - } + CHECK_PTR_FAIL_GOTO( tensors, "Create buffer fail.", final ); + memset(tensors, 0, graph->tensor_num * sizeof( vsi_bool )); sorted_nodes = (vsi_nn_node_id_t *)malloc( graph->node_num * sizeof( vsi_nn_node_id_t ) ); + CHECK_PTR_FAIL_GOTO( sorted_nodes, "Create buffer fail.", final ); + memset(sorted_nodes, 0, graph->node_num * sizeof( vsi_nn_node_id_t )); + nodes = (vsi_nn_node_id_t *)malloc( graph->node_num * sizeof( vsi_nn_node_id_t ) ); - - if( NULL == sorted_nodes || NULL == nodes) - { - goto _SortGraphNodeFinally; - } + CHECK_PTR_FAIL_GOTO( nodes, "Create buffer fail.", final ); + memset(sorted_nodes, 0, graph->node_num * sizeof( vsi_nn_node_id_t )); for( i = 0; i < graph->tensor_num; i++ ) { @@ -1396,6 +1491,8 @@ vsi_nn_node_id_t * vsi_nn_SortGraphNode { node_id = nodes[i]; node = vsi_nn_GetNode( graph, node_id ); + CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final ); + all_tensor_processed = TRUE; for( j = 0; j < node->input.num; j ++ ) { @@ -1439,17 +1536,17 @@ vsi_nn_node_id_t * vsi_nn_SortGraphNode } } while( count > 0 ); - if( count != 0 ) +final: + + /* Release memory. */ + vsi_nn_safe_free( tensors ); + vsi_nn_safe_free( nodes ); + + if ( count != 0 ) { - free( sorted_nodes ); - sorted_nodes = NULL; + vsi_nn_safe_free( sorted_nodes ); } -_SortGraphNodeFinally: - - /* Release memory. */ - free( tensors ); - free( nodes ); return sorted_nodes; } /* vsi_nn_SortGraphNode() */ @@ -1479,7 +1576,8 @@ uint32_t vsi_nn_GetNodesByUids for( j = 0; j < graph->node_num; j++ ) { node = vsi_nn_GetNode( graph, (vsi_nn_node_id_t)j ); - if( node_uids[i] == node->uid ) + + if ( node && node_uids[i] == node->uid ) { nodes[sz] = (vsi_nn_node_id_t)j; sz ++; @@ -1496,6 +1594,7 @@ uint32_t vsi_nn_GetNodesByUids } sz = graph->node_num; } + return sz; } /* vsi_nn_GetNodesByUids() */ @@ -1536,6 +1635,8 @@ void vsi_nn_DumpGraphNodeOutputsEx vsi_nn_node_t * node; vsi_nn_tensor_t * tensor; + VSI_UNREFERENCED(data_fmt); + if(vsi_nn_CheckFilePath(path) == FALSE) { return ; @@ -1576,6 +1677,7 @@ void vsi_nn_DumpGraphNodeOutputsEx for( i = 0; i < node_num; i++ ) { node = vsi_nn_GetNode( graph, (vsi_nn_node_id_t)i ); + CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final ); if( node->internal_node_wksp ) /* dump internal nodes if any */ { @@ -1611,7 +1713,9 @@ void vsi_nn_DumpGraphNodeOutputsEx } } } - free( nodes ); + +final: + vsi_nn_safe_free( nodes ); } /* vsi_nn_DumpGraphNodeOutputsEx */ void vsi_nn_PrintGraph @@ -1728,6 +1832,7 @@ void vsi_nn_DumpGraphToJson /* tensor only 1 input node */ in_node = vsi_nn_GetNode(graph, table[0].node); + CHECK_PTR_FAIL_GOTO( in_node, "Get node fail.", final ); if(j == node->input.num - 1) { fprintf(fp, "\"@uid_%u:out%u\" ", in_node->uid, table[0].index); @@ -1847,6 +1952,7 @@ void vsi_nn_DumpGraphToJson fprintf(fp, "\t}\n}\n"); +final: vsi_nn_ReleaseTensorRelevance(graph, tensor_ref); fclose(fp); } /* vsi_nn_DumpGraphToJson() */ @@ -1959,7 +2065,8 @@ vsi_status vsi_nn_setup_binary_graph_inputs_outputs { vsi_nn_node_t* node = vsi_nn_GetNode(graph, i); uint32_t numParams = 0; - if (node->op == VSI_NN_OP_NBG) + + if (node && node->op == VSI_NN_OP_NBG) { status = vxQueryNode( node->n, VX_NODE_PARAMETERS, &numParams, sizeof(numParams)); @@ -1968,13 +2075,14 @@ vsi_status vsi_nn_setup_binary_graph_inputs_outputs vx_parameter param = 0; vx_enum type = 0; param = vxGetParameterByIndex(node->n, j); - status = vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); - if (type == VX_TYPE_SCALAR) - { - num_of_graph_real_inputs++; - } if (param != NULL) { + status = vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); + if (type == VX_TYPE_SCALAR) + { + num_of_graph_real_inputs++; + } + vxReleaseParameter(¶m); param = NULL; } @@ -1997,44 +2105,50 @@ vsi_status vsi_nn_setup_binary_graph_inputs_outputs for (k = 0; k < graph->node_num; k++) { vsi_nn_node_t* node = vsi_nn_GetNode(graph, k); - if (node->op == VSI_NN_OP_NBG) + + if (node && node->op == VSI_NN_OP_NBG) { vx_parameter param = 0; vx_reference ref = 0; vx_enum type = 0; uint32_t scalar_index = j; param = vxGetParameterByIndex(node->n, scalar_index); - status = vxQueryParameter(param, - VX_PARAMETER_TYPE, - &type, - sizeof(vx_enum)); + if (param != NULL) { - vxReleaseParameter(¶m); - param = NULL; - } - if (type != VX_TYPE_SCALAR) - { - break; - } - for (p = scalar_index; p < scalar_index+4; p++) - { - param = vxGetParameterByIndex(node->n, p); status = vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); - if (type == VX_TYPE_SCALAR) + vxReleaseParameter(¶m); + param = NULL; + + if (type != VX_TYPE_SCALAR) { - vxQueryParameter(param, - VX_PARAMETER_REF, - &ref, - sizeof(vx_reference)); - graph_inputs[j++] = ref; - vxReleaseReference(&ref); + break; } + } + + for (p = scalar_index; p < scalar_index+4; p++) + { + param = vxGetParameterByIndex(node->n, p); + if (param != NULL) { + status = vxQueryParameter(param, + VX_PARAMETER_TYPE, + &type, + sizeof(vx_enum)); + if (type == VX_TYPE_SCALAR) + { + vxQueryParameter(param, + VX_PARAMETER_REF, + &ref, + sizeof(vx_reference)); + graph_inputs[j++] = ref; + vxReleaseReference(&ref); + } + vxReleaseParameter(¶m); } } @@ -2146,6 +2260,8 @@ void vsi_nn_get_tensor_consumers for(i = 0; i < graph->node_num; i++) { node = vsi_nn_GetNode(graph, i); + CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final ); + for(j = 0; j < node->input.num; j++) { if(node->input.tensors[j] == tensor_id) @@ -2159,6 +2275,8 @@ void vsi_nn_get_tensor_consumers } } } + +final: if(count != NULL) { *count = nodes_count; @@ -2177,6 +2295,8 @@ void vsi_nn_get_tensor_provider for(i = 0; i < graph->node_num; i++) { cur_node = vsi_nn_GetNode(graph, i); + CHECK_PTR_FAIL_GOTO( cur_node, "Get node fail.", final ); + for(j = 0; j < cur_node->output.num; j++) { if(cur_node->output.tensors[j] == tensor_id) @@ -2186,6 +2306,9 @@ void vsi_nn_get_tensor_provider } } } + +final: + return; } /* vsi_nn_get_tensor_provider() */ vsi_status vsi_nn_SetGraphPreloadSize @@ -2198,6 +2321,10 @@ vsi_status vsi_nn_SetGraphPreloadSize vsi_status status; status = VSI_FAILURE; + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(attr); + VSI_UNREFERENCED(size); + #if(defined(VX_PRELOAD_CONST_TENSOR_SUPPORT) && VX_PRELOAD_CONST_TENSOR_SUPPORT) if(graph && graph->g) { @@ -2259,6 +2386,8 @@ vsi_status vsi_nn_SetGraphPriority ) { vsi_status status = VSI_FAILURE; + VSI_UNREFERENCED(graph); + VSI_UNREFERENCED(priority); #ifdef VX_GRAPH_PREEMPTION_SUPPORT if(graph && graph->g) { diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c index 05b2d2fc1..aafc89038 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c +++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c @@ -27,7 +27,7 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_graph.h" #include "vsi_nn_log.h" -#include "vsi_nn_test.h" +#include "vsi_nn_error.h" static vsi_bool _is_asymm_int8_norm_tensor @@ -88,6 +88,8 @@ static vsi_status _add_forward_node uint32_t i = 0; uint32_t j = 0; + VSI_UNREFERENCED(graph); + /* Reconnect node tensors */ for(i = 0; i < nodes_count; i++) { @@ -117,8 +119,10 @@ static vsi_status _add_backward_node { uint32_t i = 0; + VSI_UNREFERENCED(graph); + /* Reconnect node output tensors */ - for(i = 0; i < (int32_t)last_node->output.num; i++) + for(i = 0; i < last_node->output.num; i++) { if(last_node->output.tensors[i] == output) { @@ -188,10 +192,13 @@ static void _get_graph_input_asymm_int8_norm_tensor for(i = 0; i < graph->node_num; i++) { node = vsi_nn_GetNode(graph, i); + CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final ); + for(j = 0; j < node->input.num; j++) { vsi_nn_tensor_id_t id = node->input.tensors[j]; vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); + if (_is_asymm_int8_norm_tensor(tensor)) { if(tensor_ids != NULL) @@ -211,6 +218,7 @@ static void _get_graph_input_asymm_int8_norm_tensor } } +final: if(count != NULL) { *count = tensor_count; @@ -236,10 +244,13 @@ static void _get_graph_output_asymm_int8_norm_tensor for(i = 0; i < graph->node_num; i++) { node = vsi_nn_GetNode(graph, i); + CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final ); + for(j = 0; j < node->output.num; j++) { vsi_nn_tensor_id_t id = node->output.tensors[j]; vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); + if (_is_asymm_int8_norm_tensor(tensor)) { if(tensor_ids != NULL) @@ -251,6 +262,7 @@ static void _get_graph_output_asymm_int8_norm_tensor } } +final: if(count != NULL) { *count = tensor_count; @@ -280,11 +292,16 @@ static vsi_status _add_graph_dataconvert_for_int8 if(input_count != 0) { input_ids = (vsi_nn_tensor_id_t *)malloc(sizeof(vsi_nn_tensor_id_t) * input_count); + CHECK_PTR_FAIL_GOTO( input_ids, "Create tensor id fail.", final ); + memset(input_ids, 0, sizeof(vsi_nn_tensor_id_t) * input_count); + _get_graph_input_asymm_int8_norm_tensor(graph, NULL, input_ids, &input_valid_count); if ( input_valid_count > 0 ) { input_nodes = (vsi_nn_node_t***)malloc(sizeof(vsi_nn_node_t**) * input_valid_count); + CHECK_PTR_FAIL_GOTO( input_nodes, "Create node fail.", final ); + memset(input_nodes, 0, sizeof(vsi_nn_node_t**) * input_valid_count); } for ( i = 0; i < input_valid_count; i++) @@ -295,6 +312,9 @@ static vsi_status _add_graph_dataconvert_for_int8 if(nodes_count > 0) { input_nodes[i] = (vsi_nn_node_t**)malloc(sizeof(vsi_nn_node_t*)*nodes_count); + CHECK_PTR_FAIL_GOTO( input_nodes[i], "Create node fail.", final ); + memset(input_nodes[i], 0, sizeof(vsi_nn_node_t*) * nodes_count); + vsi_nn_get_tensor_consumers(graph, input_ids[i], input_nodes[i], NULL); *dirty = TRUE; @@ -307,9 +327,14 @@ static vsi_status _add_graph_dataconvert_for_int8 if(output_count > 0) { output_ids = (vsi_nn_tensor_id_t*)malloc(sizeof(vsi_nn_tensor_id_t) * output_count); + CHECK_PTR_FAIL_GOTO( output_ids, "Create tensor id fail.", final ); + memset(output_ids, 0, sizeof(vsi_nn_tensor_id_t) * output_count); + _get_graph_output_asymm_int8_norm_tensor(graph, NULL, output_ids); output_nodes = (vsi_nn_node_t**)malloc(sizeof(vsi_nn_node_t*) * output_count); + CHECK_PTR_FAIL_GOTO( output_nodes, "Create node fail.", final ); + memset(output_nodes, 0, sizeof(vsi_nn_node_t*) * output_count); for ( i = 0; i < output_count; i++) { @@ -325,33 +350,25 @@ static vsi_status _add_graph_dataconvert_for_int8 uint32_t nodes_count = 0; vsi_nn_get_tensor_consumers(graph, input_ids[i], NULL, &nodes_count); - if(nodes_count != 0) + if (nodes_count > 0) { vsi_nn_tensor_id_t id = input_ids[i]; vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); - vsi_nn_tensor_id_t output; + vsi_nn_tensor_id_t output = 0; - memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t)); - attr.dtype.vx_type = VSI_NN_TYPE_UINT8; - attr.dtype.zero_point += 128; - attr.vtl = TRUE; - output = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL ); - - _add_dataconvert_node(graph, dataconvert_idx ++, VSI_NN_OPTIMIZE_FORWARD, - input_nodes[i], nodes_count, id, output); - } - if (input_nodes[i] != NULL) - { - free(input_nodes[i]); - input_nodes[i] = NULL; + if (tensor) + { + memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t)); + attr.dtype.vx_type = VSI_NN_TYPE_UINT8; + attr.dtype.zero_point += 128; + attr.vtl = TRUE; + output = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL ); + + _add_dataconvert_node(graph, dataconvert_idx ++, VSI_NN_OPTIMIZE_FORWARD, + input_nodes[i], nodes_count, id, output); + } } } - - if(input_nodes) - { - free(input_nodes); - input_nodes = NULL; - } } if ( output_count > 0 ) @@ -360,35 +377,36 @@ static vsi_status _add_graph_dataconvert_for_int8 { vsi_nn_tensor_id_t id = output_ids[i]; vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); - vsi_nn_tensor_id_t input; - - memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t)); - attr.dtype.vx_type = VSI_NN_TYPE_UINT8; - attr.dtype.zero_point += 128; - attr.vtl = TRUE; - input = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL ); + vsi_nn_tensor_id_t input = 0; - _add_dataconvert_node(graph, dataconvert_idx ++, VSI_NN_OPTIMIZE_BACKWARD, - &output_nodes[i], 1, input, id); + if (tensor) + { + memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t)); + attr.dtype.vx_type = VSI_NN_TYPE_UINT8; + attr.dtype.zero_point += 128; + attr.vtl = TRUE; + input = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL ); + + _add_dataconvert_node(graph, dataconvert_idx ++, VSI_NN_OPTIMIZE_BACKWARD, + &output_nodes[i], 1, input, id); + } } + } - if(output_nodes) +final: + for ( i = 0; i < input_valid_count; i++) + { + if (input_nodes) { - free(output_nodes); - output_nodes = NULL; + vsi_nn_safe_free(input_nodes[i]); } } + vsi_nn_safe_free(input_nodes); - if (input_ids) - { - free(input_ids); - input_ids = NULL; - } - if (output_ids) - { - free(output_ids); - output_ids = NULL; - } + vsi_nn_safe_free(output_nodes); + + vsi_nn_safe_free(input_ids); + vsi_nn_safe_free(output_ids); return status; } /* _add_graph_dataconvert_for_int8() */ @@ -402,7 +420,7 @@ static vsi_status _add_graph_data_convert vsi_status status = VSI_FAILURE; status = _add_graph_dataconvert_for_int8(graph, dirty); - TEST_CHECK_STATUS(status, final); + CHECK_STATUS_FAIL_GOTO(status, final); final: return status; @@ -510,7 +528,7 @@ vsi_status vsi_nn_CopyDataToRawTensor } else { - status = vsi_nn_copy_tensor_patch(tensor, &attr, data, VX_WRITE_ONLY); + status = vsi_nn_copy_tensor_patch(tensor, &attr, data, VX_WRITE_ONLY, NULL, NULL); } _try_set_const_raw_tensor(tensor, attr); @@ -537,11 +555,11 @@ static vx_tensor _create_const_raw_tensor params.num_of_dims = attr.dim_num; for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) { - size_vxsize[i] = -1 == attr.size[i] ? -1 : (vx_size)attr.size[i]; + size_vxsize[i] = (vsi_size_t)-1 == attr.size[i] ? (vx_size)-1 : (vx_size)attr.size[i]; } for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) { - size_u32[i] = -1 == attr.size[i] ? -1 : (vx_uint32)attr.size[i]; + size_u32[i] = (vsi_size_t)-1 == attr.size[i] ? (vx_uint32)-1 : (vx_uint32)attr.size[i]; } #ifdef VSI_40BIT_VA_SUPPORT params.sizes = size_vxsize; @@ -558,14 +576,19 @@ static vx_tensor _create_const_raw_tensor params.quant_data.dfp.fixed_point_pos = (uint8_t)attr.dtype.fl; break; case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: + case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8: params.quant_data.affine.scale = attr.dtype.scale; params.quant_data.affine.zeroPoint = (int32_t)attr.dtype.zero_point; break; case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: + case VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8: #ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT // This is a hack that driver doesn't support const scale scales = (float *)malloc(sizeof(float) * attr.dtype.scale_dim); + CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final ); zeroPoints = (int32_t *)malloc(sizeof(attr.dtype.zero_points[0]) * attr.dtype.zero_points_dim); + CHECK_PTR_FAIL_GOTO( zeroPoints, "Create buffer fail.", final ); memcpy(scales, attr.dtype.scales, attr.dtype.scale_dim * sizeof(float)); memcpy(zeroPoints, attr.dtype.zero_points, attr.dtype.zero_points_dim * sizeof(attr.dtype.zero_points[0])); params.quant_data.affinePerChannel.channelDim = attr.dtype.channel_dim; @@ -575,7 +598,8 @@ static vx_tensor _create_const_raw_tensor params.quant_data.affinePerChannel.zeroPointCount = attr.dtype.zero_points_dim; break; #else - VSILOGE( "can't support qnt_type VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC." ); + VSILOGE( "can't support qnt_type VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC" + "or VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8." ); #endif default: break; @@ -622,7 +646,7 @@ static vx_tensor _create_const_raw_tensor vx_size stride_size_vxsize[_cnt_of_array(stride_size)] = {0}; for(i = 0; i < _cnt_of_array(attr.size); i++) { - size[i] = -1 == attr.size[i] ? -1 : (vx_size)attr.size[i]; + size[i] = (vsi_size_t)-1 == attr.size[i] ? (vx_size)-1 : (vx_size)attr.size[i]; } for(i = 0; i < _cnt_of_array(stride_size); i++) { @@ -630,6 +654,7 @@ static vx_tensor _create_const_raw_tensor } addr = vxCreateTensorAddressing(graph->ctx->c, size, stride_size_vxsize, (vx_size)attr.dim_num); + CHECK_PTR_FAIL_GOTO( addr, "Create tensor address fail.", final ); } #else { @@ -637,14 +662,16 @@ static vx_tensor _create_const_raw_tensor uint32_t stride_size_32bit[_cnt_of_array(stride_size)] = {0}; for(i = 0; i < _cnt_of_array(attr.size); i++) { - size_32bit[i] = -1 == attr.size[i] ? -1 : (uint32_t)attr.size[i]; + size_32bit[i] = (vsi_size_t)-1 == attr.size[i] ? (uint32_t)-1 : (uint32_t)attr.size[i]; } for(i = 0; i < _cnt_of_array(stride_size); i++) { - stride_size_32bit[i] = -1 == stride_size[i] ? -1 : (uint32_t)stride_size[i]; + stride_size_32bit[i] = (vsi_size_t)-1 == stride_size[i] ? \ + (uint32_t)-1 : (uint32_t)stride_size[i]; } addr = vxCreateTensorAddressing(graph->ctx->c, size_32bit, stride_size_32bit, (vx_uint8)attr.dim_num); + CHECK_PTR_FAIL_GOTO( addr, "Create tensor address fail.", final ); } #endif #ifdef VX_13_NN_COMPATIBLITY @@ -687,18 +714,12 @@ static vx_tensor _create_const_raw_tensor } final: - if( NULL == tensor ) + if ( NULL == tensor ) { VSILOGE( "Create vx tensor fail." ); } - if( scales ) - { - free( scales ); - } - if (zeroPoints) - { - free( zeroPoints ); - } + vsi_nn_safe_free(scales); + vsi_nn_safe_free(zeroPoints); return tensor; } /* _create_const_raw_tensor() */ @@ -745,20 +766,23 @@ static void _convert_const_I8toU8 { uint8_t * data = NULL; vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); - vsi_nn_tensor_attr_t *attr = &tensor->attr; + vsi_nn_tensor_attr_t *attr = NULL; vsi_size_t sz = 0; vsi_size_t i = 0; + CHECK_PTR_FAIL_GOTO( tensor, "Get tensor fail.", final ); + attr = &tensor->attr; + sz = vsi_nn_GetElementNum( tensor ); data = vsi_nn_ConvertTensorToData( graph, tensor ); - if( NULL == data ) + if ( NULL == data ) { VSILOGE( "Convert data fail." ); return ; } - for( i = 0; i < sz; i++ ) + for ( i = 0; i < sz; i++ ) { data[i] = data[i] ^ 0x80; } @@ -769,6 +793,7 @@ static void _convert_const_I8toU8 if ( tensor->t ) vxReleaseTensor(&tensor->t); tensor->t = vsi_nn_CreateRawTensorFromData(graph, data, attr); +final: vsi_nn_safe_free( data ); }/* _convert_const_I8toU8() */ @@ -777,7 +802,7 @@ static vsi_status _convert_graph_const_tensor vsi_nn_graph_t* graph ) { - vsi_status status = VSI_SUCCESS; + vsi_status status = VSI_FAILURE; uint32_t node_num = graph->node_num; vsi_nn_node_t* node = NULL; uint32_t i = 0; @@ -786,6 +811,8 @@ static vsi_status _convert_graph_const_tensor for(i = 0; i < node_num; i++) { node = vsi_nn_GetNode(graph, i); + CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final ); + for(j = 0; j < node->input.num; j++) { vsi_nn_tensor_id_t id = node->input.tensors[j]; @@ -797,7 +824,9 @@ static vsi_status _convert_graph_const_tensor } } } + status = VSI_SUCCESS; +final: return status; } /* _convert_graph_const_tensor() */ @@ -829,23 +858,26 @@ static vsi_status _convert_graph_virtual_tensor for(i = 0; i < node_num; i++) { node = vsi_nn_GetNode(graph, i); + CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final ); + for(j = 0; j < node->input.num; j++) { - vsi_nn_tensor_id_t id = node->input.tensors[j]; - vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); + vsi_nn_tensor_id_t id = node->input.tensors[j]; + vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); - status = _convert_virtual_tensor_attr(tensor); + status = _convert_virtual_tensor_attr(tensor); } for(j = 0; j < node->output.num; j++) { - vsi_nn_tensor_id_t id = node->output.tensors[j]; - vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); + vsi_nn_tensor_id_t id = node->output.tensors[j]; + vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); - status = _convert_virtual_tensor_attr(tensor); + status = _convert_virtual_tensor_attr(tensor); } } +final: return status; } /* _convert_graph_virtual_tensor() */ @@ -857,13 +889,13 @@ static vsi_status _graph_optimization_convert_int8_to_uint8 { vsi_status status = VSI_FAILURE; status = _convert_graph_virtual_tensor(graph); - TEST_CHECK_STATUS(status, final); + CHECK_STATUS_FAIL_GOTO(status, final); status = _convert_graph_const_tensor(graph); - TEST_CHECK_STATUS(status, final); + CHECK_STATUS_FAIL_GOTO(status, final); status = _add_graph_data_convert(graph, dirty); - TEST_CHECK_STATUS(status, final); + CHECK_STATUS_FAIL_GOTO(status, final); final: return status; @@ -875,13 +907,15 @@ vsi_status vsi_nn_OptimizeGraph vsi_bool *dirty ) { - vsi_status status = VSI_SUCCESS; + vsi_status status = VSI_FAILURE; uint32_t i = 0; vsi_bool nbg_flag = FALSE; vsi_nn_node_t* node = NULL; for(i = 0; i < graph->node_num; i++) { node = vsi_nn_GetNode(graph, i); + CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final ); + if(node->op == VSI_NN_OP_NBG) { nbg_flag = TRUE; @@ -889,10 +923,12 @@ vsi_status vsi_nn_OptimizeGraph } } + status = VSI_SUCCESS; + if (!nbg_flag && graph->ctx->options.enable_asymi8_to_u8) { status = _graph_optimization_convert_int8_to_uint8(graph, dirty); - TEST_CHECK_STATUS(status, final); + CHECK_STATUS_FAIL_GOTO(status, final); } final: diff --git a/src/tim/vx/internal/src/vsi_nn_internal_node.c b/src/tim/vx/internal/src/vsi_nn_internal_node.c index 24265a11b..ff5b1cce0 100644 --- a/src/tim/vx/internal/src/vsi_nn_internal_node.c +++ b/src/tim/vx/internal/src/vsi_nn_internal_node.c @@ -41,9 +41,9 @@ /********************************************************** * MACROS **********************************************************/ -#define LINKLIST_APPEND( _HEAD, _ITEM ) do { \ +#define LINKLIST_APPEND( _HEAD, _ITEM ) { \ vsi_nn_LinkListPushEnd((vsi_nn_link_list_t **)&(_HEAD), \ - (vsi_nn_link_list_t *)(_ITEM) ); } while( 0 ) + (vsi_nn_link_list_t *)(_ITEM) ); } #define WKSP(_NODE_PTR) ((vsi_nn_internal_node_wksp_t *) \ ((_NODE_PTR)->internal_node_wksp)) @@ -214,6 +214,7 @@ vsi_nn_internal_tensor_t* vsi_nn_internal_create_zero_bias_tensor { case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8: scale = input_attr->dtype.scale; break; @@ -235,6 +236,7 @@ vsi_nn_internal_tensor_t* vsi_nn_internal_create_zero_bias_tensor { case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8: attr.dtype.scale = weight_attr->dtype.scale * scale; attr.dtype.zero_point = 0; attr.dtype.qnt_type = weight_attr->dtype.qnt_type; @@ -702,22 +704,48 @@ vsi_status vsi_nn_internal_optimize_node { vsi_status status = VSI_SUCCESS; vsi_nn_internal_node_t* curr = NULL; + int32_t n = 0; curr = WKSP(node)->nodes; - while( NULL != curr ) + n = (int32_t)vsi_nn_LinkListGetNodeNumber((vsi_nn_link_list_t *)WKSP(node)); + + if (direction == VSI_NN_OPTIMIZE_BACKWARD) { - VSILOGD("Optimize node uid[%u] sub_uid[%u] op[%s]", - node->uid, curr->node->uid, vsi_nn_OpGetName(curr->node->op)); + int32_t i = 0; - status = vsi_nn_OpOptimize( curr->node->op, curr->node, - curr->inputs, curr->outputs, direction ); - if( VSI_SUCCESS != status ) + for ( i = n - 1; i >= 0; i-- ) { - VSILOGE("op_optimize fail %d", curr->node->op); - break; + curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)WKSP(node), i); + VSILOGD("Optimize backward for node uid[%u] sub_uid[%u] op[%s]", + node->uid, curr->node->uid, vsi_nn_OpGetName(curr->node->op)); + + status = vsi_nn_OpOptimize( curr->node->op, curr->node, + curr->inputs, curr->outputs, direction ); + if ( VSI_SUCCESS != status ) + { + VSILOGE("op_optimize backward fail %d", curr->node->op); + break; + } + } + } + else + { + while( NULL != curr ) + { + VSILOGD("Optimize forward for node uid[%u] sub_uid[%u] op[%s]", + node->uid, curr->node->uid, vsi_nn_OpGetName(curr->node->op)); - curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)curr ); + status = vsi_nn_OpOptimize( curr->node->op, curr->node, + curr->inputs, curr->outputs, direction ); + if( VSI_SUCCESS != status ) + { + VSILOGE("op_optimize forward fail %d", curr->node->op); + break; + } + + curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)curr ); + } } return status; diff --git a/src/tim/vx/internal/src/vsi_nn_kernel_prv.h b/src/tim/vx/internal/src/vsi_nn_kernel_prv.h index fa01a5e37..76b1cc01f 100644 --- a/src/tim/vx/internal/src/vsi_nn_kernel_prv.h +++ b/src/tim/vx/internal/src/vsi_nn_kernel_prv.h @@ -55,6 +55,12 @@ vsi_bool vsi_nn_is_sp_supported_broadcast vsi_nn_tensor_t* output ); +vsi_bool vsi_nn_kernel_optimize_element_shape_with_max_rank + ( + const vsi_size_t* shape_x, const vsi_size_t rank_x, + vsi_size_t* out_shape_x, vsi_size_t* out_rank_x, vsi_size_t max_rank + ); + #ifdef __cplusplus } #endif diff --git a/src/tim/vx/internal/src/vsi_nn_node.c b/src/tim/vx/internal/src/vsi_nn_node.c index f13e80b67..4ffd68769 100644 --- a/src/tim/vx/internal/src/vsi_nn_node.c +++ b/src/tim/vx/internal/src/vsi_nn_node.c @@ -48,7 +48,7 @@ vsi_nn_node_t * vsi_nn_NewNode if(NULL == graph || FALSE == vsi_nn_OpIsValid(op)) { VSILOGE("Create node %s. fail", vsi_nn_OpGetName(op)); - return NULL; + goto final; } node = (vsi_nn_node_t *)malloc( sizeof( vsi_nn_node_t ) ); @@ -73,23 +73,41 @@ vsi_nn_node_t * vsi_nn_NewNode node->output.num = (uint32_t)output_num; node->output.tensors = (vsi_nn_tensor_id_t *) malloc( output_num * sizeof( vsi_nn_tensor_id_t ) ); + if (NULL == node->output.tensors) + { + goto final; + } vsi_nn_InitTensorsId( node->output.tensors, (uint32_t)output_num ); /* init input struct */ node->input.num = (uint32_t)input_num; node->input.tensors = (vsi_nn_tensor_id_t *) malloc( input_num * sizeof( vsi_nn_tensor_id_t ) ); + if (NULL == node->input.tensors) + { + goto final; + } vsi_nn_InitTensorsId( node->input.tensors, (uint32_t)input_num ); node->attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE; node->attr.enable_op_constraint_check = TRUE; } else { - return NULL; + goto final; } node->uid = VSI_NN_NODE_UID_NA; + return node; +final: + if (node) + { + vsi_nn_safe_free(node->output.tensors); + vsi_nn_safe_free(node->input.tensors); + } + vsi_nn_safe_free(node); + + return NULL; } /* vsi_nn_NewNode() */ /* @@ -214,6 +232,8 @@ vsi_status vsi_nn_update_node_attr { vsi_status status = VSI_FAILURE; + VSI_UNREFERENCED(node); + #if(defined(VX_PRELOAD_CONST_TENSOR_SUPPORT) && VX_PRELOAD_CONST_TENSOR_SUPPORT) if(node) { diff --git a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c index b3e2ef191..b7f8b706e 100644 --- a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c +++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c @@ -205,13 +205,14 @@ static _node_template s_template[] = /* MAXUNPOOL */ NULL, /* REVERSESEQUENCE */ NULL, /* LPNORM */ NULL, + /* RESIZE_3D */ NULL, }; //_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c ); void vsi_nn_apply_node_attr_template ( vsi_nn_node_t * node ) { - if( node->op >= _cnt_of_array( s_template ) ) + if( node->op >= (vsi_nn_op_t)_cnt_of_array( s_template ) ) { VSILOGW( "Unsupport operation id %d.", node->op ); return; diff --git a/src/tim/vx/internal/src/vsi_nn_ops.c b/src/tim/vx/internal/src/vsi_nn_ops.c index 8ca7df26e..b706240c6 100644 --- a/src/tim/vx/internal/src/vsi_nn_ops.c +++ b/src/tim/vx/internal/src/vsi_nn_ops.c @@ -298,6 +298,9 @@ void vsi_nn_OpGetIoNum ) { const vsi_nn_op_proc_t * proc; + + VSI_UNREFERENCED(node); + proc = vsi_nn_OpGetProc( op ); if( NULL != proc ) { diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c index 63c80f112..265d9221d 100644 --- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c +++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c @@ -170,26 +170,46 @@ static void _set_preproc_node_rect_params static void _set_preproc_node_norm_params ( vsi_nn_node_t* node, - vsi_nn_preprocess_mean_and_scale_t* mean_and_scale, - vsi_nn_tensor_attr_t* attr + vsi_nn_preprocess_type_e type, + void* mean_and_scale ) { int32_t i = 0; if(mean_and_scale != NULL) { - for(i = 0; i < mean_and_scale->channel_len; i++) + if (type == VSI_NN_PREPROCESS_MEAN_AND_SCALE) { - node->nn_param.pre_process.norm.mean[i] = mean_and_scale->channel_mean[i]; + vsi_nn_preprocess_mean_and_scale_t* means_and_single_scale = + (vsi_nn_preprocess_mean_and_scale_t*)mean_and_scale; + node->nn_param.pre_process.norm2.scale[0] = means_and_single_scale->scale; + node->nn_param.pre_process.norm2.scale[1] = means_and_single_scale->scale; + node->nn_param.pre_process.norm2.scale[2] = means_and_single_scale->scale; + for(i = 0; i < means_and_single_scale->channel_len; i++) + { + node->nn_param.pre_process.norm.mean[i] = means_and_single_scale->channel_mean[i]; + } + } + else if (type == VSI_NN_PREPROCESS_MEANS_AND_SCALES) + { + vsi_nn_preprocess_means_and_scales_t* means_and_scales = + (vsi_nn_preprocess_means_and_scales_t*)mean_and_scale; + for (i = 0; i < means_and_scales->scale_len; i++) + { + node->nn_param.pre_process.norm2.scale[i] = means_and_scales->scale[i]; + } + for(i = 0; i < means_and_scales->channel_len; i++) + { + node->nn_param.pre_process.norm.mean[i] = means_and_scales->channel_mean[i]; + } } - node->nn_param.pre_process.norm.scale = mean_and_scale->scale; } else { - for(i = 0; i < (int32_t)attr->dim_num - 1; i++) + for(i = 0; i < 3; i++) { node->nn_param.pre_process.norm.mean[i] = 0; + node->nn_param.pre_process.norm2.scale[i] = 1.0f; } - node->nn_param.pre_process.norm.scale = 1.0f; } } /* _set_preproc_node_norm_params() */ @@ -268,7 +288,7 @@ static void _set_preproc_node_input_attr if(*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR || *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_GRAY) { - if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC) + if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC && input_size != NULL) { input_attr->size[0] = input_size->w; input_attr->size[1] = input_size->h; @@ -453,7 +473,7 @@ vsi_status vsi_nn_add_single_preproc_node vsi_nn_node_t* node = NULL; vsi_nn_preprocess_image_size_t* input_size = NULL; vsi_nn_preprocess_crop_t* crop = NULL; - vsi_nn_preprocess_mean_and_scale_t* mean_and_scale = NULL; + void* mean_and_scale = NULL; vsi_nn_preprocess_permute_t* permute = NULL; vsi_nn_preprocess_image_resize_t* image_resize = NULL; vsi_nn_preprocess_dtype_convert_t* data_convert = NULL; @@ -462,6 +482,7 @@ vsi_status vsi_nn_add_single_preproc_node vsi_nn_tensor_id_t preproc_inputs[3] = {0}; vsi_nn_tensor_id_t preproc_output; vsi_nn_tensor_t* org_norm_tensor = NULL; + vsi_nn_preprocess_type_e mean_and_scale_type = VSI_NN_PREPROCESS_MEAN_AND_SCALE; uint32_t node_input_num = 1; int32_t reverse_channel = 0; uint32_t i = 0; @@ -501,6 +522,11 @@ vsi_status vsi_nn_add_single_preproc_node else if(preprocess[idx].type == VSI_NN_PREPROCESS_IMAGE_SIZE) input_size = (vsi_nn_preprocess_image_size_t*)preprocess[idx].param; + else if(preprocess[idx].type == VSI_NN_PREPROCESS_MEANS_AND_SCALES) + { + mean_and_scale = (vsi_nn_process_means_and_scales_t*)preprocess[idx].param; + mean_and_scale_type = VSI_NN_PREPROCESS_MEANS_AND_SCALES; + } else { VSILOGE("preprocess[%d] type is not support, please have a check!", idx); @@ -509,13 +535,20 @@ vsi_status vsi_nn_add_single_preproc_node } } - if(source_layout == NULL) + if (source_layout == NULL) { VSILOGE("Preprocess source layout need to be set!"); status = VSI_FAILURE; TEST_CHECK_STATUS(status, final); } + if (source_format == NULL) + { + VSILOGE("Preprocess source source format need to be set!"); + status = VSI_FAILURE; + TEST_CHECK_STATUS(status, final); + } + /* Add preprocess node */ if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 || *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444 || @@ -530,6 +563,7 @@ vsi_status vsi_nn_add_single_preproc_node } node = vsi_nn_AddNode(graph, VSI_NN_OP_PRE_PROCESS, node_input_num, 1, NULL); + TEST_CHECK_PTR(node, final); node->uid = (uint32_t)(VSI_NN_PREPROC_NODE_UID_BASE) + input_idx; /* Set preprocess node parameters */ @@ -537,7 +571,7 @@ vsi_status vsi_nn_add_single_preproc_node TEST_CHECK_STATUS(status, final); _set_preproc_node_rect_params(node, crop, input_size, source_format); - _set_preproc_node_norm_params(node, mean_and_scale, &org_norm_tensor->attr); + _set_preproc_node_norm_params(node, mean_and_scale_type, mean_and_scale); if(permute != NULL) { @@ -698,7 +732,17 @@ vsi_status vsi_nn_add_single_postproc_node } /* Reconnect node tensors */ + if (NULL == node->input.tensors) + { + status = VSI_FAILURE; + goto final; + } node->input.tensors[0] = postproc_input; + if (NULL == node->output.tensors) + { + status = VSI_FAILURE; + goto final; + } node->output.tensors[0] = postproc_output; for(i = 0; i < last_node->output.num; i++) { @@ -800,7 +844,7 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam uint32_t num_of_graph_outputs; uint32_t num_of_graph_real_outputs; vx_reference* graph_outputs = NULL; - vsi_nn_tensor_t* tensor; + vsi_nn_tensor_t* tensor = NULL; vsi_nn_node_t** nodes = NULL; vsi_nn_node_t* node = NULL; vsi_nn_node_id_t* processed_node_id_list = NULL; @@ -866,11 +910,13 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam } } } + vsi_nn_safe_free(nodes); } } graph_inputs = (vx_reference*)malloc(num_of_graph_real_inputs * sizeof(vx_reference)); TEST_CHECK_PTR( graph_inputs, final ); + memset(graph_inputs, 0, num_of_graph_inputs * sizeof(vx_reference)); memset(processed_node_id_list, 0, num_of_graph_inputs * sizeof(vsi_nn_node_id_t)); processed_idx = 0; for (i = 0, j=0; i < num_of_graph_inputs; i++) @@ -879,6 +925,7 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam vsi_bool enabled = FALSE; uint32_t nodes_count = 0; tensor = vsi_nn_GetTensor(graph, graph->input.tensors[i]); + TEST_CHECK_PTR( tensor, final ); vsi_nn_get_tensor_consumers(graph, graph->input.tensors[i], NULL, &nodes_count); if (nodes_count != 0) { @@ -937,19 +984,22 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam vx_enum data_type = 0; param = vxGetParameterByIndex(prenode, p); - vxQueryParameter(param, - VX_PARAMETER_TYPE, - &type, - sizeof(vx_enum)); - vxQueryParameter(param, - VX_PARAMETER_DIRECTION, - &direction, - sizeof(vx_enum)); - if (direction != VX_INPUT) continue; - vxQueryParameter(param, - VX_PARAMETER_REF, - &ref, - sizeof(vx_reference)); + if (param) + { + vxQueryParameter(param, + VX_PARAMETER_TYPE, + &type, + sizeof(vx_enum)); + vxQueryParameter(param, + VX_PARAMETER_DIRECTION, + &direction, + sizeof(vx_enum)); + if (direction != VX_INPUT) continue; + vxQueryParameter(param, + VX_PARAMETER_REF, + &ref, + sizeof(vx_reference)); + } if (type == VX_TYPE_TENSOR) { graph_inputs[j++] = ref; @@ -986,6 +1036,7 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam } } } + vsi_nn_safe_free(nodes); } } num_of_graph_outputs = graph->output.num; @@ -1003,6 +1054,8 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam } graph_outputs = (vx_reference*)malloc(num_of_graph_real_outputs * sizeof(vx_reference)); TEST_CHECK_PTR( graph_outputs, final ); + memset(graph_outputs, 0, num_of_graph_real_outputs * sizeof(vx_reference)); + for (i = 0, j = 0; i < num_of_graph_outputs; i++) { tensor = vsi_nn_GetTensor(graph, graph->output.tensors[i]); @@ -1063,7 +1116,7 @@ vsi_status vsi_nn_UpdateCropParamsForBinaryGraph uint32_t i, j; uint32_t numParams = 0; int32_t scalar_value[4] = {0}; - vsi_status status = VSI_FAILURE; + vsi_status status = VSI_SUCCESS; uint32_t input_idx = enabled_crop_input_idx; scalar_value[0] = (int32_t)((crop_w << 15) / dst_w); scalar_value[1] = (int32_t)((crop_h << 15) / dst_h); @@ -1073,7 +1126,7 @@ vsi_status vsi_nn_UpdateCropParamsForBinaryGraph for (i = 0; i < graph->node_num; i++) { vsi_nn_node_t* node = vsi_nn_GetNode(graph, i); - if (node->op == VSI_NN_OP_NBG) + if (node && node->op == VSI_NN_OP_NBG) { vx_parameter param = 0; vx_enum type = 0; @@ -1081,16 +1134,19 @@ vsi_status vsi_nn_UpdateCropParamsForBinaryGraph uint32_t scalar_idx = 0; uint32_t scalar_value_idx = 0; int32_t temp_value = 0; - status = vxQueryNode(node->n, VX_NODE_PARAMETERS, &numParams, sizeof(numParams)); + status |= vxQueryNode(node->n, VX_NODE_PARAMETERS, &numParams, sizeof(numParams)); for (j = 0; j < numParams; j++) { - param = vxGetParameterByIndex(node->n, j); - status = vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); - if (type == VX_TYPE_SCALAR) + + if (param) { - scalar_idx = j; - break; + status |= vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); + if (type == VX_TYPE_SCALAR) + { + scalar_idx = j; + break; + } } } while (input_idx > 0) @@ -1099,12 +1155,15 @@ vsi_status vsi_nn_UpdateCropParamsForBinaryGraph for (j = tensor_idx; j < numParams; j++) { param = vxGetParameterByIndex(node->n, j); - status = vxQueryParameter( - param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); - if (type == VX_TYPE_SCALAR) + if (param) { - scalar_idx = j; - break; + status |= vxQueryParameter( + param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); + if (type == VX_TYPE_SCALAR) + { + scalar_idx = j; + break; + } } } input_idx--; @@ -1113,12 +1172,15 @@ vsi_status vsi_nn_UpdateCropParamsForBinaryGraph { temp_value = scalar_value[scalar_value_idx++]; param = vxGetParameterByIndex(node->n, j); - status = vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); - if (type == VX_TYPE_SCALAR) + if (param) { - status = vxQueryParameter(param, VX_PARAMETER_REF, &ref, sizeof(vx_reference)); - status = vxWriteScalarValue((vx_scalar)ref, &temp_value); - status = vxSetParameterByIndex(node->n, j, ref); + status |= vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); + if (type == VX_TYPE_SCALAR) + { + status |= vxQueryParameter(param, VX_PARAMETER_REF, &ref, sizeof(vx_reference)); + status |= vxWriteScalarValue((vx_scalar)ref, &temp_value); + status |= vxSetParameterByIndex(node->n, j, ref); + } } } diff --git a/src/tim/vx/internal/src/vsi_nn_rnn.c b/src/tim/vx/internal/src/vsi_nn_rnn.c index 2a3baabaa..545f7dcb6 100644 --- a/src/tim/vx/internal/src/vsi_nn_rnn.c +++ b/src/tim/vx/internal/src/vsi_nn_rnn.c @@ -31,6 +31,7 @@ #include "utils/vsi_nn_util.h" #include "vsi_nn_rnn_prv.h" #include "vsi_nn_internal_node.h" +#include "vsi_nn_error.h" /********************************************************** * MACROS @@ -54,6 +55,12 @@ static vsi_status internal_buffer_init vsi_size_t data_size = 0; uint8_t* data = NULL; + if( NULL == tensor ) + { + VSILOGE("input tensor is NULL."); + return status; + } + if( TRUE == tensor->attr.vtl ) { VSILOGE("Internal tensors cannot be dumpped."); @@ -72,7 +79,7 @@ static vsi_status internal_buffer_init stride = vsi_nn_TypeGetBytes( tensor->attr.dtype.vx_type ); data = (uint8_t *)malloc(data_size); - if( NULL == buffer ) + if ( NULL == data ) { VSILOGE("Out of memoery."); goto error; @@ -136,6 +143,11 @@ static vsi_status internal_buffer_copy_to_tensor } tensor = vsi_nn_GetTensor( graph, tensorid ); + if ( NULL == tensor ) + { + VSILOGE("tensor is NULL."); + return status; + } request_data_size = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num, tensor->attr.dtype.vx_type ); if( request_data_size != buffer->data_size ) { @@ -167,6 +179,7 @@ static vsi_status internal_buffer_copy_from_tensor } tensor = vsi_nn_GetTensor( graph, tensorid ); + CHECK_PTR_FAIL_GOTO( tensor, "Get tensor fail.", final ); request_data_size = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num, tensor->attr.dtype.vx_type ); if( request_data_size != buffer->data_size ) { @@ -181,6 +194,7 @@ static vsi_status internal_buffer_copy_from_tensor status = VSI_SUCCESS; } +final: vsi_nn_safe_free( data ); return status; @@ -366,6 +380,8 @@ vsi_status vsi_nn_rnn_InitWksp memcpy( &cur_conn->connection, &connections[i], sizeof( connections[i] ) ); output_tensor = vsi_nn_GetTensor( graph, cur_conn->connection.output ); + CHECK_PTR_FAIL_GOTO( output_tensor, "Get tensor fail.", OnError ); + for( j = 0; j < VSI_NN_MAX_RNN_CONNECTION_INPUTS; j++ ) { if( VSI_NN_TENSOR_ID_NA == cur_conn->connection.inputs[j] ) @@ -374,6 +390,8 @@ vsi_status vsi_nn_rnn_InitWksp } /* make sure input tensors have the same size and dtype with output tensor */ input_tensor = vsi_nn_GetTensor( graph, cur_conn->connection.inputs[j] ); + CHECK_PTR_FAIL_GOTO( input_tensor, "Get tensor fail.", OnError ); + if( output_tensor->attr.dim_num != input_tensor->attr.dim_num || output_tensor->attr.dtype.vx_type != input_tensor->attr.dtype.vx_type || 0 != memcmp(output_tensor->attr.size, input_tensor->attr.size, @@ -399,6 +417,8 @@ vsi_status vsi_nn_rnn_InitWksp if( cur_conn->connection_inputs_count == 1 ) { input_tensor = vsi_nn_GetTensor( graph, cur_conn->connection.inputs[0] ); + CHECK_PTR_FAIL_GOTO( input_tensor, "Get tensor fail.", OnError ); + if( output_tensor && output_tensor->attr.is_created_from_handle && input_tensor && input_tensor->attr.is_created_from_handle ) { @@ -421,7 +441,7 @@ vsi_status vsi_nn_rnn_InitWksp OnError: vsi_nn_safe_free( cur_conn ); - return status; + return VSI_FAILURE; } /* vsi_nn_rnn_InitWksp() */ vsi_status vsi_nn_rnn_ResetBuffers diff --git a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c index 9466d3d60..44ab53eee 100644 --- a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c +++ b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c @@ -33,6 +33,7 @@ #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_util.h" #include "vsi_nn_rnn_helper.h" +#include "vsi_nn_error.h" vsi_bool vsi_nn_rnn_find_best_kernel_size ( @@ -121,9 +122,12 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_input_for_nn_fc memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor); tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final); tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); reshape_in_size = (vsi_size_t*)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(vsi_size_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_in_size, tmp_inode, "Create internal buffer failed", final); reshape_in_size[3] = input->attr.size[1]; reshape_in_size[2] = input->attr.size[0] / (kernel_h * kernel_w); @@ -145,13 +149,17 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_input_for_nn_fc c = tensor1->t->attr.size[1]; reshape_size[2] = tensor1->t->attr.size[3]; - reshape_size[1] = -1; + reshape_size[1] = (vsi_size_t)-1; reshape_size[0] = tensor1->t->attr.size[0]; tensor0 = vsi_nn_rnn_create_reshape(self, tensor1->t, NULL, reshape_size, 3, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tensor0, "Create internal tensor failed", final); tensor2 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(tensor2, "Create internal tensor failed", final); tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 ); + CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode, 3 * sizeof(uint32_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(permute_in_perm, tmp_inode, "Create internal buffer failed", final); permute_in_perm[0] = 2; permute_in_perm[1] = 1; permute_in_perm[2] = 0; @@ -174,6 +182,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_input_for_nn_fc tensor1 = NULL; } +final: return tensor1; } @@ -196,6 +205,9 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_output_for_nn_fc vsi_nn_tensor_t* tensor = input; vsi_bool ret = FALSE; + VSI_UNREFERENCED(kernel_h); + VSI_UNREFERENCED(kernel_w); + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor); @@ -208,13 +220,18 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_output_for_nn_fc c = tensor->attr.size[1]; reshape_size[2] = tensor->attr.size[3]; - reshape_size[1] = -1; + reshape_size[1] = (vsi_size_t)-1; reshape_size[0] = tensor->attr.size[0]; tensor0 = vsi_nn_rnn_create_reshape(self, tensor, NULL, reshape_size, 3, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tensor0, "Create internal tensor failed", final); tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final); + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 ); + CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode, 3 * sizeof(uint32_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(permute_in_perm, tmp_inode, "Create internal buffer failed", final); permute_in_perm[0] = 2; permute_in_perm[1] = 1; @@ -231,13 +248,17 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_output_for_nn_fc reshape_size[1] = c; reshape_size[0] = tensor1->t->attr.size[0]; tensor0 = vsi_nn_rnn_create_reshape(self, tensor1->t, NULL, reshape_size, 4, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tensor0, "Create internal tensor failed", final); tensor = tensor0->t; } tensor2 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(tensor2, "Create internal tensor failed", final); tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); reshape_in_size = (vsi_size_t *)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(vsi_size_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_in_size, tmp_inode, "Create internal buffer failed", final); reshape_in_size[1] = tensor->attr.size[3]; reshape_in_size[0] = tensor->attr.size[2]; @@ -252,6 +273,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_output_for_nn_fc tensor2 = NULL; } +final: return tensor2; } @@ -272,6 +294,10 @@ vsi_bool vsi_nn_rnn_process_output_for_nn_fc2 uint32_t* permute_in_perm = NULL; vsi_nn_internal_node_t* tmp_inode = NULL; vsi_nn_tensor_t* tensor = input; + vsi_bool ret = FALSE; + + VSI_UNREFERENCED(kernel_h); + VSI_UNREFERENCED(kernel_w); memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor); @@ -285,13 +311,17 @@ vsi_bool vsi_nn_rnn_process_output_for_nn_fc2 c = tensor->attr.size[1]; reshape_size[2] = tensor->attr.size[3]; - reshape_size[1] = -1; + reshape_size[1] = (vsi_size_t)-1; reshape_size[0] = tensor->attr.size[0]; tensor0 = vsi_nn_rnn_create_reshape(self, tensor, NULL, reshape_size, 3, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tensor0, "Create internal tensor failed", final); tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final); tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 ); + CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode, 3 * sizeof(uint32_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(permute_in_perm, tmp_inode, "Create internal buffer failed", final); permute_in_perm[0] = 2; permute_in_perm[1] = 1; @@ -308,12 +338,15 @@ vsi_bool vsi_nn_rnn_process_output_for_nn_fc2 reshape_size[1] = c; reshape_size[0] = tensor1->t->attr.size[0]; tensor0 = vsi_nn_rnn_create_reshape(self, tensor1->t, NULL, reshape_size, 4, use_virtual_tensor); + CHECK_PTR_FAIL_GOTO(tensor0, "Create internal tensor failed", final); tensor = tensor0->t; } tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); reshape_in_size = (vsi_size_t*)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(vsi_size_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_in_size, tmp_inode, "Create internal buffer failed", final); reshape_in_size[1] = tensor->attr.size[3]; reshape_in_size[0] = tensor->attr.size[2]; @@ -322,9 +355,10 @@ vsi_bool vsi_nn_rnn_process_output_for_nn_fc2 tmp_inode->node->nn_param.reshape2.dim_num = 2; tmp_inode->inputs[0] = tensor; tmp_inode->outputs[0] = output; - vsi_nn_internal_setup_node(self, tmp_inode); + ret = vsi_nn_internal_setup_node(self, tmp_inode); - return TRUE; +final: + return ret; } vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tp_fc @@ -351,12 +385,16 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tp_fc /* create zero bias for NN/TP */ tensor1 = vsi_nn_internal_create_zero_bias_tensor( self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE); + CHECK_PTR_FAIL_GOTO( tensor1, "Create tensor fail.", final ); + tensor = tensor1->t; } vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); tensor2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(tensor2, "Create internal tensor failed", final); tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_FCL, 0, 0 ); + CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); tmp_inode->node->nn_param.fcl.axis = 0; tmp_inode->node->nn_param.fcl.weights = (uint32_t)weight->attr.size[1]; @@ -370,6 +408,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tp_fc tensor2 = NULL; } +final: return tensor2; } @@ -400,15 +439,19 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc /* create zero bias for NN/TP */ tensor1 = vsi_nn_internal_create_zero_bias_tensor( self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE); + CHECK_PTR_FAIL_GOTO( tensor1, "Create tensor fail.", final ); tensor = tensor1->t; } vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); tensor2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(tensor2, "Create internal tensor failed", final); reshaped_weight_tensor = vsi_nn_rnn_prepare_weight_for_nn_fc(self, weight, kernel_h, kernel_w); + CHECK_PTR_FAIL_GOTO(reshaped_weight_tensor, "Create internal tensor failed", final); tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 ); + CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); tmp_inode->node->nn_param.conv2d.ksize[0] = kernel_w; tmp_inode->node->nn_param.conv2d.ksize[1] = kernel_h; tmp_inode->node->nn_param.conv2d.stride[0] = 1; @@ -432,6 +475,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc tensor2 = NULL; } +final: return tensor2; } @@ -459,6 +503,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_prepare_weight_for_nn_fc memcpy( &attr.dtype, &weight->attr.dtype, sizeof(attr.dtype)); memcpy( &attr.size, &reshaped_weight_shape, sizeof(attr.size)); reshaped_weight_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(reshaped_weight_tensor, "Create internal tensor failed", final); vsi_nn_ReshapeTensor( self->graph, weight, reshaped_weight_tensor->t, reshaped_weight_shape, 4 ); @@ -468,6 +513,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_prepare_weight_for_nn_fc vsi_nn_SetTensorAttr(reshaped_weight_tensor->t, VSI_NN_TENSOR_ATTR_CONST); } +final: return reshaped_weight_tensor; } @@ -499,15 +545,20 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc_relu /* create zero bias for NN/TP */ tensor1 = vsi_nn_internal_create_zero_bias_tensor( self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE); + + CHECK_PTR_FAIL_GOTO( tensor1, "Create tensor fail.", final ); tensor = tensor1->t; } vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); tensor2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(tensor2, "Create internal tensor failed", final); reshaped_weight_tensor = vsi_nn_rnn_prepare_weight_for_nn_fc(self, weight, kernel_h, kernel_w); + CHECK_PTR_FAIL_GOTO(reshaped_weight_tensor, "Create internal tensor failed", final); tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV_RELU, 0, 0 ); + CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); tmp_inode->node->nn_param.conv2d.ksize[0] = kernel_w; tmp_inode->node->nn_param.conv2d.ksize[1] = kernel_h; tmp_inode->node->nn_param.conv2d.stride[0] = 1; @@ -536,6 +587,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc_relu tensor2 = NULL; } +final: return tensor2; } @@ -556,8 +608,10 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tensor_add memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); tensor1 = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final); tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_ADD, 0, 0 ); + CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); tmp_inode->inputs[0] = input1; tmp_inode->inputs[1] = input2; @@ -567,6 +621,8 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tensor_add { tensor1 = NULL; } + +final: return tensor1; } @@ -612,8 +668,10 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_activation memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); tensor1 = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final); tmp_inode = vsi_nn_internal_new_node(self, vsi_nn_rnn_get_act_op_type(act_type), 0, 0 ); + CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); tmp_inode->inputs[0] = input; tmp_inode->node->nn_param.tanh.scale_a = 1.0f; @@ -625,6 +683,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_activation tensor1 = NULL; } +final: return tensor1; } @@ -649,11 +708,14 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_transpose_time_major vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); } curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(curr, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(permute_in_perm, curr, "Create internal buffer failed", final); permute_in_perm[0] = 0; permute_in_perm[1] = 2; permute_in_perm[2] = 1; @@ -676,10 +738,11 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_transpose_time_major output_tensor = NULL; } +final: return output_tensor; } -void vsi_nn_rnn_split_input_tensor +vsi_status vsi_nn_rnn_split_input_tensor ( vsi_nn_node_t * self, vsi_nn_tensor_t * input, @@ -688,6 +751,7 @@ void vsi_nn_rnn_split_input_tensor vsi_bool use_virtual_tensor ) { + vsi_status status = VSI_FAILURE; uint32_t* slices = NULL; vsi_nn_internal_node_t* curr = NULL; vsi_nn_tensor_attr_t attr; @@ -696,7 +760,9 @@ void vsi_nn_rnn_split_input_tensor memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, time_step ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); slices = (uint32_t *)vsi_nn_internal_new_node_param(curr, time_step * sizeof(uint32_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(slices, curr, "Create internal buffer failed", final); curr->node->nn_param.split.axis = 2; /* timestep axis */ curr->node->nn_param.split.slices_num = time_step; curr->inputs[0] = input; @@ -707,13 +773,18 @@ void vsi_nn_rnn_split_input_tensor slices[i] = 1; vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(output_tensor, curr, "Create internal tensor failed", final); curr->outputs[i] = output_tensor->t; output[i] = output_tensor->t; } vsi_nn_internal_setup_node( self, curr ); + + status = VSI_SUCCESS; +final: + return status; } -void vsi_nn_rnn_data_check_aligned +vsi_status vsi_nn_rnn_data_check_aligned ( vsi_nn_node_t * self, vsi_nn_tensor_t ** input, @@ -721,6 +792,7 @@ void vsi_nn_rnn_data_check_aligned vsi_bool use_virtual_tensor ) { + vsi_status status = VSI_FAILURE; vsi_nn_internal_node_t* curr = NULL; vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t* output_tensor = NULL; @@ -736,8 +808,10 @@ void vsi_nn_rnn_data_check_aligned { vsi_nn_internal_init_tensor_attr(&attr, &input[i]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = input[i]; curr->outputs[0] = output_tensor->t; vsi_nn_internal_setup_node( self, curr ); @@ -747,6 +821,10 @@ void vsi_nn_rnn_data_check_aligned ofst += tensor_size; } + + status = VSI_SUCCESS; +final: + return status; } vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_split_output @@ -767,11 +845,14 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_split_output /* reshape for split output */ vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); reshape_split_size = (vsi_size_t *)vsi_nn_internal_new_node_param(curr, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); - reshape_split_size[0] = -1; + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_split_size, curr, "Create internal buffer failed", final); + reshape_split_size[0] = (vsi_size_t)-1; reshape_split_size[1] = batch_size; curr->node->nn_param.reshape2.size = reshape_split_size; @@ -784,6 +865,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_split_output output_tensor = NULL; } +final: return output_tensor; } @@ -806,11 +888,14 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_cell_output /* reshape output to 3-dims */ vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); reshape_grucell_output_size = (vsi_size_t*)vsi_nn_internal_new_node_param(curr, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); - reshape_grucell_output_size[0] = -1; + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_grucell_output_size, curr, "Create internal buffer failed", final); + reshape_grucell_output_size[0] = (vsi_size_t)-1; reshape_grucell_output_size[1] = batch_size; reshape_grucell_output_size[2] = 1; @@ -824,6 +909,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_cell_output output_tensor = NULL; } +final: return output_tensor; } @@ -845,8 +931,10 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_binary_operator memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final); tmp_inode = vsi_nn_internal_new_node(self, op, 0, 0 ); + CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final); tmp_inode->node->nn_param.multiply.scale = 1.0f; tmp_inode->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; tmp_inode->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN; @@ -859,6 +947,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_binary_operator output_tensor = NULL; } +final: return output_tensor; } @@ -876,9 +965,11 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_concat_impl vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t* tmp_tensor = NULL; vsi_nn_internal_node_t* inode = NULL; - int tensor_count = 1; + int32_t tensor_count = 1; vsi_bool ret = FALSE; + VSI_UNREFERENCED(axis); + va_start(args, tensor); FOREACH_ARGS(args, next, vsi_nn_tensor_t*) @@ -893,8 +984,10 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_concat_impl attr.vtl = use_virtual_tensor; attr.is_const = FALSE; tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final); inode = vsi_nn_internal_new_node(self, VSI_NN_OP_CONCAT, tensor_count, 1); + CHECK_PTR_FAIL_GOTO(inode, "Create internal node failed", final); inode->inputs[0] = tensor; tensor_count = 0; va_start(args, tensor); @@ -912,6 +1005,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_concat_impl tmp_tensor = NULL; } +final: return tmp_tensor; } @@ -938,9 +1032,11 @@ vsi_nn_internal_tensor_t** vsi_nn_create_split } curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, slices_num ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); if(!slices) { slices = (uint32_t *)vsi_nn_internal_new_node_param(curr, slices_num * sizeof(uint32_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(slices, curr, "Create internal buffer failed", final); num_per_output = (uint32_t)(tensor->attr.size[axis] / slices_num); for( i = 0; i < slices_num; i++ ) { @@ -949,6 +1045,7 @@ vsi_nn_internal_tensor_t** vsi_nn_create_split } output_tensors = (vsi_nn_internal_tensor_t**)vsi_nn_internal_new_node_param(curr, slices_num * sizeof(vsi_nn_internal_tensor_t*)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(output_tensors, curr, "Create internal buffer failed", final); curr->node->nn_param.split.axis = axis; curr->node->nn_param.split.slices_num = slices_num; curr->node->nn_param.split.slices = slices; @@ -959,10 +1056,12 @@ vsi_nn_internal_tensor_t** vsi_nn_create_split for( i = 0; i < slices_num; i++ ) { output_tensors[i] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(output_tensors[i], curr, "Create internal tensor failed", final); curr->outputs[i] = output_tensors[i]->t; } vsi_nn_internal_setup_node( self, curr ); +final: return output_tensors; } @@ -982,7 +1081,9 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_reshape vsi_bool ret = FALSE; curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 ); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); reshape_in_size = (vsi_size_t*)vsi_nn_internal_new_node_param(curr, dim_num * sizeof(vsi_size_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_in_size, curr, "Create internal buffer failed", final); memcpy(reshape_in_size, size, dim_num * sizeof(vsi_size_t)); curr->node->nn_param.reshape2.size = reshape_in_size; curr->node->nn_param.reshape2.dim_num = (uint32_t)dim_num; @@ -999,6 +1100,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_reshape memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_tensor_attr(&attr, &input_tensor->attr.dtype, use_virtual_tensor); tensor0 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(tensor0, curr, "Create internal tensor failed", final); curr->outputs[0] = tensor0->t; } ret = vsi_nn_internal_setup_node(self, curr); @@ -1007,7 +1109,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_reshape tensor0 = NULL; } - +final: return tensor0; } @@ -1027,8 +1129,10 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_permute vsi_bool ret = FALSE; curr = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(curr, dim_num * sizeof(uint32_t)); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(permute_in_perm, curr, "Create internal buffer failed", final); for (i = 0; i < dim_num; i++) { @@ -1047,6 +1151,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_permute vsi_nn_tensor_attr_t attr; vsi_nn_internal_init_tensor_attr(&attr, &input_tensor->attr.dtype, use_virtual_tensor); tensor0 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(tensor0, curr, "Create internal tensor failed", final); curr->outputs[0] = tensor0->t; } ret = vsi_nn_internal_setup_node(self, curr); @@ -1055,6 +1160,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_permute tensor0 = NULL; } +final: return tensor0; } @@ -1072,6 +1178,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tensor_copy vsi_bool ret = FALSE; curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0); + CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final); curr->inputs[0] = input_tensor; if(!dtype) { @@ -1087,6 +1194,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tensor_copy vsi_nn_tensor_attr_t attr; vsi_nn_internal_init_tensor_attr(&attr, dtype, use_virtual_tensor); tensor0 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(tensor0, curr, "Create internal tensor failed", final); curr->outputs[0] = tensor0->t; } ret = vsi_nn_internal_setup_node(self, curr); @@ -1095,5 +1203,6 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tensor_copy tensor0 = NULL; } +final: return tensor0; } \ No newline at end of file diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c index 0710a624e..5f7cb47c5 100644 --- a/src/tim/vx/internal/src/vsi_nn_tensor.c +++ b/src/tim/vx/internal/src/vsi_nn_tensor.c @@ -40,6 +40,7 @@ #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_dtype_util_prv.h" #include "utils/vsi_nn_tensor_op.h" +#include "vsi_nn_error.h" static vsi_bool _try_set_const_tensor ( @@ -119,6 +120,8 @@ static void print_tensor ext_attr[count] = 0; break; case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: + case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8: count = snprintf( &ext_attr[0], _EXT_ATTR_BUF_SZ, "ASM zp=%3d, scale=%.6f", tensor->attr.dtype.zero_point, tensor->attr.dtype.scale ); @@ -126,6 +129,7 @@ static void print_tensor break; #ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: + case VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8: count = snprintf( &ext_attr[0], _EXT_ATTR_BUF_SZ, "SYM PERCHANNEL axis=%d, count=%d", tensor->attr.dtype.channel_dim, tensor->attr.dtype.scale_dim ); @@ -258,15 +262,15 @@ static vsi_bool _auto_cal_shape vsi_size_t * dim_num ) { - vsi_bool ret; + vsi_bool ret; vsi_ssize_t neg_idx; - vsi_size_t i; - vsi_size_t total_size; + vsi_size_t i = 0; + vsi_size_t total_size = 1; ret = TRUE; neg_idx = -1; total_size = vsi_nn_ShapeProduct( input_shape, input_dim ); - if (-1 == *dim_num) + if ((vsi_size_t)-1 == *dim_num) { *dim_num = 1; shape[0] = total_size; @@ -283,7 +287,7 @@ static vsi_bool _auto_cal_shape { VSILOGE( "Wrong shape '%"VSI_SSIZE_T_SPECIFIER"' ", (vsi_ssize_t)shape[i] ); ret = FALSE; - break; + goto final; } shape[i] = input_shape[i]; } @@ -297,17 +301,16 @@ static vsi_bool _auto_cal_shape { VSILOGE( "Wrong shape '%"VSI_SSIZE_T_SPECIFIER"' ", (vsi_ssize_t)shape[i] ); ret = FALSE; - break; + goto final; } } - if( FALSE == ret ) - { - shape[neg_idx] = -1; - } - else if(neg_idx != -1) + + if (-1 != neg_idx) { - shape[neg_idx] = (uint32_t)total_size; + shape[neg_idx] = (vsi_size_t)total_size; } + +final: return ret; } /* _auto_cal_shape() */ @@ -328,15 +331,21 @@ static vsi_bool _init_tensor size_t i = 0; ret = TRUE; + if (tensor->attr.dim_num > VSI_NN_MAX_DIM_NUM) + { + VSILOGE( "tensor rank greater than %d.", VSI_NN_MAX_DIM_NUM ); + return FALSE; + } + memset( ¶ms, 0, sizeof( vx_tensor_create_params_t ) ); params.num_of_dims = tensor->attr.dim_num; - for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + for(i = 0; i < tensor->attr.dim_num; i++) { - size_vxsize[i] = -1 == tensor->attr.size[i] ? -1 : (vx_size)tensor->attr.size[i]; + size_vxsize[i] = (vsi_size_t)-1 == tensor->attr.size[i] ? (vx_size)-1 : (vx_size)tensor->attr.size[i]; } - for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + for(i = 0; i < tensor->attr.dim_num; i++) { - size_u32[i] = -1 == tensor->attr.size[i] ? -1 : (vx_uint32)tensor->attr.size[i]; + size_u32[i] = (vsi_size_t)-1 == tensor->attr.size[i] ? (vx_uint32)-1 : (vx_uint32)tensor->attr.size[i]; } #ifdef VSI_40BIT_VA_SUPPORT params.sizes = size_vxsize; @@ -354,11 +363,13 @@ static vsi_bool _init_tensor break; case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8: params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE; params.quant_data.affine.scale = tensor->attr.dtype.scale; params.quant_data.affine.zeroPoint = (int32_t)tensor->attr.dtype.zero_point; break; case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: + case VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8: #ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT #ifdef VX_QUANT_AFFINE_SCALE_PER_CHANNEL params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_CHANNEL; @@ -367,6 +378,7 @@ static vsi_bool _init_tensor #endif // This is a hack that driver doesn't support const scales scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.scale_dim); + CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final ); memcpy(scales, tensor->attr.dtype.scales, tensor->attr.dtype.scale_dim * sizeof(float)); params.quant_data.affinePerChannel.channelDim = tensor->attr.dtype.channel_dim; params.quant_data.affinePerChannel.scaleCount = tensor->attr.dtype.scale_dim; @@ -378,6 +390,7 @@ static vsi_bool _init_tensor // it's symmetric quantized tensor. Fake a zp information filled with zero to meet low-level's // requirement null_zp = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.scale_dim); + CHECK_PTR_FAIL_GOTO( null_zp, "Create buffer fail.", final ); memset(null_zp, 0, sizeof(int32_t) * tensor->attr.dtype.scale_dim); params.quant_data.affinePerChannel.zeroPoint = null_zp; params.quant_data.affinePerChannel.zeroPointCount= tensor->attr.dtype.scale_dim; @@ -395,10 +408,12 @@ static vsi_bool _init_tensor #endif // This is a hack that driver doesn't support const scales scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.scale_dim); + CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final ); memcpy(scales, tensor->attr.dtype.scales, tensor->attr.dtype.scale_dim * sizeof(float)); zeroPoints = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.zero_points_dim); + CHECK_PTR_FAIL_GOTO( zeroPoints, "Create buffer fail.", final ); memcpy(zeroPoints, tensor->attr.dtype.zero_points, tensor->attr.dtype.zero_points_dim * sizeof(int32_t)); @@ -472,14 +487,17 @@ static vsi_bool _init_tensor vx_size stride_size_vxsize[_cnt_of_array(stride_size)] = {0}; for(i = 0; i < _cnt_of_array(tensor->attr.size); i++) { - size_vxsize2[i] = -1 == tensor->attr.size[i] ? -1 : (vx_size)tensor->attr.size[i]; + size_vxsize2[i] = (vsi_size_t)-1 == tensor->attr.size[i] ? \ + (vx_size)-1 : (vx_size)tensor->attr.size[i]; } for(i = 0; i < _cnt_of_array(stride_size); i++) { - stride_size_vxsize[i] = -1 == stride_size[i] ? -1 : (vx_size)stride_size[i]; + stride_size_vxsize[i] = (vsi_size_t)-1 == stride_size[i] ? \ + (vx_size)-1 : (vx_size)stride_size[i]; } addr = vxCreateTensorAddressing(graph->ctx->c, size_vxsize2, stride_size_vxsize, (vx_size)tensor->attr.dim_num); + CHECK_PTR_FAIL_GOTO( addr, "Create tensor address fail.", final ); } #else { @@ -487,14 +505,17 @@ static vsi_bool _init_tensor uint32_t stride_size_32bit[_cnt_of_array(stride_size)] = {0}; for(i = 0; i < _cnt_of_array(tensor->attr.size); i++) { - size_32bit[i] = -1 == tensor->attr.size[i] ? -1 : (uint32_t)tensor->attr.size[i]; + size_32bit[i] = (vsi_size_t)-1 == tensor->attr.size[i] ? \ + (uint32_t)-1 : (uint32_t)tensor->attr.size[i]; } for(i = 0; i < _cnt_of_array(stride_size); i++) { - stride_size_32bit[i] = -1 == stride_size[i] ? -1 : (uint32_t)stride_size[i]; + stride_size_32bit[i] = (vsi_size_t)-1 == stride_size[i] ? \ + (uint32_t)-1 : (uint32_t)stride_size[i]; } addr = vxCreateTensorAddressing(graph->ctx->c, size_32bit, stride_size_32bit, (uint8_t)tensor->attr.dim_num); + CHECK_PTR_FAIL_GOTO( addr, "Create tensor address fail.", final ); } #endif #ifdef VX_CREATE_TENSOR_SUPPORT_PHYSICAL @@ -970,6 +991,9 @@ vsi_size_t vsi_nn_CopyTensorToBuffer vsi_size_t sz; vsi_size_t stride_size[VSI_NN_MAX_DIM_NUM]; vsi_status status; + + VSI_UNREFERENCED(graph); + if( NULL == tensor || NULL == buffer ) { return 0; @@ -977,7 +1001,7 @@ vsi_size_t vsi_nn_CopyTensorToBuffer sz = 0; status = VSI_FAILURE; - status = vsi_nn_copy_tensor_patch(tensor->t, &tensor->attr, buffer, VX_READ_ONLY); + status = vsi_nn_copy_tensor_patch(tensor->t, &tensor->attr, buffer, VX_READ_ONLY, NULL, NULL); if(VSI_SUCCESS == status) { sz = vsi_nn_GetStrideSize( &tensor->attr, stride_size ); @@ -996,7 +1020,7 @@ float * vsi_nn_ConvertTensorToFloat32Data vsi_size_t elements; vsi_size_t i; vsi_size_t stride; - float *data; + float *data = NULL; if(NULL == graph || NULL == tensor) { @@ -1008,7 +1032,7 @@ float * vsi_nn_ConvertTensorToFloat32Data data = NULL; data = (float *)malloc(elements * sizeof(float)); - + CHECK_PTR_FAIL_GOTO( data, "Create buffer fail.", final ); if( tensor->attr.is_created_from_handle ) { #ifdef VSI_INVALIDATE_HANDLE_SUPPORT @@ -1031,7 +1055,14 @@ float * vsi_nn_ConvertTensorToFloat32Data else { tensor_data = vsi_nn_ConvertTensorToData(graph, tensor); + if ( tensor_data == NULL ) + { + VSILOGE("tensor_data is NULL."); + vsi_nn_safe_free(data); + return NULL; + } } + for(i = 0; i < elements; i++) { status = dtype_to_float32(&tensor_data[stride * i], &data[i], &tensor->attr.dtype); @@ -1043,6 +1074,7 @@ float * vsi_nn_ConvertTensorToFloat32Data } } +final: if( !tensor->attr.is_created_from_handle ) { vsi_nn_safe_free( tensor_data ); @@ -1061,6 +1093,9 @@ uint8_t * vsi_nn_ConvertTensorToData vsi_size_t buf_sz; vsi_size_t stride_size[VSI_NN_MAX_DIM_NUM]; vsi_status status; + + VSI_UNREFERENCED(graph); + if( NULL == tensor ) { return NULL; @@ -1074,6 +1109,12 @@ uint8_t * vsi_nn_ConvertTensorToData if( buf_sz > 0 ) { data = (uint8_t *)malloc( buf_sz ); + if (data == NULL) + { + VSILOGE("Create buffer fail"); + + return NULL; + } } if( data && tensor->attr.is_created_from_handle ) { @@ -1100,13 +1141,14 @@ uint8_t * vsi_nn_ConvertTensorToData { if( NULL != data ) { - status = vsi_nn_copy_tensor_patch(tensor->t, &tensor->attr, data, VX_READ_ONLY); + status = vsi_nn_copy_tensor_patch(tensor->t, &tensor->attr, data, VX_READ_ONLY, NULL, NULL); } if(VSI_SUCCESS != status) { VSILOGE("Read tensor data fail"); free(data); data = NULL; + return NULL; } } if(tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT4 || @@ -1114,12 +1156,16 @@ uint8_t * vsi_nn_ConvertTensorToData { vsi_size_t dest_size = vsi_nn_GetElementNum(tensor); new_data = (uint8_t*)malloc(dest_size); - status = vsi_nn_Unpack4bitData(tensor, data, new_data, tensor->attr.dtype.vx_type); - if(data) + if (new_data == NULL) { - free(data); - data = NULL; + VSILOGE("Create buffer fail"); + vsi_nn_safe_free(data); + + return NULL; } + + status = vsi_nn_Unpack4bitData(tensor, data, new_data, tensor->attr.dtype.vx_type); + vsi_nn_safe_free(data); return new_data; } else @@ -1149,6 +1195,9 @@ uint8_t * vsi_nn_ConvertRawTensorToData vsi_size_t buf_sz; vsi_status status; vsi_nn_tensor_attr_t attr; + + VSI_UNREFERENCED(addr); + if( NULL == tensor || NULL == context ) { return NULL; @@ -1175,7 +1224,7 @@ uint8_t * vsi_nn_ConvertRawTensorToData { return data; } - status = vsi_nn_copy_tensor_patch(tensor, &attr, data, VX_READ_ONLY); + status = vsi_nn_copy_tensor_patch(tensor, &attr, data, VX_READ_ONLY, NULL, NULL); if( VSI_SUCCESS != status ) { VSILOGE("Read tensor data fail"); @@ -1205,6 +1254,8 @@ uint8_t * vsi_nn_ConvertRawTensorToData2 vsi_size_t buf_sz; vsi_status status; + VSI_UNREFERENCED(addr); + if( NULL == tensor || NULL == context ) { return NULL; @@ -1229,6 +1280,7 @@ uint8_t * vsi_nn_ConvertRawTensorToData2 break; case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8: status = vxQueryTensor(tensor, VX_TENSOR_ZERO_POINT, &(attr->dtype.zero_point), sizeof(int32_t)); status = vxQueryTensor(tensor, VX_TENSOR_SCALE, @@ -1250,7 +1302,7 @@ uint8_t * vsi_nn_ConvertRawTensorToData2 { return data; } - status = vsi_nn_copy_tensor_patch(tensor, attr, data, VX_READ_ONLY); + status = vsi_nn_copy_tensor_patch(tensor, attr, data, VX_READ_ONLY, NULL, NULL); if( VSI_SUCCESS != status ) { VSILOGE("Read tensor data fail"); @@ -1407,7 +1459,8 @@ void vsi_nn_SaveDataToText write_data = vsi_nn_DataAsFloat32( &data[stride * i], type ); if( type == VSI_NN_TYPE_UINT8 || type == VSI_NN_TYPE_INT8 || - type == VSI_NN_TYPE_UINT4 || type == VSI_NN_TYPE_INT4 ) + type == VSI_NN_TYPE_UINT4 || type == VSI_NN_TYPE_INT4 || + type == VSI_NN_TYPE_FLOAT8_E4M3 || type == VSI_NN_TYPE_FLOAT8_E5M2 ) { count += snprintf( (char *)&buf[count], _TENSOR_TMPBUF_SZ - count, "%d%s", (int32_t)write_data, seperator ); @@ -1549,6 +1602,10 @@ vsi_status vsi_nn_CopyDataToTensor ) { vsi_status status = VSI_FAILURE; + uint8_t* new_data = NULL; + + VSI_UNREFERENCED(graph); + if( NULL == data || NULL == tensor ) { return status; @@ -1581,24 +1638,22 @@ vsi_status vsi_nn_CopyDataToTensor if( tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT4 || tensor->attr.dtype.vx_type == VSI_NN_TYPE_UINT4 ) { - uint8_t* new_data = NULL; vsi_size_t dest_size = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num, tensor->attr.dtype.vx_type); new_data = (uint8_t*)malloc( dest_size ); + CHECK_PTR_FAIL_GOTO( new_data, "Create buffer fail.", final ); status = vsi_nn_Pack4bitData(tensor, (uint8_t*)data, new_data); - status = vsi_nn_copy_tensor_patch( tensor->t, &tensor->attr, new_data, VX_WRITE_ONLY ); - if( new_data ) - { - free( new_data ); - new_data = NULL; - } + status = vsi_nn_copy_tensor_patch( tensor->t, &tensor->attr, new_data, VX_WRITE_ONLY, NULL, NULL ); } else { - status = vsi_nn_copy_tensor_patch( tensor->t, &tensor->attr, data, VX_WRITE_ONLY ); + status = vsi_nn_copy_tensor_patch( tensor->t, &tensor->attr, data, VX_WRITE_ONLY, NULL, NULL ); } } +final: + vsi_nn_safe_free(new_data); + return status; } /* vsi_nn_CopyDataToTensor() */ @@ -1780,6 +1835,12 @@ vsi_nn_tensor_t *vsi_nn_reshape_tensor { return NULL; } + + if (dim_num > VSI_NN_MAX_DIM_NUM) + { + VSILOGE( "tensor rank greater than %d.", VSI_NN_MAX_DIM_NUM ); + return NULL; + } /* New a ovxlib tensor struct */ memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); memcpy(&attr, &input->attr, sizeof(vsi_nn_tensor_attr_t)); @@ -1813,6 +1874,13 @@ vsi_bool vsi_nn_ReshapeTensor { vsi_bool ret; vsi_size_t new_shape[VSI_NN_MAX_DIM_NUM] = {0}; + + if (dim_num > VSI_NN_MAX_DIM_NUM) + { + VSILOGE( "tensor rank greater than %d.", VSI_NN_MAX_DIM_NUM ); + return FALSE; + } + memcpy(new_shape, shape, sizeof(vsi_size_t) * dim_num); ret = TRUE; @@ -1913,6 +1981,12 @@ vx_tensor vsi_nn_safe_reshape_tensor vsi_size_t size_of_shape_element ) { + if (sizes > VSI_NN_MAX_DIM_NUM) + { + VSILOGE( "tensor rank greater than %d.", VSI_NN_MAX_DIM_NUM ); + return NULL; + } + if(sizeof(vx_size) == size_of_shape_element) { vx_size* num_of_dims_vxsize = (vx_size*)num_of_dims; @@ -1924,7 +1998,8 @@ vx_tensor vsi_nn_safe_reshape_tensor vsi_size_t i = 0; for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) { - new_shape_int32[i] = -1 == num_of_dims_vxsize[i] ? -1 : (int32_t)num_of_dims_vxsize[i]; + new_shape_int32[i] = (vx_size)-1 == num_of_dims_vxsize[i] ? \ + (int32_t)-1 : (int32_t)num_of_dims_vxsize[i]; } return vxReshapeTensor( tensor, new_shape_int32, (uint32_t)sizes ); } @@ -1939,7 +2014,7 @@ vx_tensor vsi_nn_safe_reshape_tensor vsi_size_t i = 0; for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) { - new_shape_vxsize[i] = -1 == num_of_dims_int32[i] ? -1 : (vx_size)num_of_dims_int32[i]; + new_shape_vxsize[i] = -1 == num_of_dims_int32[i] ? (vx_size)-1 : (vx_size)num_of_dims_int32[i]; } return vxReshapeTensor( tensor, new_shape_vxsize, (vx_size)sizes ); } @@ -1970,7 +2045,7 @@ void vsi_nn_PermuteTensor uint32_t i; vsi_status status; - if( NULL == tensor || NULL == perm || 0 == dim_num ) + if( NULL == tensor || NULL == perm || 0 == dim_num || dim_num > VSI_NN_MAX_DIM_NUM ) { VSILOGE( "Wrong perm parameters." ); return; @@ -2231,8 +2306,10 @@ void vsi_nn_ReleaseTensorRelevance ) { uint32_t i; - if(NULL == tensor_ref || NULL == graph) + if (NULL == tensor_ref || NULL == graph) { + vsi_nn_safe_free(tensor_ref); + return ; } @@ -2250,11 +2327,7 @@ void vsi_nn_ReleaseTensorRelevance } } - if(tensor_ref) - { - free(tensor_ref); - tensor_ref = NULL; - } + vsi_nn_safe_free(tensor_ref); } /* vsi_nn_ReleaseTensorRelevance() */ vsi_nn_tensor_rel_t *vsi_nn_CreateTensorRelevance @@ -2286,6 +2359,11 @@ vsi_nn_tensor_rel_t *vsi_nn_CreateTensorRelevance for(j = 0; j < graph->node_num; j++) { node = vsi_nn_GetNode( graph, (vsi_nn_node_id_t)j ); + if (node == NULL) + { + continue; + } + for(k = 0; k < node->output.num; k++) { if(node->output.tensors[k] == i) @@ -2423,6 +2501,7 @@ vsi_status vsi_nn_vxGetTensorAttr break; case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8: status = vxQueryTensor(tensor, VX_TENSOR_ZERO_POINT, &(attr->dtype.zero_point), sizeof(int32_t)); TEST_CHECK_STATUS( status, final ); @@ -2468,7 +2547,7 @@ uint8_t *vsi_nn_vxCopyTensorToData } } - status = vsi_nn_copy_tensor_patch(tensor, attr, data, VX_READ_ONLY); + status = vsi_nn_copy_tensor_patch(tensor, attr, data, VX_READ_ONLY, NULL, NULL); if(VSI_SUCCESS != status) { VSILOGE("Copy tensor to data fail"); @@ -2498,7 +2577,7 @@ vsi_status vsi_nn_vxCopyDataToTensor memset(stride_size, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); vsi_nn_GetStrideSize(attr, stride_size); - status = vsi_nn_copy_tensor_patch(tensor, attr, data, VX_WRITE_ONLY); + status = vsi_nn_copy_tensor_patch(tensor, attr, data, VX_WRITE_ONLY, NULL, NULL); if(VSI_SUCCESS != status) { VSILOGE("Copy data to tensor fail"); @@ -2619,24 +2698,39 @@ vsi_status vsi_nn_copy_tensor_patch vx_tensor tensor, vsi_nn_tensor_attr_t *attr, void * user_ptr, - vsi_enum usage + vsi_enum usage, + vsi_size_t* start, + vsi_size_t* end ) { - vsi_size_t start[VSI_NN_MAX_DIM_NUM],end[VSI_NN_MAX_DIM_NUM],stride[VSI_NN_MAX_DIM_NUM]; + vsi_size_t tmp_start[VSI_NN_MAX_DIM_NUM],tmp_end[VSI_NN_MAX_DIM_NUM],stride[VSI_NN_MAX_DIM_NUM]; vsi_status status = VSI_FAILURE; - uint32_t i; + if(NULL == tensor || NULL == user_ptr) { VSILOGE("Invalid parameter"); return status; } vsi_nn_GetStrideSize(attr, stride); - memset(start, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); - for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + if (NULL == start) + { + memset(tmp_start, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); + } + else + { + memcpy(tmp_start, start, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); + } + + if (NULL == end) { - end[i] = attr->size[i]; + memcpy(tmp_end, attr->size, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); } - status = vsi_nn_copy_tensor_veiw_patch(tensor, attr, user_ptr, start, end, stride, usage, 0); + else + { + memcpy(tmp_end, end, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); + } + + status = vsi_nn_copy_tensor_veiw_patch(tensor, attr, user_ptr, tmp_start, tmp_end, stride, usage, 0); return status; } /* vsi_nn_copy_tensor_patch() */ @@ -2673,7 +2767,9 @@ void vsi_nn_reshuffle_weight_data int32_t item_size = vsi_nn_TypeGetBytes(weights->attr.dtype.vx_type); weight_data = vsi_nn_ConvertTensorToData(graph, weights); + CHECK_PTR_FAIL_GOTO( weight_data, "Create weight_data fail.", final ); buffer = (uint8_t*)malloc(item_size * slice_size * weight_size_c * weight_size_b); + CHECK_PTR_FAIL_GOTO( buffer, "Create buffer fail.", final ); memset(buffer, 0x00, item_size * slice_size * weight_size_c * weight_size_b); memcpy(buffer, weight_data, item_size * slice_size * weight_size_c * weight_size_b); #if 0 // transpose whnc to whcn if need @@ -2717,6 +2813,8 @@ void vsi_nn_reshuffle_weight_data } } vsi_nn_CopyDataToTensor( graph, weights, weight_data ); + +final: vsi_nn_Free( buffer ); vsi_nn_safe_free( weight_data ); } @@ -2806,6 +2904,7 @@ vsi_status vsi_nn_SwapHandle ) { vsi_status status = VSI_FAILURE; + VSI_UNREFERENCED(is_new_ptr_malloc_by_ovxlib); if (!tensor) { return VSI_FAILURE; @@ -3021,15 +3120,17 @@ static vsi_bool _init_dummy_tensor size_t i = 0; ret = TRUE; + VSI_UNREFERENCED(graph); + memset( ¶ms, 0, sizeof( vx_tensor_create_params_t ) ); params.num_of_dims = tensor->attr.dim_num; for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) { - size_vxsize[i] = -1 == tensor->attr.size[i] ? -1 : (vx_size)tensor->attr.size[i]; + size_vxsize[i] = (vsi_size_t)-1 == tensor->attr.size[i] ? (vx_size)-1 : (vx_size)tensor->attr.size[i]; } for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) { - size_u32[i] = -1 == tensor->attr.size[i] ? -1 : (vx_uint32)tensor->attr.size[i]; + size_u32[i] = (vsi_size_t)-1 == tensor->attr.size[i] ? (vx_uint32)-1 : (vx_uint32)tensor->attr.size[i]; } #ifdef VSI_40BIT_VA_SUPPORT params.sizes = size_vxsize; @@ -3047,11 +3148,13 @@ static vsi_bool _init_dummy_tensor break; case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8: params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE; params.quant_data.affine.scale = tensor->attr.dtype.scale; params.quant_data.affine.zeroPoint = (int32_t)tensor->attr.dtype.zero_point; break; case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: + case VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8: #ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT #ifdef VX_QUANT_AFFINE_SCALE_PER_CHANNEL params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_CHANNEL; @@ -3060,6 +3163,7 @@ static vsi_bool _init_dummy_tensor #endif // This is a hack that driver doesn't support const scales scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.scale_dim); + CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final ); memcpy(scales, tensor->attr.dtype.scales, tensor->attr.dtype.scale_dim * sizeof(float)); params.quant_data.affinePerChannel.channelDim = tensor->attr.dtype.channel_dim; params.quant_data.affinePerChannel.scaleCount = tensor->attr.dtype.scale_dim; @@ -3071,6 +3175,7 @@ static vsi_bool _init_dummy_tensor // it's symmetric quantized tensor. Fake a zp information filled with zero to meet low-level's // requirement null_zp = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.scale_dim); + CHECK_PTR_FAIL_GOTO( null_zp, "Create buffer fail.", final ); memset(null_zp, 0, sizeof(int32_t) * tensor->attr.dtype.scale_dim); params.quant_data.affinePerChannel.zeroPoint = null_zp; params.quant_data.affinePerChannel.zeroPointCount= tensor->attr.dtype.scale_dim; @@ -3092,6 +3197,7 @@ static vsi_bool _init_dummy_tensor tensor->attr.dtype.scales, tensor->attr.dtype.scale_dim * sizeof(float)); zeroPoints = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.zero_points_dim); + CHECK_PTR_FAIL_GOTO( zeroPoints, "Create buffer fail.", final ); memcpy(zeroPoints, tensor->attr.dtype.zero_points, tensor->attr.dtype.zero_points_dim * sizeof(int32_t)); diff --git a/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h b/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h index d46138fa2..1937569fc 100644 --- a/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h +++ b/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h @@ -97,6 +97,16 @@ vsi_bool vsi_nn_is_stream_process_supported_types size_t input_num ); +vsi_bool vsi_nn_is_same_data_type( + vsi_nn_tensor_t * src, + vsi_nn_tensor_t * dst + ); + +vsi_bool vsi_nn_is_same_quant_type( + vsi_nn_tensor_t * src, + vsi_nn_tensor_t * dst + ); + #ifdef __cplusplus } #endif