diff --git a/prebuilt-sdk/x86_64_linux/VERSION b/prebuilt-sdk/x86_64_linux/VERSION
index 79d5c1795..91123ad1e 100644
--- a/prebuilt-sdk/x86_64_linux/VERSION
+++ b/prebuilt-sdk/x86_64_linux/VERSION
@@ -1 +1 @@
-6.4.14_CL650117A_D650117_A648302_R647402_T648811_O646970
\ No newline at end of file
+6.4.15_CL690884A_D690855_A690484_R690194_T690259_O688896
\ No newline at end of file
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h
index 48f824f65..c49800a9f 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h
@@ -1340,6 +1340,21 @@ VX_API_ENTRY vx_status VX_API_CALL vxAssignNodeCallback(vx_node node, vx_nodecom
*/
VX_API_ENTRY vx_nodecomplete_f VX_API_CALL vxRetrieveNodeCallback(vx_node node);
+/*! \brief Assigns a callback to a node.
+ * If a callback already exists in this node, this function must return an error
+ * and the user may clear the callback by passing a NULL pointer as the callback.
+ * \param [in] node The reference to the node.
+ * \param [in] callback The callback to associate with completion of this
+ * specific node.
+ * \warning This must be used with extreme caution as it can \e ruin
+ * optimizations in the power/performance efficiency of a graph.
+ * \return A \ref vx_status_e enumeration.
+ * \retval VX_SUCCESS Callback assigned; any other value indicates failure.
+ * \retval VX_ERROR_INVALID_REFERENCE node is not a valid \ref vx_node reference.
+ * \ingroup group_node_callback
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxAssignNodeQueryCallback(vx_node node, vx_nodequery_f callback);
+
/*! \brief Sets the node target to the provided value. A success invalidates the graph
* that the node belongs to (\ref vxVerifyGraph must be called before the next execution)
* \param [in] node The reference to the \ref vx_node object.
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
index d35396074..8a2ac76b1 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
@@ -503,6 +503,40 @@ enum vx_kernel_e {
VX_KERNEL_NN_BATCH_GEMM_RELU_POOLING_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x33,
+ VX_KERNEL_NN_FUSED_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x34,
+
+ VX_KERNEL_NN_CONVOLUTION_RELU_POOLING_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x35,
+
+ VX_KERNEL_NN_LAYER_NORMALIZATION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x36,
+
+ VX_KERNEL_NN_INSTANCE_NORMALIZATION_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x37,
+
+ VX_KERNEL_NN_GROUP_NORMALIZATION_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x38,
+
+ VX_KERNEL_NN_LOGICAL_OPS_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x39,
+
+ VX_KERNEL_NN_LOGICAL_NOT_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x40,
+
+ VX_KERNEL_NN_RELATIONAL_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x41,
+
+ VX_KERNEL_NN_TENSOR_REDUCE_MAX = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x42,
+
+ VX_KERNEL_NN_MAXIMUM_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x43,
+
+ VX_KERNEL_NN_MINIMUM_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x44,
+
+ VX_KERNEL_NN_TENSOR_SELECT_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x45,
+
+ VX_KERNEL_NN_REDUCE_SUM_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x46,
+
+ VX_KERNEL_NN_GRU_CELL_ACTIVATION_Z_H_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x47,
+
+ VX_KERNEL_NN_GRU_CELL_H_TIMES_ACTIVATION_R_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x48,
+
+ VX_KERNEL_NN_GRU_CELL_RESET_AFTER_ACTIVATION_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x49,
+
+ VX_KERNEL_NN_LSTM_ACTIVATION_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x50,
+
VX_KERNEL_MAX_1_2, /*!< \internal Used for VX1.2 bounds checking in the conformance test. */
};
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
index f3f019113..ec5d069ed 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
@@ -214,7 +214,7 @@ VX_STREAM_PROCESSOR_SUPPORT is used to declare that vsi openvx driver can suppor
1: support
*/
#ifndef VX_STREAM_PROCESSOR_SUPPORT
-#define VX_STREAM_PROCESSOR_SUPPORT 0
+#define VX_STREAM_PROCESSOR_SUPPORT 1
#endif
/*
@@ -258,5 +258,144 @@ VX_STREAM_PROCESSOR_SUPPORT is used to declare that vsi openvx driver can suppor
#define VX_ACTIVATION_EXT2_SUPPORT 1
#endif
+/*
+ VX_TENSORVIEW_ON_ANY_DIM is used to declare that ovxlib can do optimization for all concat node(all dimision) to tensor view if possiable, not only channel.
+ [value]
+ 0: disable
+ 1: enable
+*/
+#ifndef VX_TENSORVIEW_ON_ANY_DIM
+#define VX_TENSORVIEW_ON_ANY_DIM 0
+#endif
+
+/*
+VX_DEPTH2SPACE_CRD_MODE_SUPPORT is used to declare that SPACE2DEPTH can support CRD mode
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_DEPTH2SPACE_CRD_MODE_SUPPORT
+#define VX_DEPTH2SPACE_CRD_MODE_SUPPORT 1
+#endif
+
+/*
+ VX_LAYER_NORMALIZATION_VX_SUPPORT is used to declare driver support layer normalization layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_LAYER_NORMALIZATION_VX_SUPPORT
+#define VX_LAYER_NORMALIZATION_VX_SUPPORT 1
+#endif
+
+/*
+ VX_LAYER_NORMALIZATION_VX_SUPPORT is used to declare driver support layer normalization layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_INSTANCE_NORMALIZATION_VX_SUPPORT
+#define VX_INSTANCE_NORMALIZATION_VX_SUPPORT 1
+#endif
+
+/*
+ VX_GROUP_NORMALIZATION_VX_SUPPORT is used to declare driver support layer normalization layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_GROUP_NORMALIZATION_VX_SUPPORT
+#define VX_GROUP_NORMALIZATION_VX_SUPPORT 1
+#endif
+
+/*
+ VX_LOGICAL_VX_SUPPORT is used to declare driver support layer logical related layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_LOGICAL_VX_SUPPORT
+#define VX_LOGICAL_VX_SUPPORT 1
+#endif
+
+/*
+ VX_RELATIONAL_OPS_VX_SUPPORT is used to declare driver support layer relational related layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_RELATIONAL_OPS_VX_SUPPORT
+#define VX_RELATIONAL_OPS_VX_SUPPORT 1
+#endif
+
+/*
+ VX_REDUCE_MAX_VX_SUPPORT is used to declare driver support layer reduce max layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_REDUCE_MAX_VX_SUPPORT
+#define VX_REDUCE_MAX_VX_SUPPORT 1
+#endif
+
+/*
+ VX_REDUCE_MEAN_VX_SUPPORT is used to declare driver support layer reduce mean layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_REDUCE_MEAN_VX_SUPPORT
+#define VX_REDUCE_MEAN_VX_SUPPORT 1
+#endif
+
+/*
+ VX_REDUCE_SUM_VX_SUPPORT is used to declare driver support layer reduce sum layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_REDUCE_SUM_VX_SUPPORT
+#define VX_REDUCE_SUM_VX_SUPPORT 1
+#endif
+
+/*
+ VX_MAX_MIN_IMUM_VX_SUPPORT is used to declare driver support maximum and minimum layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_MAX_MIN_IMUM_VX_SUPPORT
+#define VX_MAX_MIN_IMUM_VX_SUPPORT 1
+#endif
+
+/*
+ VX_TENSOR_SELECR_VX_SUPPORT is used to declare driver support tensor select layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_TENSOR_SELECT_VX_SUPPORT
+#define VX_TENSOR_SELECT_VX_SUPPORT 1
+#endif
+
+/*
+ VX_GRU_CELL_VX_SUPPORT is used to declare driver support gru cell layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_GRU_CELL_VX_SUPPORT
+#define VX_GRU_CELL_VX_SUPPORT 1
+#endif
+
+/*
+ VX_LSTM_ACTIVATION_SUPPORT is used to declare driver support gru cell layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_LSTM_ACTIVATION_SUPPORT
+#define VX_LSTM_ACTIVATION_SUPPORT 1
+#endif
#endif /* __VX_KHR_COMPATIBLE_H__ */
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
index a43a37ec2..49472870d 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
@@ -395,6 +395,17 @@ enum vx_tensor_lifetime_type_e
VX_TENSOR_LIFE_TIME_DYNAMIC,
};
+/*! \brief Specifies depthtospace mode
+ * \ingroup group_cnn
+ */
+enum vx_nn_depth_to_space_mode_e
+{
+ /*! \brief DCR(default) for depth-column-row order re-arrangement */
+ VX_NN_DEPTH_TO_SPACE_DCR = 0x0,
+ /*! \brief CRD for column-row-depth order re-arrangement */
+ VX_NN_DEPTH_TO_SPACE_CRD,
+};
+
typedef struct _vx_nn_convolution_3d_params_t
{
vx_int32 padding_w_left; /*!< \brief Number of elements added at each side in the left of w dimension of the input. */
@@ -972,6 +983,16 @@ typedef struct _vx_nn_mean_params_t
vx_int32 keep_dims; /*!< \brief Keep dims, if positive, retains reduced dims with length 1 */
} vx_nn_mean_params_t;
+/*! \brief Input parameter for reducesum layer
+* \ingroup group_cnn
+*\version 0.5
+*/
+typedef struct _vx_nn_sum_params_t
+{
+ vx_tensor axis; /*!< \brief 1D axis tensor of reduce dims */
+ vx_int32 keep_dims; /*!< \brief Keep dims, if positive, retains reduced dims with length 1 */
+} vx_nn_sum_params_t;
+
/*! \brief Input parameter for tensor squeeze layer
* \ingroup group_cnn
*\version 0.5
@@ -1254,6 +1275,12 @@ typedef struct _vx_nn_reorg_params_ext2_t
vx_int32 *axis;
} vx_nn_reorg_params_ext2_t;
+typedef struct _vx_nn_reorg_params_ext3_t
+{
+ vx_nn_reorg_params_ext2_t base; /*!< \brief vx_nn_reorg_params \ref vx_nn_reorg_params_t */
+ vx_enum mode; /*!< \brief [Optional] Only for DEPH2SPACE */
+} vx_nn_reorg_params_ext3_t;
+
/*! \brief [Graph] Creates a Reorgnization Layer Node, Enhancement of vxReorgLayer, Support both DEPTH to SPACE and SPACE to DEPTH.
* \param [in] graph The reference to the parent graph.
* \param [in] input The input tensor data to reorg.
@@ -1911,6 +1938,21 @@ VX_API_ENTRY vx_node VX_API_CALL vxRPNLayer(
vx_tensor score_output
);
+/*! \brief Input parameters for a lstm activation operation.
+ * \ingroup group_cnn
+ * \version 0.3
+ */
+typedef struct _vx_nn_lstm_activation_params_t
+{
+ vx_int32 is_ln;
+ vx_int32 is_cifg;
+ vx_int32 is_proj;
+ vx_int32 is_hybrid;
+ vx_int32 is_peephole;
+ vx_int32 recurrent_activation;
+ vx_float32 forget_bias;
+} vx_nn_lstm_activation_params_t;
+
/*! \brief Input parameters for a lstm operation.
* \ingroup group_cnn
* \version 0.3
@@ -2115,6 +2157,28 @@ VX_API_ENTRY vx_node VX_API_CALL vxTensorMeanNode(
vx_size size_of_mean_param,
vx_tensor outputs);
+/*! \brief [Graph] Creates sum layer node.
+* \details
+* Computes the sum of elements across dimensions of a tensor.
+*
+* \param [in] graph The handle to the graph.
+* \param [in] input A n-D tensor, specifying the input.
+* \param [in] sum_params paraments \ref vx_nn_sum_params_t .
+* \param [in] size_of_sum_param [static] The size of the vx_nn_mean_params_t.
+* \param [out] output A n-D tensor of the same type as input.
+* \return vx_node.
+* \returns A node reference \ref vx_node. Any possible errors preventing a
+* successful creation should be checked using \ref vxGetStatus.
+* \ingroup group_tensor
+* \version 0.5
+*/
+VX_API_ENTRY vx_node VX_API_CALL vxReduceSumNode(
+ vx_graph graph,
+ vx_tensor inputs,
+ const vx_nn_sum_params_t *sum_params,
+ vx_size size_of_sum_param,
+ vx_tensor outputs);
+
/*! \brief [Graph] Creates squeeze layer node.
* \details
* Remove dimensions of size 1 from the input tensor.
@@ -2287,6 +2351,282 @@ VX_API_ENTRY vx_node VX_API_CALL vxConv3dLayer(vx_graph graph, vx_tensor inputs,
*/
VX_API_ENTRY vx_node VX_API_CALL vxDeconv3dLayer(vx_graph graph, vx_tensor inputs, vx_tensor weights, vx_tensor biases, const vx_nn_deconvolution_3d_params_t *convolution_params, vx_size size_of_deconv_params, vx_tensor outputs);
+/*! \brief [Graph] Creates a layer Normalization Node.
+ * \details Normalize the activations of the previous layer at each batch, i.e. applies a transformation that maintains the mean activation close to 0 and the activation standard deviation close to 1.
+ * \param [in] graph The handle to the graph.
+ * \param [in] eps [static] Float 32. Small value to add to the variance estimate so that we don't divide by zero.(default is 1e-5)
+ * \param [in] axis [static] The axis on which we need do normalize.
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [out] output [static] The output tensor data.
+ * \return vx_node.
+ * \returns A node reference \ref vx_node. Any possible errors preventing a
+ * successful creation should be checked using \ref vxGetStatus.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxLayerNormalizationLayer(
+ vx_graph graph,
+ vx_float32 eps,
+ vx_int32 axis,
+ vx_tensor* input_list,
+ vx_uint32 input_count,
+ vx_tensor output
+ );
+
+/*! \brief [Graph] Creates a layer instance normalization Node.
+ * \details Normalize the activations of the previous layer at each batch, i.e. applies a transformation that maintains the mean activation close to 0 and the activation standard deviation close to 1.
+ * \param [in] graph The handle to the graph.
+ * \param [in] eps [static] Float 32. Small value to add to the variance estimate so that we don't divide by zero.(default is 1e-5)
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [out] output [static] The output tensor data.
+ * \return vx_node.
+ * \returns A node reference \ref vx_node. Any possible errors preventing a
+ * successful creation should be checked using \ref vxGetStatus.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxInstanceNormalizationLayer(
+ vx_graph graph,
+ vx_float32 eps,
+ vx_tensor* input_list,
+ vx_uint32 input_count,
+ vx_tensor output
+ );
+
+/*! \brief [Graph] Creates a layer instance normalization Node.
+ * \details Normalize the activations of the previous layer at each batch, i.e. applies a transformation that maintains the mean activation close to 0 and the activation standard deviation close to 1.
+ * \param [in] graph The handle to the graph.
+ * \param [in] eps [static] Float 32. Small value to add to the variance estimate so that we don't divide by zero.(default is 1e-5)
+ * \param [in] group_num [static] Int 32. Number of groups for GN
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [out] output [static] The output tensor data.
+ * \return vx_node.
+ * \returns A node reference \ref vx_node. Any possible errors preventing a
+ * successful creation should be checked using \ref vxGetStatus.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxGroupNormalizationLayer(
+ vx_graph graph,
+ vx_float32 eps,
+ vx_int32 group_num,
+ vx_tensor* input_list,
+ vx_uint32 input_count,
+ vx_tensor output
+ );
+
+/*! \brief [Graph] Creates a layer logical ops Node.
+ * \details Return the truth value of x AND, XOR,OR y element-wise.
+ * \param [in] graph The handle to the graph.
+ * \param [in] ops_type [static] Int 32. Operation Type
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [out] output [static] The output tensor data.
+ * \return vx_node.
+ * \returns A node reference \ref vx_node. Any possible errors preventing a
+ * successful creation should be checked using \ref vxGetStatus.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxLogicalOpsLayer(
+ vx_graph graph,
+ vx_int32 ops_type,
+ vx_tensor* input_list,
+ vx_uint32 input_count,
+ vx_tensor output
+ );
+
+/*! \brief [Graph] Creates a layer logical not Node.
+ * \details Return the truth value of not x element-wise.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input [static] The input tensor data.
+ * \param [out] output [static] The output tensor data.
+ * \return vx_node.
+ * \returns A node reference \ref vx_node. Any possible errors preventing a
+ * successful creation should be checked using \ref vxGetStatus.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxLogicalNotLayer(
+ vx_graph graph,
+ vx_tensor input,
+ vx_tensor output
+ );
+
+/*! \brief [Graph] Creates a layer relational Node.
+ * \param [in] graph The handle to the graph.
+ * \param [in] ops_type [static] Int 32. Operation Type
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [out] output [static] The output tensor data.
+ * \return vx_node.
+ * \returns A node reference \ref vx_node. Any possible errors preventing a
+ * successful creation should be checked using \ref vxGetStatus.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxRelationalLayer(
+ vx_graph graph,
+ vx_int32 ops_type,
+ vx_tensor* input_list,
+ vx_uint32 input_count,
+ vx_tensor output
+ );
+
+/*! \brief [Graph] Computes the max of elements across dimensions of input tensor.
+* \param [in] graph The handle to the graph.
+* \param [in] in input tensor data,
+* \param [in] axis [static] used to determine max across which dimension(dimension 0 means width, etc). If not given, compute the sum across all dimensions.
+* \param [in] keep_dim [static] means if keep the dimesion count.
+* \param [out] out output tensor data.
+* \ingroup group_tensor
+* \return vx_node.
+* \retval 0 Node could not be created.
+* \retval * Node handle.
+* \version 0.3
+*/
+VX_API_ENTRY vx_node VX_API_CALL vxTensorReduceMaxNode(
+ vx_graph graph,
+ vx_tensor inputs,
+ vx_tensor axis,
+ vx_bool keep_dims,
+ vx_tensor outputs);
+
+/*! \brief [Graph] Creates a layer minumum Node.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [out] output [static] The output tensor data.
+ * \return vx_node.
+ * \returns A node reference \ref vx_node. Any possible errors preventing a
+ * successful creation should be checked using \ref vxGetStatus.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxMinimumLayer(
+ vx_graph graph,
+ vx_tensor* input_list,
+ vx_uint32 input_count,
+ vx_tensor output
+ );
+
+/*! \brief [Graph] Creates a layer maximum Node.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [out] output [static] The output tensor data.
+ * \return vx_node.
+ * \returns A node reference \ref vx_node. Any possible errors preventing a
+ * successful creation should be checked using \ref vxGetStatus.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxMaximumLayer(
+ vx_graph graph,
+ vx_tensor* input_list,
+ vx_uint32 input_count,
+ vx_tensor output
+ );
+
+/*! \brief [Graph] Creates a layer select Node.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [out] output [static] The output tensor data.
+ * \return vx_node.
+ * \returns A node reference \ref vx_node. Any possible errors preventing a
+ * successful creation should be checked using \ref vxGetStatus.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxTensorSelectLayer(
+ vx_graph graph,
+ vx_tensor* input_list,
+ vx_uint32 input_count,
+ vx_tensor output
+ );
+
+/*! \brief [Graph] Creates a layer gru cell activation z h Node.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [in] recurrent_activation [static] recurrent activation type.
+ * \param [in] activation [static] activation type.
+ * \param [out] output_list [static] The output tensor data.
+ * \param [out] output_count [static] The output tensor number.
+ * \return vx_node.
+ * \returns A node reference \ref vx_node. Any possible errors preventing a
+ * successful creation should be checked using \ref vxGetStatus.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxGruCellActivationZHLayer(
+ vx_graph graph,
+ vx_tensor* input_list,
+ vx_uint32 input_count,
+ vx_int32 recurrent_activation,
+ vx_int32 activation,
+ vx_tensor* output_list,
+ vx_uint32 output_count
+ );
+
+/*! \brief [Graph] Creates a layer gru cell h times activation r Node.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [in] recurrent_activation [static] recurrent activation type.
+ * \param [out] output_list [static] The output tensor data.
+ * \param [out] output_count [static] The output tensor number.
+ * \return vx_node.
+ * \returns A node reference \ref vx_node. Any possible errors preventing a
+ * successful creation should be checked using \ref vxGetStatus.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxGruCellHTimeActivationRLayer(
+ vx_graph graph,
+ vx_tensor* input_list,
+ vx_uint32 input_count,
+ vx_int32 recurrent_activation,
+ vx_tensor* output_list,
+ vx_uint32 output_count
+ );
+
+/*! \brief [Graph] Creates a layer gru cell reset after activationNode.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [in] recurrent_activation [static] recurrent activation type.
+ * \param [in] activation [static] activation type.
+ * \param [out] output_list [static] The output tensor data.
+ * \param [out] output_count [static] The output tensor number.
+ * \return vx_node.
+ * \returns A node reference \ref vx_node. Any possible errors preventing a
+ * successful creation should be checked using \ref vxGetStatus.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxGruCellResetAfterActivationLayer(
+ vx_graph graph,
+ vx_tensor* input_list,
+ vx_uint32 input_count,
+ vx_int32 recurrent_activation,
+ vx_int32 activation,
+ vx_tensor* output_list,
+ vx_uint32 output_count
+ );
+
+/*! \brief [Graph] Creates a layer lstm activation Node.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [in] lstm_activation_param \ref vx_nn_lstm_activation_params_t .
+ * \param [out] output_list [static] The output tensor data.
+ * \param [out] output_count [static] The output tensor number.
+ * \return vx_node.
+ * \returns A node reference \ref vx_node. Any possible errors preventing a
+ * successful creation should be checked using \ref vxGetStatus.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxLSTMActivationLayer(
+ vx_graph graph,
+ vx_tensor* input_list,
+ vx_uint32 input_count,
+ const vx_nn_lstm_activation_params_t * lstm_activation_param,
+ vx_tensor* output_list,
+ vx_uint32 output_count
+ );
#ifdef __cplusplus
}
#endif
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
index 6570e1d81..e824d55a7 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
@@ -242,6 +242,48 @@ typedef struct _vx_nn_convolution_relu_pooling_params_ext7_t
vx_bool isSub;
} vx_nn_convolution_relu_pooling_params_ext7_t, * vx_nn_convolution_relu_pooling_params_ext7;
+typedef struct _vx_nn_fused_sp_params_t
+{
+ vx_enum multi_sp_kernel_type;
+ /*!*/
+ vx_scalar mul_scale;
+ /*!*/
+ union
+ {
+ struct
+ {
+ vx_scalar linear_a, linear_b;
+ } linear;
+ struct
+ {
+ vx_scalar tanh_a, tanh_b;
+ float a_v, b_v;
+ } tanh_linear;
+ struct
+ {
+ vx_scalar hsigmoid_a, hsigmoid_b;
+ } hsigmoid;
+ struct
+ {
+ vx_scalar clip_a, clip_b;
+ } clip;
+ struct
+ {
+ vx_scalar scalar_a, scalar_b, scalar_c, scalar_d;
+ } params;
+ } scalar_params;
+ /*!*/
+} vx_nn_fused_sp_params_t, * vx_nn_fused_sp_params;
+
+typedef struct _vx_nn_convolution_relu_pooling_params_sp_ext_t
+{
+ vx_nn_convolution_relu_pooling_params_ext4_t ext4; /*!< \brief convolution relu pooling params \ref vx_nn_convolution_relu_pooling_params_ext_t */
+ vx_object_array inputs_list;
+ vx_object_array outputs_list;
+ vx_nn_fused_sp_params_t sp_param;
+
+} vx_nn_convolution_relu_pooling_params_sp_ext_t, * vx_nn_convolution_relu_pooling_params_sp_ext;
+
/*! \brief [Graph] Creates a Convolutional Network Convolution and Activation(Relu) and Pooling Layer Node, this fucntion match kronos NN Extension 1.2 verion.
* \details This function implement Convolutional Network Convolution and Activation(Relu) and Pooling layer.
* For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined,
@@ -1129,6 +1171,48 @@ VX_API_ENTRY vx_node VX_API_CALL vxBatchGemmReluPoolingLayer(vx_graph graph,
const vx_nn_gemm_relu_pooling_params merge_param,
vx_tensor output);
+/*! \brief Create a fuse stream process node.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input_list input tensor list.
+ * \param [in] input_count input tensor number.
+ * \param [in] output_list output tensor list.
+ * \param [in] output_count output tensor number.
+ * \param [in] params the parameters for multi streamprocessor merging.
+ * \return \ref vx_node.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation
+ * should be checked using \ref vxGetStatus
+ * \ingroup group_vision_function_sp
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxFusedSpNode(
+ vx_graph graph,
+ vx_tensor* input_list,
+ vx_uint32 input_count,
+ vx_tensor* output_list,
+ vx_uint32 output_count,
+ const vx_nn_fused_sp_params_t * params
+ );
+
+/*! \brief Create a conv fuse stream process node.
+ * \param [in] graph The handle to the graph.
+ * \param [in] inputs input tensor.
+ * \param [in] weights_biases [static] Point to WeightBiasesParameter data, vx_weights_biases_parameter is an opaque reference.
+ * \param [in] convolution_relu_pooling_params [static] Pointer to parameters of type \ref vx_nn_convolution_relu_pooling_params_t
+ * \param [in] size_of_convolution_relu_pooling_params [static] Size in bytes of convolution_relu_pooling_params.
+ * \param [in] outputs output tensor.
+ * \return \ref vx_node.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation
+ * should be checked using \ref vxGetStatus
+ * \ingroup group_vision_function_sp
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxConvSpNode(
+ vx_graph graph,
+ vx_tensor inputs,
+ vx_weights_biases_parameter weights_biases,
+ const vx_nn_convolution_relu_pooling_params_t * convolution_relu_pooling_params,
+ vx_size size_of_convolution_relu_pooling_params,
+ vx_tensor outputs
+);
+
#ifdef __cplusplus
}
#endif
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h
index 36df37487..38d2223a4 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h
@@ -345,16 +345,6 @@ VX_API_ENTRY vx_spinst VX_API_CALL vxCreateSPINST(
vx_context context
);
-/*! \brief Creates an internal reference to a spinst data.
- * \param [in] context The reference to the implementation context.
- * \return A spinst data reference.
- * \Any possible errors preventing a successful creation should be checked using \ref vxGetStatus.
- * \ingroup group_object_spinst
- */
-VX_API_ENTRY vx_spinst VX_API_CALL vxCreateSPINSTInternal(
- vx_context context
- );
-
/*! \brief Releases a reference to a external spinst object.
* The object may not be garbage collected until its total reference count is zero.
* \param [in] spinst_obj The pointer to the spinst data to release.
@@ -368,19 +358,6 @@ VX_API_ENTRY vx_status VX_API_CALL vxReleaseSPINST(
vx_spinst *spinst_obj
);
-/*! \brief Releases a reference to a internal spinst object.
- * The object may not be garbage collected until its total reference count is zero.
- * \param [in] spinst_obj The pointer to the spinst data to release.
- * \post After returning from this function the reference is zeroed.
- * \return A \ref vx_status_e enumeration.
- * \retval VX_SUCCESS No errors; all other values indicate failure
- * \retval * An error occurred. See \ref vx_status_e.
- * \ingroup group_object_spinst
- */
-VX_API_ENTRY vx_status VX_API_CALL vxReleaseSPINSTInternal(
- vx_spinst *spinst_obj
- );
-
/*! \brief Add a instruction to spinst object.
* \param [in] spinst_obj The reference to the spinst object.
* \param [in] inst_unit_array The units of one instruction. Use a \ref vx_spinst_unit_param.
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
index 6f75ea9db..eefa39ce5 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
@@ -477,6 +477,8 @@ enum vx_type_e {
VX_TYPE_SPINST = 0x81B,/*!< \brief A \ref vx_spinst. */
VX_TYPE_INT4 = 0x81C,/*!< \brief A \ref signed 4bits tensor.. */
VX_TYPE_UINT4 = 0x81D,/*!< \brief A \ref unsigned 4bits tensor.. */
+ VX_TYPE_FLOAT8_E4M3 = 0x81E,/*!< \brief A \ref vx_float8_e4m3. */
+ VX_TYPE_FLOAT8_E5M2 = 0x81F,/*!< \brief A \ref vx_float8_e5m2. */
};
/*! \brief The enumeration of all status codes.
@@ -803,6 +805,8 @@ enum vx_convert_policy_e {
VX_CONVERT_POLICY_WRAP = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CONVERT_POLICY) + 0x0,
/*! \brief Results are saturated to the bit depth of the output operand. */
VX_CONVERT_POLICY_SATURATE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CONVERT_POLICY) + 0x1,
+ /*! \brief Results preserve infinity and nan value. */
+ VX_CONVERT_POLICY_INF = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_CONVERT_POLICY) + 0x0,
};
/*! \brief Based on the VX_DF_IMAGE definition.
diff --git a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so
index 0e2036813..40b91d016 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so and b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libCLC.so b/prebuilt-sdk/x86_64_linux/lib/libCLC.so
index 9c8839038..a50839e36 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libCLC.so and b/prebuilt-sdk/x86_64_linux/lib/libCLC.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so
index 96a5ab43d..201f51c15 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so and b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libGAL.so b/prebuilt-sdk/x86_64_linux/lib/libGAL.so
index 06525dac1..fa303327d 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libGAL.so and b/prebuilt-sdk/x86_64_linux/lib/libGAL.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so
index 1566bab34..fee4a57db 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so and b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0
index 71f33843a..b8a0d961d 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 and b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so
index 9b7e0caf8..cfa02ae3a 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so and b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libVSC.so b/prebuilt-sdk/x86_64_linux/lib/libVSC.so
index 1bafe16b3..e482f3097 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libVSC.so and b/prebuilt-sdk/x86_64_linux/lib/libVSC.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libvdtproxy.so b/prebuilt-sdk/x86_64_linux/lib/libvdtproxy.so
index 628f663a4..0deaff134 100644
Binary files a/prebuilt-sdk/x86_64_linux/lib/libvdtproxy.so and b/prebuilt-sdk/x86_64_linux/lib/libvdtproxy.so differ
diff --git a/src/tim/vx/internal/include/custom/custom_node_type.def b/src/tim/vx/internal/include/custom/custom_node_type.def
index 90d772799..c5ef3e04a 100644
--- a/src/tim/vx/internal/include/custom/custom_node_type.def
+++ b/src/tim/vx/internal/include/custom/custom_node_type.def
@@ -6,3 +6,6 @@ DEF_NODE_TYPE(custom_ainr_denoise_postprocess)
DEF_NODE_TYPE(custom_warp_affine)
DEF_NODE_TYPE(custom_warp_perspective)
DEF_NODE_TYPE(custom_sample)
+DEF_NODE_TYPE(custom_tiny_yolov4_postprocess)
+DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_confidence)
+DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_box)
diff --git a/src/tim/vx/internal/include/custom/custom_ops.def b/src/tim/vx/internal/include/custom/custom_ops.def
index 00504392c..2074b8f30 100644
--- a/src/tim/vx/internal/include/custom/custom_ops.def
+++ b/src/tim/vx/internal/include/custom/custom_ops.def
@@ -6,3 +6,6 @@ DEF_OP(CUSTOM_AINR_DENOISE_POSTPROCESS)
DEF_OP(CUSTOM_WARP_AFFINE)
DEF_OP(CUSTOM_WARP_PERSPECTIVE)
DEF_OP(CUSTOM_SAMPLE)
+DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS)
+DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE)
+DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX)
diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h
new file mode 100644
index 000000000..5234d56d6
--- /dev/null
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h
@@ -0,0 +1,47 @@
+/****************************************************************************
+*
+* Copyright (c) 2020 Vivante Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_H
+#define _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_custom_tiny_yolov4_postprocess_param
+{
+ struct _custom_tiny_yolov4_postprocess_local_data_t* local;
+ // Add parameters here
+} vsi_nn_custom_tiny_yolov4_postprocess_param;
+_compiler_assert(offsetof(vsi_nn_custom_tiny_yolov4_postprocess_param, local) == 0, \
+ vsi_nn_custom_tiny_yolov4_postprocess_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h
new file mode 100644
index 000000000..854c3a9e1
--- /dev/null
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h
@@ -0,0 +1,49 @@
+/****************************************************************************
+*
+* Copyright (c) 2020 Vivante Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX_H
+#define _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_custom_tiny_yolov4_postprocess_box_param
+{
+ struct _custom_tiny_yolov4_postprocess_box_local_data_t* local;
+ // Add parameters here
+ float bias_0;
+ float bias_1;
+} vsi_nn_custom_tiny_yolov4_postprocess_box_param;
+_compiler_assert(offsetof(vsi_nn_custom_tiny_yolov4_postprocess_box_param, local) == 0, \
+ vsi_nn_custom_tiny_yolov4_postprocess_box_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h
new file mode 100644
index 000000000..181595289
--- /dev/null
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h
@@ -0,0 +1,47 @@
+/****************************************************************************
+*
+* Copyright (c) 2020 Vivante Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE_H
+#define _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_custom_tiny_yolov4_postprocess_confidence_param
+{
+ struct _custom_tiny_yolov4_postprocess_confidence_local_data_t* local;
+ // Add parameters here
+} vsi_nn_custom_tiny_yolov4_postprocess_confidence_param;
+_compiler_assert(offsetof(vsi_nn_custom_tiny_yolov4_postprocess_confidence_param, local) == 0, \
+ vsi_nn_custom_tiny_yolov4_postprocess_confidence_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h
index 815a064fc..adf769f7f 100644
--- a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h
@@ -38,6 +38,7 @@ typedef struct _vsi_nn_custom_warp_affine_param
const float *matrix;
vsi_enum type;
int32_t size[2];
+ vsi_enum rgb_type;
} vsi_nn_custom_warp_affine_param;
_compiler_assert(offsetof(vsi_nn_custom_warp_affine_param, local) == 0, \
vsi_nn_custom_warp_affine_h );
diff --git a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
index 8976be307..eb23a2055 100644
--- a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
+++ b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
@@ -31,5 +31,8 @@
#include "custom/ops/vsi_nn_op_custom_warp_affine.h"
#include "custom/ops/vsi_nn_op_custom_warp_perspective.h"
#include "custom/ops/vsi_nn_op_custom_sample.h"
+#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h"
+#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h"
+#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h"
#endif
diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def
index 82d843fc5..0753df06d 100755
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@@ -193,3 +193,4 @@ DEF_OP(REVERSESEQUENCE)
DEF_OP(INVERSE_SIGMOID)
DEF_OP(GRID_SAMPLE)
DEF_OP(LPNORM)
+DEF_OP(RESIZE_3D)
diff --git a/src/tim/vx/internal/include/internal/internal_ops.def b/src/tim/vx/internal/include/internal/internal_ops.def
old mode 100755
new mode 100644
index de3332709..a47559a3a
--- a/src/tim/vx/internal/include/internal/internal_ops.def
+++ b/src/tim/vx/internal/include/internal/internal_ops.def
@@ -20,4 +20,3 @@ DEF_OP(SPACE2DEPTH_INTERNAL)
DEF_OP(GRUCELL_H_TIMES_ACTIVATION_R)
DEF_OP(GRUCELL_ACTIVATION_Z_H)
DEF_OP(REDUCE_MEAN_INTERNAL)
-DEF_OP(BILINEAR_GRID_SAMPLE)
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
index c118e137f..5150b0e4a 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
@@ -79,6 +79,8 @@ typedef enum
BOOL8,
I4,
U4,
+ FP8_E4M3,
+ FP8_E5M2,
} VSI_PUBLIC_TYPE vsi_nn_kernel_dtype_e;
typedef enum
@@ -89,6 +91,8 @@ typedef enum
VSI_NN_KERNEL_QUANT_ASYMM_PERCHANNEL,
VSI_NN_KERNEL_QUANT_SYMM,
VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL,
+ VSI_NN_KERNEL_QUANT_FLOAT8,
+ VSI_NN_KERNEL_QUANT_FLOAT8_PERCHANNEL,
VSI_NN_KERNEL_QUANT_TYPE_NUM
} vsi_nn_kernel_quant_type_e;
@@ -522,6 +526,10 @@ static VSI_INLINE_API vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
return BF16;
case VSI_NN_TYPE_FLOAT32:
return F32;
+ case VSI_NN_TYPE_FLOAT8_E4M3:
+ return FP8_E4M3;
+ case VSI_NN_TYPE_FLOAT8_E5M2:
+ return FP8_E5M2;
default:
VSILOGE("error data type %d", dtype);
break;
@@ -579,6 +587,8 @@ static VSI_INLINE_API size_t vsi_nn_kernel_dtype_get_bytes
case I8:
case U8:
case BOOL8:
+ case FP8_E4M3:
+ case FP8_E5M2:
return sizeof(int8_t);
case I16:
case U16:
@@ -611,6 +621,8 @@ static VSI_INLINE_API vsi_size_t vsi_nn_kernel_dtype_get_bits
case I8:
case U8:
case BOOL8:
+ case FP8_E4M3:
+ case FP8_E5M2:
return 8;
case I16:
case U16:
@@ -879,7 +891,7 @@ static VSI_INLINE_API void vsi_nn_kernel_tensor_attr_get_stride
shape = attr->shape->data;
type_bits = vsi_nn_kernel_dtype_get_bits( attr->dtype );
- if ( type_bits < BITS_PER_BYTE )
+ if ( type_bits < BITS_PER_BYTE && type_bits != 0)
{
vsi_size_t i;
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
index cfecfd1fe..c834d040e 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
@@ -91,4 +91,21 @@ vsi_bool vsi_nn_kernel_optimize_scatter_elements_shape
vsi_size_t* out_shape_x, uint32_t* out_rank_x, int32_t* out_axis, vsi_size_t max_size
);
+vsi_bool vsi_nn_kernel_optimize_matrixmul_broadcast_shape
+ (
+ const vsi_size_t * shape_x,
+ const vsi_size_t * shape_y,
+ const vsi_size_t * shape_output,
+ vsi_size_t rank_x,
+ vsi_size_t rank_y,
+ vsi_size_t rank_out,
+ vsi_size_t* out_shape_x,
+ vsi_size_t* out_shape_y,
+ vsi_size_t* out_shape_output,
+ uint32_t* new_rank,
+ uint32_t* cross_flg,
+ uint32_t* size_axis_inner_outer,
+ uint32_t* strides_axis_inner_outer
+ );
+
#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h
index 3f614139a..749a432e7 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h
@@ -82,6 +82,12 @@ typedef struct _vsi_nn_pre_process_param
vsi_nn_pre_process_type_e type;
+ struct
+ {
+ float mean[3];
+ float scale[3];
+ } norm2;
+
vsi_nn_pre_process_lcl_data *local;
} vsi_nn_pre_process_param;
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h
index d01fba846..d2772b5c1 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h
@@ -65,6 +65,10 @@ typedef struct _vsi_nn_pre_process_bgra_param
vsi_bool reverse_channel;
+ float r_scale;
+ float g_scale;
+ float b_scale;
+
/* pre process rgb layer local data structure */
vsi_nn_pre_process_bgra_lcl_data local;
} vsi_nn_pre_process_bgra_param;
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h
index aa8fc820f..34c5a6de6 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h
@@ -70,6 +70,10 @@ typedef struct _vsi_nn_pre_process_nv12_param
vsi_nn_pre_process_nv12_lcl_data* local;
vsi_nn_nv_type nv_type;
+
+ float r_scale;
+ float g_scale;
+ float b_scale;
} vsi_nn_pre_process_nv12_param;
#ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
index da52fa0d2..9e05a5966 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
@@ -76,6 +76,9 @@ typedef struct _vsi_nn_pre_process_rgb_param
vsi_bool reverse_channel;
+ float r_scale;
+ float g_scale;
+ float b_scale;
/* pre process rgb layer local data structure */
vsi_nn_pre_process_rgb_lcl_data local;
} vsi_nn_pre_process_rgb_param;
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h
index f384e4fb3..171df70c3 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h
@@ -53,6 +53,15 @@ typedef struct _vsi_nn_pre_process_rgb888_planar_param
float g_mean;
float b_mean;
float scale;
+
+
+ vsi_bool reverse_channel;
+ vsi_bool enable_rgb88_planar_nhwc;
+
+ float r_scale;
+ float g_scale;
+ float b_scale;
+
} vsi_nn_pre_process_rgb888_planar_param;
_compiler_assert(offsetof(vsi_nn_pre_process_rgb888_planar_param, local) == 0, \
vsi_nn_pre_process_rgb888_planar_h );
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h
index 998de5ee2..2ceabcb75 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h
@@ -66,6 +66,11 @@ typedef struct _vsi_nn_pre_process_yuv420_param
float rgb_scale;
vsi_bool reverse_channel;
+
+ float r_scale;
+ float g_scale;
+ float b_scale;
+
/* local data must be the first. */
vsi_nn_pre_process_yuv420_lcl_data local;
} vsi_nn_pre_process_yuv420_param;
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h
index b516e6016..1ca45170c 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h
@@ -71,6 +71,10 @@ typedef struct _vsi_nn_pre_process_yuv422_param
float rgb_scale;
vsi_bool reverse_channel;
+
+ float r_scale;
+ float g_scale;
+ float b_scale;
} vsi_nn_pre_process_yuv422_param;
#ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h
index c4391773e..7b2658968 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h
@@ -66,6 +66,10 @@ typedef struct _vsi_nn_pre_process_yuv444_param
float rgb_scale;
vsi_bool reverse_channel;
+
+ float r_scale;
+ float g_scale;
+ float b_scale;
/* local data must be the first. */
vsi_nn_pre_process_yuv444_lcl_data* local;
} vsi_nn_pre_process_yuv444_param;
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_bilinear_grid_sample.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_3d.h
similarity index 76%
rename from src/tim/vx/internal/include/ops/vsi_nn_op_bilinear_grid_sample.h
rename to src/tim/vx/internal/include/ops/vsi_nn_op_resize_3d.h
index d04c589a9..0771a71f0 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_bilinear_grid_sample.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_3d.h
@@ -22,8 +22,8 @@
*
*****************************************************************************/
-#ifndef _VSI_NN_OP_BILINEAR_GRID_SAMPLE_H
-#define _VSI_NN_OP_BILINEAR_GRID_SAMPLE_H
+#ifndef _VSI_NN_OP_RESIZE_3D_H
+#define _VSI_NN_OP_RESIZE_3D_H
#include "vsi_nn_types.h"
@@ -31,17 +31,19 @@
extern "C" {
#endif
+typedef struct _vsi_nn_resize_3d_local_data {
+ vsi_bool use_internal_node;
+} vsi_nn_resize_3d_local_data;
-typedef struct _vsi_nn_bilinear_grid_sample_param
+typedef struct _vsi_nn_resize_3d_param
{
- struct _bilinear_grid_sample_local_data_t* local;
- vsi_bool align_corners;
- vsi_nn_pad_mode_e padding_mode;
- int32_t const_val;
-} vsi_nn_bilinear_grid_sample_param;
-
-_compiler_assert(offsetof(vsi_nn_bilinear_grid_sample_param, local) == 0, \
- vsi_nn_bilinear_grid_sample_h );
+ vsi_nn_resize_3d_local_data* lcl_data;
+ vsi_enum type;
+ float factor;
+ int32_t size[3];
+ vsi_bool align_corners;
+ vsi_bool half_pixel_centers;
+} vsi_nn_resize_3d_param;
#ifdef __cplusplus
}
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
index 7ab6ff2dd..bccc0b5e5 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
@@ -33,6 +33,7 @@ extern "C" {
typedef struct _vsi_nn_topk_param
{
uint32_t k;
+ int32_t axis;
} vsi_nn_topk_param;
#ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h b/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h
index d7e598395..6446cd046 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h
@@ -52,7 +52,9 @@ enum {
D_BF16 = VSI_NN_TYPE_BFLOAT16,
D_BOOL8 = VSI_NN_TYPE_BOOL8,
D_I4 = VSI_NN_TYPE_INT4,
- D_U4 = VSI_NN_TYPE_UINT4
+ D_U4 = VSI_NN_TYPE_UINT4,
+ D_F8_E4M3 = VSI_NN_TYPE_FLOAT8_E4M3,
+ D_F8_E5M2 = VSI_NN_TYPE_FLOAT8_E5M2
};
/* short alias for qtype */
@@ -63,6 +65,8 @@ enum {
Q_ASYM = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC << Q_SHIFT,
Q_SYM_PC = VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC << Q_SHIFT,
Q_SYM = VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC << Q_SHIFT,
+ Q_FP8 = VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 << Q_SHIFT,
+ Q_FP8_PC = VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 << Q_SHIFT,
};
typedef struct {
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
index ab63a3c70..367ff88fb 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
@@ -27,6 +27,7 @@
#include "vsi_nn_types.h"
#include "vsi_nn_math.h"
#include "vsi_nn_tensor.h"
+#include "vsi_nn_log.h"
#ifdef __cplusplus
extern "C" {
@@ -78,6 +79,8 @@ static VSI_INLINE_API vsi_bool type_is_signed
case VSI_NN_TYPE_FLOAT32:
case VSI_NN_TYPE_FLOAT64:
case VSI_NN_TYPE_BFLOAT16:
+ case VSI_NN_TYPE_FLOAT8_E4M3:
+ case VSI_NN_TYPE_FLOAT8_E5M2:
ret = TRUE;
break;
default:
@@ -93,9 +96,14 @@ static VSI_INLINE_API uint32_t type_get_bytes
{
switch( type )
{
+ case VSI_NN_TYPE_INT4:
+ case VSI_NN_TYPE_UINT4:
+ return 0;
case VSI_NN_TYPE_INT8:
case VSI_NN_TYPE_UINT8:
case VSI_NN_TYPE_BOOL8:
+ case VSI_NN_TYPE_FLOAT8_E4M3:
+ case VSI_NN_TYPE_FLOAT8_E5M2:
return 1;
case VSI_NN_TYPE_INT16:
case VSI_NN_TYPE_UINT16:
@@ -111,7 +119,8 @@ static VSI_INLINE_API uint32_t type_get_bytes
case VSI_NN_TYPE_FLOAT64:
return 8;
default:
- return 0;
+ VSILOGE("unsupported type: %d", type);
+ return 1;
}
} /* type_get_bytes() */
@@ -128,6 +137,8 @@ static VSI_INLINE_API uint32_t type_get_bits
case VSI_NN_TYPE_INT8:
case VSI_NN_TYPE_UINT8:
case VSI_NN_TYPE_BOOL8:
+ case VSI_NN_TYPE_FLOAT8_E4M3:
+ case VSI_NN_TYPE_FLOAT8_E5M2:
return 8;
case VSI_NN_TYPE_INT16:
case VSI_NN_TYPE_UINT16:
@@ -143,7 +154,8 @@ static VSI_INLINE_API uint32_t type_get_bits
case VSI_NN_TYPE_FLOAT64:
return 64;
default:
- return 0;
+ VSILOGE("unsupported type: %d", type);
+ return 1;
}
} /* type_get_bits() */
@@ -236,6 +248,7 @@ static VSI_INLINE_API float affine_to_fp32
)
{
float data;
+ VSI_UNREFERENCED(type);
data = ( (float)val - zero_point ) * scale;
return data;
} /* affine_to_fp32() */
@@ -279,6 +292,7 @@ static VSI_INLINE_API float dfp_to_fp32
)
{
float result;
+ VSI_UNREFERENCED(type);
if( fl > 0 )
{
result = (float)val * ( 1.0f / ( (float) ( (int64_t)1 << fl ) ) );
@@ -440,6 +454,139 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
return out;
} /* fp32_to_bfp16_rtne */
+#define FLOAT_BIAS_EXPONENT 127
+#define FLOAT_EXPONENT_SIZE 8
+#define FLOAT_MANTISSA_SIZE 23
+#define FLOAT8_E4M3_BIAS_EXPONENT 7
+#define FLOAT8_E4M3_EXPONENT_SIZE 4
+#define FLOAT8_E4M3_MANTISSA_SIZE 3
+#define FLOAT8_E5M2_BIAS_EXPONENT 15
+#define FLOAT8_E5M2_EXPONENT_SIZE 5
+#define FLOAT8_E5M2_MANTISSA_SIZE 2
+
+static VSI_INLINE_API uint8_t fp32_to_fp8_e4m3(float in, const float scale) {
+ float fp8_f32 = in / scale;
+ int32_t fp8_i32 = *((int32_t*)&fp8_f32);
+ //int32_t mask = (int32_t)(pow(2, 32) - 1 - (pow(2, 23 - 3) - 1));
+ int32_t eps = 1 << (23 - 3 - 1);
+ fp8_i32 += eps;
+ //fp8_i32 &= mask;
+ {
+ int sign = (fp8_i32 >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1;
+ int exp = (fp8_i32 >> FLOAT_MANTISSA_SIZE) & 0xff;
+ int expShiftValue = FLOAT8_E4M3_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT;
+ int mantissa = (fp8_i32 >> (FLOAT_MANTISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7;
+
+ exp = (exp + expShiftValue) & 0xF;
+
+ return (uint8_t)(sign << 7 | exp << 3 | mantissa);
+ }
+} /* fp32_to_fp8_e4m3() */
+
+static VSI_INLINE_API uint8_t fp32_to_fp8_e5m2(float in, const float scale) {
+ float fp8_f32 = in / scale;
+ int32_t fp8_i32 = *((int32_t*)&fp8_f32);
+ //int32_t mask = (int32_t)(pow(2, 32) - 1 - (pow(2, 23 - 2) - 1));
+ int32_t eps = 1 << (23 - 2 - 1);
+ fp8_i32 += eps;
+ //fp8_i32 &= mask;
+ {
+ int sign = (fp8_i32 >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1;
+ int exp = (fp8_i32 >> FLOAT_MANTISSA_SIZE) & 0xff;
+ int expShiftValue = FLOAT8_E5M2_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT;
+ int mantissa = (fp8_i32 >> (FLOAT_MANTISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x3;
+
+ exp = (exp + expShiftValue) & 0x1F;
+
+ return (uint8_t)(sign << 7 | exp << 2 | mantissa);
+ }
+} /* fp32_to_fp8_e5m2() */
+
+static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) {
+ float val_fp32;
+
+ uint32_t signOut = 0;
+ uint32_t exponentOut = 0;
+ uint32_t mantissaOut = 0;
+ uint32_t out_u = 0;
+
+ uint32_t signIn;
+ uint32_t exponentIn;
+ uint32_t mantissaIn;
+ int expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E4M3_BIAS_EXPONENT;
+
+ signIn = (in >> (FLOAT8_E4M3_EXPONENT_SIZE + FLOAT8_E4M3_MANTISSA_SIZE)) & 0x1;
+ exponentIn = (in >> FLOAT8_E4M3_MANTISSA_SIZE) & 0xF;
+ mantissaIn = in & 0x7;
+
+ signOut = signIn;
+
+ if (exponentIn == 0 && mantissaIn == 0)
+ {
+ goto final;
+ }
+
+ if (exponentIn == 0xf && mantissaIn == 0x7)
+ {
+ exponentOut = 0xff;
+ mantissaOut = 0x400000;
+ goto final;
+ }
+
+ exponentOut = (exponentIn + expShiftValue) & 0xff;
+ mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7fffff;
+
+
+final:
+ out_u = signOut << 31 | exponentOut << 23 | mantissaOut;
+ val_fp32 = *((float*)&out_u);
+
+ return val_fp32 * scale;
+} /* fp8_e4m3_to_fp32() */
+
+static VSI_INLINE_API float fp8_e5m2_to_fp32(int8_t in, const float scale) {
+ float val_fp32;
+
+ uint32_t signOut = 0;
+ uint32_t exponentOut = 0;
+ uint32_t mantissaOut = 0;
+ uint32_t out_u = 0;
+
+ uint32_t signIn;
+ uint32_t exponentIn;
+ uint32_t mantissaIn;
+ int expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E5M2_BIAS_EXPONENT;
+
+ signIn = (in >> 7) & 0x1;
+ exponentIn = (in >> 2) & 0x1F;
+ mantissaIn = in & 0x3;
+
+ signOut = signIn;
+
+ if (exponentIn == 0 && mantissaIn == 0)
+ {
+ goto final;
+ }
+
+ if (exponentIn == 0x1f && mantissaIn == 0x3)
+ {
+ exponentOut = 0xff;
+ mantissaOut = 0x400000;
+ goto final;
+ }
+
+
+ exponentOut = (exponentIn + expShiftValue) & 0xff;
+ mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x7fffff;
+
+
+ final:
+ out_u = signOut << 31 | exponentOut << 23 | mantissaOut;
+ val_fp32 = *((float*)&out_u);
+
+ return val_fp32 * scale;
+} /* fp8_e5m2_to_fp32() */
+
static VSI_INLINE_API vsi_status dtype_to_float32
(
uint8_t *src,
@@ -458,6 +605,12 @@ static VSI_INLINE_API vsi_status dtype_to_float32
case VSI_NN_TYPE_BFLOAT16:
*dst = bfp16_to_fp32( *(int16_t *)src );
break;
+ case VSI_NN_TYPE_FLOAT8_E4M3:
+ *dst = fp8_e4m3_to_fp32(*(int8_t*)src, src_dtype->scale);
+ break;
+ case VSI_NN_TYPE_FLOAT8_E5M2:
+ *dst = fp8_e5m2_to_fp32(*(int8_t *)src, src_dtype->scale);
+ break;
case VSI_NN_TYPE_INT4:
case VSI_NN_TYPE_UINT4:
case VSI_NN_TYPE_INT8:
@@ -511,6 +664,12 @@ static VSI_INLINE_API vsi_status float32_to_dtype
case VSI_NN_TYPE_BFLOAT16:
*(int16_t *)dst = fp32_to_bfp16_rtne( src );
break;
+ case VSI_NN_TYPE_FLOAT8_E4M3:
+ *(int8_t *)dst = fp32_to_fp8_e4m3(src, dst_dtype->scale);
+ break;
+ case VSI_NN_TYPE_FLOAT8_E5M2:
+ *(int8_t *)dst = fp32_to_fp8_e5m2(src, dst_dtype->scale);
+ break;
case VSI_NN_TYPE_INT4:
case VSI_NN_TYPE_UINT4:
case VSI_NN_TYPE_INT8:
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_link_list.h b/src/tim/vx/internal/include/utils/vsi_nn_link_list.h
index 7e6afb2ea..2c800a152 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_link_list.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_link_list.h
@@ -30,7 +30,7 @@
extern "C"{
#endif
-#define vsi_nn_LinkListInitRoot(n) do{n = NULL;} while (0);
+#define vsi_nn_LinkListInitRoot(n) {n = NULL;}
typedef struct _vsi_nn_link_list
{
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_math.h b/src/tim/vx/internal/include/utils/vsi_nn_math.h
index b8a6d2a9a..924ddf004 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_math.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_math.h
@@ -53,12 +53,13 @@ extern "C" {
#define DEFINE_ARRAY_TYPE( NAME, TYPE ) \
typedef struct { \
size_t size; \
- TYPE data[0]; \
+ TYPE *data; \
} vsi_##NAME##_array_t; \
static VSI_INLINE_API vsi_##NAME##_array_t * vsi_##NAME##_array_create( size_t size ) { \
- vsi_##NAME##_array_t * array = (vsi_##NAME##_array_t *)malloc( \
- sizeof(vsi_##NAME##_array_t) + sizeof(TYPE) * size ); \
+ vsi_##NAME##_array_t * array = NULL; \
+ array = (vsi_##NAME##_array_t *)malloc( sizeof(vsi_##NAME##_array_t) + sizeof(TYPE) * size ); \
if (array == NULL) return NULL; \
+ array->data = (TYPE *)(((TYPE**)(&(array->data))) + 1); \
array->size = size; \
return array; \
} \
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h
index f939592b0..128e7d0c5 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h
@@ -50,14 +50,23 @@ extern "C" {
free( _PTR ); _PTR = NULL; }
#define vsi_safe_release_tensor(_t) if(_t){vsi_nn_ReleaseTensor(&(_t)); _t = NULL;}
-
-#define END_OF_VARIADIC_ARGUMENTS ((size_t)0xbadcaffebadcaffe)
+#if (defined(_WIN32) || defined(__WIN32__) || defined(WIN32))
+ #if defined(_WIN64)
+ #define END_OF_VARIADIC_ARGUMENTS ((size_t)0xbadcaffebadcaffe)
+ #else
+ #define END_OF_VARIADIC_ARGUMENTS ((size_t)0xbadcaffe)
+ #endif
+#else
+ #define END_OF_VARIADIC_ARGUMENTS ((size_t)0xbadcaffebadcaffe)
+#endif
#define FOREACH_ARGS(_args, _next, _arg_type) \
while(((_arg_type)((size_t)END_OF_VARIADIC_ARGUMENTS)) != (_next = va_arg(_args, _arg_type)))
#define BITS_PER_BYTE 8
+#define VSI_UNREFERENCED( param ) ( ( void ) ( param ) )
+
#define VSI_NN_STRINGIZE(X) VSI_NN_DO_STRINGIZE(X)
#define VSI_NN_DO_STRINGIZE(X) #X
diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h
index 75e5ab7e1..777cf5c04 100644
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@@ -78,6 +78,7 @@ typedef struct _vsi_nn_runtime_option_t
int32_t enable_asymi8_to_u8;
int32_t enable_dataconvert_optimize;
int32_t enable_stream_processor;
+ int32_t enable_rgb88_planar_nhwc;
} vsi_nn_runtime_option_t;
/**
diff --git a/src/tim/vx/internal/include/vsi_nn_error.h b/src/tim/vx/internal/include/vsi_nn_error.h
index 7b55aa507..bc9eca8b6 100644
--- a/src/tim/vx/internal/include/vsi_nn_error.h
+++ b/src/tim/vx/internal/include/vsi_nn_error.h
@@ -31,33 +31,42 @@
#define VSI_ASSERT( cond ) assert(cond)
#define VSI_CHECK_PTR( pointer, msg, retval ) \
- do { \
+ { \
if( pointer == NULL ) { \
VSILOGD("%s",msg); \
VSI_ASSERT(FALSE); \
} \
- } while(0)
+ }
-#define CHECK_STATUS_FAIL_GOTO( stat, lbl ) do {\
+#define CHECK_STATUS_FAIL_GOTO( stat, lbl ) {\
if( VSI_SUCCESS != stat ) {\
VSILOGE("CHECK STATUS(%d:%s)", (stat), vsi_nn_DescribeStatus(stat));\
goto lbl;\
}\
-} while(0)
+}
-#define CHECK_STATUS( stat ) do {\
+#define CHECK_STATUS( stat ) {\
if( VSI_SUCCESS != stat ) {\
VSILOGE("CHECK STATUS(%d:%s)", (stat), vsi_nn_DescribeStatus(stat));\
}\
-} while(0)
+}
#define CHECK_PTR_FAIL_GOTO( pointer, msg, lbl ) \
- do { \
+ { \
if( pointer == NULL ) { \
VSILOGD("CHECK POINTER %s", msg); \
goto lbl; \
} \
- } while(0)
+ }
+
+#define CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( pointer, node, msg, lbl ) \
+ { \
+ if( pointer == NULL ) { \
+ vsi_nn_internal_release_node(&node);\
+ VSILOGD("CHECK POINTER %s", msg); \
+ goto lbl; \
+ } \
+ }
#endif
diff --git a/src/tim/vx/internal/include/vsi_nn_feature_config.h b/src/tim/vx/internal/include/vsi_nn_feature_config.h
index 01ec04c29..e93d1af19 100644
--- a/src/tim/vx/internal/include/vsi_nn_feature_config.h
+++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h
@@ -1,26 +1,3 @@
-/****************************************************************************
-*
-* Copyright (c) 2019 Vivante Corporation
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the Software),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-* DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
/*****Auto generated header file, Please DO NOT modify manually!*****/
#ifndef _VSI_NN_FEATURE_CONFIG_H
#define _VSI_NN_FEATURE_CONFIG_H
@@ -42,5 +19,6 @@
#if defined(VX_TENSORVIEW_ON_ANY_DIM) && VX_TENSORVIEW_ON_ANY_DIM
#define VSI_CONCAT_ENHANCE_SUPPORT
#endif
+#define VSI_CREATE_TENSOR_FROM_VIEW_SUPPORT
#endif
diff --git a/src/tim/vx/internal/include/vsi_nn_graph.h b/src/tim/vx/internal/include/vsi_nn_graph.h
index 175687096..8504791f8 100644
--- a/src/tim/vx/internal/include/vsi_nn_graph.h
+++ b/src/tim/vx/internal/include/vsi_nn_graph.h
@@ -361,6 +361,27 @@ OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromHandle
uint8_t * data
);
+/**
+ * Add a new tensor from view
+ * Create a new tensor from a view and add it to graph.
+ *
+ * @param[in] graph Graph handle.
+ * @param[in] id Required, the id of the parent tensor on which to create view.
+ * @param[in] start The start cooridinates for each dim, 0-based none-negative interger.
+ * NULL means copy from the idx 0 of each dim.
+ * @param[in] end The end cooridinates for each dim, 0-based none-negative interger.
+ * NULL means copy to the end of each dim. For the given idx, the end[idx]
+ * should be greater than start[idx].
+ * @return The new tensor id on success, or VSI_NN_TENSOR_ID_NA otheriwse.
+ */
+OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromView
+ (
+ vsi_nn_graph_t* graph,
+ vsi_nn_tensor_id_t id,
+ vsi_size_t* start,
+ vsi_size_t* end
+ );
+
/**
* Attach tensor to graph
* Attach an exist tensor to graph.
diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h
index 37032f473..5cadddb3e 100644
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@@ -206,8 +206,8 @@
#include "ops/vsi_nn_op_maxunpool.h"
#include "ops/vsi_nn_op_reversesequence.h"
#include "ops/vsi_nn_op_grid_sample.h"
-#include "ops/vsi_nn_op_bilinear_grid_sample.h"
#include "ops/vsi_nn_op_lpnorm.h"
+#include "ops/vsi_nn_op_resize_3d.h"
/* custom node head define define */
#include "custom/vsi_nn_custom_node_type.h"
#include "ops/vsi_nn_op_inverse_sigmoid.h"
@@ -402,8 +402,8 @@ typedef union _vsi_nn_nn_param
vsi_nn_reversesequence_param reversesequence;
vsi_nn_inverse_sigmoid_param inverse_sigmoid;
vsi_nn_grid_sample_param gridsample;
- vsi_nn_bilinear_grid_sample_param bilinear_grid_sample;
vsi_nn_lpnorm_param lpnorm;
+ vsi_nn_resize_3d_param resize_3d;
void* client_param;
/* custom node data struct define */
diff --git a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
index 227b17f3a..59292cd0d 100644
--- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
+++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
@@ -48,6 +48,7 @@ typedef enum
VSI_NN_PREPROCESS_IMAGE_RESIZE_BILINEAR,
VSI_NN_PREPROCESS_IMAGE_RESIZE_NEAREST,
VSI_NN_PREPROCESS_DTYPE_CONVERT,
+ VSI_NN_PREPROCESS_MEANS_AND_SCALES,
} vsi_nn_preprocess_type_e;
/**
@@ -150,8 +151,25 @@ typedef struct
float scale;
}vsi_nn_process_mean_and_scale_t;
+/**
+ * Process mean and scale parameter structure
+ */
+typedef struct
+{
+ /** Mean value for each channel */
+ float* channel_mean;
+ /*Channel length */
+ int32_t channel_len;
+ /** Scale value */
+ float* scale;
+ /** Scale length */
+ int32_t scale_len;
+}vsi_nn_process_means_and_scales_t;
+
typedef vsi_nn_process_mean_and_scale_t vsi_nn_preprocess_mean_and_scale_t;
+typedef vsi_nn_process_means_and_scales_t vsi_nn_preprocess_means_and_scales_t;
typedef vsi_nn_process_mean_and_scale_t vsi_nn_postprocess_mean_and_scale_t;
+typedef vsi_nn_process_means_and_scales_t vsi_nn_postprocess_means_and_scales_t;
/**
* Process permute parameter structure
diff --git a/src/tim/vx/internal/include/vsi_nn_rnn_helper.h b/src/tim/vx/internal/include/vsi_nn_rnn_helper.h
index 4bef7b942..14f359338 100644
--- a/src/tim/vx/internal/include/vsi_nn_rnn_helper.h
+++ b/src/tim/vx/internal/include/vsi_nn_rnn_helper.h
@@ -154,7 +154,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_transpose_time_major
vsi_bool use_virtual_tensor
);
-void vsi_nn_rnn_split_input_tensor
+vsi_status vsi_nn_rnn_split_input_tensor
(
vsi_nn_node_t * self,
vsi_nn_tensor_t * input,
@@ -163,7 +163,7 @@ void vsi_nn_rnn_split_input_tensor
vsi_bool use_virtual_tensor
);
-void vsi_nn_rnn_data_check_aligned
+vsi_status vsi_nn_rnn_data_check_aligned
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** input,
diff --git a/src/tim/vx/internal/include/vsi_nn_tensor.h b/src/tim/vx/internal/include/vsi_nn_tensor.h
index 5b7bdb940..d6ed09045 100644
--- a/src/tim/vx/internal/include/vsi_nn_tensor.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor.h
@@ -82,6 +82,10 @@ typedef enum
VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC = 0x4,
/** affine perchannel asymmetric */
VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC = 0x5,
+ /** float8 */
+ VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6,
+ /** perchannel float8 */
+ VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7,
/** undefined type */
VSI_NN_QNT_TYPE_NA = 0xff,
} vsi_nn_qnt_type_e;
diff --git a/src/tim/vx/internal/include/vsi_nn_tensor_util.h b/src/tim/vx/internal/include/vsi_nn_tensor_util.h
index 4b997f319..14bb0d62b 100644
--- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h
@@ -734,13 +734,15 @@ vsi_status vsi_nn_copy_tensor_veiw_patch
/**
* OVXLIB internal tensor util api
* A wrapper api for OpenVX vxCopyTensorPatch
- * Allows the application to copy whole tensor patch from/into an tensor object.
+ * Allows the application to copy partial/whole tensor patch from/into an tensor object.
*
* @param[in] tensor OpenVX Tensor handle.
* @param[in] attr OVXLIB Tensor attr.
* @param[in] user_ptr The address of the memory location where to store the requested data.
* @param[in] usage This declares the effect of the copy with regard to the tensor object
* support VX_READ_ONLY or VX_WRITE_ONLY
+ * @param[in] start The start cooridinates for each dim. NULL means copy from the idx 0 of each dim.
+ * @param[in] end The end cooridinates for each dim. NULL means copy to the end of each dim.
* @return VSI_SUCCESS on success, or error core otherwise.
*/
vsi_status vsi_nn_copy_tensor_patch
@@ -748,7 +750,9 @@ vsi_status vsi_nn_copy_tensor_patch
vx_tensor tensor,
vsi_nn_tensor_attr_t *attr,
void * user_ptr,
- vsi_enum usage
+ vsi_enum usage,
+ vsi_size_t* start,
+ vsi_size_t* end
);
/**
diff --git a/src/tim/vx/internal/include/vsi_nn_test.h b/src/tim/vx/internal/include/vsi_nn_test.h
index 8f5df6e6a..59bafe198 100644
--- a/src/tim/vx/internal/include/vsi_nn_test.h
+++ b/src/tim/vx/internal/include/vsi_nn_test.h
@@ -31,26 +31,26 @@
extern "C"{
#endif
-#define TEST_CHECK_TENSOR_ID( id, lbl ) do {\
+#define TEST_CHECK_TENSOR_ID( id, lbl ) {\
if( VSI_NN_TENSOR_ID_NA == id ) {\
VSILOGE("CHECK TENSOR ID %d", __LINE__);\
goto lbl;\
}\
- } while(0)
+ }
-#define TEST_CHECK_PTR( ptr, lbl ) do {\
+#define TEST_CHECK_PTR( ptr, lbl ) {\
if( NULL == ptr ) {\
VSILOGE("CHECK PTR %d", __LINE__);\
goto lbl;\
}\
-} while(0)
+}
-#define TEST_CHECK_STATUS( stat, lbl ) do {\
+#define TEST_CHECK_STATUS( stat, lbl ) {\
if( VSI_SUCCESS != stat ) {\
VSILOGE("CHECK STATUS(%d:%s)", (stat), vsi_nn_DescribeStatus(stat));\
goto lbl;\
}\
-} while(0)
+}
#if defined(__cplusplus)
}
diff --git a/src/tim/vx/internal/include/vsi_nn_types.h b/src/tim/vx/internal/include/vsi_nn_types.h
index 6238e4f2d..380057b94 100644
--- a/src/tim/vx/internal/include/vsi_nn_types.h
+++ b/src/tim/vx/internal/include/vsi_nn_types.h
@@ -191,6 +191,16 @@ typedef enum
VSI_NN_TYPE_BFLOAT16 = VX_TYPE_BFLOAT16,
#else
VSI_NN_TYPE_BFLOAT16 = 0x81A,
+#endif
+#ifdef VSI_NN_TYPE_FLOAT8_E4M3_SUPPORT
+ VSI_NN_TYPE_FLOAT8_E4M3 = VX_TYPE_FLOAT8_E4M3,
+#else
+ VSI_NN_TYPE_FLOAT8_E4M3 = 0X81E,
+#endif
+#ifdef VSI_NN_TYPE_FLOAT8_E5M2_SUPPORT
+ VSI_NN_TYPE_FLOAT8_E5M2 = VX_TYPE_FLOAT8_E5M2,
+#else
+ VSI_NN_TYPE_FLOAT8_E5M2 = 0X81F,
#endif
VSI_NN_TYPE_VDATA = VX_TYPE_USER_STRUCT_START + 0x1,
@@ -268,6 +278,11 @@ typedef enum _vsi_nn_roi_align_type_e
VSI_NN_ROI_ALIGN
} vsi_nn_roi_align_type_e;
+typedef enum _vsi_nn_custom_warp_affine_type_e {
+ VSI_NN_WARP_AFFINE_TYPE_NONE = 0,
+ VSI_NN_WARP_AFFINE_TYPE_RGB
+} vsi_nn_custom_warp_affine_type_e;
+
/** Deprecated */
typedef uint32_t vsi_nn_size_t;
diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h
index 280f0cc4c..399e72e01 100644
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@@ -33,7 +33,7 @@ extern "C"{
#define VSI_NN_VERSION_MAJOR 1
#define VSI_NN_VERSION_MINOR 1
-#define VSI_NN_VERSION_PATCH 74
+#define VSI_NN_VERSION_PATCH 84
#define VSI_NN_VERSION \
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
diff --git a/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess.c b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess.c
new file mode 100644
index 000000000..6d6ceb98c
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess.c
@@ -0,0 +1,578 @@
+/****************************************************************************
+*
+* Copyright (c) 2020 Vivante Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include
+#include
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_internal_node.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _custom_tiny_yolov4_postprocess_local_data_t {
+ vx_int32 begin_dims[6][VSI_NN_MAX_DIM_NUM];
+ vx_int32 end_dims[6][VSI_NN_MAX_DIM_NUM];
+ vx_int32 stride_dims[VSI_NN_MAX_DIM_NUM];
+} custom_tiny_yolov4_postprocess_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM (4)
+#define _OUTPUT_NUM (2)
+
+static vsi_nn_internal_tensor_t *_create_internal_tensor
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t * input
+ )
+{
+ vsi_nn_tensor_attr_t attr;
+ vsi_nn_internal_tensor_t * tensor = NULL;
+
+ memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
+ memcpy( &attr.dtype, &input->attr.dtype, sizeof( attr.dtype ) );
+ attr.dim_num = VSI_NN_DIM_AUTO;
+ attr.vtl = TRUE;
+ attr.is_const = FALSE;
+ tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+
+ return tensor;
+} /* _create_internal_tensor() */
+
+static vsi_nn_internal_tensor_t *_create_sigmoid_internal_tensor
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t * input
+ )
+{
+ vsi_nn_tensor_attr_t attr;
+ vsi_nn_internal_tensor_t * tensor = NULL;
+
+ memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
+ memcpy( &attr.dtype, &input->attr.dtype, sizeof( attr.dtype ) );
+ if (attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ||
+ attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC)
+ {
+ attr.dtype.scale = 0.00390625;
+ attr.dtype.zero_point = 0;
+ }
+ attr.dim_num = VSI_NN_DIM_AUTO;
+ attr.vtl = TRUE;
+ attr.is_const = FALSE;
+ tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+
+ return tensor;
+} /* _create_sigmoid_internal_tensor() */
+
+static vsi_nn_internal_tensor_t *_create_output_internal_tensor
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t * output
+ )
+{
+ vsi_nn_tensor_attr_t attr;
+ vsi_nn_internal_tensor_t * tensor = NULL;
+
+ memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
+ memcpy( &attr.dtype, &output->attr.dtype, sizeof( attr.dtype ) );
+ attr.dim_num = VSI_NN_DIM_AUTO;
+ attr.vtl = TRUE;
+ attr.is_const = FALSE;
+ tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+
+ return tensor;
+} /* _create_output_internal_tensor() */
+
+static vsi_nn_internal_tensor_t *_create_strided_slice_op
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t * input,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t index
+ )
+{
+ vsi_nn_custom_tiny_yolov4_postprocess_param * p = NULL;
+ vsi_nn_internal_tensor_t * tensor = NULL;
+ vsi_nn_internal_node_t* curr = NULL;
+ p = (vsi_nn_custom_tiny_yolov4_postprocess_param *)&(self->nn_param.custom_tiny_yolov4_postprocess);
+
+ tensor = _create_internal_tensor(self, input);
+ CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
+ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+ curr->node->nn_param.strided_slice.begin_dims = p->local->begin_dims[index];
+ curr->node->nn_param.strided_slice.begin_dims_num = input->attr.dim_num;
+ curr->node->nn_param.strided_slice.end_dims = p->local->end_dims[index];
+ curr->node->nn_param.strided_slice.end_dims_num = input->attr.dim_num;
+ curr->node->nn_param.strided_slice.stride_dims = p->local->stride_dims;
+ curr->node->nn_param.strided_slice.stride_dims_num = input->attr.dim_num;
+ curr->node->nn_param.strided_slice.begin_mask = begin_mask;
+ curr->node->nn_param.strided_slice.end_mask = end_mask;
+ curr->node->nn_param.strided_slice.shrink_axis_mask = 0;
+ curr->node->nn_param.strided_slice.new_axis_mask = 0;
+ curr->inputs[0] = input;
+ curr->outputs[0] = tensor->t;
+ vsi_nn_internal_setup_node( self, curr );
+
+final:
+ return tensor;
+} /* _create_strided_slice() */
+
+static vsi_nn_internal_tensor_t *_create_sigmoid_op
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t * input
+ )
+{
+ vsi_nn_internal_tensor_t * tensor = NULL;
+ vsi_nn_internal_node_t* curr = NULL;
+
+ tensor = _create_sigmoid_internal_tensor(self, input);
+ CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
+
+ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SIGMOID, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+ curr->inputs[0] = input;
+ curr->outputs[0] = tensor->t;
+ vsi_nn_internal_setup_node( self, curr );
+
+final:
+ return tensor;
+} /* _create_sigmoid_op() */
+
+static vsi_nn_internal_tensor_t *_create_confidence_op
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t * input,
+ vsi_nn_tensor_t * output
+ )
+{
+ vsi_nn_internal_tensor_t * tensor = NULL;
+ vsi_nn_internal_node_t* curr = NULL;
+
+ tensor = _create_output_internal_tensor(self, output);
+ CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
+
+ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+ curr->inputs[0] = input;
+ curr->outputs[0] = tensor->t;
+ vsi_nn_internal_setup_node( self, curr );
+
+final:
+ return tensor;
+} /* _create_confidence_op() */
+
+static vsi_nn_internal_tensor_t *_create_box_op
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t * input0,
+ vsi_nn_tensor_t * input1,
+ vsi_nn_tensor_t * output,
+ float bias0,
+ float bias1
+ )
+{
+ vsi_nn_internal_tensor_t * tensor = NULL;
+ vsi_nn_internal_node_t* curr = NULL;
+
+ tensor = _create_output_internal_tensor(self, output);
+ CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
+
+ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+ curr->inputs[0] = input0;
+ curr->inputs[1] = input1;
+ curr->outputs[0] = tensor->t;
+ curr->node->nn_param.custom_tiny_yolov4_postprocess_box.bias_0 = bias0;
+ curr->node->nn_param.custom_tiny_yolov4_postprocess_box.bias_1 = bias1;
+ vsi_nn_internal_setup_node( self, curr );
+
+final:
+ return tensor;
+} /* _create_box_op() */
+
+static vsi_nn_internal_tensor_t *_create_reshape_op
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t * input,
+ vsi_nn_tensor_t * output,
+ vsi_size_t width
+ )
+{
+ vsi_nn_internal_tensor_t * tensor = NULL;
+ vsi_nn_internal_node_t* curr = NULL;
+ vsi_size_t shape_1[] = { 1, (vsi_size_t)-1, 1 };
+
+ shape_1[0] = width;
+
+ tensor = _create_output_internal_tensor(self, output);
+ CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
+
+ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+ curr->inputs[0] = input;
+ curr->outputs[0] = tensor->t;
+ curr->node->nn_param.reshape2.size = shape_1;
+ curr->node->nn_param.reshape2.dim_num = 3;
+ vsi_nn_internal_setup_node( self, curr );
+
+final:
+ return tensor;
+} /* _create_reshape_op() */
+
+static vsi_bool _create_concat_op
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t * input0,
+ vsi_nn_tensor_t * input1,
+ vsi_nn_tensor_t * input2,
+ vsi_nn_tensor_t * input3,
+ vsi_nn_tensor_t * input4,
+ vsi_nn_tensor_t * input5,
+ vsi_nn_tensor_t * output
+ )
+{
+ vsi_nn_internal_node_t* curr = NULL;
+ vsi_bool ret = FALSE;
+
+ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 6, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+ curr->inputs[0] = input0;
+ curr->inputs[1] = input1;
+ curr->inputs[2] = input2;
+ curr->inputs[3] = input3;
+ curr->inputs[4] = input4;
+ curr->inputs[5] = input5;
+ curr->outputs[0] = output;
+ curr->node->nn_param.concat.axis = 1;
+ ret = vsi_nn_internal_setup_node( self, curr );
+
+final:
+ return ret;
+} /* _create_concat_op() */
+
+static vsi_status op_compute
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t ** inputs,
+ vsi_nn_tensor_t ** outputs
+ )
+{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
+ return vsi_nn_internal_compute_node( self );
+} /* op_compute() */
+
+static vsi_bool op_check
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t ** inputs,
+ vsi_nn_tensor_t ** outputs
+ )
+{
+ BEGIN_IO_TYPE_DECL(CUSTOM_TINY_YOLOV4_POSTPROCESS, 4, 2)
+ IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
+ END_IO_TYPE_DECL(CUSTOM_TINY_YOLOV4_POSTPROCESS)
+ if (!VALIDATE_OP_IO_TYPES(CUSTOM_TINY_YOLOV4_POSTPROCESS, self, inputs,
+ self->input.num, outputs, self->output.num))
+ {
+ char* desc = generate_op_io_types_desc(inputs,
+ self->input.num, outputs, self->output.num);
+ VSILOGE("Inputs/Outputs data type not support: %s", desc);
+ destroy_op_io_types_desc(desc);
+ return FALSE;
+ }
+
+ return TRUE;
+} /* op_check() */
+
+static vsi_status op_optimize
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t ** inputs,
+ vsi_nn_tensor_t ** outputs,
+ vsi_nn_opt_direction_e direction
+ )
+{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
+ return vsi_nn_internal_optimize_node( self, direction );
+}
+
+static vsi_bool op_setup
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t ** inputs,
+ vsi_nn_tensor_t ** outputs
+ )
+{
+ vsi_bool ret = FALSE;
+ vsi_nn_internal_tensor_t * tensor0[12] = {NULL};
+ vsi_nn_internal_tensor_t * tensor1[12] = {NULL};
+ int32_t index_0 = 1;
+ int32_t index_1 = 0;
+ int32_t index_2 = 3;
+ int32_t index_3 = 2;
+
+ vsi_nn_internal_init_node_wksp( self );
+
+ /**confidence**/
+ /**input 0 chunk 0**/
+ /*
+ sub0:26x26x255 --> 26x26x81, begin: [0, 0, 4, 0] end: [0, 0, 85, 0] stride: [1, 1, 1, 1]
+ sub1[26, 26, 80] = sigmoid(sub0)[26, 26, 0:0] * sigmoid(sub0)[26, 26, 1:81]
+ sub2[80, 26, 26] = transpose(sub1)
+ sub3[80, 676] = reshape(sub2)
+ */
+ tensor0[0] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 0);
+ CHECK_PTR_FAIL_GOTO( tensor0[0], "Create internal tensor fail.", final );
+ tensor0[1] = _create_sigmoid_op(self, tensor0[0]->t);
+ CHECK_PTR_FAIL_GOTO( tensor0[1], "Create internal tensor fail.", final );
+ tensor0[2] = _create_confidence_op(self, tensor0[1]->t, outputs[0]);
+ CHECK_PTR_FAIL_GOTO( tensor0[2], "Create internal tensor fail.", final );
+ tensor0[3] = _create_reshape_op(self, tensor0[2]->t, outputs[0], 80);
+ CHECK_PTR_FAIL_GOTO( tensor0[3], "Create internal tensor fail.", final );
+ /**chunk 1**/
+ /*
+ 26x26x255 --> 26x26x81, begin: [0, 0, 89, 0] end: [0, 0, 170, 0] stride: [1, 1, 1, 1]
+ */
+ tensor0[4] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 1);
+ CHECK_PTR_FAIL_GOTO( tensor0[4], "Create internal tensor fail.", final );
+ tensor0[5] = _create_sigmoid_op(self, tensor0[4]->t);
+ CHECK_PTR_FAIL_GOTO( tensor0[5], "Create internal tensor fail.", final );
+ tensor0[6] = _create_confidence_op(self, tensor0[5]->t, outputs[0]);
+ CHECK_PTR_FAIL_GOTO( tensor0[6], "Create internal tensor fail.", final );
+ tensor0[7] = _create_reshape_op(self, tensor0[6]->t, outputs[0], 80);
+ CHECK_PTR_FAIL_GOTO( tensor0[7], "Create internal tensor fail.", final );
+ /**chunk 2**/
+ /*
+ 26x26x255 --> 26x26x81, begin: [0, 0, 174, 0] end: [0, 0, 255, 0] stride: [1, 1, 1, 1]
+ */
+ tensor0[8] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 2);
+ CHECK_PTR_FAIL_GOTO( tensor0[8], "Create internal tensor fail.", final );
+ tensor0[9] = _create_sigmoid_op(self, tensor0[8]->t);
+ CHECK_PTR_FAIL_GOTO( tensor0[9], "Create internal tensor fail.", final );
+ tensor0[10] = _create_confidence_op(self, tensor0[9]->t, outputs[0]);
+ CHECK_PTR_FAIL_GOTO( tensor0[10], "Create internal tensor fail.", final );
+ tensor0[11] = _create_reshape_op(self, tensor0[10]->t, outputs[0], 80);
+ CHECK_PTR_FAIL_GOTO( tensor0[11], "Create internal tensor fail.", final );
+
+ /**input 1 chunk 0**/
+ /*
+ sub0:13x13x255 --> 26x26x81, begin: [0, 0, 4, 0] end: [0, 0, 85, 0] stride: [1, 1, 1, 1]
+ sub1[13, 13, 80] = sigmoid(sub0)[13, 13, 0:0] * sigmoid(sub0)[13, 13, 1:81]
+ sub2[80, 13, 13] = transpose(sub1)
+ sub3[80, 169] = reshape(sub2)
+ */
+ tensor1[0] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 0);
+ CHECK_PTR_FAIL_GOTO( tensor1[0], "Create internal tensor fail.", final );
+ tensor1[1] = _create_sigmoid_op(self, tensor1[0]->t);
+ CHECK_PTR_FAIL_GOTO( tensor1[1], "Create internal tensor fail.", final );
+ tensor1[2] = _create_confidence_op(self, tensor1[1]->t, outputs[0]);
+ CHECK_PTR_FAIL_GOTO( tensor1[2], "Create internal tensor fail.", final );
+ tensor1[3] = _create_reshape_op(self, tensor1[2]->t, outputs[0], 80);
+ CHECK_PTR_FAIL_GOTO( tensor1[3], "Create internal tensor fail.", final );
+ /**chunk 1**/
+ /*
+ 13x13x255 --> 13x13x81, begin: [0, 0, 89, 0] end: [0, 0, 170, 0] stride: [1, 1, 1, 1]
+ */
+ tensor1[4] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 1);
+ CHECK_PTR_FAIL_GOTO( tensor1[4], "Create internal tensor fail.", final );
+ tensor1[5] = _create_sigmoid_op(self, tensor1[4]->t);
+ CHECK_PTR_FAIL_GOTO( tensor1[5], "Create internal tensor fail.", final );
+ tensor1[6] = _create_confidence_op(self, tensor1[5]->t, outputs[0]);
+ CHECK_PTR_FAIL_GOTO( tensor1[6], "Create internal tensor fail.", final );
+ tensor1[7] = _create_reshape_op(self, tensor1[6]->t, outputs[0], 80);
+ CHECK_PTR_FAIL_GOTO( tensor1[7], "Create internal tensor fail.", final );
+ /**chunk 2**/
+ /*
+ 13x13x255 --> 13x13x81, begin: [0, 0, 174, 0] end: [0, 0, 255, 0] stride: [1, 1, 1, 1]
+ */
+ tensor1[8] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 2);
+ CHECK_PTR_FAIL_GOTO( tensor1[8], "Create internal tensor fail.", final );
+ tensor1[9] = _create_sigmoid_op(self, tensor1[8]->t);
+ CHECK_PTR_FAIL_GOTO( tensor1[9], "Create internal tensor fail.", final );
+ tensor1[10] = _create_confidence_op(self, tensor1[9]->t, outputs[0]);
+ CHECK_PTR_FAIL_GOTO( tensor1[10], "Create internal tensor fail.", final );
+ tensor1[11] = _create_reshape_op(self, tensor1[10]->t, outputs[0], 80);
+ CHECK_PTR_FAIL_GOTO( tensor1[11], "Create internal tensor fail.", final );
+
+ ret = _create_concat_op(self, tensor0[3]->t, tensor0[7]->t, tensor0[11]->t,
+ tensor1[3]->t, tensor1[7]->t, tensor1[11]->t, outputs[0]);
+ if (ret == FALSE)
+ {
+ VSILOGE("Create concat operation fail");
+ goto final;
+ }
+
+ ret = FALSE;
+ /**box**/
+ /*
+ 26x26x255 --> 26x26x4, begin: [0, 0, 0, 0] end: [0, 0, 4, 0] stride: [1, 1, 1, 1]
+ */
+ tensor0[0] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 3);
+ CHECK_PTR_FAIL_GOTO( tensor0[0], "Create internal tensor fail.", final );
+ tensor0[1] = _create_box_op(self, tensor0[0]->t, inputs[index_2], outputs[1], 23, 27);
+ CHECK_PTR_FAIL_GOTO( tensor0[1], "Create internal tensor fail.", final );
+ tensor0[2] = _create_reshape_op(self, tensor0[1]->t, outputs[1], 4);
+ CHECK_PTR_FAIL_GOTO( tensor0[2], "Create internal tensor fail.", final );
+ /*
+ 26x26x255 --> 26x26x4, begin: [0, 0, 85, 0] end: [0, 0, 89, 0] stride: [1, 1, 1, 1]
+ */
+ tensor0[3] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 4);
+ CHECK_PTR_FAIL_GOTO( tensor0[3], "Create internal tensor fail.", final );
+ tensor0[4] = _create_box_op(self, tensor0[3]->t, inputs[index_2], outputs[1], 37, 58);
+ CHECK_PTR_FAIL_GOTO( tensor0[4], "Create internal tensor fail.", final );
+ tensor0[5] = _create_reshape_op(self, tensor0[4]->t, outputs[1], 4);
+ CHECK_PTR_FAIL_GOTO( tensor0[5], "Create internal tensor fail.", final );
+ /*
+ 26x26x255 --> 26x26x4, begin: [0, 0, 85, 0] end: [0, 0, 89, 0] stride: [1, 1, 1, 1]
+ */
+ tensor0[6] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 5);
+ CHECK_PTR_FAIL_GOTO( tensor0[6], "Create internal tensor fail.", final );
+ tensor0[7] = _create_box_op(self, tensor0[6]->t, inputs[index_2], outputs[1], 81, 82);
+ CHECK_PTR_FAIL_GOTO( tensor0[7], "Create internal tensor fail.", final );
+ tensor0[8] = _create_reshape_op(self, tensor0[7]->t, outputs[1], 4);
+ CHECK_PTR_FAIL_GOTO( tensor0[8], "Create internal tensor fail.", final );
+
+ /*
+ 13x13x255 --> 13x13x4, begin: [0, 0, 0, 0] end: [0, 0, 4, 0] stride: [1, 1, 1, 1]
+ */
+ tensor1[0] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 3);
+ CHECK_PTR_FAIL_GOTO( tensor1[0], "Create internal tensor fail.", final );
+ tensor1[1] = _create_box_op(self, tensor1[0]->t, inputs[index_3], outputs[1], 81, 82);
+ CHECK_PTR_FAIL_GOTO( tensor1[1], "Create internal tensor fail.", final );
+ tensor1[2] = _create_reshape_op(self, tensor1[1]->t, outputs[1], 4);
+ CHECK_PTR_FAIL_GOTO( tensor1[2], "Create internal tensor fail.", final );
+ /*
+ 13x13x255 --> 13x13x4, begin: [0, 0, 85, 0] end: [0, 0, 89, 0] stride: [1, 1, 1, 1]
+ */
+ tensor1[3] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 4);
+ CHECK_PTR_FAIL_GOTO( tensor1[3], "Create internal tensor fail.", final );
+ tensor1[4] = _create_box_op(self, tensor1[3]->t, inputs[index_3], outputs[1], 135, 169);
+ CHECK_PTR_FAIL_GOTO( tensor1[4], "Create internal tensor fail.", final );
+ tensor1[5] = _create_reshape_op(self, tensor1[4]->t, outputs[1], 4);
+ CHECK_PTR_FAIL_GOTO( tensor1[5], "Create internal tensor fail.", final );
+ /*
+ 13x13x255 --> 13x13x4, begin: [0, 0, 170, 0] end: [0, 0, 174, 0] stride: [1, 1, 1, 1]
+ */
+ tensor1[6] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 5);
+ CHECK_PTR_FAIL_GOTO( tensor1[6], "Create internal tensor fail.", final );
+ tensor1[7] = _create_box_op(self, tensor1[6]->t, inputs[index_3], outputs[1], 344, 319);
+ CHECK_PTR_FAIL_GOTO( tensor1[7], "Create internal tensor fail.", final );
+ tensor1[8] = _create_reshape_op(self, tensor1[7]->t, outputs[1], 4);
+ CHECK_PTR_FAIL_GOTO( tensor1[8], "Create internal tensor fail.", final );
+
+ ret = _create_concat_op(self, tensor0[2]->t, tensor0[5]->t, tensor0[8]->t,
+ tensor1[2]->t, tensor1[5]->t, tensor1[8]->t, outputs[1]);
+ if (ret == FALSE)
+ {
+ VSILOGE("Create concat operation fail");
+ goto final;
+ }
+
+final:
+ return ret;
+} /* op_setup() */
+
+static vsi_status op_init
+ (
+ vsi_nn_node_t* self
+ )
+{
+ int32_t i = 0;
+ vsi_nn_custom_tiny_yolov4_postprocess_param *p = &self->nn_param.custom_tiny_yolov4_postprocess;
+ p->local = \
+ (custom_tiny_yolov4_postprocess_local_data_t*)malloc(sizeof(custom_tiny_yolov4_postprocess_local_data_t));
+ CHECK_PTR_FAIL_GOTO(p->local, "create buffer fail", final);
+ memset(p->local, 0, sizeof(custom_tiny_yolov4_postprocess_local_data_t));
+ for ( i = 0; i < VSI_NN_MAX_DIM_NUM; i++ )
+ {
+ p->local->stride_dims[i] = 1;
+ }
+ p->local->begin_dims[0][2] = 4;
+ p->local->end_dims[0][2] = 85;
+
+ p->local->begin_dims[1][2] = 89;
+ p->local->end_dims[1][2] = 170;
+
+ p->local->begin_dims[2][2] = 174;
+ p->local->end_dims[2][2] = 255;
+
+ p->local->begin_dims[3][2] = 0;
+ p->local->end_dims[3][2] = 4;
+
+ p->local->begin_dims[4][2] = 85;
+ p->local->end_dims[4][2] = 89;
+
+ p->local->begin_dims[5][2] = 170;
+ p->local->end_dims[5][2] = 174;
+final:
+ return VSI_SUCCESS;
+} /* op_init() */
+
+static vsi_status op_deinit
+ (
+ vsi_nn_node_t* self
+ )
+{
+ vsi_status status = VSI_SUCCESS;
+
+ status = vsi_nn_op_common_deinit(self);
+
+ vsi_nn_safe_free(self->nn_param.custom_tiny_yolov4_postprocess.local);
+ vsi_nn_internal_deinit_node_wksp( self );
+
+ return status;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+ (
+ /* op_name */ CUSTOM_TINY_YOLOV4_POSTPROCESS,
+ /* init */ op_init,
+ /* compute */ op_compute,
+ /* deinit */ op_deinit,
+ /* check */ op_check,
+ /* setup */ op_setup,
+ /* optimize */ op_optimize,
+ /* input_num */ _INPUT_NUM,
+ /* output_num */ _OUTPUT_NUM
+ );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bilinear_grid_sample.c b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_box.c
similarity index 59%
rename from src/tim/vx/internal/src/ops/vsi_nn_op_bilinear_grid_sample.c
rename to src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_box.c
index c664a3c16..a05ca3f42 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bilinear_grid_sample.c
+++ b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_box.c
@@ -35,9 +35,9 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
-typedef struct _bilinear_grid_sample_local_data_t {
+typedef struct _custom_tiny_yolov4_postprocess_box_local_data_t {
int32_t placeholder;
-} bilinear_grid_sample_local_data_t;
+} custom_tiny_yolov4_postprocess_box_local_data_t;
/*
Declare number of input and output.
@@ -53,27 +53,25 @@ static vsi_status op_compute
)
{
vsi_status status = VSI_FAILURE;
-
- vsi_nn_kernel_param_t* param = NULL;
- int32_t align_corners = self->nn_param.bilinear_grid_sample.align_corners;
- vsi_nn_kernel_node_t n;
+ vsi_nn_kernel_param_t * param = NULL;
+ float bias_0 = self->nn_param.custom_tiny_yolov4_postprocess_box.bias_0;
+ float bias_1 = self->nn_param.custom_tiny_yolov4_postprocess_box.bias_1;
param = vsi_nn_kernel_param_create();
- vsi_nn_kernel_param_add_int32(param, "align_corners", align_corners);
- n = vsi_nn_kernel_selector(
- self->graph, "bilinear_grid_sample", inputs, 2, outputs, 1, param);
- if (n == NULL) {
- vsi_nn_kernel_param_release(¶m);
- status = VSI_FAILURE;
- return status;
- }
- self->n = (vx_node)n;
- vsi_nn_kernel_param_release(¶m);
- if (self->n) {
+ vsi_nn_kernel_param_add_float32( param, "bias_0", bias_0 );
+ vsi_nn_kernel_param_add_float32( param, "bias_1", bias_1 );
+
+ self->n = vsi_nn_kernel_selector( self->graph, "tiny_yolov4_postprocess_box",
+ inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+
+ if ( self->n )
+ {
status = VSI_SUCCESS;
}
+ vsi_nn_kernel_param_release( ¶m );
+
return status;
} /* op_compute() */
@@ -85,6 +83,9 @@ static vsi_bool op_check
)
{
/*TODO: Check tensor shapes. */
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
} /* op_check() */
@@ -95,61 +96,36 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
- if (NULL == self) {
- return FALSE;
- }
-
- if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) {
- outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
- outputs[0]->attr.size[0] = inputs[1]->attr.size[1];
- outputs[0]->attr.size[1] = inputs[1]->attr.size[2];
- outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
- if (4 == inputs[0]->attr.dim_num) {
- outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
+ uint32_t rank = inputs[0]->attr.dim_num;
+ vsi_bool ret = TRUE;
+
+ VSI_UNREFERENCED(self);
+
+ if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+ {
+ outputs[0]->attr.dim_num = rank;
+ outputs[0]->attr.size[0] = inputs[0]->attr.size[2];
+ outputs[0]->attr.size[1] = inputs[0]->attr.size[0];
+ outputs[0]->attr.size[2] = inputs[0]->attr.size[1];
+ if (rank > 3)
+ {
+ memcpy( &outputs[0]->attr.size[3], &inputs[0]->attr.size[3], (rank - 3) * sizeof(vsi_size_t) );
}
}
- return TRUE;
+ return ret;
} /* op_setup() */
-static vsi_status op_init
- (
- vsi_nn_node_t* self
- )
-{
- /* TODO
- //self->nn_param.bilinear_grid_sample.local = \
- // (bilinear_grid_sample_local_data_t*)malloc(sizeof(bilinear_grid_sample_local_data_t));
- */
-
- return VSI_SUCCESS;
-} /* op_init() */
-
-static vsi_status op_deinit
- (
- vsi_nn_node_t* self
- )
-{
- vsi_status status = VSI_SUCCESS;
-
- status = vsi_nn_op_common_deinit(self);
-
- /* TODO
- //vsi_nn_safe_free(self->nn_param.bilinear_grid_sample.local);
- */
-
- return status;
-} /* op_deinit() */
__BEGIN_DECLS
/* Registrar */
DEF_OP_REG
(
- /* op_name */ BILINEAR_GRID_SAMPLE,
- /* init */ op_init,
+ /* op_name */ CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX,
+ /* init */ NULL,
/* compute */ op_compute,
- /* deinit */ op_deinit,
+ /* deinit */ vsi_nn_op_common_deinit,
/* check */ op_check,
/* setup */ op_setup,
/* optimize */ NULL,
diff --git a/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_confidence.c b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_confidence.c
new file mode 100644
index 000000000..a9cf8b4a6
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_confidence.c
@@ -0,0 +1,127 @@
+/****************************************************************************
+*
+* Copyright (c) 2020 Vivante Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include
+#include
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+typedef struct _tiny_yolov4_postprocess_confidence_local_data_t {
+ int32_t placeholder;
+} tiny_yolov4_postprocess_confidence_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM (1)
+#define _OUTPUT_NUM (1)
+
+static vsi_status op_compute
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t ** inputs,
+ vsi_nn_tensor_t ** outputs
+ )
+{
+ vsi_status status = VSI_FAILURE;
+
+ self->n = vsi_nn_kernel_selector( self->graph, "tiny_yolov4_postprocess_confidence",
+ inputs, 1, outputs, 1, NULL );
+
+ if ( self->n )
+ {
+ status = VSI_SUCCESS;
+ }
+
+ return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t ** inputs,
+ vsi_nn_tensor_t ** outputs
+ )
+{
+ /*TODO: Check tensor shapes. */
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
+ return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t ** inputs,
+ vsi_nn_tensor_t ** outputs
+ )
+{
+ uint32_t rank = inputs[0]->attr.dim_num;
+ vsi_bool ret = TRUE;
+
+ VSI_UNREFERENCED(self);
+
+ if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+ {
+ outputs[0]->attr.dim_num = rank;
+ outputs[0]->attr.size[0] = inputs[0]->attr.size[2] - 1;
+ outputs[0]->attr.size[1] = inputs[0]->attr.size[0];
+ outputs[0]->attr.size[2] = inputs[0]->attr.size[1];
+ if (rank > 3)
+ {
+ memcpy( &outputs[0]->attr.size[3], &inputs[0]->attr.size[3], (rank - 3) * sizeof(vsi_size_t) );
+ }
+ }
+
+ return ret;
+} /* op_setup() */
+
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+ (
+ /* op_name */ CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE,
+ /* init */ NULL,
+ /* compute */ op_compute,
+ /* deinit */ vsi_nn_op_common_deinit,
+ /* check */ op_check,
+ /* setup */ op_setup,
+ /* optimize */ NULL,
+ /* input_num */ _INPUT_NUM,
+ /* output_num */ _OUTPUT_NUM
+ );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c
index a1e50a481..8fc6d6ce0 100644
--- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c
@@ -54,20 +54,26 @@ DEF_KERNEL_EXECUTOR(_softmax_compute)
size_t param_size
)
{
- vsi_status status = VX_SUCCESS;
+ vsi_status status = VSI_FAILURE;
float *buffer[_CPU_IO_NUM] = {NULL};
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = {NULL};
vsi_nn_kernel_tensor_attr_t *attr[_CPU_IO_NUM] = {NULL};
uint32_t i = 0, out_elements = 0;
int32_t axis;
+ VSI_UNREFERENCED(node);
+ VSI_UNREFERENCED(param_size);
+
tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; // input0
tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; // input1
tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; // output
attr[0] = vsi_nn_kernel_tensor_attr_create(tensors[0]);
+ CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create(tensors[1]);
+ CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
attr[2] = vsi_nn_kernel_tensor_attr_create(tensors[2]);
+ CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis);
CHECK_STATUS_FAIL_GOTO(status, final );
@@ -133,6 +139,8 @@ static vsi_status _query_kernel
vsi_nn_kernel_t* kernel
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
return VSI_SUCCESS;
}
@@ -153,6 +161,9 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_t node = NULL;
int32_t axis = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
axis = vsi_nn_kernel_param_get_int32(params, "axis");
status = _query_kernel(inputs, outputs, kernel);
if(status != VSI_SUCCESS)
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c
index ed1e14932..3fb62eb74 100644
--- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c
@@ -54,7 +54,7 @@ DEF_KERNEL_EXECUTOR(_softmax_exec)
size_t param_size
)
{
- vsi_status status = VX_SUCCESS;
+ vsi_status status = VSI_FAILURE;
float* buffer[_CPU_IO_NUM] = { NULL };
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL };
@@ -64,11 +64,16 @@ DEF_KERNEL_EXECUTOR(_softmax_exec)
float fMax = 0.0;
float fProbSum = 0.0f;
+ VSI_UNREFERENCED(node);
+ VSI_UNREFERENCED(param_size);
+
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+ CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+ CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &sf_axis);
CHECK_STATUS_FAIL_GOTO(status, final );
@@ -141,6 +146,8 @@ static vsi_status _query_kernel
vsi_nn_kernel_t* kernel
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
return VSI_SUCCESS;
}
@@ -161,6 +168,9 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_t node = NULL;
int32_t axis = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
axis = vsi_nn_kernel_param_get_int32(params, "axis");
status = _query_kernel( inputs, outputs, kernel );
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c
index f2cb0315c..b9e77c299 100644
--- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c
@@ -62,6 +62,7 @@ static vx_param_description_t _custom_warp_affine_kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _CUSTOM_WARP_AFFINE_PARAM_NUM _cnt_of_array( _custom_warp_affine_kernel_param_def )
@@ -97,7 +98,7 @@ static vsi_bool _read_pixel
if (out_of_bounds)
{
- *pixel = 205.0f;
+ *pixel = 0.0f;
return TRUE;
}
@@ -125,6 +126,7 @@ DEF_KERNEL_EXECUTOR(_compute)
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL };
int32_t type = 0;
+ int32_t rgb_type = 0;
float matrix[6] = {0};
vsi_size_t i = 0;
vsi_size_t b = 0;
@@ -135,11 +137,16 @@ DEF_KERNEL_EXECUTOR(_compute)
vsi_size_t height = 0;
vsi_size_t outer_size = 1;
+ VSI_UNREFERENCED(node);
+ VSI_UNREFERENCED(param_size);
+
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+ CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+ CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
@@ -153,6 +160,7 @@ DEF_KERNEL_EXECUTOR(_compute)
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_TYPE],
&type);
+ status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &rgb_type);
CHECK_STATUS_FAIL_GOTO(status, final );
for (i = 0; i < 6; i++)
{
@@ -172,34 +180,95 @@ DEF_KERNEL_EXECUTOR(_compute)
{
float *src_base = buffer[0] + b * attr[0]->shape->data[0] * attr[0]->shape->data[1];
float *dst_base = buffer[1] + b * width * height;
- for (y = 0; y < height; y++)
+
+ if ( rgb_type == VSI_NN_WARP_AFFINE_TYPE_RGB )
{
- for (x = 0; x < width; x++)
+ width = width / 3;
+ for (y = 0; y < height; y++)
{
- float xf = 0;
- float yf = 0;
- float dst = 0;
-
- _transform_affine(x, y, matrix, &xf, &yf);
- if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR)
+ for (x = 0; x < width; x++)
{
- _read_pixel(src_base, attr[0], xf, yf, &dst);
- dst_base[y * width + x] = dst;
+ float xf = 0;
+ float yf = 0;
+ float dst = 0;
+
+ _transform_affine(x, y, matrix, &xf, &yf);
+
+ if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR)
+ {
+ _read_pixel(src_base, attr[0], 3 * floorf(xf), floorf(yf), &dst);
+ dst_base[y * 3 * width + 3 * x] = dst;
+ _read_pixel(src_base, attr[0], 3 * floorf(xf) + 1, floorf(yf), &dst);
+ dst_base[y * 3 * width + 3 * x + 1] = dst;
+ _read_pixel(src_base, attr[0], 3 * floorf(xf) + 2, floorf(yf), &dst);
+ dst_base[y * 3 * width + 3 * x + 2] = dst;
+ }
+ else
+ {
+ float tl = 0, tr = 0, bl = 0, br = 0;
+ float ar = xf - floorf(xf);
+ float ab = yf - floorf(yf);
+ float al = 1.0f - ar;
+ float at = 1.0f - ab;
+
+ _read_pixel(src_base, attr[0], 3 * floorf(xf), floorf(yf), &tl);
+ _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1), floorf(yf), &tr);
+ _read_pixel(src_base, attr[0], 3 * floorf(xf), floorf(yf) + 1, &bl);
+ _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1), floorf(yf) + 1, &br);
+
+ dst_base[y * 3 * width + 3 * x] =
+ tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
+
+ _read_pixel(src_base, attr[0], 3 * floorf(xf) + 1, floorf(yf), &tl);
+ _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1) + 1, floorf(yf), &tr);
+ _read_pixel(src_base, attr[0], 3 * floorf(xf) + 1, floorf(yf) + 1, &bl);
+ _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1) + 1, floorf(yf) + 1, &br);
+
+ dst_base[y * 3 * width + 3 * x + 1] =
+ tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
+
+ _read_pixel(src_base, attr[0], 3 * floorf(xf) + 2, floorf(yf), &tl);
+ _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1) + 2, floorf(yf), &tr);
+ _read_pixel(src_base, attr[0], 3 * floorf(xf) + 2, floorf(yf) + 1, &bl);
+ _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1) + 2, floorf(yf) + 1, &br);
+
+ dst_base[y * 3 * width + 3 * x + 2] =
+ tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
+ }
}
- else
+ }
+ }
+ else
+ {
+ for (y = 0; y < height; y++)
+ {
+ for (x = 0; x < width; x++)
{
- float tl = 0, tr = 0, bl = 0, br = 0;
- float ar = xf - floorf(xf);
- float ab = yf - floorf(yf);
- float al = 1.0f - ar;
- float at = 1.0f - ab;
-
- _read_pixel(src_base, attr[0], floorf(xf), floorf(yf), &tl);
- _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf), &tr);
- _read_pixel(src_base, attr[0], floorf(xf), floorf(yf) + 1, &bl);
- _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf) + 1, &br);
-
- dst_base[y * width + x] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
+ float xf = 0;
+ float yf = 0;
+ float dst = 0;
+
+ _transform_affine(x, y, matrix, &xf, &yf);
+ if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR)
+ {
+ _read_pixel(src_base, attr[0], xf, yf, &dst);
+ dst_base[y * width + x] = dst;
+ }
+ else
+ {
+ float tl = 0, tr = 0, bl = 0, br = 0;
+ float ar = xf - floorf(xf);
+ float ab = yf - floorf(yf);
+ float al = 1.0f - ar;
+ float at = 1.0f - ab;
+
+ _read_pixel(src_base, attr[0], floorf(xf), floorf(yf), &tl);
+ _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf), &tr);
+ _read_pixel(src_base, attr[0], floorf(xf), floorf(yf) + 1, &bl);
+ _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf) + 1, &br);
+
+ dst_base[y * width + x] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
+ }
}
}
}
@@ -233,6 +302,8 @@ static vsi_status _query_kernel
)
{
vsi_status status = VSI_FAILURE;
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
kernel->info.function = _compute;
kernel->info.parameters = _custom_warp_affine_kernel_param_def;
@@ -260,6 +331,7 @@ static vsi_nn_kernel_node_t _setup
size_t i = 0;
size_t buffer_size = 0;
int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
+ int32_t rgb_type = vsi_nn_kernel_param_get_int32( params, "rgb_type");
float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
@@ -278,6 +350,8 @@ static vsi_nn_kernel_node_t _setup
node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create(
graph, F32, &buffer[i] );
}
+ node_params[9] = vsi_nn_kernel_scalar_create(
+ graph, I32, &rgb_type );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM );
@@ -286,6 +360,7 @@ static vsi_nn_kernel_node_t _setup
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
}
+ vsi_nn_kernel_scalar_release( &node_params[9] );
}
}
return node;
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c
index 397f02291..98ae55858 100644
--- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c
@@ -95,7 +95,7 @@ static vsi_bool _read_pixel
)
{
vsi_size_t width = attr->shape->data[0];
- vsi_size_t height = attr->shape->data[1];
+ vsi_size_t height = attr->shape->size > 1 ? attr->shape->data[1] : 1;
vsi_bool out_of_bounds = (x < 0 || y < 0 || x >= width || y >= height);
vsi_size_t bx = 0, by = 0;
@@ -139,11 +139,16 @@ DEF_KERNEL_EXECUTOR(_compute)
vsi_size_t height = 0;
vsi_size_t outer_size = 1;
+ VSI_UNREFERENCED(node);
+ VSI_UNREFERENCED(param_size);
+
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+ CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+ CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
@@ -237,6 +242,8 @@ static vsi_status _query_kernel
)
{
vsi_status status = VSI_FAILURE;
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
kernel->info.function = _compute;
kernel->info.parameters = _custom_warp_perspective_kernel_param_def;
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
index 0ec7145e4..6dc60cea4 100644
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
@@ -73,6 +73,8 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
{0, 0, 0}, // local_size: local group size in thread
{0, 0, 0}}; // global_size: image size in thread
+ VSI_UNREFERENCED(param_size);
+
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
if (!attr)
{
@@ -144,6 +146,8 @@ static vsi_status _query_kernel
vsi_nn_kernel_t* kernel
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
@@ -170,6 +174,9 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_t node = NULL;
int32_t axis = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
axis = vsi_nn_kernel_param_get_int32(params, "axis");
status = _query_kernel( inputs, outputs, kernel );
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_box_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_box_evis.c
new file mode 100644
index 000000000..c56c80937
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_box_evis.c
@@ -0,0 +1,357 @@
+/****************************************************************************
+*
+* Copyright (c) 2020 Vivante Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include
+#include
+#include
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+ INTERNAL_KERNEL_TINY_YOLOV4_POSTPROCESS_BOX,
+} _internal_kernel_e;
+
+#define _SOURCE "tiny_yolov4_postprocess_box"
+#define _KERNEL_NAME CVIVANTE_NAMESPACE("evis.tiny_yolov4_postprocess_box_U8_U8toU8")
+
+// Add kernel hashtable here
+#define TINY_YOLOV4_POSTPROCESS_BOX_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+ (( IN0_DTYPE ) | ( IN1_DTYPE << 8 ) | ( OUT_DTYPE << 16 ))
+#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+ { TINY_YOLOV4_POSTPROCESS_BOX_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), \
+ _KERNEL_NAME, _SOURCE }
+
+typedef struct
+{
+ uint32_t key;
+ char * function_name;
+ const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _tiny_yolov4_postprocess_box_kernel_map[] =
+{
+ // Register kernel here
+ PACK_KERNEL_MAP( U8, U8, U8 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _tiny_yolov4_postprocess_box_kernel_param_def[] =
+{
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ // Add kererl parameters here
+};
+#define _TINY_YOLOV4_POSTPROCESS_BOX_PARAM_NUM _cnt_of_array( _tiny_yolov4_postprocess_box_kernel_param_def )
+#define SCALAR_BIAS_0_VALUE (3)
+#define SCALAR_BIAS_1_VALUE (4)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_tiny_yolov4_postprocess_box_initializer)
+ (
+ vsi_nn_kernel_node_t node,
+ const vsi_nn_kernel_node_param_t * param,
+ size_t param_size
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ gpu_param_t gpu_param = {
+ 3,
+ {0, 0, 0},
+ {0, 0, 0},
+ {0, 0, 0},
+ {0, 0, 0}
+ };
+ vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
+ float CONST2 = 16.0f;
+
+ VSI_UNREFERENCED(param_size);
+
+ attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+ CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+ attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+ CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+ attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+ CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+ // Add initializer
+ gpu_param.dim = 2;
+ gpu_param.global_scale[0] = 4;
+ gpu_param.global_scale[1] = 1;
+ gpu_param.global_size[0] = gpu_align_p2(
+ (attr[0]->shape->data[0] + gpu_param.global_scale[0] - 1)
+ / gpu_param.global_scale[0], 8);
+ gpu_param.global_size[1] = 1;
+
+ if (attr[0]->shape->data[0] == 13 * 13)
+ {
+ CONST2 = 32.0f;
+ }
+
+ if (attr[0]->dtype == U8 && attr[1]->dtype == U8 && attr[2]->dtype == U8)
+ {
+ float input0_scale = attr[0]->scale;
+ float input0_tail = 0 - (float)attr[0]->zero_point * input0_scale;
+ float input1_scale = attr[1]->scale;
+ float input1_tail = 0 - (float)attr[1]->zero_point * input1_scale;
+ float output_scale = 1.0f / attr[2]->scale;
+ float output_zp = (float)attr[2]->zero_point;
+ gpu_dp_inst_t uniExtract8Data_2x8 = {{
+ 0x33333333, // TCfg
+ 0x11110000, // ASelt
+ 0x03020100, 0x03020100, // ABin
+ 0x00000000, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00002400, // AccumType, ConstantType, and PostShift
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniDatatoFloat32_0_4x4 = {{
+ 0x01010101, // TCfg
+ 0x00000000, // ASelt
+ 0x00010000, 0x00030002, // ABin
+ 0x02020202, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000000, 0x00000001, 0x00000000,
+ 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniDatatoFloat32_1_4x4 = {{
+ 0x01010101, // TCfg
+ 0x00000000, // ASelt
+ 0x00050004, 0x00070006, // ABin
+ 0x02020202, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000000, 0x00000001, 0x00000000,
+ 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniDataTranspose_0_2x8 = {{
+ 0x11111111, // TCfg
+ 0x00000000, // ASelt
+ 0x0c080400, 0x0d090501, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniDataTranspose_1_2x8 = {{
+ 0x11111111, // TCfg
+ 0x00000000, // ASelt
+ 0x0e0a0602, 0x0f0b0703, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16};
+
+ status = vsi_nn_kernel_gpu_add_param( node, "uniDatatoFloat32_0_4x4", &uniDatatoFloat32_0_4x4);
+ status |= vsi_nn_kernel_gpu_add_param( node, "uniDatatoFloat32_1_4x4", &uniDatatoFloat32_1_4x4);
+ status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Data_2x8", &uniExtract8Data_2x8);
+ status |= vsi_nn_kernel_gpu_add_param( node, "uniDataTranspose_0_2x8", &uniDataTranspose_0_2x8);
+ status |= vsi_nn_kernel_gpu_add_param( node, "uniDataTranspose_1_2x8", &uniDataTranspose_1_2x8);
+ status |= vsi_nn_kernel_gpu_add_param( node, "input0_scale", &input0_scale);
+ status |= vsi_nn_kernel_gpu_add_param( node, "input0_tail", &input0_tail);
+ status |= vsi_nn_kernel_gpu_add_param( node, "input1_scale", &input1_scale);
+ status |= vsi_nn_kernel_gpu_add_param( node, "input1_tail", &input1_tail);
+ status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale);
+ status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp);
+ status |= vsi_nn_kernel_gpu_add_param( node, "CONST2", &CONST2);
+ CHECK_STATUS_FAIL_GOTO(status, final );
+ }
+
+ status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+ if (attr[0])
+ {
+ vsi_nn_kernel_tensor_attr_release( &attr[0] );
+ }
+ if (attr[1])
+ {
+ vsi_nn_kernel_tensor_attr_release( &attr[1] );
+ }
+ if (attr[2])
+ {
+ vsi_nn_kernel_tensor_attr_release( &attr[2] );
+ }
+
+ return status;
+} /* _tiny_yolov4_postprocess_box_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+ (
+ vsi_nn_kernel_t * kernel,
+ vsi_nn_tensor_t * const * const inputs,
+ vsi_nn_tensor_t * const * const outputs
+ /* Add extra params */
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ vsi_nn_kernel_dtype_e in0_dtype;
+ vsi_nn_kernel_dtype_e in1_dtype;
+ vsi_nn_kernel_dtype_e out_dtype;
+ const _kernel_map_type * kernel_map = _tiny_yolov4_postprocess_box_kernel_map;
+ size_t kernel_map_size = _cnt_of_array( _tiny_yolov4_postprocess_box_kernel_map );
+ vx_param_description_t * param_def = _tiny_yolov4_postprocess_box_kernel_param_def;
+ vx_kernel_initialize_f initializer = _tiny_yolov4_postprocess_box_initializer;
+
+ uint32_t key;
+ uint32_t i;
+
+ in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+ in1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+ out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+ key = TINY_YOLOV4_POSTPROCESS_BOX_HASH_KEY( in0_dtype, in1_dtype, out_dtype );
+
+ for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+ {
+ if ( kernel_map[i].key == key )
+ {
+ break;
+ }
+ }
+ if ( i < (uint32_t)kernel_map_size )
+ {
+ snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
+ kernel->info.parameters = param_def;
+ kernel->info.numParams = _cnt_of_array( _tiny_yolov4_postprocess_box_kernel_param_def );
+ kernel->info.initialize = initializer;
+ // Register code source
+ vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+ "vsi_nn_kernel_header",
+ kernel_map[i].source_name );
+ // Register binary source
+ vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+ kernel_map[i].source_name );
+ status = VSI_SUCCESS;
+ }
+ return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+ (
+ vsi_nn_graph_t * graph,
+ vsi_nn_tensor_t ** inputs,
+ size_t input_num,
+ vsi_nn_tensor_t ** outputs,
+ size_t output_num,
+ const vsi_nn_kernel_param_t * params,
+ vsi_nn_kernel_t * kernel
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ vsi_nn_kernel_node_param_t node_params[_TINY_YOLOV4_POSTPROCESS_BOX_PARAM_NUM];
+ vsi_nn_kernel_node_t node = NULL;
+ vsi_size_t shape[3][VSI_NN_MAX_DIM_NUM] = { 0 };
+ vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+ float bias_0 = vsi_nn_kernel_param_get_float32( params, "bias_0" );
+ float bias_1 = vsi_nn_kernel_param_get_float32( params, "bias_1" );
+
+ VSI_UNREFERENCED(params);
+
+ memcpy(shape[0], inputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+ shape[0][0] = shape[0][0] * shape[0][1];
+ shape[0][1] = shape[0][2];
+ shape[0][2] = 1;
+
+ memcpy(shape[1], inputs[1]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+ shape[1][0] = shape[1][0] * shape[1][1];
+ shape[1][1] = shape[1][2];
+ shape[1][2] = 1;
+
+ memcpy(shape[2], outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+ shape[2][0] = shape[2][0];
+ shape[2][1] = shape[2][2] * shape[2][1];
+ shape[2][2] = 1;
+
+ reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+ inputs[0], shape[0], inputs[0]->attr.dim_num );
+ reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+ inputs[1], shape[1], inputs[1]->attr.dim_num );
+ reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+ outputs[0], shape[2], outputs[0]->attr.dim_num );
+
+ if ( !vsi_nn_kernel_gpu_check_shape(
+ reshape_tensors[0]->attr.size, reshape_tensors[0]->attr.dim_num ) )
+ {
+ return NULL;
+ }
+
+ status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+ if ( VSI_SUCCESS == status)
+ {
+ node = vsi_nn_kernel_create_node( graph, kernel );
+ if ( node )
+ {
+ /* Set inputs and outputs */
+ vsi_nn_kernel_node_pack_io( node_params, _TINY_YOLOV4_POSTPROCESS_BOX_PARAM_NUM,
+ reshape_tensors, input_num, &reshape_tensors[2], output_num );
+ /* Pass parameters to node. */
+ node_params[SCALAR_BIAS_0_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &bias_0 );
+ node_params[SCALAR_BIAS_1_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &bias_1 );
+ status = vsi_nn_kernel_node_pass_param( node, node_params, _TINY_YOLOV4_POSTPROCESS_BOX_PARAM_NUM );
+ vsi_nn_kernel_scalar_release( &node_params[SCALAR_BIAS_0_VALUE] );
+ vsi_nn_kernel_scalar_release( &node_params[SCALAR_BIAS_1_VALUE] );
+ }
+ }
+
+ vsi_safe_release_tensor( reshape_tensors[0] );
+ vsi_safe_release_tensor( reshape_tensors[1] );
+ vsi_safe_release_tensor( reshape_tensors[2] );
+
+ return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( tiny_yolov4_postprocess_box, _setup )
+
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c
new file mode 100644
index 000000000..b36ec6b14
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c
@@ -0,0 +1,320 @@
+/****************************************************************************
+*
+* Copyright (c) 2020 Vivante Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include
+#include
+#include
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+ INTERNAL_KERNEL_TINY_YOLOV4_POSTPROCESS_CONFIDENCE,
+} _internal_kernel_e;
+
+#define _SOURCE "tiny_yolov4_postprocess_confidence"
+#define _KERNEL_NAME CVIVANTE_NAMESPACE("evis.tiny_yolov4_postprocess_conf_U8toU8")
+
+// Add kernel hashtable here
+#define _CONFIDENCE_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+ (( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+ { _CONFIDENCE_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+ _KERNEL_NAME, _SOURCE }
+
+typedef struct
+{
+ uint32_t key;
+ char * function_name;
+ const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _tiny_yolov4_postprocess_confidence_kernel_map[] =
+{
+ // Register kernel here
+ PACK_KERNEL_MAP( U8, U8 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _tiny_yolov4_postprocess_confidence_kernel_param_def[] =
+{
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ // Add kererl parameters here
+};
+#define _TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM \
+ _cnt_of_array( _tiny_yolov4_postprocess_confidence_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_tiny_yolov4_postprocess_confidence_initializer)
+ (
+ vsi_nn_kernel_node_t node,
+ const vsi_nn_kernel_node_param_t * param,
+ size_t param_size
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ gpu_param_t gpu_param = {
+ 3,
+ {0, 0, 0},
+ {0, 0, 0},
+ {0, 0, 0},
+ {0, 0, 0}
+ };
+ vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+
+ VSI_UNREFERENCED(param_size);
+
+ attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+ CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+ attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+ CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+ gpu_param.dim = 2;
+ gpu_param.global_scale[0] = 4;
+ gpu_param.global_scale[1] = 4;
+ gpu_param.global_size[0] = gpu_align_p2(
+ (attr[0]->shape->data[0] + gpu_param.global_scale[0] - 1)
+ / gpu_param.global_scale[0], 4);
+ gpu_param.global_size[1] = (
+ (attr[1]->shape->data[0] + gpu_param.global_scale[1] - 1)
+ / gpu_param.global_scale[1]);
+
+ if (attr[0]->dtype == U8 && attr[1]->dtype == U8)
+ {
+ float output_scale = attr[0]->scale * attr[0]->scale / attr[1]->scale;
+ int output_zp = attr[1]->zero_point;
+ uint16_t M0 = 0;
+ int32_t postShift = 0;
+ int32_t i = 0;
+
+ gpu_dp_inst_t uniU8TimesU8_0_4x4 = {{
+ 0x01010101, // TCfg
+ 0x00000000, // ASelt
+ 0x00010000, 0x00030002, // ABin
+ 0x01010101, // BSelt
+ 0x00010000, 0x00030002, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniU16TimesMultiplier_PostShift_2x8 = {{
+ 0x11111111, // TCfg
+ 0x00000000, // ASelt
+ 0x03020100, 0x07060504, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000600, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniU8PlusU8_trans_0_2x8 = {{
+ 0xffffffff, // TCfg
+ 0x44444444, // ASelt
+ 0x0c080400, 0x0d090501, // ABin
+ 0x00000000, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00007400, // AccumType, ConstantType, and PostShift
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniU8PlusU8_trans_1_2x8 = {{
+ 0xffffffff, // TCfg
+ 0x44444444, // ASelt
+ 0x0e0a0602, 0x0f0b0703, // ABin
+ 0x00000000, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00007400, // AccumType, ConstantType, and PostShift
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16};
+
+ gpu_quantize_multiplier_16bit((double)output_scale, &M0, &postShift);
+
+ uniU16TimesMultiplier_PostShift_2x8.data[7] |= (postShift & 0x1F);
+ for ( i = 8; i < 16; i++ )
+ {
+ uniU16TimesMultiplier_PostShift_2x8.data[i] = M0;
+ }
+
+ status = vsi_nn_kernel_gpu_add_param( node, "uniU8TimesU8_0_4x4", &uniU8TimesU8_0_4x4);
+ status |= vsi_nn_kernel_gpu_add_param( node, "uniU16TimesMultiplier_PostShift_2x8",
+ &uniU16TimesMultiplier_PostShift_2x8);
+ status |= vsi_nn_kernel_gpu_add_param( node, "uniU8PlusU8_trans_0_2x8", &uniU8PlusU8_trans_0_2x8);
+ status |= vsi_nn_kernel_gpu_add_param( node, "uniU8PlusU8_trans_1_2x8", &uniU8PlusU8_trans_1_2x8);
+ status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp);
+ CHECK_STATUS_FAIL_GOTO(status, final );
+ }
+
+ status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+ if (attr[0])
+ {
+ vsi_nn_kernel_tensor_attr_release( &attr[0] );
+ }
+ if (attr[1])
+ {
+ vsi_nn_kernel_tensor_attr_release( &attr[1] );
+ }
+
+ return status;
+} /* _tiny_yolov4_postprocess_confidence_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+ (
+ vsi_nn_kernel_t * kernel,
+ vsi_nn_tensor_t * const * const inputs,
+ vsi_nn_tensor_t * const * const outputs
+ /* Add extra params */
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ vsi_nn_kernel_dtype_e in_dtype;
+ vsi_nn_kernel_dtype_e out_dtype;
+ const _kernel_map_type * kernel_map = _tiny_yolov4_postprocess_confidence_kernel_map;
+ size_t kernel_map_size = _cnt_of_array( _tiny_yolov4_postprocess_confidence_kernel_map );
+ vx_param_description_t * param_def = _tiny_yolov4_postprocess_confidence_kernel_param_def;
+ vx_kernel_initialize_f initializer = _tiny_yolov4_postprocess_confidence_initializer;
+
+ uint32_t key;
+ uint32_t i;
+
+ in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+ out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+ key = _CONFIDENCE_HASH_KEY( in_dtype, out_dtype );
+
+ for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+ {
+ if ( kernel_map[i].key == key )
+ {
+ break;
+ }
+ }
+ if ( i < (uint32_t)kernel_map_size )
+ {
+ snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
+ kernel->info.parameters = param_def;
+ kernel->info.numParams = _cnt_of_array( _tiny_yolov4_postprocess_confidence_kernel_param_def );
+ kernel->info.initialize = initializer;
+ // Register code source
+ vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+ kernel_map[i].source_name );
+ // Register binary source
+ vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+ kernel_map[i].source_name );
+ status = VSI_SUCCESS;
+ }
+ return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+ (
+ vsi_nn_graph_t * graph,
+ vsi_nn_tensor_t ** inputs,
+ size_t input_num,
+ vsi_nn_tensor_t ** outputs,
+ size_t output_num,
+ const vsi_nn_kernel_param_t * params,
+ vsi_nn_kernel_t * kernel
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ vsi_nn_kernel_node_param_t node_params[_TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM];
+ vsi_nn_kernel_node_t node = NULL;
+ vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = { 0 };
+ vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+
+ VSI_UNREFERENCED(params);
+
+ memcpy(shape[0], inputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+ shape[0][0] = shape[0][0] * shape[0][1];
+ shape[0][1] = shape[0][2];
+ shape[0][2] = 1;
+
+ memcpy(shape[1], outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+ shape[1][0] = shape[1][0];
+ shape[1][1] = shape[1][2] * shape[1][1];
+ shape[1][2] = 1;
+
+ reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+ inputs[0], shape[0], inputs[0]->attr.dim_num );
+ reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+ outputs[0], shape[1], outputs[0]->attr.dim_num );
+
+ if ( !vsi_nn_kernel_gpu_check_shape(
+ reshape_tensors[0]->attr.size, reshape_tensors[0]->attr.dim_num ) )
+ {
+ return NULL;
+ }
+
+ status = _query_kernel( kernel, inputs, outputs );
+ if ( VSI_SUCCESS == status)
+ {
+ node = vsi_nn_kernel_create_node( graph, kernel );
+ if ( node )
+ {
+ /* Set inputs and outputs */
+ vsi_nn_kernel_node_pack_io( node_params, _TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM,
+ reshape_tensors, input_num, &reshape_tensors[1], output_num );
+ /* Pass parameters to node. */
+ status = vsi_nn_kernel_node_pass_param( node, node_params,
+ _TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM );
+ }
+ }
+
+ vsi_safe_release_tensor(reshape_tensors[0]);
+ vsi_safe_release_tensor(reshape_tensors[1]);
+
+ return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( tiny_yolov4_postprocess_confidence, _setup )
+
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c
index 169825158..3272fd634 100644
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c
@@ -50,18 +50,27 @@ typedef enum _custom_warp_affine_type_e
}custom_warp_affine_type_e;
#define _CUSTOM_WARP_AFFINE_KERNEL_SOURCE "custom_warp_affine"
+#define _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE "custom_warp_affine_rgb"
// Add kernel hashtable here
-#define CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D ) \
- (( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20))
+#define CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D, RGB_TYPE ) \
+ (( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20) | (RGB_TYPE << 24))
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
- { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0 ), \
+ { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 0 ), \
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE), \
_CUSTOM_WARP_AFFINE_KERNEL_SOURCE }
#define PACK_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
- { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1 ), \
+ { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 0 ), \
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D"), \
_CUSTOM_WARP_AFFINE_KERNEL_SOURCE }
+#define PACK_RGB_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
+ { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 1 ), \
+ CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb"), \
+ _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE }
+#define PACK_RGB_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
+ { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 1 ), \
+ CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb_2D"), \
+ _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE }
typedef struct
{
@@ -78,6 +87,12 @@ static const _kernel_map_type _custom_warp_affine_kernel_map[] =
PACK_2D_KERNEL_MAP( U8, U8, nearest_neighbor ),
PACK_2D_KERNEL_MAP( U8, U8, bilinear ),
+
+ PACK_RGB_KERNEL_MAP( U8, U8, nearest_neighbor ),
+ PACK_RGB_KERNEL_MAP( U8, U8, bilinear ),
+
+ PACK_RGB_2D_KERNEL_MAP( U8, U8, nearest_neighbor ),
+ PACK_RGB_2D_KERNEL_MAP( U8, U8, bilinear ),
};
/*
@@ -124,6 +139,8 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_initializer)
float matrix4[4] = {0};
int32_t i = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -178,7 +195,81 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_initializer)
return status;
} /* _custom_warp_affine_initializer() */
+DEF_KERNEL_INITIALIZER(_custom_warp_affine_rgb_initializer)
+ (
+ vsi_nn_kernel_node_t node,
+ const vsi_nn_kernel_node_param_t * param,
+ size_t param_size
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ gpu_param_t gpu_param = {
+ 3,
+ {0, 0, 0},
+ {0, 0, 0},
+ {0, 0, 0},
+ {0, 0, 0}
+ };
+
+ vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+ vsi_size_array_t * out_shape = NULL;
+ float m[6] = {0};
+ float matrix0[4] = {0};
+ float matrix1[4] = {0};
+ int32_t i = 0;
+
+ VSI_UNREFERENCED(param_size);
+
+ attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+ CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+ attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+ CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+ for (i = 0; i < 6; i++)
+ {
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
+ &m[i]);
+ CHECK_STATUS_FAIL_GOTO(status, final );
+ }
+
+ matrix0[0] = m[0]; matrix0[1] = m[1]; matrix0[2] = m[2]; matrix0[3] = m[3];
+ matrix1[0] = m[4]; matrix1[1] = m[5];
+ out_shape = attr[1]->shape;
+
+ gpu_param.global_scale[0] = 2;
+ gpu_param.global_scale[1] = 1;
+ gpu_param.global_scale[2] = 1;
+ gpu_param.global_size[0] = (
+ (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+ / (3 * gpu_param.global_scale[0]));
+ gpu_param.global_size[1] = (
+ (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+ / gpu_param.global_scale[1]);
+ gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+ status = vsi_nn_kernel_gpu_add_param( node,
+ "matrix0", &matrix0 );
+ status |= vsi_nn_kernel_gpu_add_param( node,
+ "matrix1", &matrix1 );
+ CHECK_STATUS_FAIL_GOTO(status, final );
+
+ status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+ if (attr[0])
+ {
+ vsi_nn_kernel_tensor_attr_release( &attr[0] );
+ attr[0] = NULL;
+ }
+ if (attr[1])
+ {
+ vsi_nn_kernel_tensor_attr_release( &attr[1] );
+ attr[1] = NULL;
+ }
+
+ return status;
+} /* _custom_warp_affine_rgb_initializer() */
/*
* Query kernel
@@ -188,7 +279,8 @@ static vsi_status _query_kernel
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
- int32_t type
+ int32_t type,
+ int32_t rgb_type
)
{
vsi_status status = VSI_FAILURE;
@@ -205,8 +297,11 @@ static vsi_status _query_kernel
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
- key = CUSTOM_WARP_AFFINE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img );
-
+ key = CUSTOM_WARP_AFFINE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img, rgb_type );
+ if (rgb_type == 1)
+ {
+ initializer = _custom_warp_affine_rgb_initializer;
+ }
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
@@ -251,6 +346,7 @@ static vsi_nn_kernel_node_t _setup
size_t i = 0;
size_t buffer_size = 0;
int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
+ int32_t rgb_type = vsi_nn_kernel_param_get_int32( params, "rgb_type");
float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
if (vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
@@ -258,7 +354,7 @@ static vsi_nn_kernel_node_t _setup
return NULL;
}
- status = _query_kernel( kernel, inputs, outputs, type );
+ status = _query_kernel( kernel, inputs, outputs, type, rgb_type );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
@@ -282,7 +378,7 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
}
// Set default border mode.
- border.constant_value.U32 = 0xcdcdcdcd;
+ border.constant_value.U32 = 0x00000000;
status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
CHECK_STATUS(status);
}
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c
index 69367599b..ab6d8437e 100644
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c
@@ -127,6 +127,8 @@ DEF_KERNEL_INITIALIZER(_custom_warp_perspective_initializer)
float matrix4[4] = {0};
int32_t i = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c b/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c
index 2e7415e62..606b7c80f 100644
--- a/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c
+++ b/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c
@@ -48,6 +48,9 @@ static vsi_status op_compute
{
vsi_status status = VSI_SUCCESS;
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
+
#if defined(VX_DENOISE_POSTPROCESS_SUPPORT) && VX_DENOISE_POSTPROCESS_SUPPORT
self->n = vxDenoisePostProcesslayer(
self->graph->g,
@@ -83,6 +86,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
} /* op_check() */
@@ -93,6 +99,9 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
} /* op_setup() */
@@ -101,6 +110,7 @@ static vsi_status op_init
vsi_nn_node_t* self
)
{
+ VSI_UNREFERENCED(self);
return VSI_SUCCESS;
} /* op_init() */
diff --git a/src/tim/vx/internal/src/custom/ops/op_custom_sample.c b/src/tim/vx/internal/src/custom/ops/op_custom_sample.c
index 145953922..ef28a2e64 100644
--- a/src/tim/vx/internal/src/custom/ops/op_custom_sample.c
+++ b/src/tim/vx/internal/src/custom/ops/op_custom_sample.c
@@ -63,6 +63,9 @@ static vsi_bool op_check
)
{
/*TODO: Check params. */
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
} /* op_check() */
@@ -73,6 +76,7 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(node);
if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
{
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c b/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c
index 3a37247a9..6da5e6136 100644
--- a/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c
+++ b/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c
@@ -62,6 +62,9 @@ static vsi_bool op_check
)
{
/*TODO: Check params. */
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
} /* op_check() */
@@ -72,6 +75,7 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(node);
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c
index e076b7c7c..5ee37c58e 100644
--- a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c
+++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c
@@ -59,6 +59,7 @@ static vsi_status op_compute
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_const_buffer( param, "matrix", p->matrix, 6 );
vsi_nn_kernel_param_add_int32( param, "type", p->type);
+ vsi_nn_kernel_param_add_int32( param, "rgb_type", p->rgb_type);
self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
"custom_warp_affine",
@@ -78,6 +79,9 @@ static vsi_bool op_check
)
{
/*TODO: Check tensor shapes. */
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
} /* op_check() */
diff --git a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c
index 7afbd8352..91f788c94 100644
--- a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c
+++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c
@@ -78,6 +78,9 @@ static vsi_bool op_check
)
{
/*TODO: Check tensor shapes. */
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
} /* op_check() */
diff --git a/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c b/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c
index 6a84a5e0b..b9a840ff3 100644
--- a/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c
@@ -100,7 +100,7 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
size_t param_size
)
{
- vsi_status status = VX_FAILURE;
+ vsi_status status = VSI_FAILURE;
// Alignment with a power of two value.
gpu_param_t gpu_param = {
2,
@@ -113,6 +113,8 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
vsi_nn_kernel_tensor_attr_t *input0_attr = NULL;
vsi_size_array_t *input_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0);
CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
input_shape = input0_attr->shape;
diff --git a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
index 5741690d3..bc7d36efc 100644
--- a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
@@ -143,6 +143,8 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer)
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -183,7 +185,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int32_t i;
+ size_t i;
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -240,6 +242,9 @@ static vsi_nn_kernel_node_t _setup
int32_t axis = 0;
vsi_size_t axis_size = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
axis = vsi_nn_kernel_param_get_int32(params, "axis");
if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
diff --git a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c
index b710fa11e..6fb6cd872 100644
--- a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c
@@ -143,6 +143,8 @@ DEF_KERNEL_INITIALIZER(_argmin_initializer)
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -183,7 +185,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int32_t i;
+ size_t i;
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -240,6 +242,9 @@ static vsi_nn_kernel_node_t _setup
int32_t axis = 0;
size_t axis_size = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
axis = vsi_nn_kernel_param_get_int32(params, "axis");
if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
diff --git a/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c b/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c
index c0ed53eee..24b266439 100644
--- a/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c
@@ -129,6 +129,8 @@ DEF_KERNEL_INITIALIZER(_avg_pool3d_initializer)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
vxReadScalarValue(depth_out, &depth_out_value);
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c
index c62f0b4c0..689603021 100644
--- a/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c
@@ -135,6 +135,8 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer)
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * in_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@@ -170,7 +172,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int i;
+ size_t i;
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -238,6 +240,9 @@ static vsi_nn_kernel_node_t _setup
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
float eps = vsi_nn_kernel_param_get_float32(params, "eps");
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if ( (inputs[1]->attr.is_const && inputs[2]->attr.is_const)
|| ( inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16
&& inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32 )
diff --git a/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c b/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c
index bda96ffcb..84811fd82 100644
--- a/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c
@@ -128,6 +128,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
vsi_nn_kernel_tensor_attr_t* output_attr = NULL;
vsi_size_array_t* out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr =
vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
CHECK_PTR_FAIL_GOTO(output_attr, "Create tensor attr buffer fail.", final);
@@ -140,9 +142,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
gpu_param.dim = 2;
gpu_param.global_size[0] =
- gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1) /
- gpu_param.global_scale[0],
- 4);
+ (out_shape->data[0] + gpu_param.global_scale[0] - 1) /
+ gpu_param.global_scale[0];
gpu_param.global_size[1] =
((out_shape->data[1] + gpu_param.global_scale[1] - 1) /
gpu_param.global_scale[1]);
diff --git a/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c b/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c
index e20cb1be4..d3c4968a8 100644
--- a/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c
@@ -134,6 +134,8 @@ DEF_KERNEL_INITIALIZER(_bucketize_initializer)
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/cast_cl.c b/src/tim/vx/internal/src/kernel/cl/cast_cl.c
index 33291a799..e379000ea 100644
--- a/src/tim/vx/internal/src/kernel/cl/cast_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/cast_cl.c
@@ -128,6 +128,8 @@ DEF_KERNEL_INITIALIZER(_cast_initializer)
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
@@ -251,6 +253,8 @@ static vsi_nn_kernel_node_t _setup
vsi_bool image_2d = FALSE;
vsi_nn_kernel_node_t node = NULL;
+ VSI_UNREFERENCED(params);
+
if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
inputs[0]->attr.dim_num ) )
{
diff --git a/src/tim/vx/internal/src/kernel/cl/clip_cl.c b/src/tim/vx/internal/src/kernel/cl/clip_cl.c
index 4b518b2be..ec74f361b 100644
--- a/src/tim/vx/internal/src/kernel/cl/clip_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/clip_cl.c
@@ -128,6 +128,8 @@ DEF_KERNEL_INITIALIZER(_clip_initializer)
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
index 8fec39b3c..4b1369f96 100644
--- a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
@@ -229,6 +229,8 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)
vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -285,7 +287,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int i;
+ size_t i;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -347,6 +349,9 @@ static vsi_nn_kernel_node_t _setup
float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
ret = vsi_nn_kernel_optimize_eltwise_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num,
inputs[1]->attr.size, inputs[1]->attr.dim_num,
@@ -363,11 +368,11 @@ static vsi_nn_kernel_node_t _setup
outputs[0], shapes[2], new_rank );
#define _swap_tensor(a, b, tmp) \
- do { \
+ { \
tmp = a; \
a = b; \
b = tmp; \
- } while(0)
+ }
if (shapes[1][3] > shapes[0][3] && new_rank == 4)
{
diff --git a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
index 0aac099e6..8dca93180 100644
--- a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
@@ -135,6 +135,8 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
int32_t c = 1;
uint32_t dim = 1;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@@ -203,7 +205,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -288,11 +290,28 @@ static vsi_nn_kernel_node_t _setup
int32_t width = 0;
int32_t height = 0;
int32_t channel = 1;
- int32_t i = 0;
+ uint32_t i = 0;
+
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
- vsi_nn_kernel_optimize_softmax_shape(
- inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
- shapes[0], &rs_dim, &axis_new);
+ if (axis < 0)
+ {
+ axis_new = 0;
+ shapes[0][0] = 1;
+ shapes[0][1] = 1;
+ for (i = 0; i < inputs[0]->attr.dim_num; i++)
+ {
+ shapes[0][0] *= inputs[0]->attr.size[i];
+ }
+ rs_dim = 2;
+ }
+ else
+ {
+ vsi_nn_kernel_optimize_softmax_shape(
+ inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+ shapes[0], &rs_dim, &axis_new);
+ }
if (rs_dim > 3)
{
return NULL;
diff --git a/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c
index e1bb5f9c4..94e79fe56 100644
--- a/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c
@@ -103,6 +103,8 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
int32_t output_height = 0;
int32_t output_chn = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@@ -145,7 +147,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -195,6 +197,9 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_t node = NULL;
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
diff --git a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c
index f34393ecf..596aab56e 100644
--- a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c
@@ -126,6 +126,9 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
vsi_size_array_t * in_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+ VSI_UNREFERENCED(node);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
in_shape = input_attr->shape;
diff --git a/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c b/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c
index d54182d11..c278d0603 100644
--- a/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c
@@ -181,6 +181,14 @@ static vsi_nn_kernel_node_t _setup
{
vsi_nn_kernel_node_t node = NULL;
+ VSI_UNREFERENCED(graph);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(outputs);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(kernel);
+
return node;
} /* _setup() */
diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
index 5d29c6796..c44010a9c 100644
--- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
@@ -211,6 +211,9 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(node);
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -253,7 +256,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int i;
+ size_t i;
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -327,6 +330,9 @@ static vsi_nn_kernel_node_t _setup
float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" );
float beta = vsi_nn_kernel_param_get_float32( params, "beta" );
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if (unary_type == UNARY_SELU)
{
alpha = alpha * beta;
diff --git a/src/tim/vx/internal/src/kernel/cl/erf_cl.c b/src/tim/vx/internal/src/kernel/cl/erf_cl.c
index d6ef8d85b..e7aa1d3d2 100644
--- a/src/tim/vx/internal/src/kernel/cl/erf_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/erf_cl.c
@@ -135,6 +135,9 @@ DEF_KERNEL_INITIALIZER(_erf_initializer)
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(node);
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -243,6 +246,10 @@ static vsi_nn_kernel_node_t _setup
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+
ret = vsi_nn_kernel_optimize_element_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num,
shape, &new_rank );
diff --git a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
index af31ed15d..7341f3282 100644
--- a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
@@ -122,11 +122,14 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
{0, 0, 0},
{0, 0, 0}
};
- vx_status status = VX_FAILURE;
- vx_tensor output = (vx_tensor)param[2];
+ vsi_status status = VSI_FAILURE;
+ vx_tensor output = (vx_tensor)param[2];
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+ VSI_UNREFERENCED(node);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
@@ -258,6 +261,8 @@ static vsi_nn_kernel_node_t _setup
float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
+ VSI_UNREFERENCED(params);
+
outputScale = 1.0f / outputScale;
input0Tail = -(input0Tail * input0Scale);
input1Tail = -(input1Tail * input1Scale);
diff --git a/src/tim/vx/internal/src/kernel/cl/gather_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
index bafe86c15..a3fa2d61d 100644
--- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
@@ -205,6 +205,9 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
size_t input_dims1 = 0;
size_t i = 0;
+ VSI_UNREFERENCED(node);
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -264,7 +267,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -334,6 +337,9 @@ static vsi_nn_kernel_node_t _setup
int32_t is_array = block_size >= GPU_TENSOR_MAX_WIDTH ? 1 : 0;
int32_t i = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0, &is_array);
status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1, &is_array);
status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0, &is_array);
diff --git a/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c
index a8d56a2bc..82838648c 100644
--- a/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c
@@ -51,18 +51,30 @@ typedef enum
#define STR(a) #a
// Add kernel hashtable here
-#define GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMG_2D ) \
- (( AXIS ) | ( IN0_DTYPE << 2 ) | ( IN1_DTYPE << 10 ) | ( OUT_DTYPE << 18 ) | ( IMG_2D << 26 ))
+#define GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMG_2D, BEYOND_MAXWIDTH ) \
+ (( AXIS ) | ( IN0_DTYPE << 2 ) | ( IN1_DTYPE << 10 ) | ( OUT_DTYPE << 18 ) | ( IMG_2D << 26 ) | \
+ (BEYOND_MAXWIDTH << 28))
#define PACK_KERNEL_3D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
- { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 ), \
+ { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 , 0), \
CVIVANTE_NAMESPACE("cl.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
_GATHER_ELEMENTS_KERNEL_SOURCE}
#define PACK_KERNEL_2D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
- { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 ), \
+ { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 , 0), \
CVIVANTE_NAMESPACE("cl.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
_GATHER_ELEMENTS_KERNEL_SOURCE}
+#define PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+ { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 , 1), \
+ CVIVANTE_NAMESPACE("cl.gather_elements_beyond_maxwidth_axis"STR(AXIS)"_"STR(IN0_DTYPE)\
+ "_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
+ _GATHER_ELEMENTS_KERNEL_SOURCE}
+
+#define PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+ { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 , 1), \
+ CVIVANTE_NAMESPACE("cl.gather_elements_beyond_maxwidth_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)\
+ "to"STR(OUT_DTYPE)"_2D"), _GATHER_ELEMENTS_KERNEL_SOURCE}
+
typedef struct
{
uint32_t key;
@@ -89,6 +101,44 @@ static const _kernel_map_type _gather_elements_kernel_map[] =
PACK_KERNEL_2D_MAP( 1, F32, I32, F32 ),
PACK_KERNEL_2D_MAP( 1, I32, I32, I32 ),
PACK_KERNEL_2D_MAP( 1, U32, I32, U32 ),
+
+ PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, F32, I32, F32),
+ PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, F16, I32, F16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, I32, I32, I32 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, I16, I32, I16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, I8, I32, I8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, U8, I32, U8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, F32, I32, F32),
+ PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, F16, I32, F16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, I32, I32, I32 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, I16, I32, I16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, I8, I32, I8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, U8, I32, U8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, F32, I32, F32),
+ PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, F16, I32, F16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, I32, I32, I32 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, I16, I32, I16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, I8, I32, I8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, U8, I32, U8 ),
+
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, F32, I32, F32 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, F16, I32, F16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, I32, I32, I32 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, I16, I32, I16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, I8, I32, I8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, U8, I32, U8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, F32, I32, F32 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, F16, I32, F16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, I32, I32, I32 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, I16, I32, I16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, I8, I32, I8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, U8, I32, U8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, F32, I32, F32 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, F16, I32, F16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, I32, I32, I32 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, I16, I32, I16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, I8, I32, I8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, U8, I32, U8 ),
};
@@ -126,12 +176,38 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer)
{0, 0, 0},
{0, 0, 0}
};
+ vsi_nn_kernel_tensor_attr_t * input_attr0 = NULL;
+ vsi_nn_kernel_tensor_attr_t * input_attr1 = NULL;
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_size_array_t * out_shape = NULL;
+ uint32_t width0 = 0;
+ uint32_t height0 = 0;
+ uint32_t width1 = 0;
+ uint32_t height1 = 0;
+ uint32_t width_out = 0;
+ uint32_t height_out = 0;
+ uint32_t depth0 = 0;
+ uint32_t depth1 = 0;
+
+ VSI_UNREFERENCED(node);
+ VSI_UNREFERENCED(param_size);
+ input_attr0 = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+ CHECK_PTR_FAIL_GOTO( input_attr0, "Create tensor attr buffer fail.", final );
+ input_attr1 = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+ CHECK_PTR_FAIL_GOTO( input_attr1, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+ width0 = (uint32_t)input_attr0->shape->data[0];
+ height0 = (uint32_t)input_attr0->shape->data[1];
+ depth0 = input_attr0->shape->size > 2 ? (uint32_t)input_attr0->shape->data[2] : 1;
+ width1 = (uint32_t)input_attr1->shape->data[0];
+ height1 = (uint32_t)input_attr1->shape->data[1];
+ depth1 = input_attr1->shape->size > 2 ? (uint32_t)input_attr1->shape->data[2] : 1;
+ width_out = (uint32_t)output_attr->shape->data[0];
+ height_out = (uint32_t)output_attr->shape->data[1];
+
out_shape = output_attr->shape;
gpu_param.global_scale[0] = 1;
@@ -146,7 +222,25 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer)
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1]);
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+ if (width0 >= GPU_TENSOR_MAX_WIDTH ||
+ width1 >= GPU_TENSOR_MAX_WIDTH ||
+ height0 >= GPU_TENSOR_MAX_WIDTH ||
+ height1 >= GPU_TENSOR_MAX_WIDTH ||
+ depth0 >= GPU_TENSOR_MAX_WIDTH ||
+ depth1 >= GPU_TENSOR_MAX_WIDTH)
+ {
+ gpu_param.global_scale[0] = 1;
+ gpu_param.global_size[0] = out_shape->data[0];
+ }
+
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+ status |= vsi_nn_kernel_gpu_add_param( node, "width0", &width0 );
+ status |= vsi_nn_kernel_gpu_add_param( node, "height0", &height0 );
+ status |= vsi_nn_kernel_gpu_add_param( node, "width1", &width1 );
+ status |= vsi_nn_kernel_gpu_add_param( node, "height1", &height1 );
+ status |= vsi_nn_kernel_gpu_add_param( node, "width_out", &width_out );
+ status |= vsi_nn_kernel_gpu_add_param( node, "height_out", &height_out );
final:
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
@@ -178,32 +272,52 @@ static vsi_status _query_kernel
int32_t img_2d = (outputs[0]->attr.dim_num < 3 || outputs[0]->attr.size[2] == 1) ? 1 : 0;
uint32_t key = 0;
uint32_t i;
+ int32_t beyond_maxwidth = 0;
+ vsi_size_t depth0 = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
+ vsi_size_t depth1 = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1;
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+ if (inputs[0]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH ||
+ inputs[0]->attr.size[1] >= GPU_TENSOR_MAX_WIDTH ||
+ inputs[1]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH ||
+ inputs[1]->attr.size[1] >= GPU_TENSOR_MAX_WIDTH ||
+ depth0 >= GPU_TENSOR_MAX_WIDTH ||
+ depth1 >= GPU_TENSOR_MAX_WIDTH)
+ {
+ beyond_maxwidth = 1;
+ }
+
#define _PACK_SELECT_KEY( in0_type, out_type ) \
( ( in0_type ) | ( out_type << 8 ))
- switch (_PACK_SELECT_KEY(in0_dtype, out_dtype))
+ if (beyond_maxwidth == 0)
+ {
+ switch (_PACK_SELECT_KEY(in0_dtype, out_dtype))
+ {
+ case _PACK_SELECT_KEY(F32, F32):
+ case _PACK_SELECT_KEY(F16, F16):
+ key = GATHER_ELEMENTS_HASH_KEY( axis, F32, in1_dtype, F32, img_2d, 0 );
+ break;
+ case _PACK_SELECT_KEY(U32, U32):
+ case _PACK_SELECT_KEY(U16, U16):
+ case _PACK_SELECT_KEY(U8, U8):
+ key = GATHER_ELEMENTS_HASH_KEY( axis, U32, in1_dtype, U32, img_2d, 0 );
+ break;
+ case _PACK_SELECT_KEY(I32, I32):
+ case _PACK_SELECT_KEY(I16, I16):
+ case _PACK_SELECT_KEY(I8, I8):
+ key = GATHER_ELEMENTS_HASH_KEY( axis, I32, in1_dtype, I32, img_2d, 0 );
+ break;
+ default:
+ break;
+ }
+ }
+ else
{
- case _PACK_SELECT_KEY(F32, F32):
- case _PACK_SELECT_KEY(F16, F16):
- key = GATHER_ELEMENTS_HASH_KEY( axis, F32, in1_dtype, F32, img_2d );
- break;
- case _PACK_SELECT_KEY(U32, U32):
- case _PACK_SELECT_KEY(U16, U16):
- case _PACK_SELECT_KEY(U8, U8):
- key = GATHER_ELEMENTS_HASH_KEY( axis, U32, in1_dtype, U32, img_2d );
- break;
- case _PACK_SELECT_KEY(I32, I32):
- case _PACK_SELECT_KEY(I16, I16):
- case _PACK_SELECT_KEY(I8, I8):
- key = GATHER_ELEMENTS_HASH_KEY( axis, I32, in1_dtype, I32, img_2d );
- break;
- default:
- break;
+ key = GATHER_ELEMENTS_HASH_KEY( axis, in0_dtype, in1_dtype, out_dtype, img_2d, 1 );
}
#undef _PACK_SELECT_KEY
@@ -221,7 +335,8 @@ static vsi_status _query_kernel
kernel->info.numParams = _cnt_of_array( _gather_elements_kernel_param_def );
kernel->info.initialize = initializer;
// Register code source
- vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+ vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+ "eltwise_ops_helper",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
diff --git a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
index a41e7ace3..bfcb0df06 100644
--- a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
@@ -119,7 +119,7 @@ static vsi_status cal_gather_nd_tensor_reshape_size
uint32_t block_size,
uint32_t coordDim,
int32_t* newDim,
- int32_t batch_dims
+ uint32_t batch_dims
)
{
vsi_status status = VSI_FAILURE;
@@ -146,17 +146,23 @@ static vsi_status cal_gather_nd_tensor_reshape_size
if (batch_dims)
{
+ int32_t rank = 1;
for (i = 0; i < offset; i++)
{
sizes[0] *= input_size[i];
}
- for (i = 0; i < coordDim; i++)
+ for (i = 0; i < coordDim - 1; i++)
{
- sizes[i + 1] = input_size[i + offset];
+ sizes[rank++] = input_size[i + offset];
}
- newDim[0] = coordDim == 1 ? 2 : 3;
+ for (i = 0; i < batch_dims; i++)
+ {
+ sizes[rank] *= input_size[dims_num - i - 1];
+ }
+
+ newDim[0] = rank + 1;
}
else
{
@@ -186,13 +192,27 @@ static vsi_status cal_gather_nd_tensor_reshape_size
}
else // indices&output reshape
{
- if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
+ if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH && batch_dims == 0)
{
sizes[0] = block_size;
sizes[1] = elementCnt / block_size;
status = VSI_SUCCESS;
newDim[0] = 2;
}
+ else if (batch_dims > 0)
+ {
+ vsi_size_t batch_cnt = 1;
+ for (i = 0; i < batch_dims; ++i)
+ {
+ batch_cnt *= input_size[dims_num - i - 1];
+ }
+
+ sizes[0] = block_size;
+ sizes[1] = (elementCnt / block_size) / batch_cnt;
+ sizes[2] = batch_cnt;
+ status = VSI_SUCCESS;
+ newDim[0] = 3;
+ }
}
#undef VSI_NN_MAX_IMAGE_WIDTH
@@ -220,7 +240,11 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
int32_t block_size = 0;
- vsi_ssize_t indices_num = 1;
+ vsi_size_t indices_num = 1;
+ vsi_size_t batch_num = 1;
+
+ VSI_UNREFERENCED(node);
+ VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@@ -229,6 +253,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
CHECK_STATUS_FAIL_GOTO(status, final );
indices_num = attr[0]->shape->data[1];
+ batch_num = (attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1);
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
@@ -237,7 +262,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
gpu_param.global_size[0] = gpu_align_p2((block_size + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = indices_num;
- gpu_param.global_size[2] = 1;
+ gpu_param.global_size[2] = batch_num;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, final);
@@ -265,7 +290,8 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype = U8;
vsi_nn_kernel_coord_type_e coord_type = _error;
uint32_t key = 0;
- int i = 0;
+ int32_t batch_flg = batch_dims > 0 ? 1 : 0;
+ size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -301,7 +327,7 @@ static vsi_status _query_kernel
coord_type = _3D;
}
- key = HASH_GATHER_ND_KEY( input0_dtype, I32, output_dtype, coord_type, batch_dims );
+ key = HASH_GATHER_ND_KEY( input0_dtype, I32, output_dtype, coord_type, batch_flg );
for ( i = 0; i < _cnt_of_array(gather_nd_map); i ++ )
{
@@ -348,6 +374,9 @@ static vsi_nn_kernel_node_t _setup
int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
status = cal_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim, batch_dims);
status |= cal_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim, batch_dims);
status |= cal_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim, batch_dims);
diff --git a/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c b/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c
index 1e51bd7b7..07eb2651f 100644
--- a/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c
@@ -108,6 +108,9 @@ DEF_KERNEL_INITIALIZER(_globallppool_initializer)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
+ VSI_UNREFERENCED(node);
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
output_shape = output_attr->shape;
diff --git a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
index 95a4bff5a..5e727fadb 100644
--- a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
@@ -220,6 +220,9 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer)
vsi_ssize_t width = 0;
vsi_ssize_t chn = 0;
+ VSI_UNREFERENCED(node);
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -275,6 +278,9 @@ DEF_KERNEL_INITIALIZER(_groupnorm_mean_vari_initializer)
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_ssize_t chn = 0;
+ VSI_UNREFERENCED(node);
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@@ -325,6 +331,9 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
vsi_ssize_t chn = 0;
int32_t is2D = 0;
+ VSI_UNREFERENCED(node);
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
@@ -489,6 +498,9 @@ static vsi_nn_kernel_node_t _setup
float rSpaceOrg = 1.0f / (width * height);
float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c
index 410fe5638..b6e0bf733 100644
--- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c
@@ -91,6 +91,9 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer)
)
{
vsi_status status = VSI_FAILURE;
+ VSI_UNREFERENCED(node);
+ VSI_UNREFERENCED(param);
+ VSI_UNREFERENCED(param_size);
// vsi_nn_kernel_tensor_attr * attr[2] = { NULL };
// attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
// attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -172,6 +175,8 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
+ VSI_UNREFERENCED(params);
+
/*
// Check if gpu can support the size
if( !vsi_nn_kernel_gpu_check_shape(
diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c
index 1a849fe60..828a88a22 100644
--- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c
@@ -91,6 +91,10 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_sma_initializer)
)
{
vsi_status status = VSI_FAILURE;
+
+ VSI_UNREFERENCED(node);
+ VSI_UNREFERENCED(param);
+ VSI_UNREFERENCED(param_size);
// vsi_nn_kernel_tensor_attr * attr[2] = { NULL };
// attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
// attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -172,6 +176,8 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_SMA_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
+ VSI_UNREFERENCED(params);
+
/*
// Check if gpu can support the size
if( !vsi_nn_kernel_gpu_check_shape(
diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c
index e2b6964a8..193f388d3 100644
--- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c
@@ -118,6 +118,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
vsi_nn_kernel_tensor_t input = NULL;
vsi_nn_kernel_tensor_attr_t* input_attr = NULL;
+ VSI_UNREFERENCED(param_size);
+
input = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_Z_H_HSTATE];
input_attr = vsi_nn_kernel_tensor_attr_create( input );
diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c
index 3912b95cb..0896c6a1c 100644
--- a/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c
@@ -110,6 +110,8 @@ DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer)
vsi_nn_kernel_tensor_t output = NULL;
vsi_nn_kernel_tensor_attr_t* output_attr;
+ VSI_UNREFERENCED(param_size);
+
output = (vsi_nn_kernel_tensor_t)param[3];
output_attr = vsi_nn_kernel_tensor_attr_create( output );
diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c
index a18b1121e..a99f8b908 100644
--- a/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c
@@ -120,6 +120,8 @@ DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer)
vsi_nn_kernel_tensor_t input = NULL;
vsi_nn_kernel_tensor_attr_t* input_attr = NULL;
+ VSI_UNREFERENCED(param_size);
+
input = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_H_STATE];
input_attr = vsi_nn_kernel_tensor_attr_create( input );
diff --git a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
index 892377b53..942585037 100644
--- a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
@@ -188,6 +188,8 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer)
vsi_ssize_t height = 0;
vsi_ssize_t chn = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -255,6 +257,8 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
vsi_ssize_t height = 0;
vsi_ssize_t chn = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
@@ -405,6 +409,9 @@ static vsi_nn_kernel_node_t _setup
float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
float inv_multiplier = (float)1.0 / (float)(width * height);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
diff --git a/src/tim/vx/internal/src/kernel/cl/l1norm_cl.c b/src/tim/vx/internal/src/kernel/cl/l1norm_cl.c
index 2626bfeaa..44186d138 100644
--- a/src/tim/vx/internal/src/kernel/cl/l1norm_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/l1norm_cl.c
@@ -164,6 +164,8 @@ DEF_KERNEL_INITIALIZER(_l1norm_initializer)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis);
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c
index 7b2f50aa5..83e598bb0 100644
--- a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c
@@ -115,6 +115,8 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t * output_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis);
diff --git a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
index 20f3ab01c..a13ec2e19 100644
--- a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
@@ -123,6 +123,8 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
vsi_ssize_t height = 0;
vsi_ssize_t chn = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
@@ -175,7 +177,9 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
+
+ VSI_UNREFERENCED(reshape2D);
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -243,6 +247,9 @@ static vsi_nn_kernel_node_t _setup
float zp2ScaleE2 = 0.0f;
float sumZpScaleE2 = 0.0f;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
scale_inOut = input_scale * output_scale;
e2InScale = input_scale * input_scale;
sumZpScale = width * input_zp * input_scale;
diff --git a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
index 311de9729..3fc716cad 100644
--- a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
@@ -148,6 +148,8 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer)
vsi_size_array_t * out_shape = NULL;
int32_t axis = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -194,7 +196,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int i;
+ size_t i;
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -244,6 +246,9 @@ static vsi_nn_kernel_node_t _setup
float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
float scaleValue = (vx_float32)(log10(exp(1.0f)) / log10(2.0f));
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
axis = vsi_nn_kernel_param_get_int32(params, "axis");
beta = vsi_nn_kernel_param_get_float32(params, "beta");
diff --git a/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c b/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c
index bcf4d7a7f..27b97ebb6 100644
--- a/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c
@@ -106,11 +106,13 @@ DEF_KERNEL_INITIALIZER(_logical_not_initializer)
{0, 0, 0},
{0, 0, 0}
};
- vx_status status = VX_FAILURE;
+ vsi_status status = VSI_FAILURE;
vx_tensor output = (vx_tensor)param[1];
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
@@ -218,6 +220,8 @@ static vsi_nn_kernel_node_t _setup
vsi_size_t new_rank = 0;
vsi_bool ret = FALSE;
+ VSI_UNREFERENCED(params);
+
ret = vsi_nn_kernel_optimize_element_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num,
shape, &new_rank );
diff --git a/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c b/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c
index 7121aa93b..4d0c23ab7 100644
--- a/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c
@@ -111,11 +111,13 @@ DEF_KERNEL_INITIALIZER(_logical_ops_initializer)
{0, 0, 0},
{0, 0, 0}
};
- vx_status status = VX_FAILURE;
- vx_tensor output = (vx_tensor)param[2];
+ vsi_status status = VSI_FAILURE;
+ vx_tensor output = (vx_tensor)param[2];
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
@@ -250,11 +252,11 @@ static vsi_nn_kernel_node_t _setup
outputs[0], shapes[2], new_rank );
#define _swap_tensor(a, b, tmp) \
- do { \
+ { \
tmp = a; \
a = b; \
b = tmp; \
- } while(0)
+ }
if (shapes[1][3] > shapes[0][3] && new_rank == 4)
{
diff --git a/src/tim/vx/internal/src/kernel/cl/lppool_cl.c b/src/tim/vx/internal/src/kernel/cl/lppool_cl.c
index 514bec0c7..a46c728d7 100644
--- a/src/tim/vx/internal/src/kernel/cl/lppool_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/lppool_cl.c
@@ -121,6 +121,8 @@ DEF_KERNEL_INITIALIZER(_lppool_initializer)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c
index a7bdb2c89..dec27e3f9 100644
--- a/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c
@@ -68,7 +68,8 @@ typedef enum _LSTMUNIT_nn_activation_e
#define LSTMUNIT_ACTIVATION_HASH_KEY(_is_ln, _is_cifg, _is_proj, _is_hybrid, _is_peephole, \
_input_type, _output_type, _cell_type, _rec_act) \
((_is_ln << 31) | (_is_cifg << 30) | (_is_proj << 29) | (_is_hybrid << 28) | (_is_peephole << 27) \
-| (_input_type << 23) | (_output_type << 19) | (_cell_type << 15) | (_rec_act << 10))
+| (((uint32_t)_input_type) << 23) | (((uint32_t)_output_type) << 19) | (((uint32_t)_cell_type) << 15) \
+| (_rec_act << 10))
#define LSTMUNIT_ACTIVATION_SOURCE_NAME(_ln_cifg_proj_hybrid_, _input_type) \
"lstmunit_activation_"#_ln_cifg_proj_hybrid_"_"#_input_type
@@ -941,6 +942,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_CL_initializer)
vsi_nn_kernel_tensor_t output = NULL;
vsi_nn_kernel_tensor_attr_t* output_attr;
+ VSI_UNREFERENCED(param_size);
+
output = (vsi_nn_kernel_tensor_t)param[CL_OUTPUT];
output_attr = vsi_nn_kernel_tensor_attr_create( output );
@@ -983,6 +986,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_CB_initializer)
vsi_nn_kernel_tensor_t output = NULL;
vsi_nn_kernel_tensor_attr_t* output_attr;
+ VSI_UNREFERENCED(param_size);
+
output = (vsi_nn_kernel_tensor_t)param[CB_OUTPUT];
output_attr = vsi_nn_kernel_tensor_attr_create( output );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
@@ -1027,6 +1032,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_CS_initializer)
vsi_nn_kernel_tensor_t output = NULL;
vsi_nn_kernel_tensor_attr_t* output_attr;
+ VSI_UNREFERENCED(param_size);
+
output = (vsi_nn_kernel_tensor_t)param[CS_OUTPUT];
output_attr = vsi_nn_kernel_tensor_attr_create( output );
@@ -1073,6 +1080,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_L_initializer)
vsi_nn_kernel_tensor_t output = NULL;
vsi_nn_kernel_tensor_attr_t* output_attr;
+ VSI_UNREFERENCED(param_size);
+
output = (vsi_nn_kernel_tensor_t)param[L_OUTPUT];
output_attr = vsi_nn_kernel_tensor_attr_create( output );
@@ -1118,6 +1127,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_B_initializer)
vsi_nn_kernel_tensor_t output = NULL;
vsi_nn_kernel_tensor_attr_t* output_attr;
+ VSI_UNREFERENCED(param_size);
+
output = (vsi_nn_kernel_tensor_t)param[B_OUTPUT];
output_attr = vsi_nn_kernel_tensor_attr_create( output );
@@ -1164,6 +1175,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_S_initializer)
vsi_nn_kernel_tensor_t output = NULL;
vsi_nn_kernel_tensor_attr_t* output_attr;
+ VSI_UNREFERENCED(param_size);
+
output = (vsi_nn_kernel_tensor_t)param[S_OUTPUT];
output_attr = vsi_nn_kernel_tensor_attr_create( output );
diff --git a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
index 5ff2a9308..de336c9ba 100644
--- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
@@ -43,6 +43,7 @@ __BEGIN_DECLS
*/
#define KERNEL_SOURCE_1 "matrixmul"
#define KERNEL_SOURCE_2 "matrixmul_transA"
+#define KERNEL_SOURCE_3 "matrixmul_cross"
typedef enum
{
@@ -50,8 +51,8 @@ __BEGIN_DECLS
_3D
} vsi_nn_kernel_image_dim_type_e;
-#define HASH_MATRIXMUL_KEY(_input0_type, _input1_type, _output_type, _image_dim, _trans_a) \
- ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_dim << 4) | (_trans_a))
+#define HASH_MATRIXMUL_KEY(_type0, _type1, _type2, _image_dim, _trans_a, _cross) \
+ ((_type0 << 24) | (_type1 << 16) | (_type2 << 8) | (_image_dim << 4) | (_trans_a << 2) | (_cross))
#define HASH_MATRIXMUL_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
CVIVANTE_NAMESPACE("cl.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
@@ -62,21 +63,29 @@ __BEGIN_DECLS
#define HASH_MATRIXMUL_TRANSB_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
CVIVANTE_NAMESPACE("cl.gemm_transb_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
+#define HASH_MATRIXMUL_SH_KERNEL_MERGE_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
+ CVIVANTE_NAMESPACE("cl.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_merge")
+
#define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
- { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0), \
+ { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 0), \
HASH_MATRIXMUL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
SOURCE },
#define TENSOR_MATRIXMUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
- { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 1), \
+ { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 1, 0), \
HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
SOURCE },
#define TENSOR_MATRIXMUL_TRANSB_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
- { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 2), \
+ { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 2, 0), \
HASH_MATRIXMUL_TRANSB_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
SOURCE },
+#define TENSOR_MATRIXMUL_MERGE_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
+ { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 2), \
+ HASH_MATRIXMUL_SH_KERNEL_MERGE_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
+ SOURCE },
+
static const struct {
uint32_t key;
char* function_name;
@@ -109,6 +118,9 @@ static const struct {
TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, F32, _3D, KERNEL_SOURCE_2)
TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, F32, _2D, KERNEL_SOURCE_1)
TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, F32, _3D, KERNEL_SOURCE_1)
+ TENSOR_MATRIXMUL_MERGE_KERNELS(U8, U8, U8, _3D, KERNEL_SOURCE_3)
+ TENSOR_MATRIXMUL_MERGE_KERNELS(I8, I8, I8, _3D, KERNEL_SOURCE_3)
+ TENSOR_MATRIXMUL_MERGE_KERNELS(F32, F32, F32, _3D, KERNEL_SOURCE_3)
};
/*
@@ -132,7 +144,27 @@ static vx_param_description_t _matrixmul_kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
+static vx_param_description_t _matrixmul_merge_kernel_param_def[] =
+{
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
#define _MATRIXMUL_PARAM_NUM _cnt_of_array(_matrixmul_kernel_param_def)
+#define _MATRIXMUL_MERGE_PARAM_NUM _cnt_of_array(_matrixmul_merge_kernel_param_def)
/*
* Kernel initializer
@@ -153,17 +185,40 @@ DEF_KERNEL_INITIALIZER(_matrixmul_initializer)
{0, 0, 0}
};
- vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
- vsi_ssize_t width = 0;
- vsi_ssize_t height = 0;
- vsi_ssize_t chn = 0;
+ vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
+ vsi_size_t width = 0;
+ vsi_size_t height = 0;
+ vsi_size_t chn = 0;
- attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+ VSI_UNREFERENCED(param_size);
+
+ attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
- width = attr[0]->shape->data[0];
- height = attr[0]->shape->data[1];
- chn = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1;
+ attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+ CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+ attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+ CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
+
+ width = attr[2]->shape->data[0];
+ height = attr[2]->shape->data[1];
+ chn = attr[2]->shape->size > 2 ? attr[2]->shape->data[2] : 1;
+
+ if (((attr[0]->shape->size == 4 && attr[1]->shape->size == 3) ||
+ (attr[0]->shape->size == 3 && attr[1]->shape->size == 4))
+ && attr[0]->shape->data[2] > 1 && attr[1]->shape->data[2] > 1
+ && chn == attr[0]->shape->data[2] * attr[1]->shape->data[2])
+ {
+ if (attr[0]->shape->size == 4)
+ {
+ chn = attr[1]->shape->data[2];
+ }
+ else
+ {
+ chn = attr[0]->shape->data[2];
+ }
+ }
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
@@ -184,6 +239,16 @@ DEF_KERNEL_INITIALIZER(_matrixmul_initializer)
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
+ if (attr[1])
+ {
+ vsi_nn_kernel_tensor_attr_release( &attr[1] );
+ attr[1] = NULL;
+ }
+ if (attr[2])
+ {
+ vsi_nn_kernel_tensor_attr_release( &attr[2] );
+ attr[2] = NULL;
+ }
return status;
} /* _matrixmul_initializer() */
@@ -193,7 +258,8 @@ static vsi_status _query_kernel
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
vsi_size_t depth,
- int32_t transa
+ int32_t transa,
+ int32_t cross
)
{
vsi_status status = VSI_FAILURE;
@@ -202,7 +268,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype = U8;
vsi_nn_kernel_image_dim_type_e dim_type = _2D;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -251,7 +317,7 @@ static vsi_status _query_kernel
output_dtype = U8;
}
- key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, transa );
+ key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, transa, cross );
for( i = 0; i < _cnt_of_array(matrixmul_map); i ++ )
{
@@ -264,8 +330,16 @@ static vsi_status _query_kernel
if ( i < _cnt_of_array(matrixmul_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", matrixmul_map[i].function_name );
- kernel->info.parameters = _matrixmul_kernel_param_def;
- kernel->info.numParams = _cnt_of_array( _matrixmul_kernel_param_def );
+ if (cross == 0)
+ {
+ kernel->info.parameters = _matrixmul_kernel_param_def;
+ kernel->info.numParams = _cnt_of_array( _matrixmul_kernel_param_def );
+ }
+ else if (cross == 2)
+ {
+ kernel->info.parameters = _matrixmul_merge_kernel_param_def;
+ kernel->info.numParams = _cnt_of_array( _matrixmul_merge_kernel_param_def );
+ }
kernel->info.initialize = _matrixmul_initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
@@ -290,14 +364,17 @@ static vsi_nn_kernel_node_t _setup
)
{
vsi_status status = VSI_FAILURE;
- vsi_nn_kernel_node_param_t node_params[_MATRIXMUL_PARAM_NUM] = {NULL};
+ vsi_nn_kernel_node_param_t node_params[_MATRIXMUL_MERGE_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
int32_t transposeA = vsi_nn_kernel_param_get_int32( params, "transposeA" );
int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" );
+ int32_t cross_flg = vsi_nn_kernel_param_get_int32( params, "cross_flg" );
int32_t transFlg = 0;
vsi_size_t M = inputs[0]->attr.size[1];
vsi_size_t K = inputs[0]->attr.size[0];
vsi_size_t N = inputs[1]->attr.size[0];
+ vsi_size_t a_depth = 0;
+ vsi_size_t b_depth = 0;
vsi_size_t depth = outputs[0]->attr.dim_num > 2 ? outputs[0]->attr.size[2] : 1;
uint32_t ac2zero = 0;
uint32_t bc2zero = 0;
@@ -307,6 +384,10 @@ static vsi_nn_kernel_node_t _setup
float zp_b = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
float scale_out = vsi_nn_get_tensor_scale(outputs[0]);
float zp_out = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+ int32_t outer = 0;
+
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
scale_out = 1 / scale_out;
@@ -329,28 +410,43 @@ static vsi_nn_kernel_node_t _setup
transFlg = 1;
}
- if ((inputs[0]->attr.dim_num > inputs[1]->attr.dim_num) ||
- (inputs[0]->attr.size[2] > inputs[1]->attr.size[2]
- && inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2))
+ a_depth = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
+ b_depth = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1;
+
+ if (b_depth == 1)
{
bc2zero = 1;
}
- else if ((inputs[1]->attr.dim_num > inputs[0]->attr.dim_num) ||
- (inputs[1]->attr.size[2] > inputs[0]->attr.size[2]
- && inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2))
+ if (a_depth == 1)
+ {
+ ac2zero = 1;
+ }
+
+ if (inputs[0]->attr.dim_num == 4 && inputs[1]->attr.dim_num == 3
+ && a_depth > 1 && b_depth > 1 && cross_flg == 2)
{
ac2zero = 1;
+ bc2zero = 0;
+ outer = (int32_t)a_depth;
+ }
+ else if (inputs[1]->attr.dim_num == 4 && inputs[0]->attr.dim_num == 3
+ && a_depth > 1 && b_depth > 1 && cross_flg == 2)
+ {
+ ac2zero = 0;
+ bc2zero = 1;
+ outer = (int32_t)b_depth;
}
- status = _query_kernel( kernel, inputs, outputs, depth, transFlg );
+ status = _query_kernel( kernel, inputs, outputs, depth, transFlg, cross_flg );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
uint32_t index = 3;
+ size_t param_num = cross_flg == 2 ? _MATRIXMUL_MERGE_PARAM_NUM : _MATRIXMUL_PARAM_NUM;
/* Pass parameters to node. */
- vsi_nn_kernel_node_pack_io( node_params, _MATRIXMUL_PARAM_NUM,
+ vsi_nn_kernel_node_pack_io( node_params, param_num,
inputs, 2, outputs, 1 );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &M );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &K );
@@ -363,8 +459,12 @@ static vsi_nn_kernel_node_t _setup
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp_b );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_out );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp_out );
+ if (cross_flg == 2)
+ {
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &outer );
+ }
/* Pass parameters to node. */
- status = vsi_nn_kernel_node_pass_param( node, node_params, _MATRIXMUL_PARAM_NUM );
+ status = vsi_nn_kernel_node_pass_param( node, node_params, param_num );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &node_params[3] );
vsi_nn_kernel_scalar_release( &node_params[4] );
@@ -377,6 +477,10 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &node_params[11] );
vsi_nn_kernel_scalar_release( &node_params[12] );
vsi_nn_kernel_scalar_release( &node_params[13] );
+ if (cross_flg == 2)
+ {
+ vsi_nn_kernel_scalar_release( &node_params[14] );
+ }
}
}
return node;
diff --git a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
index c81289ed6..3446fef8b 100644
--- a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
@@ -136,6 +136,8 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -190,7 +192,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int i;
+ size_t i;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -252,6 +254,10 @@ static vsi_nn_kernel_node_t _setup
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+
outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
diff --git a/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c b/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c
index 2311810e9..b8ecf2ae9 100644
--- a/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c
@@ -115,11 +115,13 @@ DEF_KERNEL_INITIALIZER(_maxpoolwithargmax_initializer)
{0, 0, 0}
};
- vx_status status = VX_FAILURE;
+ vsi_status status = VSI_FAILURE;
vx_tensor output = (vx_tensor)param[1];
vsi_nn_kernel_tensor_attr_t * attr_out = NULL;
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr_out = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
CHECK_PTR_FAIL_GOTO( attr_out, "vsi_nn_kernel_tensor_attr_create fail.", final );
@@ -159,7 +161,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output0_dtype = U8;
vsi_nn_kernel_dtype_e output1_dtype = I32;
uint32_t key = 0;
- int32_t i = 0;
+ size_t i = 0;
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output0_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
diff --git a/src/tim/vx/internal/src/kernel/cl/maxunpool_cl.c b/src/tim/vx/internal/src/kernel/cl/maxunpool_cl.c
index 408164bfb..f4086a8e1 100644
--- a/src/tim/vx/internal/src/kernel/cl/maxunpool_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/maxunpool_cl.c
@@ -120,6 +120,8 @@ DEF_KERNEL_INITIALIZER(_maxunpool_initializer)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
index 92a19a3e5..5d85656cb 100644
--- a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
@@ -136,6 +136,8 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -190,7 +192,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int i;
+ size_t i;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -252,6 +254,11 @@ static vsi_nn_kernel_node_t _setup
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+
+
outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
diff --git a/src/tim/vx/internal/src/kernel/cl/mod_cl.c b/src/tim/vx/internal/src/kernel/cl/mod_cl.c
index 1398823d9..b6c50164a 100644
--- a/src/tim/vx/internal/src/kernel/cl/mod_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/mod_cl.c
@@ -119,6 +119,8 @@ DEF_KERNEL_INITIALIZER(_mod_initializer)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/moments_cl.c b/src/tim/vx/internal/src/kernel/cl/moments_cl.c
index e5bae713e..4afda3666 100644
--- a/src/tim/vx/internal/src/kernel/cl/moments_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/moments_cl.c
@@ -224,6 +224,8 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
int32_t axis = 0;
int32_t axis_num = 1;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@@ -306,7 +308,9 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
+
+ VSI_UNREFERENCED(params);
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -381,6 +385,9 @@ static vsi_nn_kernel_node_t _setup
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
float dim_ratio = (float)1.0 / (float)(width * height);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
axis_num = (int32_t)axis_num_temp;
if (axis_num == 1 && axis[0] == 0)
diff --git a/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c b/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c
new file mode 100644
index 000000000..cc6d53800
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c
@@ -0,0 +1,401 @@
+/****************************************************************************
+*
+* Copyright (c) 2020 Vivante Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include
+#include
+#include
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+ INTERNAL_KERNEL_NEAREST_GRID_SAMPLE,
+} _internal_kernel_e;
+
+#define _NEAREST_GRID_SAMPLE_KERNEL_SOURCE() "nearest_grid_sample"
+
+#define STR(a) #a
+
+// Add kernel hashtable here
+#define NEAREST_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+ ((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE))
+
+#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+ { \
+ NEAREST_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
+ CVIVANTE_NAMESPACE("cl.nearest_grid_sample_" STR( \
+ IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
+ _NEAREST_GRID_SAMPLE_KERNEL_SOURCE() \
+ }
+
+typedef struct
+{
+ uint32_t key;
+ char * function_name;
+ const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _nearest_grid_sample_kernel_map[] =
+{
+ // Register kernel here
+ PACK_KERNEL_MAP(F32, F32, F32),
+ PACK_KERNEL_MAP(U8, U8, U8),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _nearest_grid_sample_kernel_param_def[] =
+{
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+#define _NEAREST_GRID_SAMPLE_PARAM_NUM 8
+#define _NEAREST_GRID_SAMPLE_PARAM_QUANT_NUM \
+ _cnt_of_array(_nearest_grid_sample_kernel_param_def)
+
+#define SCALAR_HALF_INPUT0_W (3)
+#define SCALAR_HALF_INPUT0_H (4)
+#define SCALAR_ADD_VALUE_W (5)
+#define SCALAR_ADD_VALUE_H (6)
+#define SCALAR_DEPTH (7)
+#define SCALAR_INPUT0_SCALE (8)
+#define SCALAR_INPUT0_TAIL (9)
+#define SCALAR_INPUT1_SCALE (10)
+#define SCALAR_INPUT1_TAIL (11)
+#define SCALAR_OUTPUT_SCALE (12)
+#define SCALAR_OUTPUT_TAIL (13)
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_nearest_grid_sample_initializer)
+ (
+ vsi_nn_kernel_node_t node,
+ const vsi_nn_kernel_node_param_t * param,
+ size_t param_size
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ gpu_param_t gpu_param = {3, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
+ vsi_nn_kernel_tensor_attr_t* output_attr = NULL;
+ vsi_size_array_t* out_shape = NULL;
+
+ VSI_UNREFERENCED(param_size);
+
+ output_attr =
+ vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
+ CHECK_PTR_FAIL_GOTO(output_attr, "Create tensor attr buffer fail.", final);
+
+ out_shape = output_attr->shape;
+
+ gpu_param.global_scale[0] = 1;
+ gpu_param.global_scale[1] = 1;
+ gpu_param.global_scale[2] = 1;
+
+ gpu_param.dim = 2;
+ gpu_param.global_size[0] =
+ (out_shape->data[0] + gpu_param.global_scale[0] - 1) /
+ gpu_param.global_scale[0];
+ gpu_param.global_size[1] =
+ ((out_shape->data[1] + gpu_param.global_scale[1] - 1) /
+ gpu_param.global_scale[1]);
+ gpu_param.global_size[2] = 1;
+ status = vsi_nn_kernel_gpu_config(node, &gpu_param);
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) \
+ if (_PTR) { \
+ vsi_nn_kernel_tensor_attr_release(&_PTR); \
+ _PTR = NULL; \
+ }
+ SAFE_FREE_TENSOR_ATTR(output_attr);
+ return status;
+} /* _nearest_grid_sample_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+ (
+ vsi_nn_kernel_t * kernel,
+ vsi_nn_tensor_t * const * const inputs,
+ vsi_nn_tensor_t * const * const outputs,
+ vsi_bool* is_use_u8_kernel
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ vsi_nn_kernel_dtype_e in0_dtype, in1_dtype;
+ vsi_nn_kernel_dtype_e out_dtype;
+ const _kernel_map_type * kernel_map = _nearest_grid_sample_kernel_map;
+ size_t kernel_map_size = _cnt_of_array( _nearest_grid_sample_kernel_map );
+ vx_param_description_t * param_def = _nearest_grid_sample_kernel_param_def;
+ size_t param_def_size =
+ _cnt_of_array(_nearest_grid_sample_kernel_param_def);
+ vx_kernel_initialize_f initializer = _nearest_grid_sample_initializer;
+
+ uint32_t key;
+ uint32_t i;
+
+ in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
+ in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type);
+ out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+ if (F16 == in0_dtype) {
+ in0_dtype = F32;
+ }
+ if (F16 == in1_dtype) {
+ in1_dtype = F32;
+ }
+ if (F16 == out_dtype) {
+ out_dtype = F32;
+ }
+ if ((U8 == in0_dtype) || (U8 == out_dtype)) {
+ param_def_size = _NEAREST_GRID_SAMPLE_PARAM_QUANT_NUM;
+ *is_use_u8_kernel = TRUE;
+ } else {
+ param_def_size = _NEAREST_GRID_SAMPLE_PARAM_NUM;
+ *is_use_u8_kernel = FALSE;
+ }
+
+ key = NEAREST_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype);
+
+ for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+ {
+ if ( kernel_map[i].key == key )
+ {
+ break;
+ }
+ }
+ if ( i < (uint32_t)kernel_map_size )
+ {
+ snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
+ kernel->info.parameters = param_def;
+ kernel->info.numParams = (uint32_t)param_def_size;
+ kernel->info.initialize = initializer;
+ // Register code source
+ vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+ kernel_map[i].source_name );
+ // Register binary source
+ vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+ kernel_map[i].source_name );
+ status = VSI_SUCCESS;
+ }
+ return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+ (
+ vsi_nn_graph_t * graph,
+ vsi_nn_tensor_t ** inputs,
+ size_t input_num,
+ vsi_nn_tensor_t ** outputs,
+ size_t output_num,
+ const vsi_nn_kernel_param_t * params,
+ vsi_nn_kernel_t * kernel
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ vsi_nn_kernel_node_param_t node_params[_NEAREST_GRID_SAMPLE_PARAM_QUANT_NUM];
+ vsi_nn_kernel_node_t node = NULL;
+ vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
+ uint32_t final_in1_rank = 0;
+ vsi_nn_tensor_t* rs_tensors = NULL;
+ vsi_nn_tensor_t* final_tensors[3] = {NULL};
+ vsi_size_t in0_width = inputs[0]->attr.size[0];
+ vsi_size_t in0_height = inputs[0]->attr.size[1];
+ float input0_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+ float input0_scale = vsi_nn_get_tensor_scale(inputs[0]);
+ float input0_tail = -(input0_zp * input0_scale);
+ float input1_zp = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
+ float input1_scale = vsi_nn_get_tensor_scale(inputs[1]);
+ float input1_tail = -(input1_zp * input1_scale);
+ float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+ float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+ vsi_bool is_use_u8_kernel = FALSE;
+ int32_t align_corners =
+ vsi_nn_kernel_param_get_int32(params, "align_corners");
+ uint32_t pad_val = 0;
+ int32_t depth = 0;
+ vsi_nn_kernel_dtype_e in0_dtype;
+ float half_input0_w, half_input0_h, add_float_value_w, add_float_value_h;
+
+ // Check if gpu can support the size
+ if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
+ inputs[0]->attr.dim_num)) {
+ return NULL;
+ }
+
+ if (!vsi_nn_kernel_gpu_check_shape(inputs[1]->attr.size,
+ inputs[1]->attr.dim_num)) {
+ return NULL;
+ }
+
+ final_tensors[0] = inputs[0];
+ if (inputs[1]->attr.dim_num >= 3) {
+ final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0];
+ final_shape[1] = inputs[1]->attr.size[2];
+ final_shape[2] = 1;
+ final_shape[3] =
+ inputs[1]->attr.dim_num > 3 ? inputs[1]->attr.size[3] : 1;
+ final_in1_rank =
+ inputs[1]->attr.dim_num == 3 ? 2 : inputs[1]->attr.dim_num;
+ if (!vsi_nn_kernel_gpu_check_shape(final_shape, final_in1_rank)) {
+ return NULL;
+ }
+
+ rs_tensors = vsi_nn_reshape_tensor(
+ graph, inputs[1], final_shape, final_in1_rank);
+ final_tensors[1] = rs_tensors;
+ } else {
+ final_tensors[1] = inputs[1];
+ }
+ final_tensors[2] = outputs[0];
+
+ if (align_corners) {
+ half_input0_w = ((float)in0_width - 1.0f) * 0.5f;
+ half_input0_h = ((float)in0_height - 1.0f) * 0.5f;
+ add_float_value_w = half_input0_w;
+ add_float_value_h = half_input0_h;
+ } else {
+ half_input0_w = (float)in0_width * 0.5f;
+ half_input0_h = (float)in0_height * 0.5f;
+ add_float_value_w = half_input0_w - 0.5f;
+ add_float_value_h = half_input0_h - 0.5f;
+ }
+
+ add_float_value_w = add_float_value_w + 0.5f;
+ add_float_value_h = add_float_value_h + 0.5f;
+
+ depth = (int32_t)inputs[0]->attr.size[2];
+ in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
+ if (U8 == in0_dtype) {
+ pad_val = inputs[0]->attr.dtype.zero_point;
+ }
+
+ status = _query_kernel(kernel, inputs, outputs, &is_use_u8_kernel);
+ if ( VSI_SUCCESS == status)
+ {
+ node = vsi_nn_kernel_create_node(graph, kernel);
+ if (node) {
+ size_t node_params_num = _NEAREST_GRID_SAMPLE_PARAM_NUM;
+ /* Set inputs and outputs */
+ vsi_nn_kernel_node_pack_io(node_params,
+ _NEAREST_GRID_SAMPLE_PARAM_QUANT_NUM,
+ final_tensors,
+ input_num,
+ &final_tensors[2],
+ output_num);
+ node_params[SCALAR_HALF_INPUT0_W] =
+ vsi_nn_kernel_scalar_create(graph, F32, &half_input0_w);
+ node_params[SCALAR_HALF_INPUT0_H] =
+ vsi_nn_kernel_scalar_create(graph, F32, &half_input0_h);
+ node_params[SCALAR_ADD_VALUE_W] =
+ vsi_nn_kernel_scalar_create(graph, F32, &add_float_value_w);
+ node_params[SCALAR_ADD_VALUE_H] =
+ vsi_nn_kernel_scalar_create(graph, F32, &add_float_value_h);
+ node_params[SCALAR_DEPTH] =
+ vsi_nn_kernel_scalar_create(graph, I32, &depth);
+ if (is_use_u8_kernel) {
+ node_params[SCALAR_INPUT0_SCALE] =
+ vsi_nn_kernel_scalar_create(graph, F32, &input0_scale);
+ node_params[SCALAR_INPUT0_TAIL] =
+ vsi_nn_kernel_scalar_create(graph, F32, &input0_tail);
+ node_params[SCALAR_INPUT1_SCALE] =
+ vsi_nn_kernel_scalar_create(graph, F32, &input1_scale);
+ node_params[SCALAR_INPUT1_TAIL] =
+ vsi_nn_kernel_scalar_create(graph, F32, &input1_tail);
+ node_params[SCALAR_OUTPUT_SCALE] =
+ vsi_nn_kernel_scalar_create(graph, F32, &output_scale);
+ node_params[SCALAR_OUTPUT_TAIL] =
+ vsi_nn_kernel_scalar_create(graph, F32, &output_zp);
+ node_params_num = _NEAREST_GRID_SAMPLE_PARAM_QUANT_NUM;
+ }
+ /* Pass parameters to node. */
+ status = vsi_nn_kernel_node_pass_param(
+ node, node_params, node_params_num);
+ VSI_ASSERT(status == VSI_SUCCESS);
+ vsi_nn_kernel_scalar_release(&node_params[SCALAR_HALF_INPUT0_W]);
+ vsi_nn_kernel_scalar_release(&node_params[SCALAR_HALF_INPUT0_H]);
+ vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_W]);
+ vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_H]);
+ vsi_nn_kernel_scalar_release(&node_params[SCALAR_DEPTH]);
+ if (is_use_u8_kernel) {
+ vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_SCALE]);
+ vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_TAIL]);
+ vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_SCALE]);
+ vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_TAIL]);
+ vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_SCALE]);
+ vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_TAIL]);
+ }
+ {
+ // Set default border mode.
+ vx_border_t border;
+ border.mode = VX_BORDER_CONSTANT;
+ border.constant_value.U32 = pad_val;
+ status = vxSetNodeAttribute(
+ (vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
+ CHECK_STATUS(status);
+ }
+ }
+ }
+
+ vsi_safe_release_tensor(rs_tensors);
+ return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( nearest_grid_sample, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
index 4369beaf6..a66b89b3e 100644
--- a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
@@ -121,6 +121,8 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer)
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_size_array_t * in_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -234,6 +236,7 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_tensor_t* rs_tensors[2] = { NULL };
vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
int32_t i = 0;
+ size_t j = 0;
vsi_size_t num_elements = vsi_nn_vxGetTensorElementNum(&inputs[0]->attr);
vsi_size_t prefix_dim_size = 1;
vsi_size_t suffix_dim_size = 0;
@@ -320,11 +323,11 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_ReleaseTensor( &rs_tensors[1] );
}
- for (i = SCALAR_INPUT_DEPTH; i < _ONE_HOT_PARAM_NUM; i++)
+ for (j = SCALAR_INPUT_DEPTH; j < _ONE_HOT_PARAM_NUM; j++)
{
- if (node_params[i])
+ if (node_params[j])
{
- vsi_nn_kernel_scalar_release( &node_params[i] );
+ vsi_nn_kernel_scalar_release( &node_params[j] );
}
}
diff --git a/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c b/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c
index 558a1e0d1..18468ae5c 100644
--- a/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c
@@ -111,12 +111,14 @@ DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer)
{0, 0, 0}
};
- vx_status status = VX_FAILURE;
+ vsi_status status = VSI_FAILURE;
vx_tensor output = (vx_tensor)param[1];
vsi_nn_kernel_tensor_attr_t * attr_out = NULL;
vsi_size_array_t * out_shape = NULL;
vsi_bool image_2d = FALSE;
+ VSI_UNREFERENCED(param_size);
+
attr_out = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
CHECK_PTR_FAIL_GOTO( attr_out, "vsi_nn_kernel_tensor_attr_create fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/pow_cl.c b/src/tim/vx/internal/src/kernel/cl/pow_cl.c
index 1d1020d7a..6a38b4e85 100644
--- a/src/tim/vx/internal/src/kernel/cl/pow_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/pow_cl.c
@@ -126,6 +126,8 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -180,7 +182,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -248,6 +250,10 @@ static vsi_nn_kernel_node_t _setup
float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+
outputScale = 1.0f / outputScale;
inputTail = -(inputTail * inputScale);
diff --git a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c
index 609c90e18..87c8593a3 100644
--- a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c
@@ -136,6 +136,8 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -190,7 +192,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int i;
+ size_t i;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -248,6 +250,9 @@ static vsi_nn_kernel_node_t _setup
float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
int32_t is_per_channel_alpha = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
is_per_channel_alpha = vsi_nn_kernel_param_get_int32(params, "is_per_channel_alpha");
if (is_per_channel_alpha)
diff --git a/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c b/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c
index 696303b21..7e4504008 100644
--- a/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c
@@ -35,7 +35,6 @@
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
@@ -153,6 +152,8 @@ DEF_KERNEL_INITIALIZER(_multinomial_initializer)
vsi_nn_kernel_tensor_attr_t * attr = NULL;
vsi_size_array_t * in_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
@@ -196,6 +197,8 @@ DEF_KERNEL_INITIALIZER(_cdf_initializer)
vsi_size_array_t * in_shape = NULL;
vsi_size_t batch = 0;
+ VSI_UNREFERENCED(param_size);
+
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
@@ -235,6 +238,9 @@ DEF_KERNEL_INITIALIZER(_seed_initializer)
{0, 0, 0}
};
+ VSI_UNREFERENCED(param);
+ VSI_UNREFERENCED(param_size);
+
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_size[0] = 1;
@@ -351,6 +357,10 @@ static vsi_nn_kernel_node_t _setup
float rand_max = (float)(pow(2.0,32));
float re_rand_max = 1 / rand_max;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+
// Check if gpu can support the size
if( !vsi_nn_kernel_gpu_check_shape(
outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
@@ -370,17 +380,20 @@ static vsi_nn_kernel_node_t _setup
attr.is_const = FALSE;
attr.vtl = TRUE;
tensors[SEED_INDEX] = vsi_nn_CreateTensor( graph, &attr );
+ CHECK_PTR_FAIL_GOTO(tensors[SEED_INDEX], "Create tensor failed", final);
attr.size[0] = inputs[0]->attr.size[0];
attr.size[1] = inputs[0]->attr.size[1];
attr.dim_num = 2;
tensors[CDF_INDEX] = vsi_nn_CreateTensor( graph, &attr );
+ CHECK_PTR_FAIL_GOTO(tensors[CDF_INDEX], "Create tensor failed", final);
memcpy( &attr, &(inputs[1]->attr), sizeof(vsi_nn_tensor_attr_t) );
attr.size[1] = 1;
attr.dim_num = 2;
tensors[SEEDS_INDEX] = vsi_nn_reshape_tensor( graph,
inputs[1], attr.size, attr.dim_num );
+ CHECK_PTR_FAIL_GOTO(tensors[SEEDS_INDEX], "Create tensor failed", final);
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
diff --git a/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c
index 9b92246fd..aa2a45c89 100644
--- a/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c
@@ -105,6 +105,8 @@ DEF_KERNEL_INITIALIZER(_reduceall_internal_initializer)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t * output_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c
index b347758c1..b5ff4e262 100644
--- a/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c
@@ -105,6 +105,8 @@ DEF_KERNEL_INITIALIZER(_reduceany_internal_initializer)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t * output_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c
index 05a867406..5ee818064 100644
--- a/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c
@@ -120,6 +120,8 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t * output_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c
index 50a502565..ba31ed9fe 100644
--- a/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c
@@ -119,6 +119,8 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t * output_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c
index 8d1b7c0dd..b04a246a5 100644
--- a/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c
@@ -129,6 +129,8 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t * output_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c b/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c
index 8cfd331fa..1ea137bdc 100644
--- a/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c
@@ -126,6 +126,8 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/repeat_cl.c b/src/tim/vx/internal/src/kernel/cl/repeat_cl.c
index c2f28dda7..d40ae1f26 100644
--- a/src/tim/vx/internal/src/kernel/cl/repeat_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/repeat_cl.c
@@ -129,6 +129,8 @@ DEF_KERNEL_INITIALIZER(_repeat_initializer)
int32_t is1d = 0;
int32_t axis = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &axis);
@@ -190,7 +192,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype = U8;
int32_t is1d = inputs[0]->attr.dim_num == 1 ? 1 : 0;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -308,6 +310,9 @@ static vsi_nn_kernel_node_t _setup
vsi_size_t height = inputs[0]->attr.dim_num > 1 ? inputs[0]->attr.size[1] : 1;
vsi_size_t channel = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
diff --git a/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c
index fda7acdc9..d9b18e718 100644
--- a/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c
@@ -116,6 +116,8 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer)
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c
index eef5bec37..8868565f9 100644
--- a/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c
@@ -117,6 +117,8 @@ DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer)
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/resize_3d_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_3d_bilinear_cl.c
new file mode 100644
index 000000000..77afbc1ca
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/resize_3d_bilinear_cl.c
@@ -0,0 +1,329 @@
+/****************************************************************************
+*
+* Copyright (c) 2020 Vivante Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include
+#include
+#include
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+#define _RESIZE_3D_BILINEAR_KERNEL_SOURCE() "resize_3d_bilinear"
+
+#define STR(a) #a
+// Add kernel hashtable here
+#define RESIZE_3D_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+ (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) )
+
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+ { RESIZE_3D_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+ CVIVANTE_NAMESPACE("cl.resize_3d_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+ _RESIZE_3D_BILINEAR_KERNEL_SOURCE() }
+
+typedef struct
+{
+ uint32_t key;
+ char * function_name;
+ const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _resize_3d_bilinear_kernel_map[] =
+{
+ PACK_KERNEL_MAP( F32, F32),
+ PACK_KERNEL_MAP( F32, U8),
+ PACK_KERNEL_MAP( U8, F32),
+ PACK_KERNEL_MAP( U8, U8),
+ PACK_KERNEL_MAP( I8, I8),
+ PACK_KERNEL_MAP( BF16,BF16),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _resize_3d_bilinear_kernel_param_def[] =
+{
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+
+#define RESIZE_3D_BILINEAR_NUM _cnt_of_array( _resize_3d_bilinear_kernel_param_def )
+
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_resize_3d_bilinear_initializer)
+ (
+ vsi_nn_kernel_node_t node,
+ const vsi_nn_kernel_node_param_t * param,
+ size_t param_size
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ gpu_param_t gpu_param = {
+ 3,
+ {0, 0, 0},
+ {0, 0, 0},
+ {0, 0, 0},
+ {0, 0, 0}
+ };
+ vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
+ vsi_size_array_t * out_shape = NULL;
+
+ VSI_UNREFERENCED(param_size);
+
+ output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+ CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+ out_shape = output_attr->shape;
+
+ gpu_param.global_scale[0] = 1;
+ gpu_param.global_scale[1] = 1;
+ gpu_param.global_scale[2] = 1;
+
+ gpu_param.dim = 3;
+ gpu_param.global_size[0] = (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+ / gpu_param.global_scale[0];
+ gpu_param.global_size[1] = (
+ (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+ / gpu_param.global_scale[1]);
+ gpu_param.global_size[2] = out_shape->data[2];
+ status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+ SAFE_FREE_TENSOR_ATTR(output_attr);
+ return status;
+} /* _resize_3d_bilinear_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+ (
+ vsi_nn_kernel_t * kernel,
+ vsi_nn_tensor_t * const * const inputs,
+ vsi_nn_tensor_t * const * const outputs
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ vsi_nn_kernel_dtype_e in_dtype;
+ vsi_nn_kernel_dtype_e out_dtype;
+ const _kernel_map_type * kernel_map = _resize_3d_bilinear_kernel_map;
+ size_t kernel_map_size = _cnt_of_array( _resize_3d_bilinear_kernel_map );
+ vx_param_description_t * param_def = _resize_3d_bilinear_kernel_param_def;
+ size_t param_def_size = _cnt_of_array( _resize_3d_bilinear_kernel_param_def );
+ vx_kernel_initialize_f initializer = _resize_3d_bilinear_initializer;
+
+ uint32_t key;
+ uint32_t i;
+
+ in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+ out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+ if (F16 == in_dtype)
+ {
+ in_dtype = F32;
+ }
+ if (F16 == out_dtype)
+ {
+ out_dtype = F32;
+ }
+
+ if (I16 == in_dtype)
+ {
+ in_dtype = I8;
+ }
+ if (I16 == out_dtype)
+ {
+ out_dtype = I8;
+ }
+
+ key = RESIZE_3D_BILINEAR_HASH_KEY( in_dtype, out_dtype );
+
+ for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+ {
+ if( kernel_map[i].key == key )
+ {
+ break;
+ }
+ }
+ if( i < kernel_map_size )
+ {
+ snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
+ kernel->info.parameters = param_def;
+ kernel->info.numParams = (uint32_t)param_def_size;
+ kernel->info.initialize = initializer;
+ // Register code source
+ vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+ kernel_map[i].source_name );
+ // Register binary source
+ vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+ kernel_map[i].source_name );
+ status = VSI_SUCCESS;
+ }
+
+ return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+ (
+ vsi_nn_graph_t * graph,
+ vsi_nn_tensor_t ** inputs,
+ size_t input_num,
+ vsi_nn_tensor_t ** outputs,
+ size_t output_num,
+ const vsi_nn_kernel_param_t * params,
+ vsi_nn_kernel_t * kernel
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ vsi_nn_kernel_node_param_t node_params[RESIZE_3D_BILINEAR_NUM] = {NULL};
+ vsi_nn_kernel_node_t node = NULL;
+ int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" );
+ int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
+ vsi_size_t in_width = inputs[0]->attr.size[0];
+ vsi_size_t in_height = inputs[0]->attr.size[1];
+ vsi_size_t in_depth = inputs[0]->attr.size[2];
+ vsi_size_t out_width = outputs[0]->attr.size[0];
+ vsi_size_t out_height = outputs[0]->attr.size[1];
+ vsi_size_t out_depth = outputs[0]->attr.size[2];
+ float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+ float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
+ float input_tail = -(input_zp * input_scale);
+ float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+ float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+ float half_pixel_value = 0.0f;
+ float scale_factor_x = 0.0f;
+ float scale_factor_y = 0.0f;
+ float scale_factor_z = 0.0f;
+
+ if (align_corners && out_width > 1)
+ {
+ scale_factor_x = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1);
+ }
+ else
+ {
+ scale_factor_x = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width;
+ }
+
+ if (align_corners && out_height > 1)
+ {
+ scale_factor_y = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1);
+ }
+ else
+ {
+ scale_factor_y = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height;
+ }
+
+ if (align_corners && out_depth > 1)
+ {
+ scale_factor_z = ((vx_float32)(in_depth - 1) * 1.0f) / (vx_float32)(out_depth - 1);
+ }
+ else
+ {
+ scale_factor_z = ((vx_float32)in_depth * 1.0f) / (vx_float32)out_depth;
+ }
+
+ if (half_pixel_centers)
+ {
+ half_pixel_value = 0.5f;
+ }
+ else
+ {
+ half_pixel_value = 0.0f;
+ }
+
+
+ status = _query_kernel( kernel, inputs, outputs );
+ if( VSI_SUCCESS == status)
+ {
+ node = vsi_nn_kernel_create_node( graph, kernel );
+ if( node )
+ {
+ size_t node_params_num = RESIZE_3D_BILINEAR_NUM;
+ /* Set inputs and outputs */
+ vsi_nn_kernel_node_pack_io( node_params, RESIZE_3D_BILINEAR_NUM,
+ inputs, input_num, outputs, output_num );
+ node_params[2] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_x );
+ node_params[3] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_y );
+ node_params[4] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_z );
+ node_params[5] = vsi_nn_kernel_scalar_create( graph, F32, &half_pixel_value );
+ node_params[6] = vsi_nn_kernel_scalar_create( graph, U32, &in_width );
+ node_params[7] = vsi_nn_kernel_scalar_create( graph, U32, &in_height );
+ node_params[8] = vsi_nn_kernel_scalar_create( graph, U32, &in_depth );
+ node_params[9] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
+ node_params[10] = vsi_nn_kernel_scalar_create( graph, F32, &input_tail );
+ node_params[11] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
+ node_params[12] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
+
+ /* Pass parameters to node. */
+ status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+ VSI_ASSERT( status == VSI_SUCCESS );
+ vsi_nn_kernel_scalar_release( &node_params[2] );
+ vsi_nn_kernel_scalar_release( &node_params[3] );
+ vsi_nn_kernel_scalar_release( &node_params[4] );
+ vsi_nn_kernel_scalar_release( &node_params[5] );
+ vsi_nn_kernel_scalar_release( &node_params[6] );
+ vsi_nn_kernel_scalar_release( &node_params[7] );
+ vsi_nn_kernel_scalar_release( &node_params[8] );
+ vsi_nn_kernel_scalar_release( &node_params[9] );
+ vsi_nn_kernel_scalar_release( &node_params[10] );
+ vsi_nn_kernel_scalar_release( &node_params[11] );
+ vsi_nn_kernel_scalar_release( &node_params[12] );
+ }
+ }
+
+ return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( resize_3d_bilinear, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cl/resize_3d_nearest_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_3d_nearest_cl.c
new file mode 100644
index 000000000..b0e6138c7
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/resize_3d_nearest_cl.c
@@ -0,0 +1,332 @@
+/****************************************************************************
+*
+* Copyright (c) 2020 Vivante Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include
+#include
+#include
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+ INTERNAL_KERNEL_RESIZE_3D_NEAREST,
+} _internal_kernel_e;
+
+#define _RESIZE_3D_NEAREST_KERNEL_SOURCE "resize_3d_nearest"
+
+#define STR(a) #a
+// Add kernel hashtable here
+#define RESIZE_3D_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+ (( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
+
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+ { RESIZE_3D_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+ CVIVANTE_NAMESPACE("cl.resize_3d_nearest_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+ _RESIZE_3D_NEAREST_KERNEL_SOURCE }
+
+typedef struct
+{
+ uint32_t key;
+ char * function_name;
+ const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _resize_3d_nearest_kernel_map[] =
+{
+ PACK_KERNEL_MAP( F32, F32),
+ PACK_KERNEL_MAP( F32, U8),
+ PACK_KERNEL_MAP( U8, F32),
+ PACK_KERNEL_MAP( U8, U8),
+ PACK_KERNEL_MAP( I8, I8),
+ PACK_KERNEL_MAP( BF16,BF16),
+};
+
+
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _resize_3d_nearest_kernel_param_def[] =
+{
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+#define _RESIZE_3D_NEAREST_PARAM_NUM _cnt_of_array( _resize_3d_nearest_kernel_param_def )
+
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_resize_3d_nearest_initializer)
+ (
+ vsi_nn_kernel_node_t node,
+ const vsi_nn_kernel_node_param_t * param,
+ size_t param_size
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ gpu_param_t gpu_param = {
+ 3,
+ {0, 0, 0},
+ {0, 0, 0},
+ {0, 0, 0},
+ {0, 0, 0}
+ };
+ vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
+ vsi_size_array_t * out_shape = NULL;
+
+ VSI_UNREFERENCED(param_size);
+
+ output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+ CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+ out_shape = output_attr->shape;
+
+ gpu_param.global_scale[0] = 1;
+ gpu_param.global_scale[1] = 1;
+ gpu_param.global_scale[2] = 1;
+
+ gpu_param.dim = 3;
+ gpu_param.global_size[0] = gpu_align_p2(
+ (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+ / gpu_param.global_scale[0], 4);
+ gpu_param.global_size[1] = (
+ (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+ / gpu_param.global_scale[1]);
+ gpu_param.global_size[2] = out_shape->data[2];
+ status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+ SAFE_FREE_TENSOR_ATTR(output_attr);
+ return status;
+} /* _resize_3d_nearest_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+ (
+ vsi_nn_kernel_t * kernel,
+ vsi_nn_tensor_t * const * const inputs,
+ vsi_nn_tensor_t * const * const outputs
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ vsi_nn_kernel_dtype_e in_dtype;
+ vsi_nn_kernel_dtype_e out_dtype;
+ const _kernel_map_type * kernel_map = _resize_3d_nearest_kernel_map;
+ size_t kernel_map_size = _cnt_of_array( _resize_3d_nearest_kernel_map );
+ vx_param_description_t * param_def = _resize_3d_nearest_kernel_param_def;
+ size_t param_def_size = _cnt_of_array( _resize_3d_nearest_kernel_param_def );
+ vx_kernel_initialize_f initializer = _resize_3d_nearest_initializer;
+
+ uint32_t key;
+ uint32_t i;
+
+ in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+ out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+ if (F16 == in_dtype)
+ {
+ in_dtype = F32;
+ }
+ if (F16 == out_dtype)
+ {
+ out_dtype = F32;
+ }
+
+ if (I16 == in_dtype)
+ {
+ in_dtype = I8;
+ }
+ if (I16 == out_dtype)
+ {
+ out_dtype = I8;
+ }
+
+ key = RESIZE_3D_NEAREST_HASH_KEY( in_dtype, out_dtype );
+
+ for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+ {
+ if( kernel_map[i].key == key )
+ {
+ break;
+ }
+ }
+ if( i < (uint32_t)kernel_map_size )
+ {
+ snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
+ kernel->info.parameters = param_def;
+ kernel->info.numParams = (uint32_t)param_def_size;
+ kernel->info.initialize = initializer;
+ // Register code source
+ vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+ kernel_map[i].source_name );
+ // Register binary source
+ vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+ kernel_map[i].source_name );
+ status = VSI_SUCCESS;
+ }
+ return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+ (
+ vsi_nn_graph_t * graph,
+ vsi_nn_tensor_t ** inputs,
+ size_t input_num,
+ vsi_nn_tensor_t ** outputs,
+ size_t output_num,
+ const vsi_nn_kernel_param_t * params,
+ vsi_nn_kernel_t * kernel
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ vsi_nn_kernel_node_param_t node_params[_RESIZE_3D_NEAREST_PARAM_NUM] = {NULL};
+ vsi_nn_kernel_node_t node = NULL;
+ int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" );
+ int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
+ vsi_size_t in_width = inputs[0]->attr.size[0];
+ vsi_size_t in_height = inputs[0]->attr.size[1];
+ vsi_size_t in_depth = inputs[0]->attr.size[2];
+ vsi_size_t out_width = outputs[0]->attr.size[0];
+ vsi_size_t out_height = outputs[0]->attr.size[1];
+ vsi_size_t out_depth = outputs[0]->attr.size[2];
+ float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+ float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
+ float output_scale = input_scale / vsi_nn_get_tensor_scale(outputs[0]);
+ float output_tail = (float)vsi_nn_get_tensor_zero_point(outputs[0]) - input_zp * output_scale;
+ float half_pixel_value = 0.0f;
+ float round_value = 0.0f;
+ float scale_factor_x = 0.0f;
+ float scale_factor_y = 0.0f;
+ float scale_factor_z = 0.0f;
+
+ if (align_corners && out_width > 1)
+ {
+ scale_factor_x = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1);
+ }
+ else
+ {
+ scale_factor_x = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width;
+ }
+
+ if (align_corners && out_height > 1)
+ {
+ scale_factor_y = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1);
+ }
+ else
+ {
+ scale_factor_y = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height;
+ }
+
+ if (align_corners && out_depth > 1)
+ {
+ scale_factor_z = ((vx_float32)(in_depth - 1) * 1.0f) / (vx_float32)(out_depth - 1);
+ }
+ else
+ {
+ scale_factor_z = ((vx_float32)in_depth * 1.0f) / (vx_float32)out_depth;
+ }
+
+ if (align_corners)
+ {
+ round_value = 0.5f;
+ }
+ else
+ {
+ round_value = 0.0f;
+ }
+
+ if (half_pixel_centers)
+ {
+ half_pixel_value = 0.5f;
+ }
+ else
+ {
+ half_pixel_value = 0.0f;
+ }
+
+ status = _query_kernel( kernel, inputs, outputs );
+ if( VSI_SUCCESS == status)
+ {
+ node = vsi_nn_kernel_create_node( graph, kernel );
+ if( node )
+ {
+ size_t node_params_num = _RESIZE_3D_NEAREST_PARAM_NUM;
+ /* Set inputs and outputs */
+ vsi_nn_kernel_node_pack_io( node_params, _RESIZE_3D_NEAREST_PARAM_NUM,
+ inputs, input_num, outputs, output_num );
+ node_params[2] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_x );
+ node_params[3] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_y );
+ node_params[4] = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_z );
+ node_params[5] = vsi_nn_kernel_scalar_create( graph, F32, &half_pixel_value );
+ node_params[6] = vsi_nn_kernel_scalar_create( graph, F32, &round_value );
+ node_params[7] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
+ node_params[8] = vsi_nn_kernel_scalar_create(graph, F32, &output_tail );
+
+ /* Pass parameters to node. */
+ status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+ vsi_nn_kernel_scalar_release( &node_params[2] );
+ vsi_nn_kernel_scalar_release( &node_params[3] );
+ vsi_nn_kernel_scalar_release( &node_params[4] );
+ vsi_nn_kernel_scalar_release( &node_params[5] );
+ vsi_nn_kernel_scalar_release( &node_params[6] );
+ vsi_nn_kernel_scalar_release( &node_params[7] );
+ vsi_nn_kernel_scalar_release( &node_params[8] );
+ }
+ }
+ return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( resize_3d_nearest, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c
index a9c0285fb..60fbda3eb 100644
--- a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c
@@ -115,6 +115,8 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c
index d61abcf30..1ca6ba9f1 100644
--- a/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c
@@ -121,6 +121,8 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer)
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/reversesequence_cl.c b/src/tim/vx/internal/src/kernel/cl/reversesequence_cl.c
index cb9cdcd19..10b3855d2 100644
--- a/src/tim/vx/internal/src/kernel/cl/reversesequence_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/reversesequence_cl.c
@@ -124,6 +124,8 @@ DEF_KERNEL_INITIALIZER(_reversesequence_initializer)
vsi_nn_kernel_tensor_attr_t *input_attr = NULL;
vsi_size_array_t *input_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input );
CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
@@ -161,16 +163,16 @@ static vsi_status _query_kernel
int32_t batch_axis
)
{
- vsi_status status = VSI_FAILURE;
- vsi_nn_kernel_dtype_e in_dtype;
- vsi_nn_kernel_dtype_e out_dtype;
+ vsi_status status = VSI_FAILURE;
+ vsi_nn_kernel_dtype_e in_dtype = 0;
+ vsi_nn_kernel_dtype_e out_dtype = 0;
const _kernel_map_type * kernel_map = _reversesequence_kernel_map;
size_t kernel_map_size = _cnt_of_array( _reversesequence_kernel_map );
vx_param_description_t * param_def = _reversesequence_kernel_param_def;
vx_kernel_initialize_f initializer = _reversesequence_initializer;
vsi_nn_kernel_batch_axis_type_e axis_type = _axis1;
- uint32_t key;
- uint32_t i;
+ uint32_t key = 0;
+ size_t i = 0;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -223,7 +225,7 @@ static vsi_status _query_kernel
break;
}
- for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+ for ( i = 0; i < kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
@@ -272,6 +274,13 @@ static vsi_nn_kernel_node_t _setup
float inoutScale = inputScale / outputScale;
float inoutTail = outputTail - inputTail * inoutScale;
+ vsi_nn_kernel_tensor_t reshape_tensor = NULL;
+ vsi_size_t shapes[VSI_NN_MAX_DIM_NUM] = {1};
+ uint32_t new_rank = 2;
+
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
inputs[0]->attr.dim_num )
|| !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
@@ -280,6 +289,11 @@ static vsi_nn_kernel_node_t _setup
return NULL;
}
+ shapes[0] = inputs[1]->attr.size[0];
+ shapes[1] = 1;
+
+ reshape_tensor = vsi_nn_kernel_tensor_reshape(inputs[1]->t, shapes, new_rank);
+
status = _query_kernel( kernel, inputs, outputs, batch_axis );
if ( VSI_SUCCESS == status)
{
@@ -287,9 +301,10 @@ static vsi_nn_kernel_node_t _setup
if ( node )
{
/* Set inputs and outputs */
- uint32_t index = 3;
- vsi_nn_kernel_node_pack_io( node_params, _REVERSESEQUENCE_PARAM_NUM,
- inputs, input_num, outputs, output_num );
+ uint32_t index = 0;
+ node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
+ node_params[index++] = reshape_tensor;
+ node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t;
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inoutScale );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inoutTail );
/* Pass parameters to node. */
@@ -298,6 +313,11 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &node_params[4] );
}
}
+
+ if (reshape_tensor)
+ {
+ vsi_nn_kernel_tensor_release( &reshape_tensor );
+ }
return node;
} /* _setup() */
diff --git a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
index e897d0f78..9cf2818a6 100644
--- a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
@@ -134,6 +134,8 @@ DEF_KERNEL_INITIALIZER(_roi_align_initializer)
vsi_size_array_t * rois_shape = NULL;
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
rois_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( rois_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c
index 2be6a78da..fec2f3b69 100644
--- a/src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c
@@ -155,6 +155,8 @@ DEF_KERNEL_INITIALIZER(_scatter_elements_initializer)
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c
index d409c4c45..e56d37dde 100644
--- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c
@@ -183,6 +183,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer)
vsi_ssize_t block_size = 0;
vsi_ssize_t height = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@@ -222,7 +224,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype = U8;
vsi_nn_kernel_coord_type_e coord_type = _1D;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -290,6 +292,9 @@ static vsi_nn_kernel_node_t _setup
int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
vsi_size_t width = 0, area = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if (coord_dim > 3)
{
return NULL;
diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c
index d5f2867bd..94c4fa330 100644
--- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c
@@ -188,6 +188,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer)
vsi_ssize_t block_size = 0;
vsi_ssize_t height = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@@ -227,7 +229,9 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e input2_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
+
+ VSI_UNREFERENCED(coord_dim);
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
@@ -284,6 +288,9 @@ static vsi_nn_kernel_node_t _setup
vsi_size_t *input_size = inputs[2]->attr.size;
uint32_t dims_num = inputs[2]->attr.dim_num;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if (coord_dim > 4 && input_size[dims_num - 1] > 1)
{
return NULL;
diff --git a/src/tim/vx/internal/src/kernel/cl/select_cl.c b/src/tim/vx/internal/src/kernel/cl/select_cl.c
index 53b1fcdd9..ab449010a 100644
--- a/src/tim/vx/internal/src/kernel/cl/select_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/select_cl.c
@@ -35,6 +35,7 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
__BEGIN_DECLS
@@ -62,6 +63,10 @@ typedef enum _internal_img_dim_e
CVIVANTE_NAMESPACE("cl.select_"STR(COND_DTYPE)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
_SELECT_KERNEL_SOURCE}
+#define _INPUT_NUM (3)
+#define _OUTPUT_NUM (1)
+#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
+
typedef struct
{
uint32_t key;
@@ -111,7 +116,7 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
size_t param_size
)
{
- vsi_status status = VX_SUCCESS;
+ vsi_status status = VSI_FAILURE;
// Alignment with a power of two value.
gpu_param_t gpu_param = {
3,
@@ -125,6 +130,8 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output);
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
@@ -247,19 +254,73 @@ static vsi_nn_kernel_node_t _setup
float input1Scale = vsi_nn_get_tensor_scale(inputs[2]);
float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[2]);
+ vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL };
+ vsi_size_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+ vsi_size_t* shapes_ptr[_IO_NUM];
+ vsi_size_t* shapes_in[_INPUT_NUM];
+ vsi_size_t rank_in[_INPUT_NUM];
+ uint32_t new_rank = 0;
+ uint32_t i = 0;
+ vsi_bool ret = FALSE;
+
+ VSI_UNREFERENCED(params);
+
input0Scale = input0Scale / outputScale;
input1Scale = input1Scale / outputScale;
input0Tail = outputZP - input0Tail * input0Scale;
input1Tail = outputZP - input1Tail * input1Scale;
- if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
- outputs[0]->attr.dim_num ) )
+
+ for (i = 0; i < _IO_NUM; i++)
+ {
+ shapes_ptr[i] = shapes[i];
+ }
+
+ for (i = 0; i < _INPUT_NUM; i++)
+ {
+ shapes_in[i] = inputs[i]->attr.size;
+ rank_in[i] = (vsi_size_t)inputs[i]->attr.dim_num;
+ }
+
+ ret = vsi_nn_kernel_optimize_broadcast_shape(
+ (const vsi_size_t**)shapes_in, rank_in, _INPUT_NUM,
+ outputs[0]->attr.size, outputs[0]->attr.dim_num,
+ shapes_ptr, shapes[_INPUT_NUM], &new_rank);
+
+ if ( ret )
+ {
+ for (i = 0; i < _INPUT_NUM; i++)
+ {
+ reshape_tensors[i] = vsi_nn_reshape_tensor( graph,
+ inputs[i], shapes[i], new_rank );
+ }
+
+ for (i = 0; i < _OUTPUT_NUM; i++)
+ {
+ reshape_tensors[i + _INPUT_NUM] = vsi_nn_reshape_tensor( graph,
+ outputs[i], shapes[i + _INPUT_NUM], new_rank );
+ }
+ }
+ else
+ {
+ for (i = 0; i < _INPUT_NUM; i++)
+ {
+ reshape_tensors[i] = inputs[i];
+ }
+ for (i = 0; i < _OUTPUT_NUM; i++)
+ {
+ reshape_tensors[i + _INPUT_NUM] = outputs[i];
+ }
+ }
+
+ if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[3]->attr.size,
+ reshape_tensors[3]->attr.dim_num ) )
{
return NULL;
}
- image_2d = (outputs[0]->attr.dim_num == 2 || outputs[0]->attr.size[2] == 1);
- status = _query_kernel( kernel, inputs, outputs, image_2d);
+ image_2d = (reshape_tensors[3]->attr.dim_num == 2);
+ status = _query_kernel( kernel, inputs, &reshape_tensors[3], image_2d);
if( VSI_SUCCESS == status)
{
@@ -268,7 +329,7 @@ static vsi_nn_kernel_node_t _setup
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _SELECT_PARAM_NUM,
- inputs, input_num, outputs, output_num );
+ &reshape_tensors[0], input_num, &reshape_tensors[3], output_num );
node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale );
node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input0Tail );
node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale );
@@ -283,6 +344,15 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] );
}
}
+
+ if (ret)
+ {
+ for (i = 0; i < _IO_NUM; i++)
+ {
+ vsi_safe_release_tensor( reshape_tensors[i] );
+ }
+ }
+
return node;
} /* _setup() */
diff --git a/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c b/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c
index d65200d33..4c620f4ce 100644
--- a/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c
@@ -116,6 +116,8 @@ DEF_KERNEL_INITIALIZER(_sequence_mask_initializer)
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@@ -155,7 +157,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype = U8;
vsi_status status = VSI_FAILURE;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -204,6 +206,8 @@ static int32_t _optimize_mask_shape
vsi_size_t new_rank = 0;
uint32_t i = 0;
+ VSI_UNREFERENCED(outputs);
+
for(i = 0; i < inputs[0]->attr.dim_num; i++)
{
in_shape[i] = inputs[0]->attr.size[i];
@@ -253,6 +257,9 @@ static vsi_nn_kernel_node_t _setup
float input_zpScale = 0;
float outputVal1 = 1.0f;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
diff --git a/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c b/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c
index 7aee0e0af..7a2bef62f 100644
--- a/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c
@@ -95,6 +95,8 @@ DEF_KERNEL_INITIALIZER(_signal_frame_initializer)
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -199,6 +201,9 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_tensor_t* rs_tensors[2] = { NULL };
vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
for (i = 0; i < axis; i++)
{
inner *= inputs[0]->attr.size[i];
diff --git a/src/tim/vx/internal/src/kernel/cl/slice_cl.c b/src/tim/vx/internal/src/kernel/cl/slice_cl.c
index 4900bb129..d3379bbfe 100644
--- a/src/tim/vx/internal/src/kernel/cl/slice_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/slice_cl.c
@@ -128,6 +128,8 @@ DEF_KERNEL_INITIALIZER(_slice_initializer)
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
@@ -245,6 +247,8 @@ static vsi_nn_kernel_node_t _setup
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
+ VSI_UNREFERENCED(params);
+
outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num,
diff --git a/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c
index 7c7a59a2f..3bca54f63 100644
--- a/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c
@@ -114,6 +114,8 @@ DEF_KERNEL_INITIALIZER(_space2depth_internal_initializer)
vsi_ssize_t height = 0;
vsi_ssize_t chn = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@@ -155,7 +157,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype = U8;
vsi_status status = VSI_FAILURE;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -215,6 +217,9 @@ static vsi_nn_kernel_node_t _setup
float scaleInOut = 1.0f;
float zpInOut = 0.0f;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
scaleInOut = inputScale / outputScale;
zpInOut = outputZp - inputZp * scaleInOut;
diff --git a/src/tim/vx/internal/src/kernel/cl/swish_cl.c b/src/tim/vx/internal/src/kernel/cl/swish_cl.c
index b616a84ac..97d0db96b 100644
--- a/src/tim/vx/internal/src/kernel/cl/swish_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/swish_cl.c
@@ -167,11 +167,13 @@ DEF_KERNEL_INITIALIZER(_swish_initializer)
{0, 0, 0}
};
- vx_status status = VX_FAILURE;
+ vsi_status status = VSI_FAILURE;
vx_tensor output = (vx_tensor)param[1];
vsi_nn_kernel_tensor_attr_t * attr_out = NULL;
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr_out = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
CHECK_PTR_FAIL_GOTO( attr_out, "vsi_nn_kernel_tensor_attr_create fail.", final );
@@ -293,6 +295,9 @@ static vsi_nn_kernel_node_t _setup
vx_float32 logE = (vx_float32)(log10(exp(1.0f)) / log10(2.0f));
vsi_bool ret = FALSE;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
#if (VX_ACTIVATION_EXT_SUPPORT)
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
{
diff --git a/src/tim/vx/internal/src/kernel/cl/tile_cl.c b/src/tim/vx/internal/src/kernel/cl/tile_cl.c
index 63816947e..266b8ed6a 100644
--- a/src/tim/vx/internal/src/kernel/cl/tile_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/tile_cl.c
@@ -106,6 +106,7 @@ static const struct {
TENSOR_TILE_AXIS0_UINT32(U32, U32)
TENSOR_TILE_AXIS0_FLOAT(F16, F16)
TENSOR_TILE_AXIS0_FLOAT(F32, F32)
+ TENSOR_TILE_AXIS0_KERNELS(F32, U32)
TENSOR_TILE_AXIS0_INT32_2D(I8, I8)
TENSOR_TILE_AXIS0_INT32_2D(I16, I16)
@@ -114,6 +115,7 @@ static const struct {
TENSOR_TILE_AXIS0_UINT32_2D(U32, U32)
TENSOR_TILE_AXIS0_FLOAT_2D(F16, F16)
TENSOR_TILE_AXIS0_FLOAT_2D(F32, F32)
+ TENSOR_TILE_AXIS0_KERNELS_2D(F32, U32)
};
/*
@@ -130,6 +132,8 @@ static vx_param_description_t kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _CL_PARAM_NUM _cnt_of_array(kernel_param_def)
@@ -140,6 +144,8 @@ static vx_param_description_t kernel_param_def[] =
#define SCALAR_INPUT_MULTIPLES_1 (6)
#define SCALAR_INPUT_MULTIPLES_2 (7)
#define SCALAR_INPUT_MULTIPLES_3 (8)
+#define IN_OUT_SCALE (9)
+#define IN_OUT_TAIL (10)
/*
* Kernel initializer
@@ -163,6 +169,8 @@ DEF_KERNEL_INITIALIZER(_tile_initializer)
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_size_array_t * in_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -205,10 +213,29 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int i;
+ size_t i;
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+ if (input_dtype == F16)
+ {
+ input_dtype = F32;
+ }
+ else if (input_dtype == U8)
+ {
+ input_dtype = U32;
+ }
+
+ if (output_dtype == F16)
+ {
+ output_dtype = F32;
+ }
+ else if (output_dtype == U8)
+ {
+ output_dtype = U32;
+ }
+
+
key = HASH_TILE_AXIS0_KEY( input_dtype, output_dtype, image_2d );
for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
@@ -280,6 +307,16 @@ static vsi_nn_kernel_node_t _setup
vsi_bool ret = FALSE;
uint32_t dim = inputs[0]->attr.dim_num;
vsi_size_t multiples[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 };
+ float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
+ float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+ float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
+ float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+ float inoutScale = inputScale / outputScale;
+ float inoutTail = outputTail - inputTail * inoutScale;
+
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
for ( i = 0; i < dim; i++)
{
@@ -299,10 +336,34 @@ static vsi_nn_kernel_node_t _setup
return NULL;
}
- reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
- inputs[0], shapes[0], new_rank );
- reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
- outputs[0], shapes[2], new_rank );
+ if ( new_rank == 4)
+ {
+ vsi_size_t newshapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+ newshapes[0][0] = shapes[0][0];
+ newshapes[2][0] = shapes[2][0];
+ newshapes[0][1] = shapes[0][1];
+ newshapes[2][1] = shapes[2][1];
+ newshapes[0][2] = shapes[0][2] * shapes[0][3];
+ newshapes[2][2] = shapes[2][2] * shapes[2][3];
+
+ if (newshapes[0][2] >= GPU_TENSOR_MAX_WIDTH ||
+ newshapes[2][2] >= GPU_TENSOR_MAX_WIDTH)
+ {
+ return NULL;
+ }
+
+ reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+ inputs[0], newshapes[0], 3 );
+ reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+ outputs[0], newshapes[2], 3 );
+ }
+ else
+ {
+ reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+ inputs[0], shapes[0], new_rank );
+ reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+ outputs[0], shapes[2], new_rank );
+ }
}
else
{
@@ -315,7 +376,7 @@ static vsi_nn_kernel_node_t _setup
goto final;
}
- image_2d = ((reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1));
+ image_2d = reshape_tensors[0]->attr.dim_num == 2;
status = _query_kernel( &reshape_tensors[0], &reshape_tensors[1], image_2d, kernel );
if( VSI_SUCCESS == status)
{
@@ -323,13 +384,16 @@ static vsi_nn_kernel_node_t _setup
if( node )
{
- uint32_t depthIn = (uint32_t)(new_rank > 2 ? reshape_tensors[0]->attr.size[2] : 1);
- uint32_t depthOut = (uint32_t)(new_rank > 2 ? reshape_tensors[1]->attr.size[2] : 1);
- uint32_t batchIn = (uint32_t)(new_rank > 3 ? reshape_tensors[0]->attr.size[3] : 1);
+ uint32_t depthIn = (uint32_t)(new_rank > 2 ? shapes[0][2] : 1);
+ uint32_t depthOut = (uint32_t)(new_rank > 2 ? shapes[2][2] : 1);
+ uint32_t batchIn = (uint32_t)(new_rank > 3 ? shapes[0][3] : 1);
vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
&reshape_tensors[0], 1, &reshape_tensors[1], 1 );
+ shapes[1][2] = shapes[1][2] == 0 ? 1 : shapes[1][2];
+ shapes[1][3] = shapes[1][3] == 0 ? 1 : shapes[1][3];
+
/* Pass parameters to node. */
node_params[SCALAR_INPUT_BATCH_IN] = vsi_nn_kernel_scalar_create(
graph, I32, &batchIn );
@@ -338,14 +402,17 @@ static vsi_nn_kernel_node_t _setup
node_params[SCALAR_INPUT_DEPTH_OUT] = vsi_nn_kernel_scalar_create(
graph, I32, &depthOut );
node_params[SCALAR_INPUT_MULTIPLES_0] = vsi_nn_kernel_scalar_create(
- graph, I32, &multiples[0] );
+ graph, I32, &shapes[1][0] );
node_params[SCALAR_INPUT_MULTIPLES_1] = vsi_nn_kernel_scalar_create(
- graph, I32, &multiples[1] );
+ graph, I32, &shapes[1][1] );
node_params[SCALAR_INPUT_MULTIPLES_2] = vsi_nn_kernel_scalar_create(
- graph, I32, &multiples[2] );
+ graph, I32, &shapes[1][2] );
node_params[SCALAR_INPUT_MULTIPLES_3] = vsi_nn_kernel_scalar_create(
- graph, I32, &multiples[3] );
-
+ graph, I32, &shapes[1][3] );
+ node_params[IN_OUT_SCALE] = vsi_nn_kernel_scalar_create(
+ graph, F32, &inoutScale );
+ node_params[IN_OUT_TAIL] = vsi_nn_kernel_scalar_create(
+ graph, F32, &inoutTail );
status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
VSI_ASSERT( status == VSI_SUCCESS );
@@ -356,6 +423,8 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MULTIPLES_1] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MULTIPLES_2] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MULTIPLES_3] );
+ vsi_nn_kernel_scalar_release( &node_params[IN_OUT_SCALE] );
+ vsi_nn_kernel_scalar_release( &node_params[IN_OUT_TAIL] );
}
}
diff --git a/src/tim/vx/internal/src/kernel/cl/topk_cl.c b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
index 0354a1e3f..3d6884065 100644
--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@@ -181,6 +181,8 @@ DEF_KERNEL_INITIALIZER(_topk_initializer)
vsi_size_array_t * in_shape = NULL;
int32_t num_stages = 0;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
@@ -222,6 +224,8 @@ DEF_KERNEL_INITIALIZER(_topk_odd_even_sort_initializer)
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
vsi_size_array_t * in_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
@@ -424,7 +428,7 @@ static vsi_nn_kernel_node_t _setup
)
{
vsi_status status = VSI_FAILURE;
- vsi_nn_kernel_node_param_t node_params[_TOPK_ODD_EVEN_SORT_PARAM_NUM];
+ vsi_nn_kernel_node_param_t node_params[_TOPK_ODD_EVEN_SORT_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
vsi_size_t block_size = inputs[0]->attr.size[0];
vsi_size_t block_num = 1;
@@ -473,8 +477,10 @@ static vsi_nn_kernel_node_t _setup
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shape[1], 2 );
+ CHECK_PTR_FAIL_GOTO(rs_tensors[1], "Create tensor failed", final);
rs_tensors[2] = vsi_nn_reshape_tensor( graph,
outputs[1], shape[1], 2 );
+ CHECK_PTR_FAIL_GOTO(rs_tensors[2], "Create tensor failed", final);
}
else
{
@@ -484,14 +490,17 @@ static vsi_nn_kernel_node_t _setup
memcpy( &attr, &(rs_tensors[0]->attr), sizeof(vsi_nn_tensor_attr_t) );
rs_tensors[1] = vsi_nn_CreateTensor( graph, &attr );
+ CHECK_PTR_FAIL_GOTO(rs_tensors[1], "Create tensor failed", final);
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
rs_tensors[2] = vsi_nn_CreateTensor( graph, &attr );
-
+ CHECK_PTR_FAIL_GOTO(rs_tensors[2], "Create tensor failed", final);
rs_tensors[3] = vsi_nn_reshape_tensor( graph,
outputs[0], shape[1], 2 );
+ CHECK_PTR_FAIL_GOTO(rs_tensors[3], "Create tensor failed", final);
rs_tensors[4] = vsi_nn_reshape_tensor( graph,
outputs[1], shape[1], 2 );
+ CHECK_PTR_FAIL_GOTO(rs_tensors[4], "Create tensor failed", final);
input_num = 3;
}
@@ -505,10 +514,10 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_pack_io( node_params, param_num,
rs_tensors, input_num, &rs_tensors[input_num], output_num );
/* Pass parameters to node. */
- node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &inputScale );
- node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &inputTail );
+ node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &inputScale );
+ node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &inputTail );
node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &outputScale );
- node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &outputTail );
+ node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &outputTail );
if (is_odd_even_sort)
{
node_params[SCALAR_INPUT_SIZE] = vsi_nn_kernel_scalar_create(
diff --git a/src/tim/vx/internal/src/kernel/cl/upsample_cl.c b/src/tim/vx/internal/src/kernel/cl/upsample_cl.c
index 6f469883a..d2c33870a 100644
--- a/src/tim/vx/internal/src/kernel/cl/upsample_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/upsample_cl.c
@@ -123,12 +123,14 @@ DEF_KERNEL_INITIALIZER(_upsample_initializer)
{0, 0, 0}
};
- vx_status status = VX_FAILURE;
+ vsi_status status = VSI_FAILURE;
vx_tensor input = (vx_tensor)param[0];
vsi_nn_kernel_tensor_attr_t * attr_in = NULL;
vsi_size_array_t * in_shape = NULL;
vsi_bool image_2d = FALSE;
+ VSI_UNREFERENCED(param_size);
+
attr_in = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input );
CHECK_PTR_FAIL_GOTO( attr_in, "vsi_nn_kernel_tensor_attr_create fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c b/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c
index c241e1e16..e0b4517a2 100644
--- a/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c
@@ -109,7 +109,7 @@ DEF_KERNEL_INITIALIZER(_a_times_b_plus_c_initializer)
{
#define _PACK_A_TIMES_B_PLUS_C_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE ) \
(( IN2_TYPE << 24) | ( IN1_TYPE << 16) | ( IN0_TYPE << 8) | ( OUT_TYPE))
- vsi_status status = VX_SUCCESS;
+ vsi_status status = VSI_FAILURE;
// Alignment with a power of two value.
gpu_param_t gpu_param = {
3,
@@ -128,6 +128,8 @@ DEF_KERNEL_INITIALIZER(_a_times_b_plus_c_initializer)
vsi_size_array_t *output_shape = NULL;
uint32_t pack_key = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0);
CHECK_PTR_FAIL_GOTO( attr[0], "vsi_nn_kernel_tensor_attr_create fail.", final );
@@ -331,6 +333,8 @@ static vsi_nn_kernel_node_t _setup
vsi_bool ret = FALSE;
vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL };
+ VSI_UNREFERENCED(params);
+
for (i = 0; i < _IO_NUM; i++)
{
shapes_ptr[i] = shapes[i];
diff --git a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c
index 679a07d9a..e1861a262 100644
--- a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c
@@ -90,7 +90,7 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
size_t param_size
)
{
- vsi_status status = VX_FAILURE;
+ vsi_status status = VSI_FAILURE;
// Alignment with a power of two value.
gpu_param_t gpu_param = {
2,
@@ -119,6 +119,8 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
float dimRatio = 0.0f;
int32_t width = 0;
+ VSI_UNREFERENCED(param_size);
+
input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0);
CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input1);
diff --git a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c
index 3fe4185ba..f5010111c 100644
--- a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c
@@ -173,6 +173,8 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer)
vsi_size_array_t * output_shape = NULL;
uint32_t packedArgIdx[4] = {0};
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -413,7 +415,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int i;
+ size_t i;
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -469,6 +471,9 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_t node = NULL;
int32_t axis = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
axis = vsi_nn_kernel_param_get_int32(params, "axis");
if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
diff --git a/src/tim/vx/internal/src/kernel/evis/argmin_evis.c b/src/tim/vx/internal/src/kernel/evis/argmin_evis.c
index bce04ac52..90713e08b 100644
--- a/src/tim/vx/internal/src/kernel/evis/argmin_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/argmin_evis.c
@@ -166,6 +166,8 @@ DEF_KERNEL_INITIALIZER(_argmin_initializer)
vsi_size_array_t * output_shape = NULL;
uint32_t packedArgIdx[4] = {0};
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -351,7 +353,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int i;
+ size_t i;
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -397,6 +399,9 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_t node = NULL;
int32_t axis = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
axis = vsi_nn_kernel_param_get_int32(params, "axis");
if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
diff --git a/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c
index a794ee542..80a1b21ea 100644
--- a/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c
@@ -188,7 +188,7 @@ DEF_KERNEL_INITIALIZER(_batch_norm_initializer)
#define _PACK_BATCH_NORM_KEY( IN_TYPE, OUT_TYPE ) \
( ( IN_TYPE << 16) | ( OUT_TYPE ) )
- vsi_status status = VX_SUCCESS;
+ vsi_status status = VSI_FAILURE;
// Alignment with a power of two value.
gpu_param_t gpu_param = {
3,
@@ -208,6 +208,8 @@ DEF_KERNEL_INITIALIZER(_batch_norm_initializer)
float output_zp = 0;
uint32_t pack_key = 0;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input);
CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output);
diff --git a/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c b/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c
index 01ea2ab4d..553f8b739 100644
--- a/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c
@@ -58,8 +58,8 @@ typedef enum
#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
{ \
BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
- CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
- _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE) \
+ CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
+ _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE) \
}
typedef struct
@@ -139,6 +139,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
float output_scale = 1.0;
int32_t outputZP = 0;
+ VSI_UNREFERENCED(param_size);
+
input_attr[0] =
vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
CHECK_PTR_FAIL_GOTO(
@@ -418,14 +420,17 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP", &input0ZP);
status |= vsi_nn_kernel_gpu_add_param(node, "uint8Scale", &uint8Scale);
status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &uint8ZP_out);
- status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_left_4x4", &uniU8SubZPtoFp32_left_4x4);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_left_4x4",
+ &uniU8SubZPtoFp32_left_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
if (U8 == input1_dtype) {
status |= vsi_nn_kernel_gpu_add_param(node, "input1_ZP", &input1ZP);
status |= vsi_nn_kernel_gpu_add_param(node, "input1Scale", &input1_scale);
- status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part0_4x4", &uniU8SubZPtoFp32_part0_4x4);
- status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part1_4x4", &uniU8SubZPtoFp32_part1_4x4);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part0_4x4",
+ &uniU8SubZPtoFp32_part0_4x4);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part1_4x4",
+ &uniU8SubZPtoFp32_part1_4x4);
}
else if (F16 == input1_dtype) {
status |= vsi_nn_kernel_gpu_add_param(
@@ -552,9 +557,9 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
gpu_param.global_scale[2] = 1;
gpu_param.dim = 2;
- gpu_param.global_size[0] = gpu_align_p2(
+ gpu_param.global_size[0] =
(out_width + gpu_param.global_scale[0] - 1) /
- gpu_param.global_scale[0], 4);
+ gpu_param.global_scale[0];
gpu_param.global_size[1] = ((out_height + gpu_param.global_scale[1] - 1) /
gpu_param.global_scale[1]);
diff --git a/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c b/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c
index d7074c3db..75623dda3 100644
--- a/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c
@@ -109,6 +109,8 @@ DEF_KERNEL_INITIALIZER(_bucketize_initializer)
vsi_size_array_t * input0_shape = NULL;
vsi_size_array_t * input1_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input0_attr, "Create tensor attr buffer fail.", final );
input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/cast_evis.c b/src/tim/vx/internal/src/kernel/evis/cast_evis.c
index f36e100b1..7908dd581 100644
--- a/src/tim/vx/internal/src/kernel/evis/cast_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/cast_evis.c
@@ -150,6 +150,8 @@ DEF_KERNEL_INITIALIZER(_cast_initializer)
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
@@ -289,6 +291,8 @@ static vsi_nn_kernel_node_t _setup
vsi_bool image_2d = FALSE;
vsi_nn_kernel_node_t node = NULL;
+ VSI_UNREFERENCED(params);
+
if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
inputs[0]->attr.dim_num ) )
{
diff --git a/src/tim/vx/internal/src/kernel/evis/clip_evis.c b/src/tim/vx/internal/src/kernel/evis/clip_evis.c
index 87784bf31..add96c2c0 100644
--- a/src/tim/vx/internal/src/kernel/evis/clip_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/clip_evis.c
@@ -142,6 +142,8 @@ DEF_KERNEL_INITIALIZER(_clip_initializer)
int32_t srcFixPointPos = 0;
int32_t dstFixPointPos = 0;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
index 2fb8330de..4547dfb11 100644
--- a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
@@ -308,6 +308,8 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)
float input1Scale = 1.0f;
float input1Tail = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -365,7 +367,6 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)
/ gpu_param.global_scale[1]);
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
- if (1)
{
gpu_dp_inst_t uniExtractInteger_2x8 = {{
0x33333333, // TCfg
@@ -475,7 +476,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int i;
+ size_t i;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -527,6 +528,9 @@ static vsi_nn_kernel_node_t _setup
vsi_size_t new_rank = 0;
vsi_bool ret = FALSE;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
ret = vsi_nn_kernel_optimize_eltwise_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num,
inputs[1]->attr.size, inputs[1]->attr.dim_num,
@@ -543,11 +547,11 @@ static vsi_nn_kernel_node_t _setup
outputs[0], shapes[2], new_rank );
#define _swap_tensor(a, b, tmp) \
- do { \
+ { \
tmp = a; \
a = b; \
b = tmp; \
- } while(0)
+ }
if (shapes[1][3] > shapes[0][3] && new_rank == 4)
{
diff --git a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
index 8e5d05e6c..e5669b0fd 100644
--- a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
@@ -134,6 +134,8 @@ DEF_KERNEL_INITIALIZER(_conv1d_ovxlib_initializer)
int32_t input_width = 0;
int32_t output_width = 0;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
index cad8476a6..dbdd513ab 100644
--- a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
@@ -36,6 +36,7 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "utils/vsi_nn_dtype_util.h"
__BEGIN_DECLS
@@ -47,21 +48,29 @@ __BEGIN_DECLS
#define KERNEL_SOURCE_2 "cumsum_2d"
#define KERNEL_SOURCE_3 "cumsum_bf16"
#define KERNEL_SOURCE_4 "cumsum_f16_u8"
+#define KERNEL_SOURCE_5 "cumsum_ex_rev_axis0"
+#define KERNEL_SOURCE_6 "cumsum_ex_rev_axis1"
+#define KERNEL_SOURCE_7 "cumsum_ex_rev_axis2"
// Add kernel hashtable here
-#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
- ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
+#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, EX_REV, _image_2d) \
+ ((EX_REV << 24) | (AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
#define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
- { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
+ { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0), \
CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
SOURCE },
#define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
- { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \
+ { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1), \
CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
SOURCE },
+#define HASH_CUMSUM_EX_REV_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
+ { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0), \
+ CVIVANTE_NAMESPACE("evis.cumsum_ex_rev_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
+ SOURCE },
+
static const struct {
uint32_t key;
char* function_name;
@@ -108,6 +117,24 @@ static const struct {
HASH_CUMSUM_KERNELS_2D(1, F16, U8, KERNEL_SOURCE_4)
HASH_CUMSUM_KERNELS_2D(1, F16, I8, KERNEL_SOURCE_4)
HASH_CUMSUM_KERNELS_2D(1, F16, I16, KERNEL_SOURCE_4)
+ HASH_CUMSUM_EX_REV_KERNELS(0, U8, U8, KERNEL_SOURCE_5)
+ HASH_CUMSUM_EX_REV_KERNELS(0, I8, I8, KERNEL_SOURCE_5)
+ HASH_CUMSUM_EX_REV_KERNELS(0, I16, I16, KERNEL_SOURCE_5)
+ HASH_CUMSUM_EX_REV_KERNELS(0, F16, F16, KERNEL_SOURCE_5)
+ HASH_CUMSUM_EX_REV_KERNELS(1, U8, U8, KERNEL_SOURCE_6)
+ HASH_CUMSUM_EX_REV_KERNELS(1, I8, I8, KERNEL_SOURCE_6)
+ HASH_CUMSUM_EX_REV_KERNELS(1, I16, I16, KERNEL_SOURCE_6)
+ HASH_CUMSUM_EX_REV_KERNELS(1, F16, F16, KERNEL_SOURCE_6)
+ HASH_CUMSUM_EX_REV_KERNELS(2, U8, U8, KERNEL_SOURCE_7)
+ HASH_CUMSUM_EX_REV_KERNELS(2, I8, I8, KERNEL_SOURCE_7)
+ HASH_CUMSUM_EX_REV_KERNELS(2, I16, I16, KERNEL_SOURCE_7)
+ HASH_CUMSUM_EX_REV_KERNELS(2, F16, F16, KERNEL_SOURCE_7)
+ HASH_CUMSUM_EX_REV_KERNELS(1, F16, U8, KERNEL_SOURCE_4)
+ HASH_CUMSUM_EX_REV_KERNELS(1, F16, I8, KERNEL_SOURCE_4)
+ HASH_CUMSUM_EX_REV_KERNELS(1, F16, I16, KERNEL_SOURCE_4)
+ HASH_CUMSUM_EX_REV_KERNELS(2, F16, U8, KERNEL_SOURCE_4)
+ HASH_CUMSUM_EX_REV_KERNELS(2, F16, I8, KERNEL_SOURCE_4)
+ HASH_CUMSUM_EX_REV_KERNELS(2, F16, I16, KERNEL_SOURCE_4)
};
/*
@@ -143,6 +170,8 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
{0, 0, 0}}; // globalWorkSize: image size in thread
int32_t axis = 0;
+ int32_t exclusive = 0;
+ int32_t reverse = 0;
int32_t width = 0;
int32_t height = 0;
int32_t channel = 0;
@@ -161,6 +190,8 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
uint32_t pack_key = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -168,6 +199,10 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &exclusive);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &reverse);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
@@ -204,7 +239,7 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
}
in_out_scale = input_scale * output_scale;
- in_out_zp_scale = (float)in_out_scale * input_zp;
+ in_out_zp_scale = (float)in_out_scale * input_zp * (-1);
input_shape = attr[0]->shape;
dim = (uint32_t)input_shape->size;
@@ -460,14 +495,121 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniSumHorzRevF16toF16A_4x4 = {{
+ 0x01051555, // TCfg
+ 0x00000000, // ASelt
+ 0x05674567, 0x00070067, // ABin
+ 0x020a2aaa, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000100, // AccumType, ConstantType, and PostShift
+ 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00003c00,
+ 0x3c003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniSumHorzRevF16toF16B_4x4 = {{
+ 0x01051555, // TCfg
+ 0x00000000, // ASelt
+ 0x01230123, 0x00030023, // ABin
+ 0x020a2aaa, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000100, // AccumType, ConstantType, and PostShift
+ 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00003c00,
+ 0x3c003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniSumHorzRevF16toF16C_2x8 = {{
+ 0x11115555, // TCfg
+ 0x00000000, // ASelt
+ 0x43424140, 0x07060504, // ABin
+ 0x2222aaaa, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000100, // AccumType, ConstantType, and PostShift
+ 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00,
+ 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniAccSumHorzRevF16toF16_2x8 = {{
+ 0x55555555, // TCfg
+ 0x44444444, // ASelt
+ 0x03020100, 0x07060504, // ABin
+ 0xaaaaaaaa, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000100, // AccumType, ConstantType, and PostShift
+ 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00,
+ 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniSumHorzRevU8toI16A_4x4 = {{
+ 0x01051555, // TCfg
+ 0x00000000, // ASelt
+ 0x05674567, 0x00070067, // ABin
+ 0x020a2aaa, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000600, // AccumType, ConstantType, and PostShift
+ 0x00010001, 0x00010001, 0x00010001, 0x00000001,
+ 0x00010001, 0x00000000, 0x00000001, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniSumHorzRevU8toI16B_8x4 = {{
+ 0x15555555, 0x01550555, // TCfg
+ 0x443214c7, 0x3214c700, 0x14c70044, 0xc7000432, 0x00003214, // BinSelect
+ 0x00000700, // AccumType, ConstantType, and PostShift
+ 0x01010101, 0x01010101, 0x01010101, 0x00010101,
+ 0x01010101, 0x00000101, 0x01010101, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniSubZpRevI16toI16_2x8 = {{
+ 0x55555555, // TCfg
+ 0x44444444, // ASelt
+ 0x03020100, 0x07060504, // ABin
+ 0xaaaaaaaa, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000600, // AccumType, ConstantType, and PostShift
+ 0x00080001, 0x00070001, 0x00060001, 0x00050001,
+ 0x00040001, 0x00030001, 0x00020001, 0x00010001 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniAccSumHorzRevI16toI32A_4x4 = {{
+ 0x0d0d0d0d, // TCfg
+ 0x04040404, // ASelt
+ 0x00050004, 0x00070006, // ABin
+ 0x02020202, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00002600, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000000, 0x00000001, 0x00000000,
+ 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniAccSumHorzRevI16toI32B_4x4 = {{
+ 0x0d0d0d0d, // TCfg
+ 0x04040404, // ASelt
+ 0x00010000, 0x00030002, // ABin
+ 0x02020202, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00002600, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000000, 0x00000001, 0x00000000,
+ 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16};
+
gpu_quantize_multiplier_16bit( (double)input_scale * output_scale, &M0, &postShift);
multAndoutZP0[0] = (uint32_t)(M0);
multAndoutZP0[1] = (uint32_t)((attr[1]->asymm.zero_point << postShift) - input_zp * M0);
gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift );
- status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
- status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
- CHECK_STATUS_FAIL_GOTO(status, OnError );
+ if ((exclusive || reverse) && axis == 0)
+ {
+ status = vsi_nn_kernel_gpu_add_param( node,
+ "uniSumHorzRevF16toF16A_4x4", &uniSumHorzRevF16toF16A_4x4 );
+ status |= vsi_nn_kernel_gpu_add_param( node,
+ "uniSumHorzRevF16toF16B_4x4", &uniSumHorzRevF16toF16B_4x4 );
+ status |= vsi_nn_kernel_gpu_add_param( node,
+ "uniSumHorzRevF16toF16C_2x8", &uniSumHorzRevF16toF16C_2x8 );
+ status |= vsi_nn_kernel_gpu_add_param( node,
+ "uniAccSumHorzRevF16toF16_2x8", &uniAccSumHorzRevF16toF16_2x8 );
+ status |= vsi_nn_kernel_gpu_add_param( node,
+ "uniSumHorzRevU8toI16A_4x4", &uniSumHorzRevU8toI16A_4x4 );
+ status |= vsi_nn_kernel_gpu_add_param( node,
+ "uniSumHorzRevU8toI16B_8x4", &uniSumHorzRevU8toI16B_8x4 );
+ status |= vsi_nn_kernel_gpu_add_param( node,
+ "uniSubZpRevI16toI16_2x8", &uniSubZpRevI16toI16_2x8 );
+ status |= vsi_nn_kernel_gpu_add_param( node,
+ "uniAccSumHorzRevI16toI32A_4x4", &uniAccSumHorzRevI16toI32A_4x4 );
+ status |= vsi_nn_kernel_gpu_add_param( node,
+ "uniAccSumHorzRevI16toI32B_4x4", &uniAccSumHorzRevI16toI32B_4x4 );
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ }
switch( pack_key )
{
@@ -477,7 +619,6 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
case _PACK_SELECT_KEY( F16, F16, 2, 3):
{
status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
- status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale);
status |= vsi_nn_kernel_gpu_add_param(node, "in_out_zp_scale", &in_out_zp_scale);
@@ -493,47 +634,21 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
"uniAccSumVertU8toI32D_4x4", &uniAccSumVertU8toI32D_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniSumHorzF16toF16A_4x4", &uniSumHorzF16toF16A_4x4 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniSumHorzF16toF16B_4x4", &uniSumHorzF16toF16B_4x4 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniSumHorzF16toF16C_2x8", &uniSumHorzF16toF16C_2x8 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniAccSumHorzF16toF16_2x8", &uniAccSumHorzF16toF16_2x8 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniSumHorzU8toI16A_4x4", &uniSumHorzU8toI16A_4x4 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniSumHorzU8toI16B_8x4", &uniSumHorzU8toI16B_8x4 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniSubZpI16toI16_2x8", &uniSubZpI16toI16_2x8 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniAccSumHorzI16toI32A_4x4", &uniAccSumHorzI16toI32A_4x4 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniAccSumHorzI16toI32B_4x4", &uniAccSumHorzI16toI32B_4x4 );
status |= vsi_nn_kernel_gpu_add_param(
node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
- case _PACK_SELECT_KEY( U8, U8, 0, 2):
- case _PACK_SELECT_KEY( U8, U8, 1, 2):
- case _PACK_SELECT_KEY( U8, U8, 0, 3):
case _PACK_SELECT_KEY( U8, U8, 1, 3):
- case _PACK_SELECT_KEY( I8, I8, 0, 2):
- case _PACK_SELECT_KEY( I8, I8, 1, 2):
- case _PACK_SELECT_KEY( I8, I8, 0, 3):
case _PACK_SELECT_KEY( I8, I8, 1, 3):
- case _PACK_SELECT_KEY( I16, I16, 0, 2):
- case _PACK_SELECT_KEY( I16, I16, 1, 2):
- case _PACK_SELECT_KEY( I16, I16, 0, 3):
case _PACK_SELECT_KEY( I16, I16, 1, 3):
- case _PACK_SELECT_KEY( F16, F16, 0, 2):
- case _PACK_SELECT_KEY( F16, F16, 1, 2):
- case _PACK_SELECT_KEY( F16, F16, 0, 3):
case _PACK_SELECT_KEY( F16, F16, 1, 3):
+ case _PACK_SELECT_KEY( U8, U8, 1, 2):
+ case _PACK_SELECT_KEY( I8, I8, 1, 2):
+ case _PACK_SELECT_KEY( I16, I16, 1, 2):
+ case _PACK_SELECT_KEY( F16, F16, 1, 2):
{
- status = vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
+ status = vsi_nn_kernel_gpu_add_param(node, "height", &height);
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale);
status |= vsi_nn_kernel_gpu_add_param(node, "in_out_zp_scale", &in_out_zp_scale);
@@ -547,6 +662,26 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
"uniAccSumVertU8toI32C_4x4", &uniAccSumVertU8toI32C_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniAccSumVertU8toI32D_4x4", &uniAccSumVertU8toI32D_4x4 );
+ status |= vsi_nn_kernel_gpu_add_param( node,
+ "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+ status |= vsi_nn_kernel_gpu_add_param(
+ node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ }
+ break;
+ case _PACK_SELECT_KEY( U8, U8, 0, 2):
+ case _PACK_SELECT_KEY( U8, U8, 0, 3):
+ case _PACK_SELECT_KEY( I8, I8, 0, 2):
+ case _PACK_SELECT_KEY( I8, I8, 0, 3):
+ case _PACK_SELECT_KEY( I16, I16, 0, 2):
+ case _PACK_SELECT_KEY( I16, I16, 0, 3):
+ case _PACK_SELECT_KEY( F16, F16, 0, 2):
+ case _PACK_SELECT_KEY( F16, F16, 0, 3):
+ {
+ status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
+ status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
+ status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+ status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale);
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
@@ -578,7 +713,9 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
case _PACK_SELECT_KEY( BF16, BF16, 1, 3):
case _PACK_SELECT_KEY( BF16, BF16, 2, 3):
{
- status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
+ status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
+ status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
+ status |= vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
status |= vsi_nn_kernel_gpu_add_param(
node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(
@@ -604,7 +741,9 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
case _PACK_SELECT_KEY( F16, I16, 1, 3):
case _PACK_SELECT_KEY( F16, I16, 2, 3):
{
- status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
+ status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
+ status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
+ status |= vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
status |= vsi_nn_kernel_gpu_add_param(
node, "uniAccSumVertF16toF16_2x8", &uniAccSumVertF16toF16_2x8);
status |= vsi_nn_kernel_gpu_add_param(
@@ -655,21 +794,24 @@ static vsi_status _query_kernel
vsi_nn_kernel_t* kernel,
const vsi_nn_kernel_param_t * params,
int32_t axis,
- int32_t is_2d
+ int32_t is_2d,
+ int32_t is_ex_rev
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
+
+ VSI_UNREFERENCED(params);
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
- key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d);
+ key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_ex_rev, is_2d);
- for( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
+ for ( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
{
if ( cumsum_map[i].key == key )
{
@@ -716,17 +858,35 @@ static vsi_nn_kernel_node_t _setup
int32_t axis_new = 0;
int32_t is_2d = 0;
uint32_t rs_dim = 2;
- int32_t i = 0;
+ uint32_t i = 0;
+ int32_t is_ex_or_rev = exclusive || reverse;
- vsi_nn_kernel_optimize_softmax_shape(
- inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
- shapes[0], &rs_dim, &axis_new);
- if (exclusive || reverse || rs_dim > 3)
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
+ if (axis < 0)
+ {
+ axis_new = 0;
+ shapes[0][0] = 1;
+ shapes[0][1] = 1;
+ for (i = 0; i < inputs[0]->attr.dim_num; i++)
+ {
+ shapes[0][0] *= inputs[0]->attr.size[i];
+ }
+ rs_dim = 2;
+ }
+ else
+ {
+ vsi_nn_kernel_optimize_softmax_shape(
+ inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+ shapes[0], &rs_dim, &axis_new);
+ }
+ if (rs_dim > 3)
{
return NULL;
}
- if (rs_dim == 2)
+ if (rs_dim == 2 && is_ex_or_rev == 0)
{
is_2d = 1;
}
@@ -736,7 +896,7 @@ static vsi_nn_kernel_node_t _setup
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[0], (vsi_size_t)rs_dim );
- status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d);
+ status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d, is_ex_or_rev);
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
@@ -754,6 +914,14 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &tmp_params[2] );
vsi_nn_kernel_scalar_release( &tmp_params[3] );
vsi_nn_kernel_scalar_release( &tmp_params[4] );
+ {
+ // Set default border mode.
+ vx_border_t border;
+ border.mode = VX_BORDER_CONSTANT;
+ vsi_nn_Float32ToDtype(0, (uint8_t*)&border.constant_value.U32, &outputs[0]->attr.dtype);
+ status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
+ CHECK_STATUS(status);
+ }
}
}
diff --git a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
index de5aa8326..9d464623f 100644
--- a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
@@ -152,6 +152,8 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
uint32_t pack_key = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -363,7 +365,9 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
+
+ VSI_UNREFERENCED(params);
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -422,6 +426,9 @@ static vsi_nn_kernel_node_t _setup
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
int32_t blk_flg = block_size == 2 ? 1 : 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
diff --git a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
index 45c4073fd..a2f10ce82 100644
--- a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
@@ -197,6 +197,8 @@ DEF_KERNEL_INITIALIZER(_depthwise_conv1d_initializer)
vx_context ctx = vxGetContext((vx_reference)node);
uint64_t pack_key = 0;
+ VSI_UNREFERENCED(param_size);
+
memset(&hw_param, 0, sizeof(vx_hardware_caps_params_t));
status = vxQueryHardwareCaps(ctx, &hw_param, sizeof(vx_hardware_caps_params_t));
CHECK_STATUS_FAIL_GOTO(status, final);
@@ -729,7 +731,9 @@ static vsi_nn_kernel_node_t _setup
reshape_tensors[0] = inputs[0];
- if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
+ if (inputs[1]->attr.dtype.qnt_type !=
+ VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC &&
+ inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8)
{
shape[0] = inputs[1]->attr.size[0];
shape[1] = 1;
@@ -811,7 +815,9 @@ static vsi_nn_kernel_node_t _setup
}
final:
- if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
+ if (inputs[1]->attr.dtype.qnt_type !=
+ VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC &&
+ inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8)
{
vsi_nn_ReleaseTensor( &reshape_tensors[1] );
}
diff --git a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c
index ee5faf1c3..aa781c8d8 100644
--- a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c
@@ -122,6 +122,8 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
int32_t input1_ZP = 0;
int32_t input0_ZP = 0;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/evis/detect_post_nms_evis.c b/src/tim/vx/internal/src/kernel/evis/detect_post_nms_evis.c
index bc849b4da..5359233ba 100644
--- a/src/tim/vx/internal/src/kernel/evis/detect_post_nms_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/detect_post_nms_evis.c
@@ -145,7 +145,13 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_t * kernel
)
{
-
+ VSI_UNREFERENCED(graph);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(outputs);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(kernel);
return NULL;
} /* _setup() */
diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
index 23b1433a7..5d383a15e 100644
--- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
@@ -223,6 +223,8 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
float beta = 0;
uint32_t pack_key;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -467,7 +469,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int i;
+ size_t i;
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -518,6 +520,9 @@ static vsi_nn_kernel_node_t _setup
float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" );
float beta = vsi_nn_kernel_param_get_float32( params, "beta" );
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
ret = vsi_nn_kernel_optimize_element_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num,
shape, &new_rank );
diff --git a/src/tim/vx/internal/src/kernel/evis/erf_evis.c b/src/tim/vx/internal/src/kernel/evis/erf_evis.c
index a4203164a..ebc8ad8f2 100644
--- a/src/tim/vx/internal/src/kernel/evis/erf_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/erf_evis.c
@@ -136,6 +136,8 @@ DEF_KERNEL_INITIALIZER(_erf_initializer)
float outputZP = 0;
uint32_t pack_key;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -376,6 +378,10 @@ static vsi_nn_kernel_node_t _setup
vsi_bool image_2d = FALSE;
vsi_bool ret = FALSE;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+
ret = vsi_nn_kernel_optimize_element_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num,
shape, &new_rank );
diff --git a/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c b/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c
index 627e48b58..eec0f08e0 100644
--- a/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c
@@ -97,7 +97,10 @@ DEF_KERNEL_INITIALIZER(_extra_ending_initializer)
vsi_nn_kernel_tensor_attr_t * attr = NULL;
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+ CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
out_shape = attr->shape;
gpu_param.global_scale[0] = 8;
@@ -136,6 +139,8 @@ static vsi_status _query_kernel
uint32_t key = 0;
uint32_t i = 0;
+ VSI_UNREFERENCED(inputs);
+
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = EXTRA_ENDING_HASH_KEY( out_dtype );
@@ -186,6 +191,8 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
int32_t i = 0;
+ VSI_UNREFERENCED(params);
+
vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num,
shapes[0], &rank[0]);
vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num,
diff --git a/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c b/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c
index be1bd1714..86d4d585b 100644
--- a/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c
@@ -120,7 +120,7 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
{0, 0, 0},
{0, 0, 0}
};
- vx_status status = VX_FAILURE;
+ vsi_status status = VSI_FAILURE;
vx_tensor input0 = (vx_tensor)param[0];
vx_tensor input1 = (vx_tensor)param[1];
vx_tensor output = (vx_tensor)param[2];
@@ -139,6 +139,8 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
float in1Tail = 0;
float outZp = 0;
+ VSI_UNREFERENCED(param_size);
+
input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0 );
CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
@@ -402,6 +404,8 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_t node = NULL;
vsi_bool image_2d = FALSE;
+ VSI_UNREFERENCED(params);
+
if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_elements_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_elements_evis.c
index 0554d1124..07f159311 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_elements_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_elements_evis.c
@@ -51,18 +51,31 @@ typedef enum
#define STR(a) #a
// Add kernel hashtable here
-#define GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMG_2D ) \
- (( AXIS ) | ( IN0_DTYPE << 2 ) | ( IN1_DTYPE << 10 ) | ( OUT_DTYPE << 18 ) | ( IMG_2D << 26 ))
+#define GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMG_2D , BEYOND_MAXWIDTH) \
+ (( AXIS ) | ( IN0_DTYPE << 2 ) | ( IN1_DTYPE << 10 ) | ( OUT_DTYPE << 18 ) | ( IMG_2D << 26 ) |\
+ (BEYOND_MAXWIDTH << 28))
#define PACK_KERNEL_3D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
- { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 ), \
+ { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 , 0), \
CVIVANTE_NAMESPACE("evis.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
_GATHER_ELEMENTS_KERNEL_SOURCE}
#define PACK_KERNEL_2D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
- { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 ), \
+ { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 , 0), \
CVIVANTE_NAMESPACE("evis.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
_GATHER_ELEMENTS_KERNEL_SOURCE}
+#define PACK_KERNEL_BEYOND_MAXWIDTH_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+ { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 , 1), \
+ CVIVANTE_NAMESPACE("evis.gather_elements_beyond_maxwidth_axis"STR(AXIS)\
+ "_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
+ _GATHER_ELEMENTS_KERNEL_SOURCE}
+
+#define PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+ { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 , 1), \
+ CVIVANTE_NAMESPACE("evis.gather_elements_beyond_maxwidth_axis"STR(AXIS)\
+ "_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
+ _GATHER_ELEMENTS_KERNEL_SOURCE}
+
typedef struct
{
uint32_t key;
@@ -94,6 +107,32 @@ static const _kernel_map_type _gather_elements_kernel_map[] =
PACK_KERNEL_2D_MAP( 1, I16, I32, I16 ),
PACK_KERNEL_2D_MAP( 1, I8, I32, I8 ),
PACK_KERNEL_2D_MAP( 1, U8, I32, U8 ),
+
+ PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 0, F16, I32, F16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 0, I16, I32, I16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 0, I8, I32, I8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 0, U8, I32, U8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 1, F16, I32, F16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 1, I16, I32, I16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 1, I8, I32, I8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 1, U8, I32, U8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 2, F16, I32, F16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 2, I16, I32, I16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 2, I8, I32, I8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 2, U8, I32, U8 ),
+
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, F16, I32, F16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, I16, I32, I16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, I8, I32, I8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, U8, I32, U8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, F16, I32, F16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, I16, I32, I16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, I8, I32, I8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, U8, I32, U8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, F16, I32, F16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, I16, I32, I16 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, I8, I32, I8 ),
+ PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, U8, I32, U8 ),
};
@@ -128,26 +167,48 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer)
{0, 0, 0},
{0, 0, 0}
};
- vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
+ vsi_nn_kernel_tensor_attr_t * input_attr0 = NULL;
+ vsi_nn_kernel_tensor_attr_t * input_attr1 = NULL;
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_size_array_t * out_shape = NULL;
int32_t axis = 0;
int32_t axis_size = 0;
-
- input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
- CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+ uint32_t width0 = 0;
+ uint32_t height0 = 0;
+ uint32_t width1 = 0;
+ uint32_t height1 = 0;
+ uint32_t width_out = 0;
+ uint32_t height_out = 0;
+ uint32_t depth0 = 0;
+ uint32_t depth1 = 0;
+
+ VSI_UNREFERENCED(param_size);
+
+ input_attr0 = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+ CHECK_PTR_FAIL_GOTO( input_attr0, "Create tensor attr buffer fail.", final );
+ input_attr1 = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+ CHECK_PTR_FAIL_GOTO( input_attr1, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_AXIS], &axis);
out_shape = output_attr->shape;
- axis_size = (int32_t)input_attr->shape->data[axis];
+ axis_size = (int32_t)input_attr0->shape->data[axis];
if (axis == 0)
{
gpu_param.global_scale[0] = 4;
}
+ width0 = (uint32_t)input_attr0->shape->data[0];
+ height0 = (uint32_t)input_attr0->shape->data[1];
+ depth0 = input_attr0->shape->size > 2 ? (uint32_t)input_attr0->shape->data[2] : 1;
+ width1 = (uint32_t)input_attr1->shape->data[0];
+ height1 = (uint32_t)input_attr1->shape->data[1];
+ depth1 = input_attr1->shape->size > 2 ? (uint32_t)input_attr1->shape->data[2] : 1;
+ width_out = (uint32_t)output_attr->shape->data[0];
+ height_out = (uint32_t)output_attr->shape->data[1];
+
gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3;
gpu_param.global_size[0] = gpu_align_p2(
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
@@ -157,13 +218,31 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer)
/ gpu_param.global_scale[1]);
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+ if (width0 >= GPU_TENSOR_MAX_WIDTH ||
+ width1 >= GPU_TENSOR_MAX_WIDTH ||
+ height0 >= GPU_TENSOR_MAX_WIDTH ||
+ height1 >= GPU_TENSOR_MAX_WIDTH ||
+ depth0 >= GPU_TENSOR_MAX_WIDTH ||
+ depth1 >= GPU_TENSOR_MAX_WIDTH)
+ {
+ gpu_param.global_scale[0] = 1;
+ gpu_param.global_size[0] = out_shape->data[0];
+ }
+
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
status |= vsi_nn_kernel_gpu_add_param( node, "axis_size", &axis_size );
+ status |= vsi_nn_kernel_gpu_add_param( node, "width0", &width0 );
+ status |= vsi_nn_kernel_gpu_add_param( node, "height0", &height0 );
+ status |= vsi_nn_kernel_gpu_add_param( node, "width1", &width1 );
+ status |= vsi_nn_kernel_gpu_add_param( node, "height1", &height1 );
+ status |= vsi_nn_kernel_gpu_add_param( node, "width_out", &width_out );
+ status |= vsi_nn_kernel_gpu_add_param( node, "height_out", &height_out );
CHECK_STATUS_FAIL_GOTO(status, final );
final:
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
- SAFE_FREE_TENSOR_ATTR(input_attr);
+ SAFE_FREE_TENSOR_ATTR(input_attr0);
+ SAFE_FREE_TENSOR_ATTR(input_attr1);
SAFE_FREE_TENSOR_ATTR(output_attr);
return status;
} /* _gather_elements_initializer() */
@@ -190,6 +269,9 @@ static vsi_status _query_kernel
vx_param_description_t * param_def = _gather_elements_kernel_param_def;
vx_kernel_initialize_f initializer = _gather_elements_initializer;
int32_t img_2d = (outputs[0]->attr.dim_num < 3 || outputs[0]->attr.size[2] == 1) ? 1 : 0;
+ int32_t beyond_maxwidth = 0;
+ vsi_size_t depth0 = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
+ vsi_size_t depth1 = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1;
uint32_t key;
uint32_t i;
@@ -207,7 +289,17 @@ static vsi_status _query_kernel
out_dtype = F16;
}
- key = GATHER_ELEMENTS_HASH_KEY( axis, in0_dtype, in1_dtype, out_dtype, img_2d );
+ if (inputs[0]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH ||
+ inputs[0]->attr.size[1] >= GPU_TENSOR_MAX_WIDTH ||
+ inputs[1]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH ||
+ inputs[1]->attr.size[1] >= GPU_TENSOR_MAX_WIDTH ||
+ depth0 >= GPU_TENSOR_MAX_WIDTH ||
+ depth1 >= GPU_TENSOR_MAX_WIDTH)
+ {
+ beyond_maxwidth = 1;
+ }
+
+ key = GATHER_ELEMENTS_HASH_KEY( axis, in0_dtype, in1_dtype, out_dtype, img_2d, beyond_maxwidth );
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
index 499bc5a28..ba7ad75f4 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
@@ -294,6 +294,8 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
uint32_t pack_key = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -491,6 +493,8 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
uint32_t pack_key = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -692,7 +696,9 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
+
+ VSI_UNREFERENCED(params);
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -768,6 +774,9 @@ static vsi_nn_kernel_node_t _setup
vsi_size_t rs_dim = batch_dims == 0 ? 2 : 3;
int32_t i = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if (axis == 0)
{
status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, batch_dims, 0, &is_array);
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
index 355e90857..91c8f1744 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
@@ -148,7 +148,7 @@ static vsi_status get_gather_nd_tensor_reshape_size
vsi_size_t block_size,
uint32_t coordDim,
int32_t* newDim,
- int32_t batch_dims
+ uint32_t batch_dims
)
{
vsi_status status = VSI_FAILURE;
@@ -175,17 +175,23 @@ static vsi_status get_gather_nd_tensor_reshape_size
if (batch_dims)
{
+ int32_t rank = 1;
for (i = 0; i < offset; i++)
{
sizes[0] *= input_size[i];
}
- for (i = 0; i < coordDim; i++)
+ for (i = 0; i < coordDim - 1; i++)
{
- sizes[i + 1] = input_size[i + offset];
+ sizes[rank++] = input_size[i + offset];
}
- newDim[0] = coordDim == 1 ? 2 : 3;
+ for (i = 0; i < batch_dims; i++)
+ {
+ sizes[rank] *= input_size[dims_num - i - 1];
+ }
+
+ newDim[0] = rank + 1;
}
else
{
@@ -215,13 +221,27 @@ static vsi_status get_gather_nd_tensor_reshape_size
}
else // indices&output reshape
{
- if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
+ if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH && batch_dims == 0)
{
sizes[0] = block_size;
sizes[1] = elementCnt / block_size;
status = VSI_SUCCESS;
newDim[0] = 2;
}
+ else if (batch_dims > 0)
+ {
+ vsi_size_t batch_cnt = 1;
+ for (i = 0; i < batch_dims; ++i)
+ {
+ batch_cnt *= input_size[dims_num - i - 1];
+ }
+
+ sizes[0] = block_size;
+ sizes[1] = (elementCnt / block_size) / batch_cnt;
+ sizes[2] = batch_cnt;
+ status = VSI_SUCCESS;
+ newDim[0] = 3;
+ }
}
#undef VSI_NN_MAX_IMAGE_WIDTH
@@ -248,15 +268,18 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
};
vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
- int32_t block_size = 0;
- int32_t indices_num = 1;
- int32_t src0ZP = 0;
- float src0Scale = 1;
- int32_t dstZP = 0;
- float dstScale = 1;
+ int32_t block_size = 0;
+ int32_t indices_num = 1;
+ int32_t batch_num = 1;
+ int32_t src0ZP = 0;
+ float src0Scale = 1;
+ int32_t dstZP = 0;
+ float dstScale = 1;
uint32_t pack_key = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -302,6 +325,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
}
indices_num = (int32_t)(attr[1]->shape->data[1]);
+ batch_num = (int32_t)(attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1);
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
@@ -310,7 +334,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
gpu_param.global_size[0] = gpu_align_p2((block_size + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = indices_num;
- gpu_param.global_size[2] = 1;
+ gpu_param.global_size[2] = batch_num;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, OnError);
@@ -422,7 +446,8 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype = U8;
vsi_nn_kernel_coord_type_e coord_type = _error;
uint32_t key = 0;
- int i = 0;
+ int32_t batch_flg = batch_dims > 0 ? 1 : 0;
+ size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -448,7 +473,7 @@ static vsi_status _query_kernel
coord_type = _3D;
}
- key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, batch_dims );
+ key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, batch_flg );
for ( i = 0; i < _cnt_of_array(gather_nd_map); i ++ )
{
@@ -495,6 +520,9 @@ static vsi_nn_kernel_node_t _setup
int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
status = get_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim, batch_dims);
status |= get_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim, batch_dims);
status |= get_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim, batch_dims);
diff --git a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
index 8a9971fc6..ce13b84f7 100644
--- a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
@@ -246,6 +246,8 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
float sum_x2_tail1 = 1;
float work_item_pixels = 1;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -381,6 +383,8 @@ DEF_KERNEL_INITIALIZER(_groupnorm_means_initializer)
int32_t chn = 0;
int32_t group_stride = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -450,6 +454,8 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
int32_t height = 0, width = 0, chn = 0;
int32_t is2D = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
@@ -776,6 +782,9 @@ static vsi_nn_kernel_node_t _setup
vsi_size_t group_size = inputs[0]->attr.size[2] / group_num;
float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
// Check if gpu can support the size
if ( !vsi_nn_kernel_gpu_check_shape(
outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
@@ -898,11 +907,11 @@ static vsi_nn_kernel_node_t _setup
if (node)
{
uint32_t index = 0;
- int32_t pStride = 0;
+ float pStride = 0;
if (!is2D_flg)
{
- pStride = (int32_t)(inputs[1]->attr.size[0] / new_shape[1]);
- rSpaceOrg = 1.0f / (new_shape[0] / pStride);
+ pStride = (float)inputs[1]->attr.size[0] / (float)new_shape[1];
+ rSpaceOrg = pStride < 1.0f ? 0.0f : 1.0f / (new_shape[0] / pStride);
}
node_params[index++] = rs_input;
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
@@ -912,7 +921,7 @@ static vsi_nn_kernel_node_t _setup
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rSpaceOrg );
- node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pStride );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &pStride );
status = vsi_nn_kernel_node_pass_param( node, node_params,
_GROUPNORM_PARAM_NUM );
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c
index 9b5a2c1fb..1bfdb49fd 100644
--- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c
@@ -227,6 +227,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer)
vsi_size_array_t * output_shape = NULL;
vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL, NULL, NULL, NULL };
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -635,7 +637,7 @@ static vsi_status _query_kernel
int32_t input_category,
int32_t input_layout,
int32_t use_cudnn,
- int32_t* param_count,
+ vsi_size_t* param_count,
int32_t* input_count,
int32_t* output_count
/* Add extra params */
@@ -756,7 +758,7 @@ static vsi_nn_kernel_node_t _setup
int32_t k = 0;
vsi_size_t input_size = inputs[0]->attr.size[0];
vsi_size_t batch = inputs[0]->attr.size[1];
- int32_t param_count = 0;
+ vsi_size_t param_count = 0;
int32_t input_count = 0;
int32_t output_count = 0;
int32_t gate_activation = 0;
@@ -765,6 +767,8 @@ static vsi_nn_kernel_node_t _setup
int32_t use_cudnn = vsi_nn_kernel_param_get_int32( params, "use_cudnn_implementation" );
int32_t input_layout = vsi_nn_kernel_param_get_int32( params, "input_layout" );
+ VSI_UNREFERENCED(input_num);
+
gate_activation = vsi_nn_kernel_param_get_int32( params, "gate_activation" );
candidate_activation = vsi_nn_kernel_param_get_int32( params, "candidate_activation" );
@@ -783,7 +787,9 @@ static vsi_nn_kernel_node_t _setup
if( VSI_SUCCESS == status)
{
_inputs = (vsi_nn_tensor_t**)malloc(input_count * sizeof(vsi_nn_tensor_t**));
+ CHECK_PTR_FAIL_GOTO( _inputs, "Create buffer fail.", final );
node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count);
+ CHECK_PTR_FAIL_GOTO( node_params, "Create buffer fail.", final );
if (use_cudnn)
{
@@ -896,6 +902,7 @@ static vsi_nn_kernel_node_t _setup
}
}
+final:
vsi_nn_safe_free(_inputs);
vsi_nn_safe_free(node_params);
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c
index 75b6136e1..9ad5852c3 100644
--- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c
@@ -110,7 +110,7 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_sma_initializer)
{
#define _PACK_A_GRUCELL_ACTIVATION_SMA_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE ) \
(( IN1_TYPE << 24) | ( IN1_TYPE << 16) | ( IN0_TYPE << 8) | ( OUT_TYPE))
- vsi_status status = VX_SUCCESS;
+ vsi_status status = VSI_FAILURE;
// Alignment with a power of two value.
gpu_param_t gpu_param = {
3,
@@ -129,6 +129,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_sma_initializer)
vsi_size_array_t *output_shape = NULL;
uint32_t pack_key = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0);
CHECK_PTR_FAIL_GOTO( attr[0], "vsi_nn_kernel_tensor_attr_create fail.", final );
@@ -302,6 +304,8 @@ static vsi_nn_kernel_node_t _setup
vsi_bool ret = FALSE;
vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL };
+ VSI_UNREFERENCED(params);
+
for (i = 0; i < _IO_NUM; i++)
{
shapes_ptr[i] = shapes[i];
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
index 40e22e981..7adf6bfb7 100644
--- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
@@ -124,6 +124,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
#define _PACK_SELECT_KEY( hstate_type, fc_type, output_type ) \
(hstate_type | (fc_type << 8) | (output_type << 16))
+ VSI_UNREFERENCED(param_size);
+
output = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_Z_H_IN_CNT + GRUCELL_ACT_Z_H_OUT_OUTPUT];
hstate_out = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_Z_H_IN_CNT + GRUCELL_ACT_Z_H_OUT_HSTATE];
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c
index 85220002f..afd872352 100644
--- a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c
@@ -117,6 +117,8 @@ DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer)
#define _PACK_SELECT_KEY( hstate_type, fc_type, output_type ) \
(hstate_type | (fc_type << 8) | (output_type << 16))
+ VSI_UNREFERENCED(param_size);
+
output = (vsi_nn_kernel_tensor_t)param[3];
for (i = 0; i < 2; i++)
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c
index 0c35aeaf9..60d932b80 100644
--- a/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c
@@ -46,17 +46,19 @@ typedef enum _grucell_nn_activation_type_e
{
SIGMOID = VSI_NN_ACT_SIGMOID,
HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
+ TANH = VSI_NN_ACT_TANH,
}grucell_nn_activation_type_e;
#define _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE "grucell_reset_after_activation"
// Add kernel hashtable here
-#define GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \
- (( HSTATE_DTYPE ) | ( IN_FC_DTYPE << 6 ) | ( OUT_TYPE << 12 ) | ( REC_ACT << 18 ))
-#define PACK_KERNEL_MAP( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \
- { GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ), \
-CVIVANTE_NAMESPACE("evis.grucell_reset_after_activation_"#HSTATE_DTYPE"_"#IN_FC_DTYPE"to"#OUT_TYPE"_"#REC_ACT), \
-_GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE }
+#define GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT, ACT ) \
+ (( HSTATE_DTYPE ) | ( IN_FC_DTYPE << 6 ) | ( OUT_TYPE << 12 ) | ( REC_ACT << 18 ) | ( ACT << 24 ))
+#define PACK_KERNEL_MAP( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT, ACT ) \
+ { GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT, ACT ), \
+ CVIVANTE_NAMESPACE("evis.grucell_reset_after_activation_"\
+ #HSTATE_DTYPE"_"#IN_FC_DTYPE"to"#OUT_TYPE"_"#ACT"_"#REC_ACT), \
+ _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE }
typedef struct
{
@@ -68,10 +70,14 @@ typedef struct
static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] =
{
// Register kernel here
- PACK_KERNEL_MAP( U8, F16, U8, SIGMOID ),
- PACK_KERNEL_MAP( I8, F16, I8, SIGMOID ),
- PACK_KERNEL_MAP( I16, F16, I16, SIGMOID ),
- PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
+ PACK_KERNEL_MAP( U8, F16, U8, SIGMOID, TANH ),
+ PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, TANH ),
+ PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, TANH ),
+ PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, TANH ),
+ PACK_KERNEL_MAP( U8, F16, U8, SIGMOID, SIGMOID ),
+ PACK_KERNEL_MAP( I8, F16, I8, SIGMOID, SIGMOID ),
+ PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, SIGMOID ),
+ PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, SIGMOID ),
};
@@ -123,6 +129,8 @@ DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer)
#define _PACK_SELECT_KEY( hstate_type, fc_type, output_type ) \
(hstate_type | (fc_type << 8) | (output_type << 16))
+ VSI_UNREFERENCED(param_size);
+
output = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_IN_CNT + GRUCELL_ACT_OUT_OUTPUT];
hstate_out = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_IN_CNT + GRUCELL_ACT_OUT_H_STATE];
@@ -297,7 +305,8 @@ static vsi_status _query_kernel
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
- int32_t recurrent_activation
+ int32_t recurrent_activation,
+ int32_t activation
)
{
vsi_status status = VSI_FAILURE;
@@ -309,14 +318,15 @@ static vsi_status _query_kernel
vx_param_description_t * param_def = _grucell_reset_after_activation_kernel_param_def;
vx_kernel_initialize_f initializer = _grucell_reset_after_activation_initializer;
- uint32_t key;
- uint32_t i;
+ uint32_t key = 0;
+ uint32_t i = 0;
hstate_dtype = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_H_STATE]->attr.dtype.vx_type );
fc_dtype = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_I_FC_Z]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dtype.vx_type );
- key = GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( hstate_dtype, fc_dtype, out_dtype, recurrent_activation );
+ key = GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( hstate_dtype, fc_dtype, out_dtype,
+ recurrent_activation, activation );
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
@@ -362,12 +372,7 @@ static vsi_nn_kernel_node_t _setup
int32_t activation = vsi_nn_kernel_param_get_int32( params, "activation" );
int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" );
- if( activation != VSI_NN_ACT_TANH )
- {
- return NULL;
- }
-
- status = _query_kernel( kernel, inputs, outputs, recurrent_activation );
+ status = _query_kernel( kernel, inputs, outputs, recurrent_activation, activation );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
diff --git a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
index 48af7f85a..7e5a84650 100644
--- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
@@ -246,6 +246,8 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer)
float sum_x2_tail1 = 1;
float work_item_pixels = 1;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -402,6 +404,8 @@ DEF_KERNEL_INITIALIZER(_instancenorm_means_initializer)
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
vsi_size_array_t * input_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -452,6 +456,8 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
vsi_size_array_t * input_shape = NULL;
vx_int32 width = 0, chn = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -808,6 +814,10 @@ static vsi_nn_kernel_node_t _setup
vsi_size_t batch = 1;
vsi_bool ret = FALSE;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
+
memcpy(new_shape, inputs[0]->attr.size, sizeof(inputs[0]->attr.size));
if (new_shape[0] >= GPU_TENSOR_MAX_WIDTH || new_shape[1] >= GPU_TENSOR_MAX_WIDTH)
diff --git a/src/tim/vx/internal/src/kernel/evis/l1norm_evis.c b/src/tim/vx/internal/src/kernel/evis/l1norm_evis.c
index 00c31c319..ce097d624 100644
--- a/src/tim/vx/internal/src/kernel/evis/l1norm_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/l1norm_evis.c
@@ -168,6 +168,8 @@ DEF_KERNEL_INITIALIZER(_l1norm_initializer_axis)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis);
@@ -197,12 +199,12 @@ DEF_KERNEL_INITIALIZER(_l1norm_initializer_axis)
}
else if (axis == 1)
{
- gpu_param.global_size[0] = (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0];;
+ gpu_param.global_size[0] = (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0];
gpu_param.global_size[1] = depth;
}
else if (axis == 2)
{
- gpu_param.global_size[0] = (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0];;
+ gpu_param.global_size[0] = (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0];
gpu_param.global_size[1] = height;
}
diff --git a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
index be4a29953..068257c43 100644
--- a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
@@ -139,6 +139,8 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
int32_t axis2Dflg = 0;
int32_t inputWidth = 0;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
diff --git a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
index 966a6cdd8..0a477c525 100644
--- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
@@ -250,6 +250,8 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
float inv_multiplier = 0;
int32_t height = 0, width = 0, chn = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
@@ -539,6 +541,8 @@ DEF_KERNEL_INITIALIZER(_layernorm_axis01_sums_initializer)
int32_t height = 0;
int32_t chn = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -653,6 +657,8 @@ DEF_KERNEL_INITIALIZER(_layernorm_axis01_initializer)
vx_uint32 group_num = 0;
vx_int32 height = 0, width = 0, chn = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
@@ -787,7 +793,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e input2_dtype = F16;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
- int32_t i = 0;
+ size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
@@ -832,7 +838,7 @@ static vsi_status _query_kernel_axis01
vsi_nn_kernel_dtype_e input2_dtype = F16;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
@@ -917,6 +923,9 @@ static vsi_nn_kernel_node_t _setup_axis01
uint32_t axis_size = 0;
uint32_t rank_in = 0, rank_para = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
status = vsi_nn_kernel_optimize_tensor_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num,
axis, axis_num, new_shape[0], &rank_in, new_axis, &axis_size);
@@ -942,6 +951,7 @@ static vsi_nn_kernel_node_t _setup_axis01
rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[0], rank_in);
kernel_sums = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+ CHECK_PTR_FAIL_GOTO( kernel_sums, "Create kernel fail.", final );
// Assign unique_id
kernel_sums->unique_id = kernel->unique_id;
@@ -961,6 +971,7 @@ static vsi_nn_kernel_node_t _setup_axis01
attr.size[3] = new_shape[0][3];
attr.dim_num = rank_in;
tensor_sums = vsi_nn_CreateTensor( graph, &attr );
+ CHECK_PTR_FAIL_GOTO( tensor_sums, "Create tensor fail.", final );
status = _query_kernel_axis01(inputs, outputs, kernel_sums, kernel);
if ( VSI_SUCCESS != status )
@@ -972,6 +983,7 @@ static vsi_nn_kernel_node_t _setup_axis01
** sum(x) and sumsq(x*x)
*/
sums_node = vsi_nn_kernel_create_node(graph, kernel_sums);
+ CHECK_PTR_FAIL_GOTO( sums_node, "Create kernel fail.", final );
if (sums_node)
{
sums_node_params[0] = rs_input;
@@ -992,6 +1004,7 @@ static vsi_nn_kernel_node_t _setup_axis01
}
node = vsi_nn_kernel_create_node( graph, kernel );
+ CHECK_PTR_FAIL_GOTO( node, "Create kernel fail.", final );
if (node)
{
uint32_t index = 0;
@@ -1065,6 +1078,9 @@ static vsi_nn_kernel_node_t _setup_axis0
uint32_t rank_in = 0;
int32_t is_img2d_input = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
status = vsi_nn_kernel_optimize_tensor_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num,
axis, axis_num, new_shape[0], &rank_in, new_axis, &axis_size);
diff --git a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c
index 3ee30282d..4e7b8a087 100644
--- a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c
@@ -166,6 +166,8 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer)
float rlogE = (float)(log10(2.0f) / log10(exp(1.0f)));
float scaleLogE = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -482,7 +484,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int i;
+ size_t i;
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -529,6 +531,9 @@ static vsi_nn_kernel_node_t _setup
int32_t axis = 0;
float beta = 1.0f;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
axis = vsi_nn_kernel_param_get_int32(params, "axis");
beta = vsi_nn_kernel_param_get_float32(params, "beta");
diff --git a/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c b/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c
index 890f7bc78..d59d851ed 100644
--- a/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c
@@ -98,7 +98,7 @@ DEF_KERNEL_INITIALIZER(_logical_not_initializer)
size_t param_size
)
{
- vsi_status status = VX_SUCCESS;
+ vsi_status status = VSI_FAILURE;
// Alignment with a power of two value.
gpu_param_t gpu_param = {
3,
@@ -112,6 +112,8 @@ DEF_KERNEL_INITIALIZER(_logical_not_initializer)
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output);
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
@@ -226,6 +228,8 @@ static vsi_nn_kernel_node_t _setup
vsi_size_t new_rank = 0;
vsi_bool ret = FALSE;
+ VSI_UNREFERENCED(params);
+
ret = vsi_nn_kernel_optimize_element_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num,
shape, &new_rank );
diff --git a/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c b/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c
index 7e5476b74..54713cb08 100644
--- a/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c
@@ -109,7 +109,7 @@ DEF_KERNEL_INITIALIZER(_logical_ops_initializer)
size_t param_size
)
{
- vsi_status status = VX_FAILURE;
+ vsi_status status = VSI_FAILURE;
// Alignment with a power of two value.
gpu_param_t gpu_param = {
3,
@@ -125,6 +125,8 @@ DEF_KERNEL_INITIALIZER(_logical_ops_initializer)
vsi_nn_kernel_tensor_attr_t *input_attr = NULL, *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input);
CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
@@ -308,11 +310,11 @@ static vsi_nn_kernel_node_t _setup
outputs[0], shapes[2], new_rank );
#define _swap_tensor(a, b, tmp) \
- do { \
+ { \
tmp = a; \
a = b; \
b = tmp; \
- } while(0)
+ }
if (shapes[1][3] > shapes[0][3] && new_rank == 4)
{
diff --git a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
index a99acc6cd..95232b9d1 100644
--- a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
@@ -65,7 +65,8 @@ typedef enum _LSTMUNIT_nn_activation_e
#define LSTMUNIT_ACTIVATION_HASH_KEY(_is_ln, _is_cifg, _is_proj, _is_hybrid, _is_peephole, \
_input_type, _output_type, _cell_type, _rec_act) \
((_is_ln << 31) | (_is_cifg << 30) | (_is_proj << 29) | (_is_hybrid << 28) | (_is_peephole << 27) \
-| (_input_type << 23) | (_output_type << 19) | (_cell_type << 15) | (_rec_act << 10))
+| (((uint32_t)_input_type) << 23) | (((uint32_t)_output_type) << 19) | (((uint32_t)_cell_type) << 15) \
+| (_rec_act << 10))
#define LSTMUNIT_ACTIVATION_SOURCE_NAME(_ln_cifg_proj_hybrid_, _input_type) \
"lstmunit_activation_"#_ln_cifg_proj_hybrid_"_"#_input_type
diff --git a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
index 6e4ee41b1..f5dc60b1e 100644
--- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
@@ -58,9 +58,12 @@ __BEGIN_DECLS
#define KERNEL_SOURCE_14 "matrixmul_f16i16_i16"
#define KERNEL_SOURCE_15 "matrixmul_bf16"
#define KERNEL_SOURCE_16 "matrixmul_u8i16_i16"
+#define KERNEL_SOURCE_17 "matrixmul_merge"
+#define KERNEL_SOURCE_18 "matrixmul_cross"
+#define KERNEL_SOURCE_19 "matrixmul_cross_i16"
-#define HASH_MATRIX_MUL_KEY(_input0_type, _input1_type, _output_type, _trans_a, _trans_b) \
- ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_trans_a << 4) | (_trans_b))
+#define HASH_MATRIX_MUL_KEY(_type0, _type1, _type2, _trans_a, _trans_b, _cross) \
+ ((_type0 << 24) | (_type1 << 16) | (_type2 << 8) | (_trans_a << 4) | (_trans_b << 2) | (_cross))
#define HASH_MATRIX_MUL_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("evis.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE)
@@ -71,21 +74,37 @@ __BEGIN_DECLS
#define HASH_MATRIX_MUL_TRANSA_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("evis.gemm_transa_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE)
+#define HASH_MATRIX_MUL_SH_KERNEL_CROSS_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
+ CVIVANTE_NAMESPACE("evis.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_cross")
+
+#define HASH_MATRIX_MUL_SH_KERNEL_MERGE_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
+ CVIVANTE_NAMESPACE("evis.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_merge")
+
#define TENSOR_MATRIX_MUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
- { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 0), \
+ { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 0, 0), \
HASH_MATRIX_MUL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
SOURCE },
#define TENSOR_MATRIX_MUL_TRANSB_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
- { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 1), \
+ { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 1, 0), \
HASH_MATRIX_MUL_TRANSB_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
SOURCE },
#define TENSOR_MATRIX_MUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
- { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1, 0), \
+ { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1, 0, 0), \
HASH_MATRIX_MUL_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
SOURCE },
+#define TENSOR_MATRIX_MUL_CROSS_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
+ { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 0, 1), \
+ HASH_MATRIX_MUL_SH_KERNEL_CROSS_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
+ SOURCE },
+
+#define TENSOR_MATRIX_MUL_MERGE_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
+ { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 0, 2), \
+ HASH_MATRIX_MUL_SH_KERNEL_MERGE_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
+ SOURCE },
+
static const struct {
uint32_t key;
@@ -135,6 +154,14 @@ static const struct {
TENSOR_MATRIX_MUL_TRANSA_KERNELS(F16, F16, F16, KERNEL_SOURCE_7)
TENSOR_MATRIX_MUL_TRANSA_KERNELS(BF16,BF16,BF16, KERNEL_SOURCE_15)
TENSOR_MATRIX_MUL_TRANSA_KERNELS(U8, I16, I16, KERNEL_SOURCE_7)
+ TENSOR_MATRIX_MUL_MERGE_KERNELS(U8, U8, U8, KERNEL_SOURCE_17)
+ TENSOR_MATRIX_MUL_MERGE_KERNELS(I8, I8, I8, KERNEL_SOURCE_17)
+ TENSOR_MATRIX_MUL_MERGE_KERNELS(I16, I16, I16, KERNEL_SOURCE_19)
+ TENSOR_MATRIX_MUL_MERGE_KERNELS(F16, F16, F16, KERNEL_SOURCE_17)
+ TENSOR_MATRIX_MUL_CROSS_KERNELS(U8, U8, U8, KERNEL_SOURCE_18)
+ TENSOR_MATRIX_MUL_CROSS_KERNELS(I8, I8, I8, KERNEL_SOURCE_18)
+ TENSOR_MATRIX_MUL_CROSS_KERNELS(I16, I16, I16, KERNEL_SOURCE_19)
+ TENSOR_MATRIX_MUL_CROSS_KERNELS(F16, F16, F16, KERNEL_SOURCE_18)
};
/*
@@ -154,7 +181,35 @@ static vx_param_description_t _matrix_mul_kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
+
+static vx_param_description_t _matrix_mul_kernel_cross_param_def[] =
+{
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ // Add kererl parameters here
+};
#define _MATRIX_MUL_PARAM_NUM _cnt_of_array( _matrix_mul_kernel_param_def )
+#define _MATRIX_MUL_CROSS_PARAM_NUM _cnt_of_array( _matrix_mul_kernel_cross_param_def )
/*
* Kernel initializer
@@ -180,7 +235,10 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
int32_t transB = 0;
int32_t width = 0;
int32_t height = 0;
- int32_t chn = 0;
+ vsi_size_t chn = 0;
+ int32_t a_depth = 0;
+ int32_t b_depth = 0;
+ vsi_size_t outer = 0;
int32_t src0ZP = 0;
float src0Scale = 0;
@@ -204,6 +262,8 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
uint32_t evis2 = 0;
vx_context ctx = vxGetContext((vx_reference)node);
vx_hardware_caps_params_t hw_param;
+
+ VSI_UNREFERENCED(param_size);
memset(&hw_param, 0, sizeof(vx_hardware_caps_params_t));
status = vxQueryHardwareCaps(ctx, &hw_param, sizeof(vx_hardware_caps_params_t));
CHECK_STATUS_FAIL_GOTO(status, OnError );
@@ -294,22 +354,59 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
mulKIn0In1Zp = (float)((int)(K + 3) / 4 * 4 * src1ZP * src0ZP);
inOutScale = src0Scale * src1Scale / dstScale;
- if ((attr[0]->shape->size > attr[1]->shape->size) ||
- (attr[0]->shape->data[2] > attr[1]->shape->data[2]
- && attr[0]->shape->size > 2 && attr[1]->shape->size > 2))
+ a_depth = (int32_t)(attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1);
+ b_depth = (int32_t)(attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1);
+
+ if (b_depth == 1)
{
bc2zero = 1;
}
- else if ((attr[1]->shape->size > attr[0]->shape->size) ||
- (attr[1]->shape->data[2] > attr[0]->shape->data[2]
- && attr[0]->shape->size > 2 && attr[1]->shape->size > 2))
+ if (a_depth == 1)
{
ac2zero = 1;
}
width = (int32_t)(attr[2]->shape->data[0]);
height = (int32_t)(attr[2]->shape->data[1]);
- chn = (int32_t)(attr[2]->shape->size > 2 ? attr[2]->shape->data[2] : 1);
+ chn = (attr[2]->shape->size > 2 ? attr[2]->shape->data[2] : 1);
+
+ if (((attr[0]->shape->size == 4 && attr[1]->shape->size == 3) ||
+ (attr[0]->shape->size == 3 && attr[1]->shape->size == 4))
+ && attr[0]->shape->data[2] > 1 && attr[1]->shape->data[2] > 1
+ && chn != attr[0]->shape->data[2] * attr[1]->shape->data[2])
+ {
+ vsi_size_t iter = attr[0]->shape->data[2] * attr[1]->shape->data[2] / chn;
+ if (attr[0]->shape->size == 4)
+ {
+ ac2zero = 1;
+ bc2zero = 0;
+ chn = attr[1]->shape->data[2];
+ outer = attr[0]->shape->data[2] / iter;
+ }
+ else
+ {
+ ac2zero = 0;
+ bc2zero = 1;
+ chn = attr[0]->shape->data[2];
+ outer = attr[1]->shape->data[2] / iter;
+ }
+ }
+ else if (attr[0]->shape->size == 4 && attr[1]->shape->size == 3
+ && attr[0]->shape->data[2] != 1 && attr[1]->shape->data[2] != 1)
+ {
+ ac2zero = 1;
+ bc2zero = 0;
+ chn = attr[1]->shape->data[2];
+ outer = attr[0]->shape->data[2];
+ }
+ else if (attr[1]->shape->size == 4 && attr[0]->shape->size == 3
+ && attr[0]->shape->data[2] != 1 && attr[1]->shape->data[2] != 1)
+ {
+ ac2zero = 0;
+ bc2zero = 1;
+ chn = attr[0]->shape->data[2];
+ outer = attr[1]->shape->data[2];
+ }
gpu_param.global_scale[0] = 4;
gpu_param.global_scale[1] = 4;
@@ -319,7 +416,7 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = gpu_align_p2((height + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1], 4);
- gpu_param.global_size[2] = chn;
+ gpu_param.global_size[2] = (size_t)chn;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, OnError);
@@ -683,6 +780,12 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
uniI16MulI16SumtoI32_16x1.data[i] = multiplierZpB;
}
+ if (outer)
+ {
+ status = vsi_nn_kernel_gpu_add_param( node, "outer", &outer );
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ }
+
switch( pack_key )
{
case _PACK_SELECT_KEY( U8, U8, F16, 0, 1, 0 ):
@@ -790,16 +893,19 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
"uniConvertUint8SubZpToFp32_4x4", &uniConvertUint8SubZpToFp32_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvertUint8SubZpToFp32B_4x4", &uniConvertUint8SubZpToFp32B_4x4 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniI16MulI16SumtoI32_16x1", &uniI16MulI16SumtoI32_16x1 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniI16MulI16SumtoI32B_16x1", &uniI16MulI16SumtoI32B_16x1 );
status |= vsi_nn_kernel_gpu_add_param( node, "input0_ZP", &src0ZP );
status |= vsi_nn_kernel_gpu_add_param( node, "input1_ZP", &src1ZP );
status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP );
status |= vsi_nn_kernel_gpu_add_param( node, "outputScale", &reScaleOut );
- status |= vsi_nn_kernel_gpu_add_param( node, "inout_scale", &inScaledivOut );
- status |= vsi_nn_kernel_gpu_add_param( node, "inout_beta", &inout_beta );
+ if (outer == 0)
+ {
+ status |= vsi_nn_kernel_gpu_add_param( node,
+ "uniI16MulI16SumtoI32_16x1", &uniI16MulI16SumtoI32_16x1 );
+ status |= vsi_nn_kernel_gpu_add_param( node,
+ "uniI16MulI16SumtoI32B_16x1", &uniI16MulI16SumtoI32B_16x1 );
+ status |= vsi_nn_kernel_gpu_add_param( node, "inout_scale", &inScaledivOut );
+ status |= vsi_nn_kernel_gpu_add_param( node, "inout_beta", &inout_beta );
+ }
}
break;
case _PACK_SELECT_KEY( F16, U8, F16, 0, 0, 0 ):
@@ -1093,6 +1199,308 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
return status;
} /* _matrix_mul_initializer() */
+DEF_KERNEL_INITIALIZER(_matrix_mul_cross_initializer)
+ (
+ vsi_nn_kernel_node_t node,
+ const vsi_nn_kernel_node_param_t * param,
+ size_t param_size
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ gpu_param_t gpu_param = {
+ 3,
+ {0, 0, 0},
+ {0, 0, 0},
+ {0, 0, 0},
+ {0, 0, 0}
+ };
+
+ vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
+ int32_t transA = 0;
+ int32_t transB = 0;
+ int32_t width = 0;
+ int32_t height = 0;
+ int32_t axis_size = 0;
+
+ int32_t src0ZP = 0;
+ float src0Scale = 0;
+ int32_t src1ZP = 0;
+ float src1Scale = 0;
+ float dstZP = 0;
+ float dstScale = 0;
+
+ uint32_t pack_key = 0;
+
+ float mulKIn0In1Zp = 0;
+ float inOutScale = 0;
+ int32_t K = 0;
+
+ uint32_t evis2 = 0;
+ vx_context ctx = vxGetContext((vx_reference)node);
+ vx_hardware_caps_params_t hw_param;
+
+ VSI_UNREFERENCED(param_size);
+ memset(&hw_param, 0, sizeof(vx_hardware_caps_params_t));
+ status = vxQueryHardwareCaps(ctx, &hw_param, sizeof(vx_hardware_caps_params_t));
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+ if (hw_param.evis2 == TRUE)
+ {
+ evis2 = 1;
+ }
+
+ attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+ CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+ attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+ CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+ attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+ CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
+
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &transA);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &transB);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &K);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &axis_size);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+ src0ZP = attr[0]->asymm.zero_point;
+ src0Scale = attr[0]->asymm.scale;
+ src1ZP = attr[1]->asymm.zero_point;
+ src1Scale = attr[1]->asymm.scale;
+ dstZP = (float)attr[2]->asymm.zero_point;
+ dstScale = attr[2]->asymm.scale;
+
+ if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+ {
+ if (attr[0]->dfp.fl > 0)
+ {
+ src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
+ }
+ else
+ {
+ src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
+ }
+ src0ZP = 0;
+ }
+ else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
+ {
+ src0Scale = 1;
+ src0ZP = 0;
+ }
+
+ if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
+ {
+ if (attr[1]->dfp.fl > 0)
+ {
+ src1Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl)));
+ }
+ else
+ {
+ src1Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl));
+ }
+ src1ZP = 0;
+ }
+ else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
+ {
+ src1Scale = 1;
+ src1ZP = 0;
+ }
+
+ if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
+ {
+ if (attr[2]->dfp.fl > 0)
+ {
+ dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
+ }
+ else
+ {
+ dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
+ }
+ dstScale = 1.0f / dstScale;
+ dstZP = 0.0f;
+ }
+ else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
+ {
+ dstScale = 1;
+ dstZP = 0.0f;
+ }
+
+ mulKIn0In1Zp = (float)((int)(K + 3) / 4 * 4 * src1ZP * src0ZP);
+ inOutScale = src0Scale * src1Scale / dstScale;
+
+ width = (int32_t)(attr[2]->shape->data[0]);
+ height = (int32_t)(attr[2]->shape->data[1]);
+
+ gpu_param.global_scale[0] = 4;
+ gpu_param.global_scale[1] = 4;
+ gpu_param.global_scale[2] = 1;
+
+ gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
+ / gpu_param.global_scale[0], 4);
+ gpu_param.global_size[1] = gpu_align_p2((height + gpu_param.global_scale[1] - 1)
+ / gpu_param.global_scale[1], 4);
+ gpu_param.global_size[2] = (size_t)axis_size;
+
+ status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+ CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE, TRANSA, TRANSB, EVIS2) \
+ ((IN0_TYPE << 24) | (IN1_TYPE << 16) | (OUT_TYPE << 8) | (TRANSA << 4) | (TRANSB << 2) | (EVIS2))
+
+ pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, attr[2]->dtype, transA, transB, evis2);
+ {
+ uint16_t M0 = 0;
+ uint16_t M1 = 0;
+ int32_t postShift0 = 0;
+ int32_t postShift1 = 0;
+ uint32_t multiplierA = 0;
+ uint32_t multiplierB = 0;
+ gpu_dp_inst_t uniGemmU8U8MulZptoFp32_8x4 = {{
+ 0xaaaaaaaa, 0xaaaaaaaa, // TCfg
+ 0xf02a0600, 0x2a8620e0, 0x0640e8f2, 0x60f0f42b, 0xf8f62b86, // BinSelect
+ 0x00000700, // AccumType, ConstantType, and PostShift
+ 0x03020302, 0x03020302, 0x03020302, 0x03020302,
+ 0x03020302, 0x03020302, 0x03020302, 0x03020302 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+ 0x33333333, // TCfg
+ 0x11110000, // ASelt
+ 0x03020100, 0x03020100, // ABin
+ 0x00000000, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00002400, // AccumType, ConstantType, and PostShift
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uniGemmU8U8toFp32Block4_4x4 = {{
+ 0x55555555, // TCfg
+ 0x00000000, // ASelt
+ 0x32103210, 0x32103210, // ABin
+ 0x55555555, // BSelt
+ 0xd951c840, 0xfb73ea62, // BBin
+ 0x00000600, // AccumType, ConstantType, and PostShift
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uniGemmU8F16toF32Lo_4x4b = {{
+ 0x55555555, // TCfg
+ 0x50505050, // ASelt
+ 0x51514040, 0x73736262, // ABin
+ 0x00000000, // BSelt
+ 0x32103210, 0x32103210, // BBin
+ 0x00000000, // AccumType, ConstantType, and PostShift
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
+
+ gpu_dp_inst_t uniConvertUint8SubZpToFp32_4x4 = {{
+ 0x09090909, // TCfg
+ 0x04040404, // ASelt
+ 0x00010000, 0x00030002, // ABin
+ 0x0a0a0a0a, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000600, // AccumType, ConstantType, and PostShift
+ 0x00010001, 0x00000000, 0x00010001, 0x00000000,
+ 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uniConvertUint8SubZpToFp32B_4x4 = {{
+ 0x09090909, // TCfg
+ 0x04040404, // ASelt
+ 0x00010000, 0x00030002, // ABin
+ 0x0a0a0a0a, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000600, // AccumType, ConstantType, and PostShift
+ 0x00010001, 0x00000000, 0x00010001, 0x00000000,
+ 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
+
+ float reScaleOut = 1 / dstScale;
+ uint32_t multiplierU8ZpAB = (src0ZP << 24) | (src1ZP << 16) | (src0ZP << 8) | (src1ZP);
+ int32_t i = 8;
+ gpu_quantize_multiplier_16bit(src0Scale / 1.0f, &M0, &postShift0);
+ gpu_quantize_multiplier_16bit(src1Scale / 1.0f, &M1, &postShift1);
+
+ multiplierA = (M0 << 16) | M0;
+ multiplierB = (M1 << 16) | M1;
+
+ uniConvertUint8SubZpToFp32_4x4.data[7] |= (postShift0 & 0x1F);
+ uniConvertUint8SubZpToFp32B_4x4.data[7] |= (postShift1 & 0x1F);
+ for( i = 8; i < 16; i += 2)
+ {
+ uniConvertUint8SubZpToFp32_4x4.data[i] = multiplierA;
+ uniConvertUint8SubZpToFp32B_4x4.data[i] = multiplierB;
+ }
+ for( i = 8; i < 16; i++)
+ {
+ uniGemmU8U8MulZptoFp32_8x4.data[i] = multiplierU8ZpAB;
+ }
+
+ switch( pack_key )
+ {
+ case _PACK_SELECT_KEY( U8, U8, U8, 0, 0, 1 ):
+ case _PACK_SELECT_KEY( I8, I8, I8, 0, 0, 1 ):
+ {
+ status = vsi_nn_kernel_gpu_add_param( node,
+ "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+ status |= vsi_nn_kernel_gpu_add_param( node,
+ "uniGemmU8U8toFp32Block4_4x4", &uniGemmU8U8toFp32Block4_4x4 );
+ status |= vsi_nn_kernel_gpu_add_param( node,
+ "uniGemmU8U8MulZptoFp32_8x4", &uniGemmU8U8MulZptoFp32_8x4 );
+ status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale );
+ status |= vsi_nn_kernel_gpu_add_param( node, "mulKIn0In1Zp", &mulKIn0In1Zp );
+ status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP );
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ }
+ break;
+ case _PACK_SELECT_KEY( I16, I16, I16, 0, 0, 0 ):
+ case _PACK_SELECT_KEY( I16, I16, I16, 0, 0, 1 ):
+ {
+ status = vsi_nn_kernel_gpu_add_param( node,
+ "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+ status |= vsi_nn_kernel_gpu_add_param( node,
+ "uniConvertUint8SubZpToFp32_4x4", &uniConvertUint8SubZpToFp32_4x4 );
+ status |= vsi_nn_kernel_gpu_add_param( node,
+ "uniConvertUint8SubZpToFp32B_4x4", &uniConvertUint8SubZpToFp32B_4x4 );
+ status |= vsi_nn_kernel_gpu_add_param( node, "input0_ZP", &src0ZP );
+ status |= vsi_nn_kernel_gpu_add_param( node, "input1_ZP", &src1ZP );
+ status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP );
+ status |= vsi_nn_kernel_gpu_add_param( node, "outputScale", &reScaleOut );
+ }
+ break;
+ case _PACK_SELECT_KEY( F16, F16, F16, 0, 0, 1 ):
+ {
+ status = vsi_nn_kernel_gpu_add_param( node,
+ "uniGemmU8F16toF32Lo_4x4b", &uniGemmU8F16toF32Lo_4x4b );
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ }
+ break;
+ default:
+ break;
+ }
+ }
+#undef _PACK_SELECT_KEY
+
+OnError:
+ if (attr[0])
+ {
+ vsi_nn_kernel_tensor_attr_release( &attr[0] );
+ attr[0] = NULL;
+ }
+ if (attr[1])
+ {
+ vsi_nn_kernel_tensor_attr_release( &attr[1] );
+ attr[1] = NULL;
+ }
+ if (attr[2])
+ {
+ vsi_nn_kernel_tensor_attr_release( &attr[2] );
+ attr[2] = NULL;
+ }
+ return status;
+} /* _matrix_mul_cross_initializer() */
+
/*
* Query kernel
*/
@@ -1102,7 +1510,8 @@ static vsi_status _query_kernel
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel,
int32_t transa,
- int32_t transb
+ int32_t transb,
+ int32_t cross
)
{
vsi_status status = VSI_FAILURE;
@@ -1110,13 +1519,13 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e input1_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
- key = HASH_MATRIX_MUL_KEY( input0_dtype, input1_dtype, output_dtype, transa, transb );
+ key = HASH_MATRIX_MUL_KEY( input0_dtype, input1_dtype, output_dtype, transa, transb, cross);
for( i = 0; i < _cnt_of_array(matrix_mul_map); i ++ )
{
@@ -1128,9 +1537,18 @@ static vsi_status _query_kernel
if ( i < _cnt_of_array(matrix_mul_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", matrix_mul_map[i].function_name );
- kernel->info.parameters = _matrix_mul_kernel_param_def;
- kernel->info.numParams = _cnt_of_array( _matrix_mul_kernel_param_def );
- kernel->info.initialize = _matrix_mul_initializer;
+ if (cross == 1)
+ {
+ kernel->info.parameters = _matrix_mul_kernel_cross_param_def;
+ kernel->info.numParams = _cnt_of_array( _matrix_mul_kernel_cross_param_def );
+ kernel->info.initialize = _matrix_mul_cross_initializer;
+ }
+ else
+ {
+ kernel->info.parameters = _matrix_mul_kernel_param_def;
+ kernel->info.numParams = _cnt_of_array( _matrix_mul_kernel_param_def );
+ kernel->info.initialize = _matrix_mul_initializer;
+ }
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
@@ -1155,18 +1573,28 @@ static vsi_nn_kernel_node_t _setup
)
{
vsi_status status = VSI_FAILURE;
- vsi_nn_kernel_node_param_t tmp_params[_MATRIX_MUL_PARAM_NUM] = { NULL };
+ vsi_nn_kernel_node_param_t tmp_params[_MATRIX_MUL_CROSS_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
int32_t transposeA = vsi_nn_kernel_param_get_int32( params, "transposeA" );
int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" );
int32_t adjointA = vsi_nn_kernel_param_get_int32( params, "adjointA" );
int32_t adjointB = vsi_nn_kernel_param_get_int32( params, "adjointB" );
+ uint32_t cross_flg = vsi_nn_kernel_param_get_int32( params, "cross_flg" );
+ size_t tmp_size = 0;
+ uint32_t* size_axis_in_out = NULL;
+ uint32_t* stride_axis_in_out = NULL;
vsi_size_t M = inputs[0]->attr.size[1];
vsi_size_t K = inputs[0]->attr.size[0];
vsi_size_t N = inputs[1]->attr.size[0];
vsi_size_t depthA = 1, depthB = 1;
+ size_axis_in_out = (uint32_t *)vsi_nn_kernel_param_get_buffer( params, "size_axis_inner_outer", &tmp_size);
+ stride_axis_in_out = (uint32_t *)vsi_nn_kernel_param_get_buffer( params, "stride_axis_inner_outer", &tmp_size);
+
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if ((inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32
&& inputs[1]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32
&& outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32)
@@ -1209,13 +1637,14 @@ static vsi_nn_kernel_node_t _setup
rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
}
- status = _query_kernel( inputs, outputs, kernel, transposeA, transposeB );
+ status = _query_kernel( inputs, outputs, kernel, transposeA, transposeB, cross_flg );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
uint32_t index = 3;
+ size_t param_num = cross_flg == 1 ? _MATRIX_MUL_CROSS_PARAM_NUM : _MATRIX_MUL_PARAM_NUM;
/* Pass parameters to node. */
if (rs_input)
{
@@ -1225,7 +1654,7 @@ static vsi_nn_kernel_node_t _setup
}
else
{
- vsi_nn_kernel_node_pack_io( tmp_params, _MATRIX_MUL_PARAM_NUM,
+ vsi_nn_kernel_node_pack_io( tmp_params, param_num,
inputs, 2, outputs, 1 );
}
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &transposeA );
@@ -1235,7 +1664,22 @@ static vsi_nn_kernel_node_t _setup
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &M );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &K );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &N );
- status = vsi_nn_kernel_node_pass_param( node, tmp_params, _MATRIX_MUL_PARAM_NUM );
+ if (cross_flg == 1)
+ {
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &size_axis_in_out[0] );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &size_axis_in_out[1] );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &size_axis_in_out[2] );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[0] );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[1] );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[2] );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[3] );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[4] );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[5] );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[6] );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[7] );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[8] );
+ }
+ status = vsi_nn_kernel_node_pass_param( node, tmp_params, param_num );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &tmp_params[3] );
vsi_nn_kernel_scalar_release( &tmp_params[4] );
@@ -1244,6 +1688,21 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &tmp_params[7] );
vsi_nn_kernel_scalar_release( &tmp_params[8] );
vsi_nn_kernel_scalar_release( &tmp_params[9] );
+ if (cross_flg == 1)
+ {
+ vsi_nn_kernel_scalar_release( &tmp_params[10] );
+ vsi_nn_kernel_scalar_release( &tmp_params[11] );
+ vsi_nn_kernel_scalar_release( &tmp_params[12] );
+ vsi_nn_kernel_scalar_release( &tmp_params[13] );
+ vsi_nn_kernel_scalar_release( &tmp_params[14] );
+ vsi_nn_kernel_scalar_release( &tmp_params[15] );
+ vsi_nn_kernel_scalar_release( &tmp_params[16] );
+ vsi_nn_kernel_scalar_release( &tmp_params[17] );
+ vsi_nn_kernel_scalar_release( &tmp_params[18] );
+ vsi_nn_kernel_scalar_release( &tmp_params[19] );
+ vsi_nn_kernel_scalar_release( &tmp_params[20] );
+ vsi_nn_kernel_scalar_release( &tmp_params[21] );
+ }
{
// Set default border mode.
vx_border_t border;
diff --git a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
index 460ad87f7..d862eb752 100644
--- a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
@@ -153,6 +153,8 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
vsi_size_array_t * out_shape = NULL;
uint32_t pack_key;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -404,7 +406,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int i;
+ size_t i;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -453,6 +455,10 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type;
vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
diff --git a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
index 11478f544..cb9fc3563 100644
--- a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
@@ -153,6 +153,8 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
vsi_size_array_t * out_shape = NULL;
uint32_t pack_key;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -404,7 +406,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int i;
+ size_t i;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -453,6 +455,10 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type;
vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
diff --git a/src/tim/vx/internal/src/kernel/evis/mod_evis.c b/src/tim/vx/internal/src/kernel/evis/mod_evis.c
index fe7edd7cc..70188f6e7 100644
--- a/src/tim/vx/internal/src/kernel/evis/mod_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/mod_evis.c
@@ -119,7 +119,7 @@ DEF_KERNEL_INITIALIZER(_mod_initializer)
{0, 0, 0},
{0, 0, 0}
};
- vx_status status = VX_FAILURE;
+ vsi_status status = VSI_FAILURE;
vx_tensor input0 = (vx_tensor)param[0];
vx_tensor input1 = (vx_tensor)param[1];
vx_tensor output = (vx_tensor)param[2];
@@ -138,6 +138,8 @@ DEF_KERNEL_INITIALIZER(_mod_initializer)
float in1Tail = 0;
float outZp = 0;
+ VSI_UNREFERENCED(param_size);
+
input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0 );
CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/evis/moments_evis.c b/src/tim/vx/internal/src/kernel/evis/moments_evis.c
index d79142617..9dc6eae47 100644
--- a/src/tim/vx/internal/src/kernel/evis/moments_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/moments_evis.c
@@ -162,7 +162,7 @@ static vx_param_description_t _moments_kernel_param_def[] =
};
#define _MOMENTS_PARAM_NUM _cnt_of_array( _moments_kernel_param_def )
-static int32_t set_constant_border
+static int32_t _set_constant_border
(
vsi_nn_kernel_node_t node,
int32_t value
@@ -172,9 +172,6 @@ static int32_t set_constant_border
vx_border_t border;
border.mode = VX_BORDER_CONSTANT;
border.constant_value.S32 = value;
- border.constant_value.U32 = (vx_uint32)value;
- border.constant_value.S16 = (vx_int16)value;
- border.constant_value.U8 = (vx_uint8)value;
status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
return status;
}
@@ -226,6 +223,8 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
uint32_t pack_key = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -797,7 +796,9 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
+
+ VSI_UNREFERENCED(params);
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -866,6 +867,9 @@ static vsi_nn_kernel_node_t _setup
vsi_bool image_2d = FALSE;
vsi_bool is_continue_axis = TRUE;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
axis_num = (int32_t)axis_num_temp;
for ( i = 1; i < axis_num; i++)
@@ -901,7 +905,7 @@ static vsi_nn_kernel_node_t _setup
reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
outputs[1], shapes[1], rank_out );
- if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[1]->attr.size,
+ if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[1]->attr.size,
reshape_tensors[1]->attr.dim_num ) )
{
return NULL;
@@ -911,10 +915,10 @@ static vsi_nn_kernel_node_t _setup
axis_first = new_axis[0];
status = _query_kernel( inputs, outputs, kernel, params, new_axis, axis_size, image_2d );
- if( VSI_SUCCESS == status)
+ if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
- if( node )
+ if ( node )
{
uint32_t index = 3;
/* Pass parameters to node. */
@@ -926,17 +930,14 @@ static vsi_nn_kernel_node_t _setup
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &node_params[3] );
vsi_nn_kernel_scalar_release( &node_params[4] );
- status = set_constant_border(node, vsi_nn_get_tensor_zero_point(inputs[0]));
+ status = _set_constant_border(node, 0);
CHECK_STATUS(status);
}
}
- for(i = 0; i < 3; i++)
+ for (i = 0; i < 3; i++)
{
- if(reshape_tensors[i])
- {
- vsi_nn_ReleaseTensor(&reshape_tensors[i]);
- }
+ vsi_safe_release_tensor(reshape_tensors[i]);
}
return node;
diff --git a/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c b/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c
new file mode 100644
index 000000000..28ff2d1ae
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c
@@ -0,0 +1,614 @@
+/****************************************************************************
+*
+* Copyright (c) 2020 Vivante Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include
+#include
+#include
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+ INTERNAL_KERNEL_NEAREST_GRID_SAMPLE,
+} _internal_kernel_e;
+
+#define STR(a) #a
+
+#define _NEAREST_GRID_SAMPLE_KERNEL_SOURCE(_input_type, _output_type) \
+ "nearest_grid_sample_" #_input_type "_to_" #_output_type
+
+// Add kernel hashtable here
+#define NEAREST_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+ ((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE))
+#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+ { \
+ NEAREST_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
+ CVIVANTE_NAMESPACE("evis.nearest_grid_sample_" STR( \
+ IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
+ _NEAREST_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE) \
+ }
+
+typedef struct
+{
+ uint32_t key;
+ char * function_name;
+ const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _nearest_grid_sample_kernel_map[] =
+{
+ PACK_KERNEL_MAP(F16, F32, F16),
+ PACK_KERNEL_MAP(F16, U8, F16),
+ PACK_KERNEL_MAP(F16, F16, F16),
+ PACK_KERNEL_MAP(F16, F32, U8),
+ PACK_KERNEL_MAP(F16, F16, U8),
+ PACK_KERNEL_MAP(F16, U8, U8),
+ PACK_KERNEL_MAP(U8, U8, U8),
+ PACK_KERNEL_MAP(U8, F16, U8),
+ PACK_KERNEL_MAP(U8, F32, U8),
+ PACK_KERNEL_MAP(I16, I16, I16),
+ PACK_KERNEL_MAP(I8, I8, I8),
+ PACK_KERNEL_MAP(BF16, BF16, BF16),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _nearest_grid_sample_kernel_param_def[] =
+{
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _NEAREST_GRID_SAMPLE_PARAM_NUM _cnt_of_array( _nearest_grid_sample_kernel_param_def )
+
+#define SCALAR_ALIGN_CORNERS (3)
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_nearest_grid_sample_initializer)
+ (
+ vsi_nn_kernel_node_t node,
+ const vsi_nn_kernel_node_param_t * param,
+ size_t param_size
+ )
+{
+#define MAX_POST_SHIFT_BITS (31)
+#define MAX_MULTIPLIER_NUM (65535)
+ vsi_status status = VSI_FAILURE;
+ gpu_param_t gpu_param = {3, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
+ vsi_nn_kernel_tensor_attr_t* output_attr = NULL;
+ vsi_nn_kernel_tensor_attr_t* input_attr[2] = {NULL};
+ vsi_size_array_t* out_shape = NULL;
+ vsi_size_array_t* in0_shape = NULL;
+ vsi_nn_kernel_dtype_e input0_dtype = F16;
+ vsi_nn_kernel_dtype_e input1_dtype = F16;
+ vsi_nn_kernel_dtype_e output_dtype = F16;
+
+ uint32_t depth = 0;
+ float half_input0_wh[2];
+ float add_float_value[2];
+ uint32_t in0_width;
+ uint32_t in0_height;
+ uint32_t out_width;
+ uint32_t out_height;
+ int32_t align_corners;
+
+ float input0_scale = 1.0;
+ int32_t input0ZP = 0;
+ float input1_scale = 1.0;
+ int32_t input1ZP = 0;
+ float output_scale = 1.0;
+ int32_t outputZP = 0;
+
+ VSI_UNREFERENCED(param_size);
+
+ input_attr[0] =
+ vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
+ CHECK_PTR_FAIL_GOTO(
+ input_attr[0], "Create tensor attr buffer fail.", final);
+
+ input_attr[1] =
+ vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]);
+ CHECK_PTR_FAIL_GOTO(
+ input_attr[1], "Create tensor attr buffer fail.", final);
+
+ output_attr =
+ vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
+ CHECK_PTR_FAIL_GOTO(output_attr, "Create tensor attr buffer fail.", final);
+
+ status = vsi_nn_kernel_scalar_read_int32(
+ (vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners));
+ CHECK_STATUS_FAIL_GOTO(status, final);
+
+ out_shape = output_attr->shape;
+ in0_shape = input_attr[0]->shape;
+ input0_dtype = input_attr[0]->dtype;
+ input1_dtype = input_attr[1]->dtype;
+ output_dtype = output_attr->dtype;
+
+ input0_scale = input_attr[0]->scale;
+ input0ZP = input_attr[0]->zero_point;
+ input1_scale = input_attr[1]->scale;
+ input1ZP = input_attr[1]->zero_point;
+ output_scale = output_attr->scale;
+ outputZP = output_attr->zero_point;
+
+
+ in0_width = (uint32_t)(in0_shape->data[0]);
+ in0_height = (uint32_t)(in0_shape->data[1]);
+ depth = (uint32_t)(in0_shape->data[2]);
+ out_width = (uint32_t)(out_shape->data[0]);
+ out_height = (uint32_t)(out_shape->data[1]);
+
+ if (align_corners) {
+ half_input0_wh[0] = ((float)in0_width - 1.0f) * 0.5f;
+ half_input0_wh[1] = ((float)in0_height - 1.0f) * 0.5f;
+ add_float_value[0] = half_input0_wh[0] + 0.5f;
+ add_float_value[1] = half_input0_wh[1] + 0.5f;
+ } else {
+ half_input0_wh[0] = (float)in0_width * 0.5f;
+ half_input0_wh[1] = (float)in0_height * 0.5f;
+ add_float_value[0] = half_input0_wh[0];
+ add_float_value[1] = half_input0_wh[1];
+ }
+
+ status = vsi_nn_kernel_gpu_add_param(node, "half_input0_wh", half_input0_wh);
+ status |= vsi_nn_kernel_gpu_add_param(node, "add_float_value", add_float_value);
+ status |= vsi_nn_kernel_gpu_add_param(node, "depth", &depth);
+
+ {
+ gpu_dp_inst_t uniFp16toFp32_part0_4x4 = {
+ {
+ 0x01010101, // TCfg
+ 0x00000000, // ASelt
+ 0x00010000, 0x00030002, // ABin
+ 0x02020202, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000000, 0x00000001, 0x00000000,
+ 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+ },
+ GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniFp16toFp32_part1_4x4 = {
+ {
+ 0x01010101, // TCfg
+ 0x00000000, // ASelt
+ 0x00050004, 0x00070006, // ABin
+ 0x02020202, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000000, 0x00000001, 0x00000000,
+ 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+ },
+ GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniU8SubZPtoFp32_part0_4x4 = {
+ {
+ 0x09090909, // TCfg
+ 0x04040404, // ASelt
+ 0x00010000, 0x00030002, // ABin
+ 0x0a0a0a0a, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00010001, 0x00000000, 0x00010001, 0x00000000,
+ 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniU8SubZPtoFp32_part1_4x4 = {
+ {
+ 0x09090909, // TCfg
+ 0x04040404, // ASelt
+ 0x00050004, 0x00070006, // ABin
+ 0x0a0a0a0a, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00010001, 0x00000000, 0x00010001, 0x00000000,
+ 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniExtact8Bit_2x8 = {{
+ 0x33333333, // TCfg
+ 0x11110000, // ASelt
+ 0x03020100, 0x03020100, // ABin
+ 0x00000000, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00002400, // AccumType, ConstantType, and PostShift
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16};
+ if (F16 == input0_dtype &&
+ (F16 == input1_dtype || F32 == input1_dtype ||
+ U8 == input1_dtype) &&
+ F16 == output_dtype) {
+ if (F16 == input1_dtype) {
+ status |= vsi_nn_kernel_gpu_add_param(
+ node, "uniFp16toFp32_part0_4x4", &uniFp16toFp32_part0_4x4);
+ status |= vsi_nn_kernel_gpu_add_param(
+ node, "uniFp16toFp32_part1_4x4", &uniFp16toFp32_part1_4x4);
+ } else if (U8 == input1_dtype) {
+ status |=
+ vsi_nn_kernel_gpu_add_param(node, "input1_ZP", &input1ZP);
+ status |= vsi_nn_kernel_gpu_add_param(
+ node, "input1Scale", &input1_scale);
+ status |=
+ vsi_nn_kernel_gpu_add_param(node,
+ "uniU8SubZPtoFp32_part0_4x4",
+ &uniU8SubZPtoFp32_part0_4x4);
+ status |=
+ vsi_nn_kernel_gpu_add_param(node,
+ "uniU8SubZPtoFp32_part1_4x4",
+ &uniU8SubZPtoFp32_part1_4x4);
+ }
+ } else if (F16 == input0_dtype &&
+ (F16 == input1_dtype || F32 == input1_dtype ||
+ U8 == input1_dtype) &&
+ U8 == output_dtype) {
+ float uint8Scale = 1.0f / output_scale;
+ float uint8ZP_out = (float)outputZP;
+ status |= vsi_nn_kernel_gpu_add_param(node, "uint8Scale", &uint8Scale);
+ status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &uint8ZP_out);
+ status |= vsi_nn_kernel_gpu_add_param(
+ node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
+ if (U8 == input1_dtype) {
+ status |=
+ vsi_nn_kernel_gpu_add_param(node, "input1_ZP", &input1ZP);
+ status |= vsi_nn_kernel_gpu_add_param(
+ node, "input1Scale", &input1_scale);
+ status |=
+ vsi_nn_kernel_gpu_add_param(node,
+ "uniU8SubZPtoFp32_part0_4x4",
+ &uniU8SubZPtoFp32_part0_4x4);
+ status |=
+ vsi_nn_kernel_gpu_add_param(node,
+ "uniU8SubZPtoFp32_part1_4x4",
+ &uniU8SubZPtoFp32_part1_4x4);
+ } else if (F16 == input1_dtype) {
+ status |= vsi_nn_kernel_gpu_add_param(
+ node, "uniFp16toFp32_part0_4x4", &uniFp16toFp32_part0_4x4);
+ status |= vsi_nn_kernel_gpu_add_param(
+ node, "uniFp16toFp32_part1_4x4", &uniFp16toFp32_part1_4x4);
+ }
+ }
+ else if (U8 == input0_dtype &&
+ (F16 == input1_dtype || F32 == input1_dtype ||
+ U8 == input1_dtype) &&
+ U8 == output_dtype) {
+ uint16_t M0 = 0;
+ int32_t postShift = 0;
+ uint32_t multAndoutZP[2] = {0};
+ gpu_dp_inst_t uniMultiplyAndPostShift_2x8 = {{
+ 0xdddddddd, // TCfg
+ 0x44444444, // ASelt
+ 0x13121110, 0x17161514, // ABin
+ 0x11111111, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00002400, // AccumType, ConstantType, and PostShift
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16};
+
+ gpu_quantize_multiplier_16bit(
+ (double)input0_scale / (double)output_scale, &M0, &postShift);
+
+ multAndoutZP[0] = (uint32_t)(M0);
+ multAndoutZP[1] =
+ (uint32_t)((outputZP << postShift) - input0ZP * M0);
+
+ uniMultiplyAndPostShift_2x8.data[7] |= (postShift & 0x1F);
+ status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP", multAndoutZP);
+ status |= vsi_nn_kernel_gpu_add_param( node, "uniMultiplyAndPostShift_2x8",
+ &uniMultiplyAndPostShift_2x8);
+ if (U8 == input1_dtype) {
+ status |= vsi_nn_kernel_gpu_add_param(node, "input1_ZP", &input1ZP);
+ status |= vsi_nn_kernel_gpu_add_param(node, "input1Scale", &input1_scale);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part0_4x4",
+ &uniU8SubZPtoFp32_part0_4x4);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part1_4x4",
+ &uniU8SubZPtoFp32_part1_4x4);
+ }
+ else if (F16 == input1_dtype) {
+ status |= vsi_nn_kernel_gpu_add_param(
+ node, "uniFp16toFp32_part0_4x4", &uniFp16toFp32_part0_4x4);
+ status |= vsi_nn_kernel_gpu_add_param(
+ node, "uniFp16toFp32_part1_4x4", &uniFp16toFp32_part1_4x4);
+ }
+ }
+ else if (BF16 == input0_dtype && BF16 == input1_dtype &&
+ BF16 == output_dtype) {
+ gpu_dp_inst_t uniBF16toFp32_part0_2x8 = {
+ {
+ 0x11111111, // TCfg
+ 0x01010101, // ASelt
+ 0x01050004, 0x03070206, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000600, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniBF16toFp32_part1_2x8 = {
+ {
+ 0x11111111, // TCfg
+ 0x01010101, // ASelt
+ 0x05050404, 0x07070606, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000600, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16};
+ status |= vsi_nn_kernel_gpu_add_param(
+ node, "uniBF16toFp32_part0_2x8", &uniBF16toFp32_part0_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(
+ node, "uniBF16toFp32_part1_2x8", &uniBF16toFp32_part1_2x8);
+ }
+ else if (((I16 == input0_dtype && I16 == input1_dtype &&
+ I16 == output_dtype)) ||
+ ((I8 == input0_dtype && I8 == input1_dtype &&
+ I8 == output_dtype))) {
+ uint16_t M0 = 0;
+ int32_t postShift = 0;
+ uint32_t i = 0;
+ gpu_dp_inst_t uniDFPtoFp32_part0_4x4 = {{
+ 0x01010101, // TCfg
+ 0x00000000, // ASelt
+ 0x00010000, 0x00030002, // ABin
+ 0x02020202, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000300, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000000, 0x00000001, 0x00000000,
+ 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniDFPtoFp32_part1_4x4 = {{
+ 0x01010101, // TCfg
+ 0x00000000, // ASelt
+ 0x00050004, 0x00070006, // ABin
+ 0x02020202, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000300, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000000, 0x00000001, 0x00000000,
+ 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_dp_inst_t uniConvertI8toI8_2x8 = {{
+ 0x11111111, // TCfg
+ 0x00000000, // ASelt
+ 0x03020100, 0x07060504, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000600, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16};
+ gpu_quantize_multiplier_16bit(
+ (double)input0_scale / (double)output_scale, &M0, &postShift);
+ uniConvertI8toI8_2x8.data[7] |= (postShift & 0x1F);
+ for (i = 0; i < 8; i++) {
+ uniConvertI8toI8_2x8.data[i + 8] = M0;
+ }
+
+ status |= vsi_nn_kernel_gpu_add_param(node, "input1_scale", &input1_scale);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uniDFPtoFp32_part0_4x4", &uniDFPtoFp32_part0_4x4);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uniDFPtoFp32_part1_4x4", &uniDFPtoFp32_part1_4x4);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertI8toI8_2x8", &uniConvertI8toI8_2x8);
+ }
+ else {
+ VSILOGE("input or output's format is not support");
+ status = VSI_FAILURE;
+ }
+ }
+ CHECK_STATUS_FAIL_GOTO(status, final);
+
+ gpu_param.global_scale[0] = 4;
+ gpu_param.global_scale[1] = 1;
+ gpu_param.global_scale[2] = 1;
+
+ gpu_param.dim = 2;
+ gpu_param.global_size[0] =
+ (out_width + gpu_param.global_scale[0] - 1) /
+ gpu_param.global_scale[0];
+ gpu_param.global_size[1] = ((out_height + gpu_param.global_scale[1] - 1) /
+ gpu_param.global_scale[1]);
+
+ status = vsi_nn_kernel_gpu_config(node, &gpu_param);
+
+#undef MAX_MULTIPLIER_NUM
+#undef MAX_POST_SHIFT_BITS
+
+ final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) \
+ if (_PTR) { \
+ vsi_nn_kernel_tensor_attr_release(&_PTR); \
+ _PTR = NULL; \
+ }
+ SAFE_FREE_TENSOR_ATTR(output_attr);
+ SAFE_FREE_TENSOR_ATTR(input_attr[0]);
+ SAFE_FREE_TENSOR_ATTR(input_attr[1]);
+
+ return status;
+} /* _nearest_grid_sample_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+ (
+ vsi_nn_kernel_t * kernel,
+ vsi_nn_tensor_t * const * const inputs,
+ vsi_nn_tensor_t * const * const outputs
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ vsi_nn_kernel_dtype_e in0_dtype, in1_dtype;
+ vsi_nn_kernel_dtype_e out_dtype;
+ const _kernel_map_type * kernel_map = _nearest_grid_sample_kernel_map;
+ size_t kernel_map_size = _cnt_of_array( _nearest_grid_sample_kernel_map );
+ vx_param_description_t * param_def = _nearest_grid_sample_kernel_param_def;
+ vx_kernel_initialize_f initializer = _nearest_grid_sample_initializer;
+
+ uint32_t key;
+ uint32_t i;
+
+ in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
+ in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type);
+ out_dtype = vsi_nn_kernel_map_dtype(outputs[0]->attr.dtype.vx_type);
+
+ key = NEAREST_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype);
+
+ for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+ {
+ if ( kernel_map[i].key == key )
+ {
+ break;
+ }
+ }
+ if ( i < (uint32_t)kernel_map_size )
+ {
+ snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
+ kernel->info.parameters = param_def;
+ kernel->info.numParams = _cnt_of_array( _nearest_grid_sample_kernel_param_def );
+ kernel->info.initialize = initializer;
+ // Register code source
+ vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+ "vsi_nn_kernel_header",
+ kernel_map[i].source_name );
+ // Register binary source
+ vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+ kernel_map[i].source_name );
+ status = VSI_SUCCESS;
+ }
+ return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+ (
+ vsi_nn_graph_t * graph,
+ vsi_nn_tensor_t ** inputs,
+ size_t input_num,
+ vsi_nn_tensor_t ** outputs,
+ size_t output_num,
+ const vsi_nn_kernel_param_t * params,
+ vsi_nn_kernel_t * kernel
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ vsi_nn_kernel_node_param_t node_params[_NEAREST_GRID_SAMPLE_PARAM_NUM];
+ vsi_nn_kernel_node_t node = NULL;
+ vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
+ uint32_t final_in1_rank = 0;
+ vsi_nn_tensor_t* rs_tensors = NULL;
+ vsi_nn_tensor_t* final_tensors[3] = {NULL};
+ vsi_nn_kernel_dtype_e in0_dtype;
+ uint32_t pad_val = 0;
+ int32_t align_corners =
+ vsi_nn_kernel_param_get_int32(params, "align_corners");
+
+ // Check if gpu can support the size
+ if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
+ inputs[0]->attr.dim_num)) {
+ return NULL;
+ }
+
+ if (!vsi_nn_kernel_gpu_check_shape(inputs[1]->attr.size,
+ inputs[1]->attr.dim_num)) {
+ return NULL;
+ }
+
+ final_tensors[0] = inputs[0];
+
+ if (inputs[1]->attr.dim_num >= 3) {
+ final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0];
+ final_shape[1] = inputs[1]->attr.size[2];
+ final_shape[2] = 1;
+ final_shape[3] =
+ inputs[1]->attr.dim_num > 3 ? inputs[1]->attr.size[3] : 1;
+ final_in1_rank =
+ inputs[1]->attr.dim_num == 3 ? 2 : inputs[1]->attr.dim_num;
+ if (!vsi_nn_kernel_gpu_check_shape(final_shape, final_in1_rank)) {
+ return NULL;
+ }
+
+ rs_tensors = vsi_nn_reshape_tensor(
+ graph, inputs[1], final_shape, final_in1_rank);
+ final_tensors[1] = rs_tensors;
+ } else {
+ final_tensors[1] = inputs[1];
+ }
+ final_tensors[2] = outputs[0];
+
+ in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
+ if (U8 == in0_dtype) {
+ pad_val = inputs[0]->attr.dtype.zero_point;
+ }
+
+ status = _query_kernel( kernel, inputs, outputs );
+ if ( VSI_SUCCESS == status)
+ {
+ node = vsi_nn_kernel_create_node( graph, kernel );
+ if ( node )
+ {
+ /* Set inputs and outputs */
+ vsi_nn_kernel_node_pack_io( node_params, _NEAREST_GRID_SAMPLE_PARAM_NUM,
+ final_tensors, input_num, &final_tensors[2], output_num );
+ node_params[SCALAR_ALIGN_CORNERS] =
+ vsi_nn_kernel_scalar_create(graph, I32, &align_corners);
+ /* Pass parameters to node. */
+ status = vsi_nn_kernel_node_pass_param( node, node_params, _NEAREST_GRID_SAMPLE_PARAM_NUM );
+ VSI_ASSERT(status == VSI_SUCCESS);
+ vsi_nn_kernel_scalar_release(&node_params[SCALAR_ALIGN_CORNERS]);
+ {
+ // Set default border mode.
+ vx_border_t border;
+ border.mode = VX_BORDER_CONSTANT;
+ border.constant_value.U32 = pad_val;
+ status = vxSetNodeAttribute(
+ (vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
+ CHECK_STATUS(status);
+ }
+ }
+ }
+ vsi_safe_release_tensor(rs_tensors);
+ return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( nearest_grid_sample, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
index 5dc05023c..de2d35add 100644
--- a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
@@ -148,6 +148,8 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer)
int32_t srcFixPointPos = 0;
vsi_nn_kernel_dtype_e input_dtype = F16;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -422,6 +424,7 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_tensor_t* rs_tensors[2] = { NULL };
vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
int32_t i = 0;
+ size_t j = 0;
vsi_bool image_2d = FALSE;
vsi_size_t num_elements = vsi_nn_vxGetTensorElementNum(&inputs[0]->attr);
vsi_size_t prefix_dim_size = 1;
@@ -505,11 +508,11 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_ReleaseTensor( &rs_tensors[1] );
}
- for (i = SCALAR_INPUT_SUFFIX_SIZE; i < _ONE_HOT_PARAM_NUM; i++)
+ for (j = SCALAR_INPUT_SUFFIX_SIZE; j < _ONE_HOT_PARAM_NUM; j++)
{
- if (node_params[i])
+ if (node_params[j])
{
- vsi_nn_kernel_scalar_release( &node_params[i] );
+ vsi_nn_kernel_scalar_release( &node_params[j] );
}
}
diff --git a/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c b/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c
index a625d97f8..e45704fe6 100644
--- a/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c
@@ -146,6 +146,8 @@ DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer)
int32_t output_ZP = 0;
vsi_bool image_2d = FALSE;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/pow_evis.c b/src/tim/vx/internal/src/kernel/evis/pow_evis.c
index b4d4f218c..679526e6a 100644
--- a/src/tim/vx/internal/src/kernel/evis/pow_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pow_evis.c
@@ -149,6 +149,8 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -377,7 +379,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -423,6 +425,10 @@ static vsi_nn_kernel_node_t _setup
vsi_bool image_2d = FALSE;
vsi_nn_kernel_node_t node = NULL;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
index 498ee4528..52588a4d4 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
@@ -84,6 +84,8 @@ static vx_param_description_t vxPreProcessBgraKernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _EVIS_PRE_PROCESS_BGRA_PARAM_NUM _cnt_of_array(vxPreProcessBgraKernel_param_def)
@@ -115,6 +117,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -391,7 +395,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_convert_type_e convert_type = SCALE;
vsi_status status = VSI_FAILURE;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
@@ -449,6 +453,9 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
int32_t trans = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
@@ -469,7 +476,9 @@ static vsi_nn_kernel_node_t _setup
float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" );
float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" );
- float bgra_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
+ float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" );
+ float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" );
+ float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" );
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
/* Pass parameters to node. */
@@ -496,9 +505,11 @@ static vsi_nn_kernel_node_t _setup
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
- tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &bgra_scale );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_BGRA_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &tmp_params[2] );
@@ -511,6 +522,8 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &tmp_params[9] );
vsi_nn_kernel_scalar_release( &tmp_params[10] );
vsi_nn_kernel_scalar_release( &tmp_params[11] );
+ vsi_nn_kernel_scalar_release( &tmp_params[12] );
+ vsi_nn_kernel_scalar_release( &tmp_params[13] );
}
}
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
index 797c925b2..1973eb2a3 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
@@ -124,6 +124,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer)
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -224,6 +226,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_initializer)
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -388,6 +392,8 @@ DEF_KERNEL_INITIALIZER(_resize_gray_initializer)
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -516,7 +522,7 @@ static vsi_status _query_kernel
vsi_nn_gray_convert_type_e convert_type = SCALE;
vsi_status status = VSI_FAILURE;
uint32_t key = 0;
- int32_t i = 0;
+ size_t i = 0;
vsi_bool is_4_over_3 = FALSE;
vsi_bool is_half_scale = FALSE;
vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
@@ -605,6 +611,9 @@ static vsi_nn_kernel_node_t _setup
float scale = vsi_nn_kernel_param_get_float32( params, "scale" );
vsi_bool is_no_range_change = FALSE;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
index fe39a5cfb..a0d76f4ba 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
@@ -112,6 +112,8 @@ static vx_param_description_t vxPreProcessNv12Kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _EVIS_PRE_PROCESS_NV12_PARAM_NUM _cnt_of_array(vxPreProcessNv12Kernel_param_def)
@@ -136,13 +138,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
int32_t order1 = 2;
uint32_t width = 0;
uint32_t height = 0;
- float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f;
- float outputScaleVar = 0.0f;
+ float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f;
+ float b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f;
+ float outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f;
float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f;
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -152,10 +157,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &bMean);
CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &var);
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &r_scale);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder);
CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &g_scale);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[15], &b_scale);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[0]->shape;
output_scale = 1.0f / attr[0]->scale;
@@ -169,10 +178,12 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
order1 = 0;
}
- outputScaleVar = output_scale * var;
- bMeanScaleVarZp = output_zp - bMean * outputScaleVar;
- gMeanScaleVarZp = output_zp - gMean * outputScaleVar;
- rMeanScaleVarZp = output_zp - rMean * outputScaleVar;
+ outputScaleVar_b = output_scale * b_scale;
+ outputScaleVar_g = output_scale * g_scale;
+ outputScaleVar_r = output_scale * r_scale;
+ bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b;
+ gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g;
+ rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r;
shaderParam.global_scale[0] = 4;
shaderParam.global_scale[1] = 1;
@@ -255,7 +266,9 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8);
- status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar);
+ status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
+ status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g);
+ status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r);
status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
@@ -317,14 +330,17 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
uint32_t yrIntFloat_16 = 0;
int32_t xRatio = 0;
int32_t yRatio = 0;
- float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f;
- float outputScaleVar = 0.0f;
+ float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f;
+ float b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f;
+ float outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f;
float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f;
float resize = 0.0f;
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -341,10 +357,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &bMean);
CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &var);
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &r_scale);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder);
CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &g_scale);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[15], &b_scale);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[1]->shape;
output_scale = 1.0f / attr[1]->scale;
@@ -364,10 +384,12 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
xrIntFloat_16 = (uint32_t)((roi_width << 16) / width + 1);
yrIntFloat_16 = (uint32_t)((roi_height << 16) / height + 1);
- outputScaleVar = output_scale * var;
- bMeanScaleVarZp = output_zp - bMean * outputScaleVar;
- gMeanScaleVarZp = output_zp - gMean * outputScaleVar;
- rMeanScaleVarZp = output_zp - rMean * outputScaleVar;
+ outputScaleVar_b = output_scale * b_scale;
+ outputScaleVar_g = output_scale * g_scale;
+ outputScaleVar_r = output_scale * r_scale;
+ bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b;
+ gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g;
+ rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r;
shaderParam.global_scale[0] = 4;
shaderParam.global_scale[1] = 1;
@@ -472,7 +494,9 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUVtoCharSub128_2x8", &uniConvertUVtoCharSub128_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16);
status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16);
- status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar);
+ status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
+ status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g);
+ status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r);
status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
@@ -537,7 +561,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_convert_type_e convert_type = SCALE;
vsi_status status = VSI_FAILURE;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
vsi_size_t dstWidth = outputs[0]->attr.size[0];
float scaleVal = (float)dstWidth / ((scale_x * dstWidth) >> 15);
@@ -611,6 +635,9 @@ static vsi_nn_kernel_node_t _setup
int32_t trans = 0;
int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
@@ -630,7 +657,9 @@ static vsi_nn_kernel_node_t _setup
float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" );
float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" );
- float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
+ float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" );
+ float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" );
+ float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" );
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
int32_t nv_type = vsi_nn_kernel_param_get_int32( params, "nv_type" );
@@ -645,10 +674,12 @@ static vsi_nn_kernel_node_t _setup
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
- tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &nv_type );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &tmp_params[3] );
@@ -662,6 +693,8 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &tmp_params[11] );
vsi_nn_kernel_scalar_release( &tmp_params[12] );
vsi_nn_kernel_scalar_release( &tmp_params[13] );
+ vsi_nn_kernel_scalar_release( &tmp_params[14] );
+ vsi_nn_kernel_scalar_release( &tmp_params[15] );
}
}
vsi_safe_release_tensor(reshape_tensors[0]);
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
index ddfc9b5a8..256f7e5ce 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
@@ -143,8 +143,10 @@ static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
- {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
- {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
@@ -162,8 +164,10 @@ static vx_param_description_t _pre_process_rgb888_planar_sep_kernel_param_def[]
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
- {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
- {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
@@ -195,8 +199,10 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_initializer)
float output_zp = 0;
float output_scale = 1;
+ int32_t reverse = 0;
+ int32_t rgb_order[4] = {0};
uint32_t width = 0;
- uint32_t height = 0;
+ int32_t height = 0;
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * out_shape = NULL;
@@ -210,30 +216,28 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_initializer)
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
}
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
- status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &output_scale);
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 4], &reverse);
+ status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 3], &height);
CHECK_STATUS_FAIL_GOTO(status, OnError );
- out_shape = attr[0]->shape;
- width = (uint32_t)(out_shape->data[0]);
- height = (uint32_t)(out_shape->data[1]);
-
- if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+ if (reverse)
{
- if ( attr[0]->dfp.fl > 0 )
- {
- output_scale *= (float)((int64_t)1 << attr[0]->dfp.fl);
- }
- else
- {
- output_scale *= (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
- }
+ rgb_order[0] = 2 * height;
+ rgb_order[1] = height;
+ rgb_order[2] = 0;
}
- else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+ else
{
- output_zp = (float)attr[0]->asymm.zero_point;
- output_scale /= attr[0]->asymm.scale;
+ rgb_order[0] = 0;
+ rgb_order[1] = height;
+ rgb_order[2] = 2 * height;
}
+ out_shape = attr[0]->shape;
+ width = (uint32_t)(out_shape->data[0]);
+ output_scale /= attr[0]->scale;
+ output_zp = (float)attr[0]->zero_point;
+
shaderParam.global_scale[0] = 4;
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
@@ -322,7 +326,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4);
-
+ status |= vsi_nn_kernel_gpu_add_param(node, "rgb_order", &rgb_order);
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
@@ -363,8 +367,10 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
float output_zp = 0;
float output_scale = 1;
- uint32_t width = 0;
- uint32_t height = 0;
+ uint32_t width = 0;
+ int32_t height = 0;
+ int32_t reverse = 0;
+ int32_t rgb_order[4] = {0};
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * out_shape = NULL;
@@ -378,12 +384,25 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
}
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
- status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &output_scale);
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 4], &reverse);
+ status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 3], &height);
CHECK_STATUS_FAIL_GOTO(status, OnError );
+ if (reverse)
+ {
+ rgb_order[0] = 2 * height;
+ rgb_order[1] = height;
+ rgb_order[2] = 0;
+ }
+ else
+ {
+ rgb_order[0] = 0;
+ rgb_order[1] = height;
+ rgb_order[2] = 2 * height;
+ }
+
out_shape = attr[0]->shape;
width = (uint32_t)(out_shape->data[0]);
- height = (uint32_t)(out_shape->data[1]);
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
@@ -435,6 +454,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
status = vsi_nn_kernel_gpu_add_param(node, "uniDataMeanStddevLo_2x8", &uniDataMeanStddevLo_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniDataMeanStddevHi_2x8", &uniDataMeanStddevHi_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "rgb_order", &rgb_order);
status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
CHECK_STATUS_FAIL_GOTO(status, OnError );
@@ -464,11 +484,13 @@ DEF_KERNEL_INITIALIZER(_resize_rgb888_planar_initializer)
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0}}; // globalWorkSize: image size in thread
- uint32_t width = 0;
- uint32_t height = 0;
- vsi_bool is_4_over_3 = 0;
+ uint32_t width = 0;
+ int32_t height = 0;
+ vsi_bool is_4_over_3 = 0;
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ int32_t reverse = 0;
+ int32_t rgb_order[4] = {0};
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -482,12 +504,28 @@ DEF_KERNEL_INITIALIZER(_resize_rgb888_planar_initializer)
}
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 4], &reverse);
+ status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 3], &height);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+ if (reverse)
+ {
+ rgb_order[0] = 2 * height;
+ rgb_order[1] = height;
+ rgb_order[2] = 0;
+ }
+ else
+ {
+ rgb_order[0] = 0;
+ rgb_order[1] = height;
+ rgb_order[2] = 2 * height;
+ }
+
out_shape = attr[1]->shape;
width = (uint32_t)(out_shape->data[0]);
- height = (uint32_t)(out_shape->data[1]);
is_4_over_3 = (attr[0]->shape->data[0] * 3 == width * 4) &&
- (attr[0]->shape->data[1] * 3 == height * 4);
+ (attr[0]->shape->data[1] * 3 == (vsi_size_t)height * 4);
if (is_4_over_3)
{
@@ -570,7 +608,7 @@ DEF_KERNEL_INITIALIZER(_resize_rgb888_planar_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l01_4x4", &uniBilinear_4over3_l01_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l11_4x4", &uniBilinear_4over3_l11_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l21_4x4", &uniBilinear_4over3_l21_4x4);
-
+ status |= vsi_nn_kernel_gpu_add_param(node, "rgb_order", &rgb_order);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
@@ -609,7 +647,7 @@ static vsi_status _query_kernel
_internal_scale_e scale_type = SCALE;
vsi_status status = VSI_FAILURE;
uint32_t key = 0;
- int32_t i = 0;
+ size_t i = 0;
vsi_bool is_4_over_3 = FALSE;
vsi_bool is_half_scale = FALSE;
vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
@@ -679,8 +717,7 @@ static vsi_status _query_kernel
{
kernel->info.initialize = _pre_process_rgb888_planar_initializer;
}
- vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
- "vsi_nn_kernel_header",
+ vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
pre_process_rgb888_planar_kernel_map[i].source_name );
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
pre_process_rgb888_planar_kernel_map[i].source_name );
@@ -705,19 +742,31 @@ static vsi_nn_kernel_node_t _setup
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t* node_params = NULL;
vsi_nn_kernel_node_t node = NULL;
- int32_t param_count = _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM;
+ vsi_nn_tensor_t* reshape_tensor = NULL;
+ vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+ size_t param_count = _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM;
int32_t width = vsi_nn_kernel_param_get_int32( params, "width" );
int32_t height = vsi_nn_kernel_param_get_int32( params, "height" );
+ int32_t output_height = (int32_t)outputs[0]->attr.size[1];
float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" );
float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" );
- float scale = vsi_nn_kernel_param_get_float32( params, "scale" );
+ float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" );
+ int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
+ float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" );
+ float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" );
vsi_bool is_no_range_change = FALSE;
input_num = inputs[1] == NULL ? 1 : input_num;
param_count = inputs[1] == NULL ? _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM : param_count;
- if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+ memcpy(shape, outputs[0]->attr.size, outputs[0]->attr.dim_num * sizeof(shape[0]));
+ shape[1] *= shape[2];
+ shape[2] = 1;
+ reshape_tensor = vsi_nn_reshape_tensor( graph,
+ outputs[0], shape, outputs[0]->attr.dim_num );
+
+ if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
@@ -727,7 +776,9 @@ static vsi_nn_kernel_node_t _setup
outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 &&
outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC &&
(float)outputs[0]->attr.dtype.zero_point == r_mean && r_mean == g_mean && r_mean == b_mean &&
- vsi_nn_abs(outputs[0]->attr.dtype.scale - scale) < 1e-8 )
+ vsi_nn_abs(outputs[0]->attr.dtype.scale - r_scale) < 1e-8 &&
+ vsi_nn_abs(outputs[0]->attr.dtype.scale - g_scale) < 1e-8 &&
+ vsi_nn_abs(outputs[0]->attr.dtype.scale - b_scale) < 1e-8)
{
is_no_range_change = TRUE;
}
@@ -736,10 +787,11 @@ static vsi_nn_kernel_node_t _setup
if ( VSI_SUCCESS == status)
{
node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count);
+ CHECK_PTR_FAIL_GOTO( node_params, "Create buffer fail.", final );
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
- uint32_t index = inputs[1] == NULL ? 4 : 6;
+ uint32_t index = inputs[1] == NULL ? 2 : 4;
uint32_t scalar_index = index;
int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" );
@@ -748,7 +800,7 @@ static vsi_nn_kernel_node_t _setup
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, param_count,
- inputs, input_num, outputs, output_num );
+ inputs, input_num, &reshape_tensor, output_num );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
@@ -757,7 +809,11 @@ static vsi_nn_kernel_node_t _setup
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
- node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &output_height );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, param_count );
index = scalar_index;
@@ -769,9 +825,14 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &node_params[index++] );
vsi_nn_kernel_scalar_release( &node_params[index++] );
vsi_nn_kernel_scalar_release( &node_params[index++] );
+ vsi_nn_kernel_scalar_release( &node_params[index++] );
+ vsi_nn_kernel_scalar_release( &node_params[index++] );
+ vsi_nn_kernel_scalar_release( &node_params[index++] );
+ vsi_nn_kernel_scalar_release( &node_params[index++] );
}
}
+final:
vsi_nn_safe_free(node_params);
return node;
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
new file mode 100644
index 000000000..ae559dac1
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
@@ -0,0 +1,1002 @@
+/****************************************************************************
+*
+* Copyright (c) 2020 Vivante Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include
+#include
+#include
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+#define RGB888_SEP_SOURCE_0 "pre_process_rgb888_planar_sep_nhwc_0",
+#define RGB888_SEP_SOURCE_1 "pre_process_rgb888_planar_sep_nhwc_1",
+#define RGB888_SEP_SOURCE_2 "pre_process_rgb888_planar_sep_nhwc_2",
+#define RGB888_SOURCE_0 "pre_process_rgb888_planar_nhwc_0",
+#define RGB888_SOURCE_1 "pre_process_rgb888_planar_nhwc_1",
+#define RGB888_SOURCE_2 "pre_process_rgb888_planar_nhwc_2",
+
+#define STR(a) #a
+
+typedef enum
+{
+ COPY = 0,
+ SCALE,
+ FOUR_OVER_THREE,
+ HALF
+} _internal_scale_e;
+
+// Add kernel hashtable here
+#define PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, SEP, SCALE_FLAG ) \
+ (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 ) | ( SEP << 4 ) | (SCALE_FLAG))
+
+#define PACK_KERNEL_SCALE_MAP( IN_DTYPE, OUT_DTYPE ) \
+ { PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, SCALE ), \
+ CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_nhwc"), \
+ RGB888_SOURCE_0 }
+
+#define PACK_KERNEL_SEP_SCALE_MAP( IN_DTYPE, OUT_DTYPE ) \
+ { PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, SCALE ), \
+ CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_nhwc"), \
+ RGB888_SEP_SOURCE_0 }
+
+#define PACK_KERNEL_COPY_MAP( IN_DTYPE, OUT_DTYPE ) \
+ { PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, COPY ), \
+ CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_nhwc"), \
+ RGB888_SOURCE_1 }
+
+#define PACK_KERNEL_SEP_COPY_MAP( IN_DTYPE, OUT_DTYPE ) \
+ { PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, COPY ), \
+ CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_nhwc"), \
+ RGB888_SEP_SOURCE_1 }
+
+#define PACK_KERNEL_HALF_MAP( IN_DTYPE, OUT_DTYPE ) \
+ { PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, HALF ), \
+ CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_nhwc"), \
+ RGB888_SOURCE_2 }
+
+#define PACK_KERNEL_SEP_HALF_MAP( IN_DTYPE, OUT_DTYPE ) \
+ { PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, HALF ), \
+ CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_nhwc"), \
+ RGB888_SEP_SOURCE_2 }
+typedef struct
+{
+ uint32_t key;
+ char * function_name;
+ const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _pre_process_rgb888_planar_nhwc_kernel_map[] =
+{
+ // Register kernel here
+ PACK_KERNEL_SCALE_MAP( U8, F16 ),
+ PACK_KERNEL_SCALE_MAP( U8, I16 ),
+ PACK_KERNEL_SCALE_MAP( U8, I8 ),
+ PACK_KERNEL_SCALE_MAP( U8, U8 ),
+
+ PACK_KERNEL_COPY_MAP( U8, F16 ),
+ PACK_KERNEL_COPY_MAP( U8, I16 ),
+ PACK_KERNEL_COPY_MAP( U8, I8 ),
+ PACK_KERNEL_COPY_MAP( U8, U8 ),
+
+ PACK_KERNEL_HALF_MAP( U8, U8 ),
+
+ PACK_KERNEL_SEP_SCALE_MAP( U8, F16 ),
+ PACK_KERNEL_SEP_SCALE_MAP( U8, I16 ),
+ PACK_KERNEL_SEP_SCALE_MAP( U8, I8 ),
+ PACK_KERNEL_SEP_SCALE_MAP( U8, U8 ),
+
+ PACK_KERNEL_SEP_COPY_MAP( U8, F16 ),
+ PACK_KERNEL_SEP_COPY_MAP( U8, I16 ),
+ PACK_KERNEL_SEP_COPY_MAP( U8, I8 ),
+ PACK_KERNEL_SEP_COPY_MAP( U8, U8 ),
+
+ PACK_KERNEL_SEP_HALF_MAP( U8, U8 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
+{
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def )
+
+static vx_param_description_t _pre_process_rgb888_planar_sep_kernel_param_def[] =
+{
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_initializer)
+ (
+ vsi_nn_kernel_node_t node,
+ const vsi_nn_kernel_node_param_t * param,
+ size_t param_size
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ gpu_param_t shaderParam = {
+ 2, // workdim
+ {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
+ {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
+ {0, 0, 0}, // localWorkSize: local group size in thread
+ {0, 0, 0}}; // globalWorkSize: image size in thread
+
+ float output_zp = 0;
+ float output_scale = 1;
+ int32_t reverse = 0;
+ uint32_t width = 0;
+ uint32_t height = 0;
+
+ vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+ vsi_size_array_t * out_shape = NULL;
+
+ if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ))
+ {
+ attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+ }
+ else
+ {
+ attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+ }
+ CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 3], &reverse);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+ out_shape = attr[0]->shape;
+ width = (uint32_t)(out_shape->data[0] / 3);
+ height = (uint32_t)(out_shape->data[1]);
+ output_scale /= attr[0]->scale;
+ output_zp = (float)attr[0]->zero_point;
+
+ shaderParam.global_scale[0] = 4;
+ shaderParam.global_scale[1] = 1;
+ shaderParam.global_scale[2] = 1;
+ shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
+ / shaderParam.global_scale[0], 4);
+ shaderParam.global_size[1] = height;
+ shaderParam.global_size[2] = 1;
+
+ status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+ CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+ {
+ gpu_dp_inst_t uniVecShift10 = {{
+ 0x01010101, // TCfg
+ 0x00000000, // ASelt
+ 0x00020000, 0x00060004, // ABin
+ 0x02020202, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000600, // AccumType, ConstantType, and PostShift
+ 0x00000400, 0x00000000, 0x00000400, 0x00000000,
+ 0x00000400, 0x00000000, 0x00000400, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uniAddRShift = {{
+ 0x0f0f0f0f, // TCfg
+ 0x04040404, // ASelt
+ 0x00010000, 0x00030002, // ABin
+ 0x00000000, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00002405, // AccumType, ConstantType, and PostShift
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uniGetTempVal = {{
+ 0x09090909, // TCfg
+ 0x00000000, // ASelt
+ 0x00230001, 0x00670045, // ABin
+ 0x05050505, // BSelt
+ 0x00110000, 0x00330022, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uniExtractBytes = {{
+ 0x0f0f0f0f, // TCfg
+ 0x04040404, // ASelt
+ 0x00010000, 0x00030002, // ABin
+ 0x00000000, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00002414, // AccumType, ConstantType, and PostShift
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uniConvertIntergetoF32_4x4 = {{
+ 0x01010101, // TCfg
+ 0x00000000, // ASelt
+ 0x00010000, 0x00030002, // ABin
+ 0x02020202, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000000, 0x00000001, 0x00000000,
+ 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+ 0x11111111, // TCfg
+ 0x11110000, // ASelt
+ 0x06040200, 0x06040200, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000100, // AccumType, ConstantType, and PostShift
+ 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+ 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uniExtractInteger_2x8 = {{
+ 0x33333333, // TCfg
+ 0x11110000, // ASelt
+ 0x03020100, 0x03020100, // ABin
+ 0x00000000, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00002400, // AccumType, ConstantType, and PostShift
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni16BitsDataInterleaveRGB_0_2x8 = {{
+ 0x11111111, // TCfg
+ 0x00100100, // ASelt
+ 0x01000400, 0x06020105, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni16BitsDataInterleaveRGB_1_2x8 = {{
+ 0x00001111, // TCfg
+ 0x00001001, // ASelt
+ 0x03070302, 0x00000000, // ABin
+ 0x00002222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni16BitsDataInterleaveBGR_0_2x8 = {{
+ 0x11111111, // TCfg
+ 0x01001001, // ASelt
+ 0x01000400, 0x06020105, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni16BitsDataInterleaveBGR_1_2x8 = {{
+ 0x00001111, // TCfg
+ 0x00000010, // ASelt
+ 0x03070302, 0x00000000, // ABin
+ 0x00002222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni8BitsDataInterleaveRGB_0_2x8= {{
+ 0x11111111, // TCfg
+ 0x00000000, // ASelt
+ 0x01080400, 0x06020905, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni8BitsDataInterleaveRGB_1_2x8 = {{
+ 0x00001111, // TCfg
+ 0x00000000, // ASelt
+ 0x0b07030a, 0x00000000, // ABin
+ 0x00002222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni8BitsDataInterleaveBGR_0_2x8 = {{
+ 0x11111111, // TCfg
+ 0x00000000, // ASelt
+ 0x09000408, 0x060a0105, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni8BitsDataInterleaveBGR_1_2x8 = {{
+ 0x00001111, // TCfg
+ 0x00000000, // ASelt
+ 0x03070b02, 0x00000000, // ABin
+ 0x00002222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
+
+ status = vsi_nn_kernel_gpu_add_param(node, "uniVecShift10", &uniVecShift10);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uniAddRShift", &uniAddRShift);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4);
+ status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+ status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
+ if (reverse)
+ {
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_0_2x8",
+ &uni16BitsDataInterleaveBGR_0_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_1_2x8",
+ &uni16BitsDataInterleaveBGR_1_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_0_2x8",
+ &uni8BitsDataInterleaveBGR_0_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_1_2x8",
+ &uni8BitsDataInterleaveBGR_1_2x8);
+ }
+ else
+ {
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_0_2x8",
+ &uni16BitsDataInterleaveRGB_0_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_1_2x8",
+ &uni16BitsDataInterleaveRGB_1_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_0_2x8",
+ &uni8BitsDataInterleaveRGB_0_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_1_2x8",
+ &uni8BitsDataInterleaveRGB_1_2x8);
+ }
+
+ if (attr[0]->dtype == F16)
+ {
+ status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
+ }
+ else
+ {
+ status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8);
+ }
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ }
+
+OnError:
+ if (attr[0])
+ {
+ vsi_nn_kernel_tensor_attr_release( &attr[0] );
+ attr[0] = NULL;
+ }
+ return status;
+} /* _pre_process_rgb888_planar_initializer() */
+
+DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
+ (
+ vsi_nn_kernel_node_t node,
+ const vsi_nn_kernel_node_param_t * param,
+ size_t param_size
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ gpu_param_t shaderParam = {
+ 2, // workdim
+ {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
+ {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
+ {0, 0, 0}, // localWorkSize: local group size in thread
+ {0, 0, 0}}; // globalWorkSize: image size in thread
+
+ float output_zp = 0;
+ float output_scale = 1;
+ uint32_t width = 0;
+ uint32_t height = 0;
+ int32_t reverse = 0;
+
+ vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+ vsi_size_array_t * out_shape = NULL;
+
+ if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ))
+ {
+ attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+ }
+ else
+ {
+ attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+ }
+ CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 3], &reverse);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+ out_shape = attr[0]->shape;
+ width = (uint32_t)(out_shape->data[0] / 3);
+ height = (uint32_t)(out_shape->data[1]);
+
+ if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+ {
+ if ( attr[0]->dfp.fl > 0 )
+ {
+ output_scale *= (float)((int64_t)1 << attr[0]->dfp.fl);
+ }
+ else
+ {
+ output_scale *= (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
+ }
+ }
+ else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+ {
+ output_zp = (float)attr[0]->asymm.zero_point;
+ output_scale /= attr[0]->asymm.scale;
+ }
+
+ if (attr[0]->dtype == F16 || attr[0]->dtype == I16)
+ {
+ shaderParam.global_scale[0] = 4;
+ }
+ else
+ {
+ shaderParam.global_scale[0] = 8;
+ }
+ shaderParam.global_scale[1] = 1;
+ shaderParam.global_scale[2] = 1;
+ shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
+ / shaderParam.global_scale[0], 4);
+ shaderParam.global_size[1] = height;
+ shaderParam.global_size[2] = 1;
+
+ status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+ CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+ {
+ gpu_dp_inst_t uniDataMeanStddevLo_2x8 = {{
+ 0x99999999, // TCfg
+ 0x44444444, // ASelt
+ 0x03020100, 0x07060504, // ABin
+ 0x99999999, // BSelt
+ 0x06060606, 0x06060606, // BBin
+ 0x00000100, // AccumType, ConstantType, and PostShift
+ 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000,
+ 0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni16BitsDataInterleaveRGB_0_2x8 = {{
+ 0x11111111, // TCfg
+ 0x00100100, // ASelt
+ 0x01000400, 0x06020105, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni16BitsDataInterleaveRGB_1_2x8 = {{
+ 0x00001111, // TCfg
+ 0x00001001, // ASelt
+ 0x03070302, 0x00000000, // ABin
+ 0x00002222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni16BitsDataInterleaveBGR_0_2x8 = {{
+ 0x11111111, // TCfg
+ 0x01001001, // ASelt
+ 0x01000400, 0x06020105, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni16BitsDataInterleaveBGR_1_2x8 = {{
+ 0x00001111, // TCfg
+ 0x00000010, // ASelt
+ 0x03070302, 0x00000000, // ABin
+ 0x00002222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni8BDataInterleaveRGB_0_2x8 = {{
+ 0x11111111, // TCfg
+ 0x00100100, // ASelt
+ 0x01000800, 0x0a020109, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni8BDataInterleaveRGB_1_2x8 = {{
+ 0x11111111, // TCfg
+ 0x01001001, // ASelt
+ 0x030b0302, 0x05040c04, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni8BDataInterleaveRGB_2_2x8 = {{
+ 0x11111111, // TCfg
+ 0x10010010, // ASelt
+ 0x0e06050d, 0x070f0706, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni8BDataInterleaveBGR_0_2x8 = {{
+ 0x11111111, // TCfg
+ 0x01001001, // ASelt
+ 0x01000800, 0x0a020109, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni8BDataInterleaveBGR_1_2x8 = {{
+ 0x11111111, // TCfg
+ 0x10010010, // ASelt
+ 0x030b0302, 0x05040c04, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni8BDataInterleaveBGR_2_2x8 = {{
+ 0x11111111, // TCfg
+ 0x00100100, // ASelt
+ 0x0e06050d, 0x070f0706, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16 };
+
+ status = vsi_nn_kernel_gpu_add_param(node, "uniDataMeanStddevLo_2x8", &uniDataMeanStddevLo_2x8);
+ if (reverse)
+ {
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_0_2x8",
+ &uni16BitsDataInterleaveBGR_0_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_1_2x8",
+ &uni16BitsDataInterleaveBGR_1_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_0_2x8",
+ &uni8BDataInterleaveBGR_0_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_1_2x8",
+ &uni8BDataInterleaveBGR_1_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_2_2x8",
+ &uni8BDataInterleaveBGR_2_2x8);
+ }
+ else
+ {
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_0_2x8",
+ &uni16BitsDataInterleaveRGB_0_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_1_2x8",
+ &uni16BitsDataInterleaveRGB_1_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_0_2x8",
+ &uni8BDataInterleaveRGB_0_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_1_2x8",
+ &uni8BDataInterleaveRGB_1_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_2_2x8",
+ &uni8BDataInterleaveRGB_2_2x8);
+ }
+ status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
+ status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ }
+
+OnError:
+ if (attr[0])
+ {
+ vsi_nn_kernel_tensor_attr_release( &attr[0] );
+ attr[0] = NULL;
+ }
+ return status;
+} /* _pre_process_gray_copy_initializer() */
+
+DEF_KERNEL_INITIALIZER(_resize_rgb888_planar_initializer)
+ (
+ vsi_nn_kernel_node_t node,
+ const vsi_nn_kernel_node_param_t * param,
+ size_t param_size
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ gpu_param_t shaderParam = {
+ 2, // workdim
+ {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
+ {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
+ {0, 0, 0}, // localWorkSize: local group size in thread
+ {0, 0, 0}}; // globalWorkSize: image size in thread
+
+ vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+ int32_t reverse = 0;
+
+ attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+ CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+ if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ))
+ {
+ attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+ }
+ else
+ {
+ attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+ }
+ CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 3], &reverse);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+ {
+ shaderParam.global_scale[0] = 16;
+ shaderParam.global_scale[1] = 2;
+ shaderParam.global_size[0] = gpu_align_p2((attr[0]->shape->data[0] + shaderParam.global_scale[0] - 1)
+ / shaderParam.global_scale[0], 4);
+ shaderParam.global_size[1] = (attr[0]->shape->data[1] + shaderParam.global_scale[1] - 1)
+ / shaderParam.global_scale[1];
+ }
+
+ status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+ CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+ {
+ gpu_dp_inst_t uni8BDataInterleaveRGB_0_2x8 = {{
+ 0x11111111, // TCfg
+ 0x00100100, // ASelt
+ 0x01000800, 0x0a020109, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni8BDataInterleaveRGB_1_2x8 = {{
+ 0x11111111, // TCfg
+ 0x01001001, // ASelt
+ 0x030b0302, 0x05040c04, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni8BDataInterleaveRGB_2_2x8 = {{
+ 0x11111111, // TCfg
+ 0x10010010, // ASelt
+ 0x0e06050d, 0x070f0706, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni8BDataInterleaveBGR_0_2x8 = {{
+ 0x11111111, // TCfg
+ 0x01001001, // ASelt
+ 0x01000800, 0x0a020109, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni8BDataInterleaveBGR_1_2x8 = {{
+ 0x11111111, // TCfg
+ 0x10010010, // ASelt
+ 0x030b0302, 0x05040c04, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uni8BDataInterleaveBGR_2_2x8 = {{
+ 0x11111111, // TCfg
+ 0x00100100, // ASelt
+ 0x0e06050d, 0x070f0706, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16 };
+
+ if (reverse)
+ {
+ status = vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_0_2x8",
+ &uni8BDataInterleaveBGR_0_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_1_2x8",
+ &uni8BDataInterleaveBGR_1_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_2_2x8",
+ &uni8BDataInterleaveBGR_2_2x8);
+ }
+ else
+ {
+ status = vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_0_2x8",
+ &uni8BDataInterleaveRGB_0_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_1_2x8",
+ &uni8BDataInterleaveRGB_1_2x8);
+ status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_2_2x8",
+ &uni8BDataInterleaveRGB_2_2x8);
+ }
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ }
+
+OnError:
+ if (attr[0])
+ {
+ vsi_nn_kernel_tensor_attr_release( &attr[0] );
+ attr[0] = NULL;
+ }
+ if (attr[1])
+ {
+ vsi_nn_kernel_tensor_attr_release( &attr[0] );
+ attr[0] = NULL;
+ }
+
+ return status;
+} /* _resize_rgb888_planar_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+ (
+ vsi_nn_tensor_t * const * const inputs,
+ vsi_nn_tensor_t * const * const outputs,
+ vsi_nn_kernel_t* kernel,
+ const vsi_nn_kernel_param_t * params,
+ vsi_bool is_no_range_change,
+ int32_t width,
+ int32_t height
+ )
+{
+ vsi_nn_kernel_dtype_e input0_dtype = U8;
+ vsi_nn_kernel_dtype_e output_dtype = U8;
+ _internal_scale_e scale_type = SCALE;
+ vsi_status status = VSI_FAILURE;
+ uint32_t key = 0;
+ size_t i = 0;
+ vsi_bool is_half_scale = FALSE;
+ vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
+ vsi_bool is_rgb888_sep = (vsi_bool)(inputs[1] != NULL);
+
+ is_half_scale = (width == (int32_t)outputs[0]->attr.size[0] * 2) &&
+ (height == (int32_t)outputs[0]->attr.size[1] * 2);
+ input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+ output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+ if (enable_copy)
+ {
+ scale_type = COPY;
+ }
+ else
+ {
+ if (is_no_range_change && is_half_scale)
+ {
+ scale_type = HALF;
+ }
+ else
+ {
+ scale_type = SCALE;
+ }
+ }
+
+ key = PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( input0_dtype, output_dtype, is_rgb888_sep, scale_type);
+
+ for ( i = 0; i < _cnt_of_array(_pre_process_rgb888_planar_nhwc_kernel_map); i ++ )
+ {
+ if ( _pre_process_rgb888_planar_nhwc_kernel_map[i].key == key )
+ {
+ break;
+ }
+ }
+ if ( i < _cnt_of_array(_pre_process_rgb888_planar_nhwc_kernel_map) )
+ {
+ snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",
+ _pre_process_rgb888_planar_nhwc_kernel_map[i].function_name );
+
+ if (is_rgb888_sep)
+ {
+ kernel->info.parameters = _pre_process_rgb888_planar_sep_kernel_param_def;
+ kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def );
+ }
+ else
+ {
+ kernel->info.parameters = _pre_process_rgb888_planar_kernel_param_def;
+ kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def );
+ }
+
+ if (enable_copy)
+ {
+ kernel->info.initialize = _pre_process_rgb888_planar_copy_initializer;
+ }
+ else if (scale_type == HALF)
+ {
+ kernel->info.initialize = _resize_rgb888_planar_initializer;
+ }
+ else
+ {
+ kernel->info.initialize = _pre_process_rgb888_planar_initializer;
+ }
+ vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+ _pre_process_rgb888_planar_nhwc_kernel_map[i].source_name );
+ vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+ _pre_process_rgb888_planar_nhwc_kernel_map[i].source_name );
+ status = VSI_SUCCESS;
+ }
+
+ return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+ (
+ vsi_nn_graph_t * graph,
+ vsi_nn_tensor_t ** inputs,
+ size_t input_num,
+ vsi_nn_tensor_t ** outputs,
+ size_t output_num,
+ const vsi_nn_kernel_param_t * params,
+ vsi_nn_kernel_t * kernel
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ vsi_nn_kernel_node_param_t* node_params = NULL;
+ vsi_nn_kernel_node_t node = NULL;
+ vsi_nn_tensor_t* reshape_tensor = NULL;
+ vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+ size_t param_count = _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM;
+ int32_t width = vsi_nn_kernel_param_get_int32( params, "width" );
+ int32_t height = vsi_nn_kernel_param_get_int32( params, "height" );
+ float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
+ float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" );
+ float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" );
+ float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" );
+ float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" );
+ float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" );
+ int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
+ vsi_bool is_no_range_change = FALSE;
+
+ input_num = inputs[1] == NULL ? 1 : input_num;
+ param_count = inputs[1] == NULL ? _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM : param_count;
+
+ memcpy(shape, outputs[0]->attr.size, outputs[0]->attr.dim_num * sizeof(shape[0]));
+ shape[0] *= shape[1];
+ shape[1] = shape[2];
+ shape[2] = 1;
+ reshape_tensor = vsi_nn_reshape_tensor( graph,
+ outputs[0], shape, outputs[0]->attr.dim_num );
+
+ if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
+ outputs[0]->attr.dim_num ) )
+ {
+ return NULL;
+ }
+
+ if ( width == (int32_t)inputs[0]->attr.size[0] && height == (int32_t)inputs[0]->attr.size[1] &&
+ outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 &&
+ outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC &&
+ (float)outputs[0]->attr.dtype.zero_point == r_mean && r_mean == g_mean && r_mean == b_mean &&
+ vsi_nn_abs(outputs[0]->attr.dtype.scale - r_scale) < 1e-8 &&
+ vsi_nn_abs(outputs[0]->attr.dtype.scale - g_scale) < 1e-8 &&
+ vsi_nn_abs(outputs[0]->attr.dtype.scale - b_scale) < 1e-8)
+ {
+ is_no_range_change = TRUE;
+ }
+
+ status = _query_kernel( inputs, outputs, kernel, params, is_no_range_change, width, height );
+ if ( VSI_SUCCESS == status)
+ {
+ node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count);
+ CHECK_PTR_FAIL_GOTO( node_params, "Create buffer fail.", final );
+ node = vsi_nn_kernel_create_node( graph, kernel );
+ if ( node )
+ {
+ uint32_t index = inputs[1] == NULL ? 2 : 4;
+ uint32_t scalar_index = index;
+ int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
+ int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" );
+ int32_t left = vsi_nn_kernel_param_get_int32( params, "left" );
+ int32_t top = vsi_nn_kernel_param_get_int32( params, "top" );
+
+ /* Set inputs and outputs */
+ vsi_nn_kernel_node_pack_io( node_params, param_count,
+ inputs, input_num, &reshape_tensor, output_num );
+
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
+ /* Pass parameters to node. */
+ status = vsi_nn_kernel_node_pass_param( node, node_params, param_count );
+ index = scalar_index;
+ vsi_nn_kernel_scalar_release( &node_params[index++] );
+ vsi_nn_kernel_scalar_release( &node_params[index++] );
+ vsi_nn_kernel_scalar_release( &node_params[index++] );
+ vsi_nn_kernel_scalar_release( &node_params[index++] );
+ vsi_nn_kernel_scalar_release( &node_params[index++] );
+ vsi_nn_kernel_scalar_release( &node_params[index++] );
+ vsi_nn_kernel_scalar_release( &node_params[index++] );
+ vsi_nn_kernel_scalar_release( &node_params[index++] );
+ vsi_nn_kernel_scalar_release( &node_params[index++] );
+ vsi_nn_kernel_scalar_release( &node_params[index++] );
+ vsi_nn_kernel_scalar_release( &node_params[index++] );
+ }
+ }
+
+final:
+ vsi_nn_safe_free(node_params);
+
+ return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( pre_process_rgb888_planar_nhwc, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
index 5fda28142..984293bcb 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
@@ -106,6 +106,8 @@ static vx_param_description_t vxPreProcessRgbKernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _EVIS_PRE_PROCESS_RGB_PARAM_NUM _cnt_of_array(vxPreProcessRgbKernel_param_def)
@@ -126,19 +128,24 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
float outputZP = 0;
float outputScale = 1;
- int32_t reorder = 0;
- int32_t trans = 0;
- int32_t xRatio = 0;
- int32_t yRatio = 0;
- int32_t order1 = 2;
- uint32_t width = 0;
- uint32_t height = 0;
- int32_t enable_copy= 0;
- uint32_t pack_key = 0;
+ int32_t reorder = 0;
+ int32_t trans = 0;
+ int32_t xRatio = 0;
+ int32_t yRatio = 0;
+ int32_t order1 = 2;
+ uint32_t width = 0;
+ uint32_t height = 0;
+ int32_t enable_copy = 0;
+ uint32_t pack_key = 0;
+ float rgb_mean[4] = {0};
+ float rgb_scale[4] = {0};
+ float param_data[4] = {0};
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -148,6 +155,18 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder);
CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[6], &rgb_mean[0]);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &rgb_mean[1]);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &rgb_mean[2]);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &rgb_scale[0]);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[12], &rgb_scale[1]);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &rgb_scale[2]);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[0]->shape;
width = (uint32_t)(out_shape->data[0]);
@@ -417,6 +436,12 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
case _PACK_SELECT_KEY( 1, 0, 0): // copy
case _PACK_SELECT_KEY( 1, 2, 0): // copy reorder
{
+ int32_t i = 0;
+ for (i = 0;i < 3; i ++)
+ {
+ rgb_scale[i] *= outputScale;
+ param_data[i] = rgb_mean[i] * rgb_scale[i] - outputZP;
+ }
if (attr[0]->dtype == I8 || attr[0]->dtype == U8)
{
shaderParam.global_scale[0] = 16;
@@ -454,6 +479,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part3_4x4", &uniExtractBtoF32_part3_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "r_order", &reorder);
status |= vsi_nn_kernel_gpu_add_param(node, "b_order", &order1);
+ status |= vsi_nn_kernel_gpu_add_param(node, "rgb_scale", &rgb_scale);
+ status |= vsi_nn_kernel_gpu_add_param(node, "param_data", ¶m_data);
CHECK_STATUS_FAIL_GOTO(status, OnError);
}
break;
@@ -486,6 +513,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes);
status |= vsi_nn_kernel_gpu_add_param(node, "r_order", &reorder);
status |= vsi_nn_kernel_gpu_add_param(node, "b_order", &order1);
+ status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale);
+ status |= vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP);
CHECK_STATUS_FAIL_GOTO(status, OnError);
}
break;
@@ -493,10 +522,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
break;
}
- status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale);
- status |= vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP);
- CHECK_STATUS_FAIL_GOTO(status, OnError );
-
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
CHECK_STATUS_FAIL_GOTO(status, OnError);
}
@@ -523,7 +548,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_convert_type_e convert_type = SCALE;
vsi_status status = VSI_FAILURE;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
@@ -580,6 +605,9 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
int32_t trans = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
@@ -600,7 +628,9 @@ static vsi_nn_kernel_node_t _setup
float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" );
float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" );
- float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
+ float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" );
+ float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" );
+ float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" );
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
/* Pass parameters to node. */
@@ -616,9 +646,11 @@ static vsi_nn_kernel_node_t _setup
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
- tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_RGB_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &tmp_params[2] );
@@ -631,6 +663,8 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &tmp_params[9] );
vsi_nn_kernel_scalar_release( &tmp_params[10] );
vsi_nn_kernel_scalar_release( &tmp_params[11] );
+ vsi_nn_kernel_scalar_release( &tmp_params[12] );
+ vsi_nn_kernel_scalar_release( &tmp_params[13] );
}
}
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
index 8e5f77949..eb9d16056 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
@@ -99,6 +99,8 @@ static vx_param_description_t vxPreProcessYuv420Kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _EVIS_PRE_PROCESS_YUV420_PARAM_NUM _cnt_of_array(vxPreProcessYuv420Kernel_param_def)
@@ -128,6 +130,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -496,6 +500,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -833,7 +839,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_convert_type_e convert_type = SCALE;
vsi_status status = VSI_FAILURE;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
@@ -900,6 +906,9 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
int32_t trans = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
@@ -920,7 +929,9 @@ static vsi_nn_kernel_node_t _setup
float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" );
float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" );
- float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
+ float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" );
+ float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" );
+ float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" );
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
/* Pass parameters to node. */
@@ -935,9 +946,11 @@ static vsi_nn_kernel_node_t _setup
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
- tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_YUV420_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &tmp_params[4] );
@@ -950,6 +963,8 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &tmp_params[11] );
vsi_nn_kernel_scalar_release( &tmp_params[12] );
vsi_nn_kernel_scalar_release( &tmp_params[13] );
+ vsi_nn_kernel_scalar_release( &tmp_params[14] );
+ vsi_nn_kernel_scalar_release( &tmp_params[15] );
}
}
if (reshape_tensors[0])
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c
index ca397de23..61d421d27 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c
@@ -99,6 +99,8 @@ static vx_param_description_t vxPreProcessyuv422Kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _EVIS_PRE_PROCESS_YUV422_PARAM_NUM _cnt_of_array(vxPreProcessyuv422Kernel_param_def)
@@ -126,13 +128,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer)
int32_t order1 = 2;
uint32_t width = 0;
uint32_t height = 0;
- float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f;
- float outputScaleVar = 0.0f;
+ float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f;
+ float b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f;
+ float outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f;
float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f;
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -142,10 +147,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &bMean);
CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &var);
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &r_scale);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder);
CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &g_scale);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &b_scale);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[0]->shape;
output_scale = 1.0f / attr[0]->scale;
@@ -159,10 +168,12 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer)
order1 = 0;
}
- outputScaleVar = output_scale * var;
- bMeanScaleVarZp = output_zp - bMean * outputScaleVar;
- gMeanScaleVarZp = output_zp - gMean * outputScaleVar;
- rMeanScaleVarZp = output_zp - rMean * outputScaleVar;
+ outputScaleVar_b = output_scale * b_scale;
+ outputScaleVar_g = output_scale * g_scale;
+ outputScaleVar_r = output_scale * r_scale;
+ bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b;
+ gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g;
+ rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r;
shaderParam.global_scale[0] = 4;
shaderParam.global_scale[1] = 1;
@@ -245,7 +256,9 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractYUVtoShortSub_2x8", &uniExtractYUVtoShortSub_2x8);
- status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar);
+ status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
+ status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g);
+ status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r);
status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
@@ -308,13 +321,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer)
uint32_t yrIntFloat_16 = 0;
int32_t xRatio = 0;
int32_t yRatio = 0;
- float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f;
- float outputScaleVar = 0.0f;
+ float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f;
+ float b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f;
+ float outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f;
float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f;
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -328,10 +344,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &bMean);
CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &var);
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &r_scale);
CHECK_STATUS_FAIL_GOTO(status, OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder);
CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &g_scale);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &b_scale);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[0]->shape;
output_scale = 1.0f / attr[0]->scale;
@@ -350,10 +370,12 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer)
xrIntFloat_16 = (uint32_t)((roi_width << 16) / width + 1);
yrIntFloat_16 = (uint32_t)((roi_height << 16) / height + 1);
- outputScaleVar = output_scale * var;
- bMeanScaleVarZp = output_zp - bMean * outputScaleVar;
- gMeanScaleVarZp = output_zp - gMean * outputScaleVar;
- rMeanScaleVarZp = output_zp - rMean * outputScaleVar;
+ outputScaleVar_b = output_scale * b_scale;
+ outputScaleVar_g = output_scale * g_scale;
+ outputScaleVar_r = output_scale * r_scale;
+ bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b;
+ gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g;
+ rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r;
shaderParam.global_scale[0] = 4;
shaderParam.global_scale[1] = 1;
@@ -445,7 +467,9 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toR_4x4", &uniConvertYUV422toR_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16);
status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16);
- status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar);
+ status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
+ status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g);
+ status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r);
status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
@@ -503,9 +527,11 @@ static vsi_status _query_kernel
vsi_nn_kernel_convert_type_e convert_type = SCALE;
vsi_status status = VSI_FAILURE;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
+ VSI_UNREFERENCED(scale_x);
+
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -570,6 +596,9 @@ static vsi_nn_kernel_node_t _setup
int32_t trans = 0;
int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
@@ -589,7 +618,9 @@ static vsi_nn_kernel_node_t _setup
float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" );
float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" );
- float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
+ float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" );
+ float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" );
+ float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" );
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
int32_t yuv422_type = vsi_nn_kernel_param_get_int32( params, "yuv422_type" );
@@ -604,10 +635,12 @@ static vsi_nn_kernel_node_t _setup
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
- tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &yuv422_type );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_YUV422_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &tmp_params[2] );
@@ -621,6 +654,8 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &tmp_params[10] );
vsi_nn_kernel_scalar_release( &tmp_params[11] );
vsi_nn_kernel_scalar_release( &tmp_params[12] );
+ vsi_nn_kernel_scalar_release( &tmp_params[13] );
+ vsi_nn_kernel_scalar_release( &tmp_params[14] );
}
}
vsi_safe_release_tensor(reshape_tensors[0]);
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
index 7c7efc765..4c322a8fc 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
@@ -95,6 +95,8 @@ static vx_param_description_t vxPreProcessYuv444Kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _EVIS_PRE_PROCESS_YUV444_PARAM_NUM _cnt_of_array(vxPreProcessYuv444Kernel_param_def)
@@ -123,6 +125,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -488,6 +492,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -845,7 +851,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_convert_type_e convert_type = SCALE;
vsi_status status = VSI_FAILURE;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
@@ -910,6 +916,9 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
int32_t trans = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
@@ -930,7 +939,9 @@ static vsi_nn_kernel_node_t _setup
float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" );
float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" );
- float rgb_scale = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
+ float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" );
+ float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" );
+ float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" );
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
/* Pass parameters to node. */
@@ -944,9 +955,11 @@ static vsi_nn_kernel_node_t _setup
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
- tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
+ tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_YUV444_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &tmp_params[4] );
@@ -959,6 +972,8 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &tmp_params[11] );
vsi_nn_kernel_scalar_release( &tmp_params[12] );
vsi_nn_kernel_scalar_release( &tmp_params[13] );
+ vsi_nn_kernel_scalar_release( &tmp_params[14] );
+ vsi_nn_kernel_scalar_release( &tmp_params[15] );
}
}
if(reshape_tensors[0])
diff --git a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
index c007a088e..bed0b6c46 100644
--- a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
@@ -142,6 +142,8 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
vx_context ctx = vxGetContext((vx_reference)node);
vx_hardware_caps_params_t hw_param;
+ VSI_UNREFERENCED(param_size);
+
memset(&hw_param, 0, sizeof(vx_hardware_caps_params_t));
status = vxQueryHardwareCaps(ctx, &hw_param, sizeof(vx_hardware_caps_params_t));
CHECK_STATUS_FAIL_GOTO(status, final);
@@ -531,7 +533,7 @@ static vsi_status _query_kernel
vsi_nn_shader_type_e sh_type = image_2d ? (input_fl >= output_fl ? _2D_OPT : _2D) : _3D;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int i;
+ size_t i;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -583,6 +585,9 @@ static vsi_nn_kernel_node_t _setup
vsi_bool ret;
int32_t is_per_channel_alpha = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
is_per_channel_alpha = vsi_nn_kernel_param_get_int32(params, "is_per_channel_alpha");
if (is_per_channel_alpha)
diff --git a/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c b/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c
index daa40605e..cac4e3b13 100644
--- a/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c
@@ -35,7 +35,6 @@
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
@@ -151,6 +150,8 @@ DEF_KERNEL_INITIALIZER(_multinomial_initializer)
vsi_nn_kernel_tensor_attr_t * attr = NULL;
vsi_size_array_t * in_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
@@ -196,6 +197,8 @@ DEF_KERNEL_INITIALIZER(_cdf_initializer)
uint32_t class_size = 0;
uint32_t batch = 0;
+ VSI_UNREFERENCED(param_size);
+
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
@@ -292,6 +295,8 @@ DEF_KERNEL_INITIALIZER(_seed_initializer)
float rand_max = (float)(pow(2.0,32));
float re_rand_max = 1 / rand_max;
+ VSI_UNREFERENCED(param_size);
+
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
@@ -425,20 +430,24 @@ static vsi_nn_kernel_node_t _setup
uint32_t hashkey = 0;
int32_t i;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+
// Check if gpu can support the size
- if( !vsi_nn_kernel_gpu_check_shape(
+ if ( !vsi_nn_kernel_gpu_check_shape(
outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
{
return NULL;
}
- for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
+ for ( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
{
ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
// Assign unique_id
ikernels[i]->unique_id = kernel->unique_id;
}
- if( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 )
+ if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 )
{
class_max_stride = (int32_t)gpu_align_p2(inputs[0]->attr.size[0], 4);
}
@@ -453,17 +462,20 @@ static vsi_nn_kernel_node_t _setup
attr.is_const = FALSE;
attr.vtl = TRUE;
tensors[SEED_INDEX] = vsi_nn_CreateTensor( graph, &attr );
+ CHECK_PTR_FAIL_GOTO(tensors[SEED_INDEX], "Create tensor failed", final);
attr.size[0] = class_max_stride * inputs[0]->attr.size[1];
attr.size[1] = inputs[0]->attr.size[1];
attr.dim_num = 2;
tensors[CDF_INDEX] = vsi_nn_CreateTensor( graph, &attr );
+ CHECK_PTR_FAIL_GOTO(tensors[CDF_INDEX], "Create tensor failed", final);
memcpy( &attr, &(inputs[1]->attr), sizeof(vsi_nn_tensor_attr_t) );
attr.size[1] = 1;
attr.dim_num = 2;
tensors[SEEDS_INDEX] = vsi_nn_reshape_tensor( graph,
inputs[1], attr.size, attr.dim_num );
+ CHECK_PTR_FAIL_GOTO(tensors[SEEDS_INDEX], "Create tensor failed", final);
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -474,17 +486,17 @@ static vsi_nn_kernel_node_t _setup
hashkey = MULTINOMIAL_HASH_KEY( F32, F32, out_dtype );
status = _query_kernel( ikernels[SEED_INDEX], hashkeys[SEED_INDEX], INTERNAL_KERNEL_SEED );
- if( VSI_SUCCESS != status )
+ if ( VSI_SUCCESS != status )
{
goto final;
}
status = _query_kernel( ikernels[CDF_INDEX], hashkeys[CDF_INDEX], INTERNAL_KERNEL_CDF );
- if( VSI_SUCCESS != status )
+ if ( VSI_SUCCESS != status )
{
goto final;
}
status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_MULTINOMIAL );
- if( VSI_SUCCESS != status )
+ if ( VSI_SUCCESS != status )
{
goto final;
}
@@ -518,13 +530,13 @@ static vsi_nn_kernel_node_t _setup
/* Pass parameters to node. */
final:
- for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
+ for ( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
{
- if( ikernels[i] )
+ if ( ikernels[i] )
{
vsi_nn_kernel_release( &ikernels[i] );
}
- if( tensors[i] )
+ if ( tensors[i] )
{
vsi_nn_ReleaseTensor( &tensors[i] );
}
diff --git a/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c
index caf40b973..a133a121e 100644
--- a/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c
@@ -111,6 +111,8 @@ DEF_KERNEL_INITIALIZER(_reduceall_internal_initializer)
vsi_size_array_t * output_shape = NULL;
int32_t axisSize = 0;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c
index df45307c9..11aa099ec 100644
--- a/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c
@@ -111,6 +111,8 @@ DEF_KERNEL_INITIALIZER(_reduceany_internal_initializer)
vsi_size_array_t * output_shape = NULL;
int32_t axisSize = 0;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c
index e70b58a52..efb52f080 100644
--- a/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c
@@ -159,6 +159,8 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer)
float outputScale = 1.0f;
float output_offset_asymmetric = 0.0f;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c
index b1149fd59..d9bd40d8a 100644
--- a/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c
@@ -161,6 +161,8 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer)
float outputScale = 1.0f;
float output_offset_asymmetric = 0.0f;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c
index 6fd1b7d63..3c710f599 100644
--- a/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c
@@ -167,6 +167,8 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer)
float outputScale = 1.0f;
float output_offset_asymmetric = 0.0f;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c b/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c
index d7cb58d43..131111732 100644
--- a/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c
@@ -141,6 +141,8 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
int32_t srcFixPointPos = 0;
int32_t dstFixPointPos = 0;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/repeat_evis.c b/src/tim/vx/internal/src/kernel/evis/repeat_evis.c
index 7fe19bc70..164ab495c 100644
--- a/src/tim/vx/internal/src/kernel/evis/repeat_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/repeat_evis.c
@@ -147,6 +147,8 @@ DEF_KERNEL_INITIALIZER(_preprocess_initializer)
vsi_nn_kernel_tensor_attr_t* attr[1] = {NULL};
int32_t width = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -212,6 +214,8 @@ DEF_KERNEL_INITIALIZER(_repeat_initializer)
int32_t is1d = 0;
int32_t axis = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -303,7 +307,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
int32_t is1d = inputs[0]->attr.dim_num == 1 ? 1 : 0;
- int i = 0;
+ size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -453,6 +457,9 @@ static vsi_nn_kernel_node_t _setup
vsi_size_t new_rank[2] = {0, 0};
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
// Check if gpu can support the size
if ( !vsi_nn_kernel_gpu_check_shape(
outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
@@ -497,7 +504,7 @@ static vsi_nn_kernel_node_t _setup
attr.size[1] = 1;
attr.dim_num = 2;
tensor_preprocess = vsi_nn_CreateTensor( graph, &attr );
-
+ CHECK_PTR_FAIL_GOTO( tensor_preprocess, "Create tensor fail.", final );
// preprocess
tmp_node = vsi_nn_kernel_create_node( graph, kernel_preprocess );
if (tmp_node)
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c
index f893feaf2..95c33b80b 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c
@@ -35,7 +35,6 @@
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
#include "utils/vsi_nn_dtype_util_prv.h"
__BEGIN_DECLS
@@ -855,7 +854,6 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer)
else if (F16 == output_dtype)
{
status = vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8);
- status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertFp2FP32_left_4x4",
&uniConvertFp2FP32_left_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertFp2FP32_right_4x4",
@@ -1187,7 +1185,7 @@ static vsi_nn_tensor_t* _create_scale_tensor
uint32_t dims = output->attr.dim_num;
vsi_size_t batch = dims > 3 ? output->attr.size[3] : 1;
vsi_size_t width = output->attr.size[0];
- vsi_size_t sizes[4] = {width * 2, 1, 1, batch};
+ vsi_size_t sizes[4] = { 0, 0, 0, 0 };
vsi_size_t item_count = width * 2 * batch;
vsi_size_t input_width = input->attr.size[0];
vsi_size_t x = 0;
@@ -1195,6 +1193,10 @@ static vsi_nn_tensor_t* _create_scale_tensor
float width_scale = 1.0f;
uint16_t *scale_data_ptr = NULL;
+ sizes[0] = width * 2;
+ sizes[1] = 1;
+ sizes[2] = 1;
+ sizes[3] = batch;
if (align_corners && width > 1)
{
width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(width - 1);
@@ -1310,6 +1312,7 @@ static vsi_nn_kernel_node_t _setup
if (is_run_opt_kernel)
{
scale = _create_scale_tensor(graph, inputs[0], outputs[0], align_corners, half_pixel_centers);
+ CHECK_PTR_FAIL_GOTO( scale, "Create tensor fail.", final );
node_params[SCALAR_TENSOR_SCALE] = (vsi_nn_kernel_node_param_t)(scale->t);
node_params_num = _RESIZE_1D_BILINEAR_PARAM_NUM;
}
@@ -1325,16 +1328,18 @@ static vsi_nn_kernel_node_t _setup
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_TYPE] );
}
+ }
+ }
- if (is_run_opt_kernel)
- {
- if (scale)
- {
- vsi_nn_ReleaseTensor(&scale);
- }
- }
+final:
+ if (is_run_opt_kernel)
+ {
+ if (scale)
+ {
+ vsi_nn_ReleaseTensor(&scale);
}
}
+
return node;
} /* _setup() */
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c
index be1cd0972..fddd1e381 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c
@@ -144,6 +144,8 @@ DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer)
float half_pixel_value = 0.0f;
float round_value = 0.0f;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
index 1e79cbfe3..ebfe9ed38 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@@ -868,6 +868,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
vsi_bool is_4x_up_kernel = FALSE;
vsi_bool is_8x_up_kernel = FALSE;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -1167,6 +1169,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_align_corners_opt_initializer)
uint32_t out_height = 0;
vsi_bool is_8x_align_corners = FALSE;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -1490,7 +1494,7 @@ static vsi_nn_tensor_t* _create_scale_tensor
vsi_size_t width = output->attr.size[0];
vsi_size_t height = output->attr.size[1];
vsi_size_t batch = dims > 3 ? output->attr.size[3] : 1;
- vsi_size_t sizes[4] = {width * 4, height, 1, batch};
+ vsi_size_t sizes[4] = { 0, 0, 0, 0 };
vsi_size_t item_count = width * 4 * height * batch;
vsi_size_t input_width = input->attr.size[0];
vsi_size_t input_height = input->attr.size[1];
@@ -1501,6 +1505,10 @@ static vsi_nn_tensor_t* _create_scale_tensor
float height_scale = 1.0f;
uint16_t *scale_data_ptr = NULL;
+ sizes[0] = width * 4;
+ sizes[1] = height;
+ sizes[2] = 1;
+ sizes[3] = batch;
if (align_corners && width > 1)
{
width_scale = ((float)(input_width - 1) * 1.0f) / (float)(width - 1);
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c
index b8e634e4e..596d528f7 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c
@@ -137,6 +137,8 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_nhwc_initializer)
vsi_bool is_3x_up_kernel = FALSE;
vsi_bool is_4x_up_kernel = FALSE;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -433,6 +435,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_nhwc_bound_initializer)
vsi_bool is_3x_up_kernel = FALSE;
vsi_bool is_4x_up_kernel = FALSE;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c
index 4d0189327..6bf9ba87c 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c
@@ -145,6 +145,8 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer)
float half_pixel_value = 0.0f;
float round_value = 0.0f;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
index 9876ebc71..bba21eabb 100644
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
@@ -188,6 +188,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer)
int32_t coord_dim = 0;
int32_t offsetX = 0, offsetY = 0, offsetZ = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -345,6 +347,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_big_initializer)
int32_t coord_dim = 0;
int32_t offsetX = 0, offsetY = 0, offsetZ = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -457,7 +461,9 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e input1_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
+
+ VSI_UNREFERENCED(coord_dim);
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -517,6 +523,9 @@ static vsi_nn_kernel_node_t _setup
vsi_size_t width = 0, area = 0;
int32_t big_flg = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if (coord_dim > 3)
{
return NULL;
diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
index e9d6d5dd0..43ea15c3f 100644
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
@@ -45,64 +45,82 @@ __BEGIN_DECLS
#define KERNEL_SOURCE_2 "scatter_nd_update_big"
#define KERNEL_SOURCE_3 "scatter_nd_update_atom"
#define KERNEL_SOURCE_4 "scatter_nd_update_special"
+#define KERNEL_SOURCE_5 "scatter_nd_update_qint"
+#define KERNEL_SOURCE_6 "scatter_nd_update_fp"
-#define HASH_SCATTER_ND_UPDATE_KEY(_input0_type, _input2_type, _output_type, _pre_op, _large_type) \
- ((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | (_pre_op << 4) | (_large_type))
+#define HASH_SCATTER_ND_UPDATE_KEY(_in0_type, _in2_type, _out_type, _stage, _coord_type, _opt_flg) \
+ ((_in0_type << 24) | (_in2_type << 16) | (_out_type << 8) | (_stage << 4) | (_coord_type << 2) | (_opt_flg))
-#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_NAME(SRC0_TYPE, SRC2_TYPE, DST_TYPE) \
- CVIVANTE_NAMESPACE("evis.scatter_nd_update_"#SRC0_TYPE#SRC2_TYPE"to"#DST_TYPE)
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_RESET_NAME(SRC0_TYPE, DST_TYPE) \
+ CVIVANTE_NAMESPACE("evis.scatter_nd_update_reset_"#SRC0_TYPE"to"#DST_TYPE)
-#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_BIG_NAME(SRC0_TYPE, SRC2_TYPE, DST_TYPE) \
- CVIVANTE_NAMESPACE("evis.scatter_nd_update_"#SRC0_TYPE#SRC2_TYPE"to"#DST_TYPE"_big")
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_NAME(SRC2_TYPE) \
+ CVIVANTE_NAMESPACE("evis.scatter_nd_update_update_"#SRC2_TYPE)
-#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PRE_NAME(SRC0_TYPE) \
- CVIVANTE_NAMESPACE("evis.scatter_nd_update_"#SRC0_TYPE"_pre")
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_4X_NAME(SRC2_TYPE) \
+ CVIVANTE_NAMESPACE("evis.scatter_nd_update_update_"#SRC2_TYPE"_4X")
- #define HASH_SCATTER_ND_UPDATE_SH_KERNEL_RESET_NAME() \
- CVIVANTE_NAMESPACE("evis.scatter_nd_update_reset")
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_NAME(SRC2_TYPE, DST_TYPE) \
+ CVIVANTE_NAMESPACE("evis.scatter_nd_update_ref_"#SRC2_TYPE"to"#DST_TYPE)
-#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_NAME(SRC0_TYPE, DST_TYPE) \
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_4X_NAME(SRC2_TYPE, DST_TYPE) \
+ CVIVANTE_NAMESPACE("evis.scatter_nd_update_ref_"#SRC2_TYPE"to"#DST_TYPE"_4X")
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_COPY_NAME(DST_TYPE) \
+ CVIVANTE_NAMESPACE("evis.scatter_nd_update_copy_"#DST_TYPE)
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_SPECIAL_REF_NAME(SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("evis.scatter_nd_update_ref2out_"#SRC0_TYPE"to"#DST_TYPE)
-#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_NAME(SRC2_TYPE, DST_TYPE) \
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_SPECIAL_UPDATE_NAME(SRC2_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("evis.scatter_nd_update_update2ref_"#SRC2_TYPE"to"#DST_TYPE"_16x")
-#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_COPY_NAME(DST_TYPE) \
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_SPECIAL_COPY_NAME(DST_TYPE) \
CVIVANTE_NAMESPACE("evis.scatter_nd_update_cpy2out_"#DST_TYPE"to"#DST_TYPE)
-#define TENSOR_SCATTER_ND_UPDATE_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
- { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 0, 0), \
- HASH_SCATTER_ND_UPDATE_SH_KERNEL_NAME(IN0_TYPE, IN2_TYPE, OUT_TYPE), \
+#define TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
+ { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 4, 1, 0), \
+ HASH_SCATTER_ND_UPDATE_SH_KERNEL_SPECIAL_REF_NAME(IN0_TYPE, OUT_TYPE), \
+ SOURCE },
+
+#define TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
+ { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 5, 1, 0), \
+ HASH_SCATTER_ND_UPDATE_SH_KERNEL_SPECIAL_UPDATE_NAME(IN2_TYPE, OUT_TYPE), \
+ SOURCE },
+
+#define TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
+ { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 6, 1, 0), \
+ HASH_SCATTER_ND_UPDATE_SH_KERNEL_SPECIAL_COPY_NAME(IN0_TYPE), \
SOURCE },
-#define TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
- { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 0, 1), \
- HASH_SCATTER_ND_UPDATE_SH_KERNEL_BIG_NAME(IN0_TYPE, IN2_TYPE, OUT_TYPE), \
+#define TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+ { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, 0, OUT_TYPE, 0, 0, 0), \
+ HASH_SCATTER_ND_UPDATE_SH_KERNEL_RESET_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
-#define TENSOR_SCATTER_ND_UPDATE_PRE_KERNELS(IN0_TYPE, SOURCE) \
- { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, I32, I32, 1, 1), \
- HASH_SCATTER_ND_UPDATE_SH_KERNEL_PRE_NAME(IN0_TYPE), \
+#define TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(IN2_TYPE, SOURCE) \
+ { HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, 0, 1, 0, 0), \
+ HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_NAME(IN2_TYPE), \
SOURCE },
- #define TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(SOURCE) \
- { HASH_SCATTER_ND_UPDATE_KEY(I32, I32, I32, 2, 1), \
- HASH_SCATTER_ND_UPDATE_SH_KERNEL_RESET_NAME(), \
+#define TENSOR_SCATTER_ND_UPDATE_UPDATE_4X_KERNELS(IN2_TYPE, SOURCE) \
+ { HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, 0, 1, 0, 1), \
+ HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_4X_NAME(IN2_TYPE), \
SOURCE },
-#define TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
- { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 3, 1), \
- HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_NAME(IN0_TYPE, OUT_TYPE), \
+#define TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(IN2_TYPE, OUT_TYPE, SOURCE) \
+ { HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, OUT_TYPE, 2, 0, 0), \
+ HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_NAME(IN2_TYPE, OUT_TYPE), \
SOURCE },
-#define TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
- { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 4, 1), \
- HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_NAME(IN2_TYPE, OUT_TYPE), \
+#define TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(IN2_TYPE, OUT_TYPE, SOURCE) \
+ { HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, OUT_TYPE, 2, 0, 1), \
+ HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_4X_NAME(IN2_TYPE, OUT_TYPE), \
SOURCE },
-#define TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
- { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 5, 1), \
- HASH_SCATTER_ND_UPDATE_SH_KERNEL_COPY_NAME(IN0_TYPE), \
+#define TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(OUT_TYPE, SOURCE) \
+ { HASH_SCATTER_ND_UPDATE_KEY(0, 0, OUT_TYPE, 3, 0, 0), \
+ HASH_SCATTER_ND_UPDATE_SH_KERNEL_COPY_NAME(OUT_TYPE), \
SOURCE },
typedef struct
@@ -112,93 +130,118 @@ typedef struct
const char * source_name;
} _kernel_map_type;
-static const _kernel_map_type scatter_nd_update_map[] =
+static const _kernel_map_type scatter_nd_update_reset_map[] =
{
- TENSOR_SCATTER_ND_UPDATE_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_1)
- TENSOR_SCATTER_ND_UPDATE_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_1)
- TENSOR_SCATTER_ND_UPDATE_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_1)
- TENSOR_SCATTER_ND_UPDATE_KERNELS(F16, I32, F16, F16, KERNEL_SOURCE_1)
- TENSOR_SCATTER_ND_UPDATE_KERNELS(BF16, I32, BF16, BF16, KERNEL_SOURCE_1)
- TENSOR_SCATTER_ND_UPDATE_KERNELS(U8, I32, U8, F16, KERNEL_SOURCE_1)
- TENSOR_SCATTER_ND_UPDATE_KERNELS(I8, I32, I8, F16, KERNEL_SOURCE_1)
- TENSOR_SCATTER_ND_UPDATE_KERNELS(I16, I32, I16, F16, KERNEL_SOURCE_1)
- TENSOR_SCATTER_ND_UPDATE_KERNELS(F16, I32, F16, U8, KERNEL_SOURCE_1)
- TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(F16, I32, F16, F16, KERNEL_SOURCE_2)
- TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(F16, I32, F16, U8, KERNEL_SOURCE_2)
+ TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(U8, U8, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(I8, I8, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(I16, I16, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(F16, F16, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(BF16, BF16, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(U8, F16, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(I8, F16, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(I16, F16, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(F16, U8, KERNEL_SOURCE_5)
};
-static const _kernel_map_type scatter_nd_update_reset_map[] =
+static const _kernel_map_type scatter_nd_update_update_map[] =
{
- TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(KERNEL_SOURCE_3)
+ TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(U8, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(I8, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(I16, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(F16, KERNEL_SOURCE_6)
+ TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(BF16, KERNEL_SOURCE_6)
+ TENSOR_SCATTER_ND_UPDATE_UPDATE_4X_KERNELS(U8, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_UPDATE_4X_KERNELS(I8, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_UPDATE_4X_KERNELS(I16, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_UPDATE_4X_KERNELS(F16, KERNEL_SOURCE_6)
+ TENSOR_SCATTER_ND_UPDATE_UPDATE_4X_KERNELS(BF16, KERNEL_SOURCE_6)
};
-static const _kernel_map_type scatter_nd_update_pre_map[] =
+static const _kernel_map_type scatter_nd_update_ref_map[] =
{
- TENSOR_SCATTER_ND_UPDATE_PRE_KERNELS(U8, KERNEL_SOURCE_3)
- TENSOR_SCATTER_ND_UPDATE_PRE_KERNELS(I8, KERNEL_SOURCE_3)
- TENSOR_SCATTER_ND_UPDATE_PRE_KERNELS(I16, KERNEL_SOURCE_3)
+ TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(I32, U8, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(I32, I8, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(I32, I16, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(I32, F16, KERNEL_SOURCE_6)
+ TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(F32, F16, KERNEL_SOURCE_6)
+ TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(F32, BF16, KERNEL_SOURCE_6)
+ TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(I32, U8, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(I32, I8, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(I32, I16, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(I32, F16, KERNEL_SOURCE_6)
+ TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(F32, F16, KERNEL_SOURCE_6)
+ TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(F32, BF16, KERNEL_SOURCE_6)
};
-static const _kernel_map_type scatter_nd_update_post_map[] =
+static const _kernel_map_type scatter_nd_update_copy_map[] =
{
- TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(U8, I32, U8, F16, KERNEL_SOURCE_3)
- TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(I8, I32, I8, F16, KERNEL_SOURCE_3)
- TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(I16, I32, I16, F16, KERNEL_SOURCE_3)
- TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_3)
- TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_3)
- TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(I16, I32, I16, I16, KERNEL_SOURCE_3)
+ TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(U8, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(I8, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(I16, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(F16, KERNEL_SOURCE_5)
+ TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(BF16, KERNEL_SOURCE_5)
};
-static const _kernel_map_type scatter_nd_update_ref_map[] =
+static const _kernel_map_type scatter_nd_update_special_ref_map[] =
{
- TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
- TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
+ TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
+ TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
};
-static const _kernel_map_type scatter_nd_update_update_map[] =
+static const _kernel_map_type scatter_nd_update_special_update_map[] =
{
- TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
- TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
+ TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
+ TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
};
-static const _kernel_map_type scatter_nd_update_copy_map[] =
+static const _kernel_map_type scatter_nd_update_special_copy_map[] =
{
- TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
- TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
+ TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U8, I32, U8, U8, KERNEL_SOURCE_4)
+ TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I8, I32, I8, I8, KERNEL_SOURCE_4)
};
/*
* Kernel params
*/
-static vx_param_description_t _scatter_nd_update_kernel_param_def[] =
+static vx_param_description_t _scatter_nd_update_reset_kernel_param_def[] =
{
- {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
- {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
- {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
-static vx_param_description_t _scatter_nd_update_reset_kernel_param_def[] =
+static vx_param_description_t _scatter_nd_update_update_kernel_param_def[] =
{
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
- {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
-static vx_param_description_t _scatter_nd_update_pre_kernel_param_def[] =
+static vx_param_description_t _scatter_nd_update_ref_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
- //{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+ {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
@@ -206,22 +249,17 @@ static vx_param_description_t _scatter_nd_update_pre_kernel_param_def[] =
// Add kererl parameters here
};
-static vx_param_description_t _scatter_nd_update_post_kernel_param_def[] =
+static vx_param_description_t _scatter_nd_update_copy_kernel_param_def[] =
{
- {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
- {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
- {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
- {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
- {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
-static vx_param_description_t _scatter_nd_update_ref_kernel_param_def[] =
+static vx_param_description_t _scatter_nd_update_special_ref_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@@ -229,7 +267,7 @@ static vx_param_description_t _scatter_nd_update_ref_kernel_param_def[] =
// Add kererl parameters here
};
-static vx_param_description_t _scatter_nd_update_update_kernel_param_def[] =
+static vx_param_description_t _scatter_nd_update_special_update_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@@ -243,7 +281,7 @@ static vx_param_description_t _scatter_nd_update_update_kernel_param_def[] =
// Add kererl parameters here
};
-static vx_param_description_t _scatter_nd_update_copy_kernel_param_def[] =
+static vx_param_description_t _scatter_nd_update_special_copy_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@@ -251,13 +289,14 @@ static vx_param_description_t _scatter_nd_update_copy_kernel_param_def[] =
// Add kererl parameters here
};
-#define _SCATTER_ND_UPDATE_PARAM_NUM _cnt_of_array( _scatter_nd_update_kernel_param_def )
-#define _SCATTER_ND_UPDATE_PRE_PARAM_NUM _cnt_of_array( _scatter_nd_update_pre_kernel_param_def )
-#define _SCATTER_ND_UPDATE_POST_PARAM_NUM _cnt_of_array( _scatter_nd_update_post_kernel_param_def )
#define _SCATTER_ND_UPDATE_RESET_PARAM_NUM _cnt_of_array( _scatter_nd_update_reset_kernel_param_def )
-#define _SCATTER_ND_UPDATE_REF_PARAM_NUM _cnt_of_array( _scatter_nd_update_ref_kernel_param_def )
-#define _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM _cnt_of_array( _scatter_nd_update_update_kernel_param_def )
-#define _SCATTER_ND_UPDATE_COPY_PARAM_NUM _cnt_of_array( _scatter_nd_update_copy_kernel_param_def )
+#define _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM _cnt_of_array(_scatter_nd_update_update_kernel_param_def)
+#define _SCATTER_ND_UPDATE_REF_PARAM_NUM _cnt_of_array(_scatter_nd_update_ref_kernel_param_def)
+#define _SCATTER_ND_UPDATE_COPY_PARAM_NUM _cnt_of_array(_scatter_nd_update_copy_kernel_param_def)
+
+#define _SCATTER_ND_UPDATE_SPECIAL_REF_PARAM_NUM _cnt_of_array(_scatter_nd_update_special_ref_kernel_param_def)
+#define _SCATTER_ND_UPDATE_SPECIAL_UPDATE_PARAM_NUM _cnt_of_array(_scatter_nd_update_special_update_kernel_param_def)
+#define _SCATTER_ND_UPDATE_SPECIAL_COPY_PARAM_NUM _cnt_of_array(_scatter_nd_update_special_copy_kernel_param_def)
static vsi_status get_scatter_nd_update_tensor_reshape_size
(
@@ -265,24 +304,17 @@ static vsi_status get_scatter_nd_update_tensor_reshape_size
vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
uint32_t block_size,
uint32_t coordDim,
- vsi_size_t* width,
- vsi_size_t* area,
- vsi_size_t* vol,
+ vsi_size_t strides[VSI_NN_MAX_DIM_NUM],
int32_t* newDim,
int32_t* isBig
)
{
- vsi_status status = VSI_FAILURE;
+ vsi_status status = VSI_SUCCESS;
uint32_t dims_num = inputs[0]->attr.dim_num;
vsi_size_t *input_size = inputs[0]->attr.size;
uint32_t i = 0;
vsi_size_t elementCnt = 1;
- if (coordDim != 0 && (width == NULL || area == NULL))
- {
- return status;
- }
-
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
newDim[0] = 0;
@@ -305,36 +337,30 @@ static vsi_status get_scatter_nd_update_tensor_reshape_size
isBig[0] |= 1;
}
- if (coordDim == 1) // index shape
- {
- *width = 0;
- *area = 0;
- }
- else if (coordDim == 2)
+ if (coordDim == 1 && strides) // index shape
{
- *width = input_size[dims_num - 2];
- *area = 0;
- }
- else if (coordDim == 3)
- {
- *width = input_size[dims_num - 3];
- *area = input_size[dims_num - 3] * input_size[dims_num - 2];
- }
- else if (coordDim == 4)
- {
- *width = input_size[dims_num - 4];
- *area = input_size[dims_num - 4] * input_size[dims_num - 3];
- *vol = input_size[dims_num - 4] * input_size[dims_num - 3] * input_size[dims_num - 2];
+ for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+ {
+ strides[i] = 0;
+ }
}
- else if (coordDim == 5)
+ else if (coordDim >= 2 && coordDim <= VSI_NN_MAX_DIM_NUM && strides)
{
- *width = input_size[dims_num - 5];
- *area = input_size[dims_num - 5] * input_size[dims_num - 4];
- *vol = input_size[dims_num - 5] * input_size[dims_num - 4] * input_size[dims_num - 3];
+ for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+ {
+ strides[i] = 0;
+ }
+
+ strides[0] = input_size[dims_num - coordDim];
+ for (i = 1; i < coordDim - 1; i++)
+ {
+ strides[i] = strides[i - 1] * input_size[dims_num - coordDim + i];
+ }
}
+
#undef VSI_NN_MAX_IMAGE_WIDTH
- return VSI_SUCCESS;
+ return status;
} /* _get_EltOP_tensor_reshape_size */
static vsi_status check_scatter_nd_update_index_repeat
@@ -458,7 +484,8 @@ static vsi_status check_scatter_nd_update_index_repeat
/*
* Kernel initializer
*/
-DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer)
+
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
@@ -474,157 +501,68 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer)
{0, 0, 0}
};
- vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL };
- int32_t block_size = 1;
- int32_t height = 1;
- int32_t index_num = 1;
- int32_t width = 0, area = 0, vol = 0;
- int32_t coord_dim = 0;
- int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0;
- int32_t src0ZP = 0;
- float src0Scale = 1;
- int32_t src2ZP = 0;
- float src2Scale = 1;
- int32_t dstZP = 0;
- float dstScale = 1;
+ vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+ int32_t block_size = 1;
+ int32_t width = 0;
+ int32_t height = 0;
+
+ int32_t input0_zp = 0;
+ float input0_scale = 1.0f;
+ int32_t output_zp = 0;
+ float output_scale = 1.0f;
+
+ uint32_t pack_key = 0;
+
+ VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
- attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
- CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
- attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
- CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError );
-
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &width);
- CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &area);
- CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &vol);
- CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &coord_dim);
- CHECK_STATUS_FAIL_GOTO(status, OnError );
-
- block_size = (int32_t)(attr[3]->shape->data[0]);
- height = (int32_t)(attr[3]->shape->data[1]);
- index_num = (int32_t)(attr[1]->shape->data[1]);
- if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
- {
- src0ZP = attr[0]->asymm.zero_point;
- src0Scale = attr[0]->asymm.scale;
- }
- else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
- {
- if (attr[0]->dfp.fl > 0)
- {
- src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
- }
- else
- {
- src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
- }
- }
-
- if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
- {
- src2ZP = attr[2]->asymm.zero_point;
- src2Scale = attr[2]->asymm.scale;
- }
- else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
- {
- if (attr[2]->dfp.fl > 0)
- {
- src2Scale = (1.0f / ((float) ((int64_t)1 << attr[2]->dfp.fl)));
- }
- else
- {
- src2Scale = ((float) ((int64_t)1 << -attr[2]->dfp.fl));
- }
- }
-
- if (attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+ block_size = (int32_t)(attr[0]->shape->data[0]);
+ height = (int32_t)(attr[0]->shape->data[1]);
+ width = (int32_t)(block_size * height);
+ if (attr[0]->dtype == F16 || attr[0]->dtype == I16 || attr[0]->dtype == U16)
{
- dstZP = attr[3]->asymm.zero_point;
- dstScale = attr[3]->asymm.scale;
+ width = (width + 7) / 8;
}
- else if ( attr[3]->quant == VSI_NN_KERNEL_QUANT_DFP )
+ else if (attr[0]->dtype == U8 || attr[0]->dtype == I8)
{
- if (attr[3]->dfp.fl > 0)
- {
- dstScale = (float)((int64_t)1 << attr[3]->dfp.fl);
- }
- else
- {
- dstScale = (1.0f / (float)((int64_t)1 << -attr[3]->dfp.fl));
- }
- dstScale = 1.0f/dstScale;
+ width = (width + 15) / 16;
}
- if (coord_dim == 5)
- {
- offset_idx = 1;
- }
- if (coord_dim == 4 || coord_dim == 5)
- {
- offsetX = vol;
- offsetY = area;
- offsetZ = width;
- offsetW = 1;
- }
- else if (coord_dim == 3)
- {
- offsetX = area;
- offsetY = width;
- offsetZ = 1;
- offsetW = 0;
- }
- else if (coord_dim == 2)
- {
- offsetX = width;
- offsetY = 1;
- offsetZ = 0;
- offsetW = 0;
- }
- else if (coord_dim == 1)
- {
- offsetX = 1;
- offsetY = 0;
- offsetZ = 0;
- offsetW = 0;
- }
+ input0_zp = attr[0]->asymm.zero_point;
+ input0_scale = attr[0]->asymm.scale;
+ output_zp = attr[1]->asymm.zero_point;
+ output_scale = 1.0f / attr[1]->asymm.scale;
- gpu_param.global_scale[0] = 8;
+ gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
- gpu_param.global_size[0] = gpu_align_p2((block_size + gpu_param.global_scale[0] - 1)
- / gpu_param.global_scale[0], 4);
- gpu_param.global_size[1] = height;
+ gpu_param.global_size[0] = width;
+ gpu_param.global_size[1] = 1;
gpu_param.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, OnError);
+#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \
+ (IN0_TYPE | ( OUT_TYPE << 16))
+
+ pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype );
+
+ switch( pack_key )
{
- uint16_t M0 = 0;
- uint16_t M1 = 0;
- int32_t postShift0 = 0;
- int32_t postShift1 = 0;
- uint32_t multAndoutZP0[2] = {0};
- uint32_t multAndoutZP1[2] = {0};
- gpu_dp_inst_t uniAccumulateSum_2x8 = {{
- 0x55555555, // TCfg
- 0x44444444, // ASelt
- 0x33221100, 0x77665544, // ABin
- 0xaaaaaaaa, // BSelt
- 0x00000000, 0x00000000, // BBin
- 0x00000600, // AccumType, ConstantType, and PostShift
- 0x00010001, 0x00010001, 0x00010001, 0x00010001,
- 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
- }, GPU_DP_TYPE_16 };
- gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{
+ case _PACK_SELECT_KEY( I8, I8 ):
+ case _PACK_SELECT_KEY( U8, U8 ):
+ {
+ uint16_t M0 = 0;
+ int32_t postShift0 = 0;
+ uint32_t multAndoutZP0[2] = {0};
+
+ gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{
0xdddddddd, // TCfg
0x44444444, // ASelt
0x13121110, 0x17161514, // ABin
@@ -633,80 +571,40 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer)
0x00002600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
- }, GPU_DP_TYPE_16 };
- gpu_dp_inst_t uniU8MulAndPostShift_1_Lo_2x8 = {{
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{
0xdddddddd, // TCfg
0x44444444, // ASelt
- 0x13121110, 0x17161514, // ABin
+ 0x1b1a1918, 0x1f1e1d1c, // ABin
0x11111111, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
- }, GPU_DP_TYPE_16 };
- gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
- 0x11111111, // TCfg
- 0x01010101, // ASelt
- 0x01050004, 0x03070206, // ABin
- 0x22222222, // BSelt
- 0x00000000, 0x00000000, // BBin
- 0x00000600, // AccumType, ConstantType, and PostShift
- 0x00000001, 0x00000001, 0x00000001, 0x00000001,
- 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
- }, GPU_DP_TYPE_16};
- gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
- 0x11111111, // TCfg
- 0x01010101, // ASelt
- 0x05050404, 0x07070606, // ABin
- 0x22222222, // BSelt
- 0x00000000, 0x00000000, // BBin
- 0x00000600, // AccumType, ConstantType, and PostShift
- 0x00000001, 0x00000001, 0x00000001, 0x00000001,
- 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
- }, GPU_DP_TYPE_16};
- gpu_dp_inst_t uniExtractOddData_2x8 = {{
- 0x11111111, // TCfg
- 0x11110000, // ASelt
- 0x07050301, 0x07050301, // ABin
- 0x22222222, // BSelt
- 0x00000000, 0x00000000, // BBin
- 0x00000600, // AccumType, ConstantType, and PostShift
- 0x00000001, 0x00000001, 0x00000001, 0x00000001,
- 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
- }, GPU_DP_TYPE_16};
+ }, GPU_DP_TYPE_16 };
- gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0);
- gpu_quantize_multiplier_16bit( (double)src2Scale / dstScale, &M1, &postShift1);
- multAndoutZP0[0] = (uint32_t)(M0);
- multAndoutZP0[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0);
- multAndoutZP1[0] = (uint32_t)(M1);
- multAndoutZP1[1] = (uint32_t)((dstZP << postShift1) - src2ZP * M1);
- gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift0 );
- gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_1_Lo_2x8, postShift1 );
+ gpu_quantize_multiplier_16bit( (double)input0_scale * output_scale, &M0, &postShift0);
- status = vsi_nn_kernel_gpu_add_param( node,
- "uniAccumulateSum_2x8", &uniAccumulateSum_2x8 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniU8MulAndPostShift_1_Lo_2x8", &uniU8MulAndPostShift_1_Lo_2x8 );
- status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
- status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
- status |= vsi_nn_kernel_gpu_add_param( node, "index_num", &index_num );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW );
- status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx );
- CHECK_STATUS_FAIL_GOTO(status, OnError);
+ multAndoutZP0[0] = (uint32_t)(M0);
+ multAndoutZP0[1] = (uint32_t)((output_zp << postShift0) - input0_zp * M0);
+
+ gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift0 );
+ gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift0 );
+
+ status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
+ status |= vsi_nn_kernel_gpu_add_param( node,
+ "uniU8MulAndPostShift0_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 );
+ status |= vsi_nn_kernel_gpu_add_param( node,
+ "uniU8MulAndPostShift0_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ }
+ break;
+ default:
+ break;
}
+#undef _PACK_SELECT_KEY
+
OnError:
if (attr[0])
{
@@ -718,20 +616,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer)
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
- if (attr[2])
- {
- vsi_nn_kernel_tensor_attr_release( &attr[2] );
- attr[2] = NULL;
- }
- if (attr[3])
- {
- vsi_nn_kernel_tensor_attr_release( &attr[3] );
- attr[3] = NULL;
- }
return status;
-} /* _scatter_nd_update_initializer() */
+} /* _scatter_nd_update_special_ref_initializer() */
-DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer)
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
@@ -747,19 +635,20 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer)
{0, 0, 0}
};
- vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL };
+ vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
int32_t block_size = 1;
- int32_t height = 1;
+ int32_t update_width = 1;
int32_t index_num = 1;
int32_t width = 0, area = 0, vol = 0;
int32_t coord_dim = 0;
int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0;
- int32_t src0ZP = 0;
- float src0Scale = 1;
- int32_t src2ZP = 0;
- float src2Scale = 1;
- int32_t dstZP = 0;
- float dstScale = 1;
+ int32_t input1_zp = 0;
+ float input1_scale = 1.0f;
+ int32_t output_zp = 0;
+ float output_scale = 1.0f;
+ uint32_t pack_key = 0;
+
+ VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -767,73 +656,24 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer)
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
- attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
- CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError );
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &width);
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &width);
CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &area);
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &area);
CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &vol);
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &vol);
CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &coord_dim);
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &coord_dim);
CHECK_STATUS_FAIL_GOTO(status, OnError );
- block_size = (int32_t)(attr[3]->shape->data[0]);
- height = (int32_t)(attr[3]->shape->data[1]);
- index_num = (int32_t)(attr[1]->shape->data[1]);
-
- if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
- {
- src0ZP = attr[0]->asymm.zero_point;
- src0Scale = attr[0]->asymm.scale;
- }
- else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
- {
- if (attr[0]->dfp.fl > 0)
- {
- src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
- }
- else
- {
- src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
- }
- }
-
- if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
- {
- src2ZP = attr[2]->asymm.zero_point;
- src2Scale = attr[2]->asymm.scale;
- }
- else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
- {
- if (attr[2]->dfp.fl > 0)
- {
- src2Scale = (1.0f / ((float) ((int64_t)1 << attr[2]->dfp.fl)));
- }
- else
- {
- src2Scale = ((float) ((int64_t)1 << -attr[2]->dfp.fl));
- }
- }
+ block_size = (int32_t)(attr[2]->shape->data[0]);
+ update_width = (int32_t)(attr[1]->shape->data[0]);
+ index_num = (int32_t)(attr[0]->shape->data[1]);
- if (attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
- {
- dstZP = attr[3]->asymm.zero_point;
- dstScale = attr[3]->asymm.scale;
- }
- else if ( attr[3]->quant == VSI_NN_KERNEL_QUANT_DFP )
- {
- if (attr[3]->dfp.fl > 0)
- {
- dstScale = (float)((int64_t)1 << attr[3]->dfp.fl);
- }
- else
- {
- dstScale = (1.0f / (float)((int64_t)1 << -attr[3]->dfp.fl));
- }
- dstScale = 1.0f / dstScale;
- }
+ input1_zp = attr[1]->asymm.zero_point;
+ input1_scale = attr[1]->asymm.scale;
+ output_zp = attr[2]->asymm.zero_point;
+ output_scale = 1.0f / attr[2]->asymm.scale;
if (coord_dim == 5)
{
@@ -865,35 +705,60 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer)
offsetZ = 0;
}
+ if (attr[1]->dtype == F16 || attr[1]->dtype == I16 || attr[1]->dtype == U16)
+ {
+ update_width = (update_width + 7) / 8;
+ }
+ else if (attr[1]->dtype == U8 || attr[1]->dtype == I8)
+ {
+ update_width = (update_width + 15) / 16;
+ }
+
+ if (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == U16)
+ {
+ block_size = (block_size + 7) / 8;
+ }
+ else if (attr[2]->dtype == U8 || attr[2]->dtype == I8)
+ {
+ block_size = (block_size + 15) / 16;
+ }
+
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = block_size;
- gpu_param.global_size[1] = height;
+ gpu_param.global_size[1] = index_num;
gpu_param.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, OnError);
{
- uint16_t M0 = 0;
- uint16_t M1 = 0;
- int32_t postShift0 = 0;
- int32_t postShift1 = 0;
- uint32_t multAndoutZP0[2] = {0};
- uint32_t multAndoutZP1[2] = {0};
- gpu_dp_inst_t uniAccumulateSum_2x8 = {{
- 0x55555555, // TCfg
- 0x44444444, // ASelt
- 0x33221100, 0x77665544, // ABin
- 0xaaaaaaaa, // BSelt
- 0x00000000, 0x00000000, // BBin
- 0x00000600, // AccumType, ConstantType, and PostShift
- 0x00010001, 0x00010001, 0x00010001, 0x00010001,
- 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
- }, GPU_DP_TYPE_16 };
- gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{
+ status = vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width );
+ status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size );
+ status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX );
+ status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY );
+ status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ );
+ status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW );
+ status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx );
+ CHECK_STATUS_FAIL_GOTO(status, OnError);
+ }
+#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \
+ (IN0_TYPE | ( OUT_TYPE << 16))
+
+ pack_key = _PACK_SELECT_KEY( attr[1]->dtype, attr[2]->dtype );
+
+ switch( pack_key )
+ {
+ case _PACK_SELECT_KEY( I8, I8 ):
+ case _PACK_SELECT_KEY( U8, U8 ):
+ {
+ uint16_t M1 = 0;
+ int32_t postShift1 = 0;
+ uint32_t multAndoutZP1[2] = {0};
+
+ gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{
0xdddddddd, // TCfg
0x44444444, // ASelt
0x13121110, 0x17161514, // ABin
@@ -902,48 +767,38 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer)
0x00002600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
- }, GPU_DP_TYPE_16 };
- gpu_dp_inst_t uniU8MulAndPostShift_1_Lo_2x8 = {{
+ }, GPU_DP_TYPE_16 };
+ gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{
0xdddddddd, // TCfg
0x44444444, // ASelt
- 0x13121110, 0x17161514, // ABin
+ 0x1b1a1918, 0x1f1e1d1c, // ABin
0x11111111, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
- }, GPU_DP_TYPE_16 };
+ }, GPU_DP_TYPE_16 };
- gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0);
- gpu_quantize_multiplier_16bit( (double)src2Scale / dstScale, &M1, &postShift1);
- multAndoutZP0[0] = (uint32_t)(M0);
- multAndoutZP0[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0);
- multAndoutZP1[0] = (uint32_t)(M1);
- multAndoutZP1[1] = (uint32_t)((dstZP << postShift1) - src2ZP * M1);
- gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift0 );
- gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_1_Lo_2x8, postShift1 );
+ gpu_quantize_multiplier_16bit( (double)input1_scale * output_scale, &M1, &postShift1);
- status = vsi_nn_kernel_gpu_add_param( node,
- "uniAccumulateSum_2x8", &uniAccumulateSum_2x8 );
- if (attr[3]->quant != VSI_NN_KERNEL_QUANT_NONE)
- {
+ multAndoutZP1[0] = (uint32_t)(M1);
+ multAndoutZP1[1] = (uint32_t)((output_zp << postShift1) - input1_zp * M1);
+
+ gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift1 );
+ gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 );
+
+ status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
status |= vsi_nn_kernel_gpu_add_param( node,
- "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 );
+ "uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
- "uniU8MulAndPostShift_1_Lo_2x8", &uniU8MulAndPostShift_1_Lo_2x8 );
- status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
- status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
+ "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
}
- status |= vsi_nn_kernel_gpu_add_param( node, "index_num", &index_num );
- status |= vsi_nn_kernel_gpu_add_param( node, "update_width", &block_size );
- status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW );
- status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx );
- CHECK_STATUS_FAIL_GOTO(status, OnError);
+ break;
+ default:
+ break;
}
+#undef _PACK_SELECT_KEY
OnError:
if (attr[0])
@@ -961,15 +816,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer)
vsi_nn_kernel_tensor_attr_release( &attr[2] );
attr[2] = NULL;
}
- if (attr[3])
- {
- vsi_nn_kernel_tensor_attr_release( &attr[3] );
- attr[3] = NULL;
- }
return status;
-} /* _scatter_nd_update_big_initializer() */
+} /* _scatter_nd_update_special_update_initializer() */
-DEF_KERNEL_INITIALIZER(_scatter_nd_update_pre_initializer)
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_copy_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
@@ -985,140 +835,50 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_pre_initializer)
{0, 0, 0}
};
- vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
+ vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
int32_t block_size = 1;
- int32_t update_width = 1;
- int32_t index_num = 1;
- int32_t width = 0, area = 0, vol = 0;
- int32_t coord_dim = 0;
- int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0;
- int32_t src0ZP = 0;
- float src0Scale = 1;
+ int32_t width = 0;
+ int32_t height = 0;
+
+ VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
- attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
- CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
- attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
- CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
-
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &width);
- CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &area);
- CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &vol);
- CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &coord_dim);
- CHECK_STATUS_FAIL_GOTO(status, OnError );
- block_size = (int32_t)(attr[2]->shape->data[0]);
- update_width = (int32_t)(attr[1]->shape->data[0]);
- index_num = (int32_t)(attr[0]->shape->data[1]);
-
- if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
- {
- src0ZP = attr[1]->asymm.zero_point;
- src0Scale = attr[1]->asymm.scale;
- }
- else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
- {
- if (attr[1]->dfp.fl > 0)
- {
- src0Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl)));
- }
- else
- {
- src0Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl));
- }
- }
+ block_size = (int32_t)(attr[0]->shape->data[0]);
+ height = (int32_t)(attr[0]->shape->data[1]);
+ width = (int32_t)(block_size * height);
- if (coord_dim == 5)
- {
- offset_idx = 1;
- }
- if (coord_dim == 4 || coord_dim == 5)
- {
- offsetX = vol;
- offsetY = area;
- offsetZ = width;
- offsetW = 1;
- }
- else if (coord_dim == 3)
- {
- offsetX = area;
- offsetY = width;
- offsetZ = 1;
- }
- else if (coord_dim == 2)
+ if (attr[0]->dtype == F16 || attr[0]->dtype == I16 || attr[0]->dtype == U16)
{
- offsetX = width;
- offsetY = 1;
- offsetZ = 0;
+ width = (width + 7) / 8;
}
- else if (coord_dim == 1)
+ else if (attr[0]->dtype == U8 || attr[0]->dtype == I8)
{
- offsetX = 1;
- offsetY = 0;
- offsetZ = 0;
+ width = (width + 15) / 16;
}
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
- gpu_param.global_size[0] = block_size;
- gpu_param.global_size[1] = index_num;
+ gpu_param.global_size[0] = width;
+ gpu_param.global_size[1] = 1;
gpu_param.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, OnError);
- {
- gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
- 0x05050505, // TCfg
- 0x04040404, // ASelt
- 0x00010000, 0x00030002, // ABin
- 0x0a0a0a0a, // BSelt
- 0x00000000, 0x00000000, // BBin
- 0x00000400, // AccumType, ConstantType, and PostShift
- 0xffff0001, 0x00000000, 0xffff0001, 0x00000000,
- 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
- }, GPU_DP_TYPE_16 };
-
- status = vsi_nn_kernel_gpu_add_param( node,
- "uniConvert1stUint8SubZpToFp32_4x4", &uniConvert1stUint8SubZpToFp32_4x4 );
- status |= vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width );
- status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW );
- status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx );
- status |= vsi_nn_kernel_gpu_add_param( node, "input_zp", &src0ZP );
- status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &src0Scale );
- CHECK_STATUS_FAIL_GOTO(status, OnError);
- }
-
OnError:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
- if (attr[1])
- {
- vsi_nn_kernel_tensor_attr_release( &attr[1] );
- attr[1] = NULL;
- }
- if (attr[2])
- {
- vsi_nn_kernel_tensor_attr_release( &attr[2] );
- attr[2] = NULL;
- }
return status;
-} /* _scatter_nd_update_pre_initializer() */
+} /* _scatter_nd_update_special_copy_initializer() */
-DEF_KERNEL_INITIALIZER(_scatter_nd_update_post_initializer)
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_reset_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
@@ -1127,132 +887,56 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_post_initializer)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
- 3,
+ 1,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
- vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
- int32_t block_size = 1;
- int32_t height = 1;
- int32_t width = 0, area = 0, vol = 0;
- int32_t coord_dim = 0;
- int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0;
- int32_t src0ZP = 0;
- float src0Scale = 1;
- float src2Scale = 1;
- int32_t dstZP = 0;
- float dstScale = 1;
+ vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+ int32_t width = 0;
+ int32_t element_size = 1;
+ int32_t input_zp0 = 0;
+ float input_scale0 = 1;
+ int32_t output_zp = 0;
+ float output_scale = 1;
+ int32_t i = 0;
- attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); // ref
+ VSI_UNREFERENCED(param_size);
+
+ attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
- attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] ); // update
+ attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
- attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[5] ); // output
- CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
-
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &width);
- CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &area);
- CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &vol);
- CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &coord_dim);
- CHECK_STATUS_FAIL_GOTO(status, OnError );
-
- block_size = (int32_t)(attr[2]->shape->data[0]);
- height = (int32_t)(attr[2]->shape->data[1]);
- if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+ for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
{
- src0ZP = attr[0]->asymm.zero_point;
- src0Scale = attr[0]->asymm.scale;
- }
- else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
- {
- if (attr[0]->dfp.fl > 0)
- {
- src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
- }
- else
- {
- src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
- }
+ element_size *= (int32_t)attr[0]->shape->data[i];
}
+ width = element_size / 8;
+
+ input_zp0 = attr[0]->asymm.zero_point;
+ input_scale0 = attr[0]->asymm.scale;
+ output_zp = attr[1]->asymm.zero_point;
+ output_scale = attr[1]->asymm.scale;
- if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+ if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
{
- src2Scale = attr[1]->asymm.scale;
+ input_scale0 = 1.0f;
}
- else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
+ if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
{
- if (attr[1]->dfp.fl > 0)
- {
- src2Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl)));
- }
- else
- {
- src2Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl));
- }
- }
-
- if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
- {
- dstZP = attr[2]->asymm.zero_point;
- dstScale = attr[2]->asymm.scale;
- }
- else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
- {
- if (attr[2]->dfp.fl > 0)
- {
- dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
- }
- else
- {
- dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
- }
- dstScale = 1.0f / dstScale;
- dstZP = 0;
- }
-
- if (coord_dim == 5)
- {
- offset_idx = 1;
- }
- if (coord_dim == 4 || coord_dim == 5)
- {
- offsetX = vol;
- offsetY = area;
- offsetZ = width;
- offsetW = 1;
- }
- else if (coord_dim == 3)
- {
- offsetX = area;
- offsetY = width;
- offsetZ = 1;
- }
- else if (coord_dim == 2)
- {
- offsetX = width;
- offsetY = 1;
- offsetZ = 0;
- }
- else if (coord_dim == 1)
- {
- offsetX = 1;
- offsetY = 0;
- offsetZ = 0;
+ output_scale = 1.0f;
}
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
- gpu_param.global_size[0] = block_size;
- gpu_param.global_size[1] = height;
+ gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
+ / gpu_param.global_scale[0], 4);
+ gpu_param.global_size[1] = 1;
gpu_param.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
@@ -1272,38 +956,15 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_post_initializer)
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
- gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
- 0x33333333, // TCfg
- 0x11110000, // ASelt
- 0x03020100, 0x03020100, // ABin
- 0x00000000, // BSelt
- 0x00000000, 0x00000000, // BBin
- 0x00002400, // AccumType, ConstantType, and PostShift
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
- }, GPU_DP_TYPE_16 };
- float output_zp = (float)dstZP;
- float scaleInOut = src2Scale / dstScale;
- gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0);
+ gpu_quantize_multiplier_16bit( (double)input_scale0 / output_scale, &M0, &postShift0);
multAndoutZP0[0] = (uint32_t)(M0);
- multAndoutZP0[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0);
+ multAndoutZP0[1] = (uint32_t)((output_zp << postShift0) - input_zp0 * M0);
gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift0 );
status = vsi_nn_kernel_gpu_add_param( node,
"uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
- status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW );
- status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx );
- status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &src2Scale );
- status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp );
- status |= vsi_nn_kernel_gpu_add_param( node, "scaleInOut", &scaleInOut );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
CHECK_STATUS_FAIL_GOTO(status, OnError);
}
@@ -1318,15 +979,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_post_initializer)
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
- if (attr[2])
- {
- vsi_nn_kernel_tensor_attr_release( &attr[2] );
- attr[2] = NULL;
- }
return status;
-} /* _scatter_nd_update_post_initializer() */
+} /* _scatter_nd_update_reset_initializer() */
-DEF_KERNEL_INITIALIZER(_scatter_nd_update_reset_initializer)
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
@@ -1335,168 +991,137 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_reset_initializer)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
- 3,
+ 2,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
- vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+ vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
int32_t block_size = 1;
+ int32_t update_width = 1;
+ int32_t index_num = 1;
int32_t width = 0;
- int32_t height = 0;
- int32_t count_width = 0;
-
- attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
- CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
-
- block_size = (int32_t)(attr[0]->shape->data[0]);
- height = (int32_t)(attr[0]->shape->data[1]);
- width = (int32_t)(block_size * height);
- count_width = (int32_t)((height + 3) / 4);
-
- gpu_param.global_scale[0] = 1;
- gpu_param.global_scale[1] = 1;
- gpu_param.global_scale[2] = 1;
-
- gpu_param.global_size[0] = (width + 3) / 4;
- gpu_param.global_size[1] = 1;
- gpu_param.global_size[2] = 1;
-
- status = vsi_nn_kernel_gpu_config( node, &gpu_param );
- CHECK_STATUS_FAIL_GOTO(status, OnError);
-
- status = vsi_nn_kernel_gpu_add_param( node, "count_width", &count_width );
- CHECK_STATUS_FAIL_GOTO(status, OnError);
-
-OnError:
- if (attr[0])
- {
- vsi_nn_kernel_tensor_attr_release( &attr[0] );
- attr[0] = NULL;
- }
- return status;
-} /* _scatter_nd_update_reset_initializer() */
-
-DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer)
- (
- vsi_nn_kernel_node_t node,
- const vsi_nn_kernel_node_param_t * param,
- size_t param_size
- )
-{
- vsi_status status = VSI_FAILURE;
- gpu_param_t gpu_param = {
- 3,
- {0, 0, 0},
- {0, 0, 0},
- {0, 0, 0},
- {0, 0, 0}
- };
-
- vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
- int32_t block_size = 1;
- int32_t width = 0;
- int32_t height = 0;
-
- int32_t input0_zp = 0;
- float input0_scale = 1.0f;
- int32_t output_zp = 0;
- float output_scale = 1.0f;
+ int32_t coord_dim = 0;
+ int32_t strides[VSI_NN_MAX_DIM_NUM] = {0};
+ int32_t coord_strides[8] = {0};
+ int32_t *coord_strides1 = coord_strides + 4;
+ int32_t input2_zp = 0;
+ int32_t i = 0;
- uint32_t pack_key = 0;
+ VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+ attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+ CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
- block_size = (int32_t)(attr[0]->shape->data[0]);
- height = (int32_t)(attr[0]->shape->data[1]);
- width = (int32_t)(block_size * height);
- if (attr[0]->dtype == F16 || attr[0]->dtype == I16 || attr[0]->dtype == U16)
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &strides[0]);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &strides[1]);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &strides[2]);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &strides[3]);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &strides[4]);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &strides[5]);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &strides[6]);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &coord_dim);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+ block_size = (int32_t)(attr[2]->shape->data[0]);
+ update_width = (int32_t)(attr[1]->shape->data[0]);
+ index_num = (int32_t)(attr[0]->shape->data[1]);
+ width = block_size;
+ if (block_size % 4 == 0)
{
- width = (width + 7) / 8;
+ update_width = update_width / 4;
+ width = block_size / 4;
}
- else if (attr[0]->dtype == U8 || attr[0]->dtype == I8)
+
+ input2_zp = attr[1]->asymm.zero_point;
+
+ coord_strides[coord_dim - 1] = 1;
+ for (i = 0; i < coord_dim - 1; i++)
{
- width = (width + 15) / 16;
+ coord_strides[i] = strides[coord_dim - 2 - i];
}
- input0_zp = attr[0]->asymm.zero_point;
- input0_scale = attr[0]->asymm.scale;
- output_zp = attr[1]->asymm.zero_point;
- output_scale = 1.0f / attr[1]->asymm.scale;
-
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = width;
- gpu_param.global_size[1] = 1;
+ gpu_param.global_size[1] = index_num;
gpu_param.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, OnError);
-#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \
- (IN0_TYPE | ( OUT_TYPE << 16))
-
- pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype );
-
- switch( pack_key )
{
- case _PACK_SELECT_KEY( I8, I8 ):
- case _PACK_SELECT_KEY( U8, U8 ):
- {
- uint16_t M0 = 0;
- int32_t postShift0 = 0;
- uint32_t multAndoutZP0[2] = {0};
-
- gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{
- 0xdddddddd, // TCfg
- 0x44444444, // ASelt
- 0x13121110, 0x17161514, // ABin
- 0x11111111, // BSelt
- 0x00000000, 0x00000000, // BBin
- 0x00002600, // AccumType, ConstantType, and PostShift
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
- }, GPU_DP_TYPE_16 };
- gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{
- 0xdddddddd, // TCfg
- 0x44444444, // ASelt
- 0x1b1a1918, 0x1f1e1d1c, // ABin
- 0x11111111, // BSelt
+ gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
+ 0x05050505, // TCfg
+ 0x04040404, // ASelt
+ 0x00010000, 0x00030002, // ABin
+ 0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
- 0x00002600, // AccumType, ConstantType, and PostShift
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
- }, GPU_DP_TYPE_16 };
+ 0x00000400, // AccumType, ConstantType, and PostShift
+ 0xffff0001, 0x00000000, 0xffff0001, 0x00000000,
+ 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
- gpu_quantize_multiplier_16bit( (double)input0_scale * output_scale, &M0, &postShift0);
+ gpu_dp_inst_t uniConvertFp16ToFp32_4x4 = {{
+ 0x01010101, // TCfg
+ 0x00000000, // ASelt
+ 0x00010000, 0x00030002, // ABin
+ 0x02020202, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000100, // AccumType, ConstantType, and PostShift
+ 0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+ 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+ }, GPU_DP_TYPE_16 };
- multAndoutZP0[0] = (uint32_t)(M0);
- multAndoutZP0[1] = (uint32_t)((output_zp << postShift0) - input0_zp * M0);
+ gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+ 0x11111111, // TCfg
+ 0x01010101, // ASelt
+ 0x01050004, 0x03070206, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000600, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16};
- gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift0 );
- gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift0 );
+ status = vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width );
+ status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size );
+ status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride", &coord_strides );
+ status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride1", &coord_strides1 );
+ CHECK_STATUS_FAIL_GOTO(status, OnError);
- status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniU8MulAndPostShift0_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 );
+ if (attr[1]->dtype == U8 || attr[1]->dtype == I8 || attr[1]->dtype == I16)
+ {
+ status = vsi_nn_kernel_gpu_add_param( node,
+ "uniConvert1stUint8SubZpToFp32_4x4", &uniConvert1stUint8SubZpToFp32_4x4 );
+ status |= vsi_nn_kernel_gpu_add_param( node, "input_zp", &input2_zp );
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ }
+ else if (attr[1]->dtype == F16 || attr[1]->dtype == BF16)
+ {
+ status = vsi_nn_kernel_gpu_add_param( node,
+ "uniConvertFp16ToFp32_4x4", &uniConvertFp16ToFp32_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
- "uniU8MulAndPostShift0_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
+ "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
- break;
- default:
- break;
}
-#undef _PACK_SELECT_KEY
-
OnError:
if (attr[0])
{
@@ -1508,10 +1133,15 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer)
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
+ if (attr[2])
+ {
+ vsi_nn_kernel_tensor_attr_release( &attr[2] );
+ attr[2] = NULL;
+ }
return status;
-} /* _scatter_nd_update_ref_initializer() */
+} /* _scatter_nd_update_update_initializer() */
-DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer)
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
@@ -1531,164 +1161,127 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer)
int32_t block_size = 1;
int32_t update_width = 1;
int32_t index_num = 1;
- int32_t width = 0, area = 0, vol = 0;
+ int32_t width = 0;
int32_t coord_dim = 0;
- int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0;
- int32_t input1_zp = 0;
- float input1_scale = 1.0f;
- int32_t output_zp = 0;
+ int32_t strides[VSI_NN_MAX_DIM_NUM] = {0};
+ int32_t coord_strides[8] = {0};
+ int32_t *coord_strides1 = coord_strides + 4;
+ float output_zp = 0;
+ float input_scale = 1.0f;
float output_scale = 1.0f;
- uint32_t pack_key = 0;
+ float inout_scale = 1.0f;
+ int32_t i = 0;
+
+ VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
- attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+ attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &width);
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &strides[0]);
CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &area);
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &strides[1]);
CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &vol);
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &strides[2]);
CHECK_STATUS_FAIL_GOTO(status, OnError );
- status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &coord_dim);
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &strides[3]);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &strides[4]);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &strides[5]);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &strides[6]);
+ CHECK_STATUS_FAIL_GOTO(status, OnError );
+ status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &coord_dim);
CHECK_STATUS_FAIL_GOTO(status, OnError );
block_size = (int32_t)(attr[2]->shape->data[0]);
update_width = (int32_t)(attr[1]->shape->data[0]);
index_num = (int32_t)(attr[0]->shape->data[1]);
- input1_zp = attr[1]->asymm.zero_point;
- input1_scale = attr[1]->asymm.scale;
- output_zp = attr[2]->asymm.zero_point;
- output_scale = 1.0f / attr[2]->asymm.scale;
-
- if (coord_dim == 5)
- {
- offset_idx = 1;
- }
- if (coord_dim == 4 || coord_dim == 5)
- {
- offsetX = vol;
- offsetY = area;
- offsetZ = width;
- offsetW = 1;
- }
- else if (coord_dim == 3)
- {
- offsetX = area;
- offsetY = width;
- offsetZ = 1;
- }
- else if (coord_dim == 2)
+ input_scale = attr[1]->asymm.scale;
+ output_scale = attr[2]->asymm.scale;
+ output_zp = (float)attr[2]->asymm.zero_point;
+ if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
{
- offsetX = width;
- offsetY = 1;
- offsetZ = 0;
+ input_scale = 1.0f;
}
- else if (coord_dim == 1)
+ if (attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE)
{
- offsetX = 1;
- offsetY = 0;
- offsetZ = 0;
+ output_scale = 1.0f;
}
+ inout_scale = input_scale / output_scale;
- if (attr[1]->dtype == F16 || attr[1]->dtype == I16 || attr[1]->dtype == U16)
+ coord_strides[coord_dim - 1] = 1;
+ for (i = 0; i < coord_dim - 1; i++)
{
- update_width = (update_width + 7) / 8;
- }
- else if (attr[1]->dtype == U8 || attr[1]->dtype == I8)
- {
- update_width = (update_width + 15) / 16;
+ coord_strides[i] = strides[coord_dim - 2 - i];
}
- if (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == U16)
- {
- block_size = (block_size + 7) / 8;
- }
- else if (attr[2]->dtype == U8 || attr[2]->dtype == I8)
+ width = block_size;
+ if (block_size % 4 == 0)
{
- block_size = (block_size + 15) / 16;
+ width = block_size / 4;
}
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
- gpu_param.global_size[0] = block_size;
+ gpu_param.global_size[0] = width;
gpu_param.global_size[1] = index_num;
gpu_param.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
- CHECK_STATUS_FAIL_GOTO(status, OnError);
-
- {
- status = vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width );
- status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ );
- status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW );
- status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx );
- CHECK_STATUS_FAIL_GOTO(status, OnError);
- }
-#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \
- (IN0_TYPE | ( OUT_TYPE << 16))
-
- pack_key = _PACK_SELECT_KEY( attr[1]->dtype, attr[2]->dtype );
-
- switch( pack_key )
- {
- case _PACK_SELECT_KEY( I8, I8 ):
- case _PACK_SELECT_KEY( U8, U8 ):
- {
- uint16_t M1 = 0;
- int32_t postShift1 = 0;
- uint32_t multAndoutZP1[2] = {0};
-
- gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{
- 0xdddddddd, // TCfg
- 0x44444444, // ASelt
- 0x13121110, 0x17161514, // ABin
- 0x11111111, // BSelt
- 0x00000000, 0x00000000, // BBin
- 0x00002600, // AccumType, ConstantType, and PostShift
- 0x00000000, 0x00000000, 0x00000000, 0x00000000,
- 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
- }, GPU_DP_TYPE_16 };
- gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{
- 0xdddddddd, // TCfg
- 0x44444444, // ASelt
- 0x1b1a1918, 0x1f1e1d1c, // ABin
- 0x11111111, // BSelt
+ CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+ {
+ gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+ 0x33333333, // TCfg
+ 0x11110000, // ASelt
+ 0x03020100, 0x03020100, // ABin
+ 0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
- 0x00002600, // AccumType, ConstantType, and PostShift
+ 0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
- gpu_quantize_multiplier_16bit( (double)input1_scale * output_scale, &M1, &postShift1);
-
- multAndoutZP1[0] = (uint32_t)(M1);
- multAndoutZP1[1] = (uint32_t)((output_zp << postShift1) - input1_zp * M1);
+ gpu_dp_inst_t uniExtractOddData_2x8 = {{
+ 0x11111111, // TCfg
+ 0x11110000, // ASelt
+ 0x07050301, 0x07050301, // ABin
+ 0x22222222, // BSelt
+ 0x00000000, 0x00000000, // BBin
+ 0x00000600, // AccumType, ConstantType, and PostShift
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001,
+ 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+ }, GPU_DP_TYPE_16};
- gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift1 );
- gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 );
+ status = vsi_nn_kernel_gpu_add_param( node, "output_stride", &width );
+ status |= vsi_nn_kernel_gpu_add_param( node, "ref_stride", &update_width );
+ status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride", &coord_strides );
+ status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride1", &coord_strides1 );
+ status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp );
+ status |= vsi_nn_kernel_gpu_add_param( node, "inout_scale", &inout_scale );
+ CHECK_STATUS_FAIL_GOTO(status, OnError);
- status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 );
- status |= vsi_nn_kernel_gpu_add_param( node,
- "uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
- CHECK_STATUS_FAIL_GOTO(status, OnError );
+ if (attr[1]->dtype == U8 || attr[1]->dtype == I8 || attr[1]->dtype == I16)
+ {
+ status = vsi_nn_kernel_gpu_add_param( node,
+ "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+ CHECK_STATUS_FAIL_GOTO(status, OnError);
+ }
+ else if (attr[1]->dtype == BF16)
+ {
+ status = vsi_nn_kernel_gpu_add_param( node,
+ "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
+ CHECK_STATUS_FAIL_GOTO(status, OnError);
}
- break;
- default:
- break;
}
-#undef _PACK_SELECT_KEY
OnError:
if (attr[0])
@@ -1707,7 +1300,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer)
attr[2] = NULL;
}
return status;
-} /* _scatter_nd_update_update_initializer() */
+} /* _scatter_nd_update_ref_initializer() */
DEF_KERNEL_INITIALIZER(_scatter_nd_update_copy_initializer)
(
@@ -1718,7 +1311,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_copy_initializer)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
- 3,
+ 1,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
@@ -1726,31 +1319,27 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_copy_initializer)
};
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
- int32_t block_size = 1;
- int32_t width = 0;
- int32_t height = 0;
+ int32_t width = 0;
+ int32_t element_size = 1;
+ int32_t i = 0;
+
+ VSI_UNREFERENCED(param_size);
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
- block_size = (int32_t)(attr[0]->shape->data[0]);
- height = (int32_t)(attr[0]->shape->data[1]);
- width = (int32_t)(block_size * height);
-
- if (attr[0]->dtype == F16 || attr[0]->dtype == I16 || attr[0]->dtype == U16)
- {
- width = (width + 7) / 8;
- }
- else if (attr[0]->dtype == U8 || attr[0]->dtype == I8)
+ for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
{
- width = (width + 15) / 16;
+ element_size *= (int32_t)attr[0]->shape->data[i];
}
+ width = element_size / 8;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
- gpu_param.global_size[0] = width;
+ gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
+ / gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = 1;
gpu_param.global_size[2] = 1;
@@ -1766,166 +1355,151 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_copy_initializer)
return status;
} /* _scatter_nd_update_copy_initializer() */
-/*
- * Query kernel
- */
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
- vsi_nn_kernel_t* kernel,
- int32_t coord_dim,
- int32_t isBig
+ vsi_nn_kernel_t* kernel_reset,
+ vsi_nn_kernel_t* kernel_update,
+ vsi_nn_kernel_t* kernel_ref,
+ vsi_nn_kernel_t* kernel_copy,
+ int32_t coord_flg,
+ int32_t opt_flg
)
{
- vsi_status status = VSI_FAILURE;
+ vsi_status status = VSI_SUCCESS;
vsi_nn_kernel_dtype_e input0_dtype = U8;
- vsi_nn_kernel_dtype_e input2_dtype = U8;
+ vsi_nn_kernel_dtype_e input2_dtype = F16;
vsi_nn_kernel_dtype_e output_dtype = U8;
+ vsi_nn_kernel_dtype_e acc_dtype = I32;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
- key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 0, isBig );
+ if (input2_dtype == F16)
+ {
+ acc_dtype = F32;
+ }
+
+ key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, 0, output_dtype, 0, 0, 0);
- for ( i = 0; i < _cnt_of_array(scatter_nd_update_map); i ++ )
+ for ( i = 0; i < _cnt_of_array(scatter_nd_update_reset_map); i ++ )
{
- if ( scatter_nd_update_map[i].key == key )
+ if ( scatter_nd_update_reset_map[i].key == key )
{
break;
}
}
- if ( i < _cnt_of_array(scatter_nd_update_map) )
+
+ if ( i < _cnt_of_array(scatter_nd_update_reset_map) )
{
- snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_map[i].function_name );
- kernel->info.parameters = _scatter_nd_update_kernel_param_def;
- kernel->info.numParams = _cnt_of_array( _scatter_nd_update_kernel_param_def );
- if (isBig)
- {
- kernel->info.initialize = _scatter_nd_update_big_initializer;
- }
- else
- {
- kernel->info.initialize = _scatter_nd_update_initializer;
- }
+ snprintf( kernel_reset->info.name, VX_MAX_KERNEL_NAME, "%s",
+ scatter_nd_update_reset_map[i].function_name );
+ kernel_reset->info.parameters = _scatter_nd_update_reset_kernel_param_def;
+ kernel_reset->info.numParams = _SCATTER_ND_UPDATE_RESET_PARAM_NUM;
+ kernel_reset->info.initialize = _scatter_nd_update_reset_initializer;
- vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+ vsi_nn_kernel_add_source( kernel_reset, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
- scatter_nd_update_map[i].source_name );
- vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
- scatter_nd_update_map[i].source_name );
- status = VSI_SUCCESS;
+ scatter_nd_update_reset_map[i].source_name );
+ vsi_nn_kernel_add_source( kernel_reset, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+ scatter_nd_update_reset_map[i].source_name );
+ }
+ else
+ {
+ status = VSI_FAILURE;
}
- return status;
-} /* _query_kernel() */
-
-static vsi_status _query_kernel_large
- (
- vsi_nn_tensor_t* const* const inputs,
- vsi_nn_tensor_t* const* const outputs,
- vsi_nn_kernel_t* kernel_reset,
- vsi_nn_kernel_t* kernel_pre,
- vsi_nn_kernel_t* kernel
- )
-{
- vsi_status status = VSI_SUCCESS;
- vsi_nn_kernel_dtype_e input0_dtype = U8;
- vsi_nn_kernel_dtype_e input2_dtype = F16;
- vsi_nn_kernel_dtype_e output_dtype = U8;
- uint32_t key = 0;
- int i = 0;
-
- input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
- input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
- output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
- key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, I32, I32, 1, 1 );
+ key = HASH_SCATTER_ND_UPDATE_KEY( 0, input2_dtype, 0, 1, coord_flg, opt_flg);
- for ( i = 0; i < _cnt_of_array(scatter_nd_update_pre_map); i ++ )
+ for ( i = 0; i < _cnt_of_array(scatter_nd_update_update_map); i ++ )
{
- if ( scatter_nd_update_pre_map[i].key == key )
+ if ( scatter_nd_update_update_map[i].key == key )
{
break;
}
}
-
- if ( i < _cnt_of_array(scatter_nd_update_pre_map) )
+ if ( i < _cnt_of_array(scatter_nd_update_update_map) )
{
- snprintf( kernel_pre->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_pre_map[i].function_name );
- kernel_pre->info.parameters = _scatter_nd_update_pre_kernel_param_def;
- kernel_pre->info.numParams = _SCATTER_ND_UPDATE_PRE_PARAM_NUM;
- kernel_pre->info.initialize = _scatter_nd_update_pre_initializer;
+ snprintf( kernel_update->info.name, VX_MAX_KERNEL_NAME, "%s",
+ scatter_nd_update_update_map[i].function_name );
+ kernel_update->info.parameters = _scatter_nd_update_update_kernel_param_def;
+ kernel_update->info.numParams = _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM;
+ kernel_update->info.initialize = _scatter_nd_update_update_initializer;
- vsi_nn_kernel_add_source( kernel_pre, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+ vsi_nn_kernel_add_source( kernel_update, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
- scatter_nd_update_pre_map[i].source_name );
- vsi_nn_kernel_add_source( kernel_pre, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
- scatter_nd_update_pre_map[i].source_name );
+ scatter_nd_update_update_map[i].source_name );
+ vsi_nn_kernel_add_source( kernel_update, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+ scatter_nd_update_update_map[i].source_name );
}
else
{
- status = VSI_FAILURE;
+ status |= VSI_FAILURE;
}
+ key = HASH_SCATTER_ND_UPDATE_KEY( 0, acc_dtype, output_dtype, 2, coord_flg, opt_flg);
- key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 0, 1 );
-
- for ( i = 0; i < _cnt_of_array(scatter_nd_update_post_map); i ++ )
+ for ( i = 0; i < _cnt_of_array(scatter_nd_update_ref_map); i ++ )
{
- if ( scatter_nd_update_post_map[i].key == key )
+ if ( scatter_nd_update_ref_map[i].key == key )
{
break;
}
}
- if ( i < _cnt_of_array(scatter_nd_update_post_map) )
+
+ if ( i < _cnt_of_array(scatter_nd_update_ref_map) )
{
- snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_post_map[i].function_name );
- kernel->info.parameters = _scatter_nd_update_post_kernel_param_def;
- kernel->info.numParams = _SCATTER_ND_UPDATE_POST_PARAM_NUM;
- kernel->info.initialize = _scatter_nd_update_post_initializer;
+ snprintf( kernel_ref->info.name, VX_MAX_KERNEL_NAME, "%s",
+ scatter_nd_update_ref_map[i].function_name );
+ kernel_ref->info.parameters = _scatter_nd_update_ref_kernel_param_def;
+ kernel_ref->info.numParams = _SCATTER_ND_UPDATE_REF_PARAM_NUM;
+ kernel_ref->info.initialize = _scatter_nd_update_ref_initializer;
- vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+ vsi_nn_kernel_add_source( kernel_ref, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
- scatter_nd_update_post_map[i].source_name );
- vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
- scatter_nd_update_post_map[i].source_name );
+ scatter_nd_update_ref_map[i].source_name );
+ vsi_nn_kernel_add_source( kernel_ref, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+ scatter_nd_update_ref_map[i].source_name );
}
else
{
- status |= VSI_FAILURE;
+ status = VSI_FAILURE;
}
- key = HASH_SCATTER_ND_UPDATE_KEY( I32, I32, I32, 2, 1 );
+ key = HASH_SCATTER_ND_UPDATE_KEY( 0, 0, output_dtype, 3, 0, 0);
- for ( i = 0; i < _cnt_of_array(scatter_nd_update_reset_map); i ++ )
+ for ( i = 0; i < _cnt_of_array(scatter_nd_update_copy_map); i ++ )
{
- if ( scatter_nd_update_reset_map[i].key == key )
+ if ( scatter_nd_update_copy_map[i].key == key )
{
break;
}
}
- if ( i < _cnt_of_array(scatter_nd_update_reset_map) )
+ if ( i < _cnt_of_array(scatter_nd_update_copy_map) )
{
- snprintf( kernel_reset->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_reset_map[i].function_name );
- kernel_reset->info.parameters = _scatter_nd_update_reset_kernel_param_def;
- kernel_reset->info.numParams = _SCATTER_ND_UPDATE_RESET_PARAM_NUM;
- kernel_reset->info.initialize = _scatter_nd_update_reset_initializer;
+ snprintf( kernel_copy->info.name, VX_MAX_KERNEL_NAME, "%s",
+ scatter_nd_update_copy_map[i].function_name );
+ kernel_copy->info.parameters = _scatter_nd_update_copy_kernel_param_def;
+ kernel_copy->info.numParams = _SCATTER_ND_UPDATE_COPY_PARAM_NUM;
+ kernel_copy->info.initialize = _scatter_nd_update_copy_initializer;
- vsi_nn_kernel_add_source( kernel_reset, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+ vsi_nn_kernel_add_source( kernel_copy, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
- scatter_nd_update_reset_map[i].source_name );
- vsi_nn_kernel_add_source( kernel_reset, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
- scatter_nd_update_reset_map[i].source_name );
+ scatter_nd_update_copy_map[i].source_name );
+ vsi_nn_kernel_add_source( kernel_copy, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+ scatter_nd_update_copy_map[i].source_name );
}
else
{
status |= VSI_FAILURE;
}
+
return status;
-} /* _query_kernel_large() */
+} /* _query_kernel() */
static vsi_status _query_kernel_special
(
@@ -1941,34 +1515,35 @@ static vsi_status _query_kernel_special
vsi_nn_kernel_dtype_e input2_dtype = F16;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
- key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 3, 1 );
+ key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 4, 1, 0);
- for ( i = 0; i < _cnt_of_array(scatter_nd_update_ref_map); i ++ )
+ for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_ref_map); i ++ )
{
- if ( scatter_nd_update_ref_map[i].key == key )
+ if ( scatter_nd_update_special_ref_map[i].key == key )
{
break;
}
}
- if ( i < _cnt_of_array(scatter_nd_update_ref_map) )
+ if ( i < _cnt_of_array(scatter_nd_update_special_ref_map) )
{
- snprintf( kernel_ref->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_ref_map[i].function_name );
- kernel_ref->info.parameters = _scatter_nd_update_ref_kernel_param_def;
- kernel_ref->info.numParams = _SCATTER_ND_UPDATE_REF_PARAM_NUM;
- kernel_ref->info.initialize = _scatter_nd_update_ref_initializer;
+ snprintf( kernel_ref->info.name, VX_MAX_KERNEL_NAME, "%s",
+ scatter_nd_update_special_ref_map[i].function_name );
+ kernel_ref->info.parameters = _scatter_nd_update_special_ref_kernel_param_def;
+ kernel_ref->info.numParams = _SCATTER_ND_UPDATE_SPECIAL_REF_PARAM_NUM;
+ kernel_ref->info.initialize = _scatter_nd_update_special_ref_initializer;
vsi_nn_kernel_add_source( kernel_ref, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
- scatter_nd_update_ref_map[i].source_name );
+ scatter_nd_update_special_ref_map[i].source_name );
vsi_nn_kernel_add_source( kernel_ref, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
- scatter_nd_update_ref_map[i].source_name );
+ scatter_nd_update_special_ref_map[i].source_name );
}
else
{
@@ -1976,54 +1551,56 @@ static vsi_status _query_kernel_special
}
- key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 4, 1 );
+ key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 5, 1, 0);
- for ( i = 0; i < _cnt_of_array(scatter_nd_update_update_map); i ++ )
+ for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_update_map); i ++ )
{
- if ( scatter_nd_update_update_map[i].key == key )
+ if ( scatter_nd_update_special_update_map[i].key == key )
{
break;
}
}
- if ( i < _cnt_of_array(scatter_nd_update_update_map) )
+ if ( i < _cnt_of_array(scatter_nd_update_special_update_map) )
{
- snprintf( kernel_update->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_update_map[i].function_name );
- kernel_update->info.parameters = _scatter_nd_update_update_kernel_param_def;
- kernel_update->info.numParams = _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM;
- kernel_update->info.initialize = _scatter_nd_update_update_initializer;
+ snprintf( kernel_update->info.name, VX_MAX_KERNEL_NAME, "%s",
+ scatter_nd_update_special_update_map[i].function_name );
+ kernel_update->info.parameters = _scatter_nd_update_special_update_kernel_param_def;
+ kernel_update->info.numParams = _SCATTER_ND_UPDATE_SPECIAL_UPDATE_PARAM_NUM;
+ kernel_update->info.initialize = _scatter_nd_update_special_update_initializer;
vsi_nn_kernel_add_source( kernel_update, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
- scatter_nd_update_update_map[i].source_name );
+ scatter_nd_update_special_update_map[i].source_name );
vsi_nn_kernel_add_source( kernel_update, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
- scatter_nd_update_update_map[i].source_name );
+ scatter_nd_update_special_update_map[i].source_name );
}
else
{
status |= VSI_FAILURE;
}
- key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 5, 1 );
+ key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 6, 1, 0);
- for ( i = 0; i < _cnt_of_array(scatter_nd_update_copy_map); i ++ )
+ for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_copy_map); i ++ )
{
- if ( scatter_nd_update_copy_map[i].key == key )
+ if ( scatter_nd_update_special_copy_map[i].key == key )
{
break;
}
}
- if ( i < _cnt_of_array(scatter_nd_update_copy_map) )
+ if ( i < _cnt_of_array(scatter_nd_update_special_copy_map) )
{
- snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_copy_map[i].function_name );
- kernel->info.parameters = _scatter_nd_update_copy_kernel_param_def;
- kernel->info.numParams = _SCATTER_ND_UPDATE_COPY_PARAM_NUM;
- kernel->info.initialize = _scatter_nd_update_copy_initializer;
+ snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",
+ scatter_nd_update_special_copy_map[i].function_name );
+ kernel->info.parameters = _scatter_nd_update_special_copy_kernel_param_def;
+ kernel->info.numParams = _SCATTER_ND_UPDATE_SPECIAL_COPY_PARAM_NUM;
+ kernel->info.initialize = _scatter_nd_update_special_copy_initializer;
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
- scatter_nd_update_copy_map[i].source_name );
+ scatter_nd_update_special_copy_map[i].source_name );
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
- scatter_nd_update_copy_map[i].source_name );
+ scatter_nd_update_special_copy_map[i].source_name );
}
else
{
@@ -2044,41 +1621,37 @@ static vsi_nn_kernel_node_t _setup
)
{
vsi_status status = VSI_FAILURE;
- vsi_nn_kernel_node_param_t tmp_params[_SCATTER_ND_UPDATE_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+ vsi_size_t strides[VSI_NN_MAX_DIM_NUM] = {0};
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
int32_t idx_num = vsi_nn_kernel_param_get_int32( params, "idx_num" );
- vsi_size_t *input_size = inputs[2]->attr.size;
- uint32_t dims_num = inputs[2]->attr.dim_num;
int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
- vsi_size_t width = 0, area = 0, vol = 0;
int32_t big_flg = 0;
vsi_nn_kernel_dtype_e update_dtype = vsi_nn_kernel_map_dtype(inputs[2]->attr.dtype.vx_type);
vsi_nn_kernel_dtype_e ref_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
vsi_nn_kernel_dtype_e output_dtype = vsi_nn_kernel_map_dtype(outputs[0]->attr.dtype.vx_type);
int32_t type_flg = ((update_dtype == U8 || update_dtype == I8 || update_dtype == I16) &&
(update_dtype == ref_dtype && update_dtype == output_dtype)) ? 1 : 0;
- int32_t special_flg = (block_size % 16 == 0 && type_flg) ? 1 : 0;
+ int32_t special_flg = (block_size % 16 == 0 && type_flg && coord_dim <= 4) ? 1 : 0;
+ int32_t coord_flg = 0;
+ int32_t opt_flg = (block_size % 4 == 0) ? 1 : 0;
int32_t i = 0;
int32_t isRepeat = 0;
+ vsi_nn_tensor_t * tensors[4] = { NULL };
+ vsi_nn_kernel_t * ikernels[3] = { NULL };
- if (coord_dim > 4 && input_size[dims_num - 1] > 1)
- {
- return NULL;
- }
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
status = get_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0], coord_dim, 0,
- NULL, NULL, NULL, &rs_idx_dim, &big_flg);
+ NULL, &rs_idx_dim, &big_flg);
status |= get_scatter_nd_update_tensor_reshape_size(&inputs[2], shapes[1], block_size, 0,
- NULL, NULL, NULL, &rs_in_dim, &big_flg);
+ NULL, &rs_in_dim, &big_flg);
status |= get_scatter_nd_update_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim,
- &width, &area, &vol, &rs_out_dim, &big_flg);
- if (status != VSI_SUCCESS)
- {
- return NULL;
- }
+ strides, &rs_out_dim, &big_flg);
+ CHECK_STATUS_FAIL_GOTO( status, final );
check_scatter_nd_update_index_repeat(inputs, coord_dim, block_size, idx_num, &isRepeat);
@@ -2087,11 +1660,9 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_tensor_attr_t attr;
vsi_nn_kernel_node_t tmp_node = NULL;
vsi_nn_kernel_node_t ref_node = NULL;
- vsi_nn_kernel_node_param_t ref_params[_SCATTER_ND_UPDATE_REF_PARAM_NUM] = { NULL };
- vsi_nn_kernel_node_param_t node_params[_SCATTER_ND_UPDATE_UPDATE_PARAM_NUM] = { NULL };
- vsi_nn_kernel_node_param_t cpy_params[_SCATTER_ND_UPDATE_COPY_PARAM_NUM] = { NULL };
- vsi_nn_kernel_t * ikernels[2] = { NULL };
- vsi_nn_tensor_t * tensors[3] = { NULL };
+ vsi_nn_kernel_node_param_t ref_params[_SCATTER_ND_UPDATE_SPECIAL_REF_PARAM_NUM] = { NULL };
+ vsi_nn_kernel_node_param_t node_params[_SCATTER_ND_UPDATE_SPECIAL_UPDATE_PARAM_NUM] = { NULL };
+ vsi_nn_kernel_node_param_t cpy_params[_SCATTER_ND_UPDATE_SPECIAL_COPY_PARAM_NUM] = { NULL };
ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
ikernels[0]->unique_id = kernel->unique_id;
@@ -2127,7 +1698,8 @@ static vsi_nn_kernel_node_t _setup
ref_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[2], rs_out_dim );
ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
- status = vsi_nn_kernel_node_pass_param( ref_node, ref_params, _SCATTER_ND_UPDATE_REF_PARAM_NUM );
+ status = vsi_nn_kernel_node_pass_param( ref_node, ref_params,
+ _SCATTER_ND_UPDATE_SPECIAL_REF_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_tensor_release( &ref_params[0] );
}
@@ -2143,11 +1715,12 @@ static vsi_nn_kernel_node_t _setup
node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t;
- node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
- node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &area );
- node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &vol );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[0] );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[1] );
+ node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[2] );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
- status = vsi_nn_kernel_node_pass_param( tmp_node, node_params, _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM );
+ status = vsi_nn_kernel_node_pass_param( tmp_node, node_params,
+ _SCATTER_ND_UPDATE_SPECIAL_UPDATE_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_tensor_release( &node_params[0] );
vsi_nn_kernel_tensor_release( &node_params[1] );
@@ -2166,7 +1739,7 @@ static vsi_nn_kernel_node_t _setup
cpy_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
cpy_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t;
cpy_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
- status = vsi_nn_kernel_node_pass_param( node, cpy_params, _SCATTER_ND_UPDATE_COPY_PARAM_NUM );
+ status = vsi_nn_kernel_node_pass_param( node, cpy_params, _SCATTER_ND_UPDATE_SPECIAL_COPY_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_tensor_release( &cpy_params[2] );
}
@@ -2195,106 +1768,159 @@ static vsi_nn_kernel_node_t _setup
if (ref_node) {vsi_nn_kernel_node_release( &ref_node );}
if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
}
- else if ((update_dtype == U8 || update_dtype == I8 || update_dtype == I16))
+ else
{
vsi_nn_tensor_attr_t attr;
- vsi_nn_kernel_node_t tmp_node = NULL;
vsi_nn_kernel_node_t reset_node = NULL;
- vsi_nn_kernel_node_param_t pre_params[_SCATTER_ND_UPDATE_PRE_PARAM_NUM] = { NULL };
- vsi_nn_kernel_node_param_t node_params[_SCATTER_ND_UPDATE_POST_PARAM_NUM] = { NULL };
+ vsi_nn_kernel_node_t update_node = NULL;
+ vsi_nn_kernel_node_t ref_node = NULL;
vsi_nn_kernel_node_param_t reset_params[_SCATTER_ND_UPDATE_RESET_PARAM_NUM] = { NULL };
- vsi_nn_kernel_t * ikernels[2] = { NULL };
- vsi_nn_tensor_t * tensors[3] = { NULL };
+ vsi_nn_kernel_node_param_t ref_params[_SCATTER_ND_UPDATE_REF_PARAM_NUM] = { NULL };
+ vsi_nn_kernel_node_param_t update_params[_SCATTER_ND_UPDATE_UPDATE_PARAM_NUM] = { NULL };
+ vsi_nn_kernel_node_param_t cpy_params[_SCATTER_ND_UPDATE_COPY_PARAM_NUM] = { NULL };
+ int32_t width = 1;
+ int32_t res = 0;
ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
ikernels[0]->unique_id = kernel->unique_id;
ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
ikernels[1]->unique_id = kernel->unique_id;
+ ikernels[2] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+ ikernels[2]->unique_id = kernel->unique_id;
memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
- attr.dtype.vx_type = VSI_NN_TYPE_INT32;
- attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+ attr.dtype = outputs[0]->attr.dtype;
attr.is_const = FALSE;
attr.vtl = TRUE;
for (i = 0; i < rs_out_dim; i++)
{
attr.size[i] = shapes[2][i];
+ width *= (int32_t)shapes[2][i];
}
attr.dim_num = rs_out_dim;
- tensors[0] = vsi_nn_CreateTensor( graph, &attr );
+ res = width % 8;
+ width = (width >> 3) << 3;
+
+ tensors[0] = vsi_nn_CreateTensor( graph, &attr ); // ref'
+ attr.dtype = inputs[2]->attr.dtype;
+ attr.dtype.vx_type = VSI_NN_TYPE_INT32;
+ attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+ if (update_dtype == F16)
+ {
+ attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+ }
+ tensors[1] = vsi_nn_CreateTensor( graph, &attr ); // temp_buf_int
attr.size[0] = 1;
- tensors[1] = vsi_nn_CreateTensor( graph, &attr );
attr.size[1] = 1;
- tensors[2] = vsi_nn_CreateTensor( graph, &attr );
+ tensors[2] = vsi_nn_CreateTensor( graph, &attr ); // link_buffer0
+ tensors[3] = vsi_nn_CreateTensor( graph, &attr ); // link_buffer1
- status = _query_kernel_large( inputs, outputs, ikernels[0], ikernels[1], kernel);
+ status = _query_kernel( inputs, outputs, ikernels[0], ikernels[1], ikernels[2], kernel, coord_flg, opt_flg);
if ( VSI_SUCCESS == status)
{
- // reset count
+ // convert ref to output
reset_node = vsi_nn_kernel_create_node( graph, ikernels[0] );
if (reset_node)
{
uint32_t index = 0;
/* Pass parameters to node. */
- reset_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim );
+ reset_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[2], rs_out_dim );
reset_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
reset_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
+ reset_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+ reset_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
status = vsi_nn_kernel_node_pass_param( reset_node, reset_params, _SCATTER_ND_UPDATE_RESET_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_tensor_release( &reset_params[0] );
+ vsi_nn_kernel_scalar_release( &reset_params[3] );
+ vsi_nn_kernel_scalar_release( &reset_params[4] );
}
- // pre-process
- tmp_node = vsi_nn_kernel_create_node( graph, ikernels[1] );
- if (tmp_node)
+ // update
+ update_node = vsi_nn_kernel_create_node( graph, ikernels[1] );
+ if (update_node)
+ {
+ uint32_t index = 0;
+ /* Pass parameters to node. */
+ update_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim );
+ update_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim );
+ update_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
+ update_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t;
+ update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[0] );
+ update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[1] );
+ update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[2] );
+ update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[3] );
+ update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[4] );
+ update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[5] );
+ update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[6] );
+ update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
+ status = vsi_nn_kernel_node_pass_param( update_node, update_params,
+ _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM );
+ CHECK_STATUS(status);
+ vsi_nn_kernel_tensor_release( &update_params[0] );
+ vsi_nn_kernel_tensor_release( &update_params[1] );
+ vsi_nn_kernel_scalar_release( &update_params[4] );
+ vsi_nn_kernel_scalar_release( &update_params[5] );
+ vsi_nn_kernel_scalar_release( &update_params[6] );
+ vsi_nn_kernel_scalar_release( &update_params[7] );
+ vsi_nn_kernel_scalar_release( &update_params[8] );
+ vsi_nn_kernel_scalar_release( &update_params[9] );
+ vsi_nn_kernel_scalar_release( &update_params[10] );
+ vsi_nn_kernel_scalar_release( &update_params[11] );
+ }
+
+ // ref
+ ref_node = vsi_nn_kernel_create_node( graph, ikernels[2] );
+ if (ref_node)
{
uint32_t index = 0;
/* Pass parameters to node. */
- pre_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim );
- pre_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim );
- pre_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
- pre_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
- pre_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t;
- pre_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
- pre_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &area );
- pre_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &vol );
- pre_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
- status = vsi_nn_kernel_node_pass_param( tmp_node, pre_params, _SCATTER_ND_UPDATE_PRE_PARAM_NUM );
+ ref_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim );
+ ref_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim );
+ ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
+ ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+ ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t;
+ ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[3]->t;
+ ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[0] );
+ ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[1] );
+ ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[2] );
+ ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[3] );
+ ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[4] );
+ ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[5] );
+ ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[6] );
+ ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
+ status = vsi_nn_kernel_node_pass_param( ref_node, ref_params, _SCATTER_ND_UPDATE_REF_PARAM_NUM );
CHECK_STATUS(status);
- vsi_nn_kernel_tensor_release( &pre_params[0] );
- vsi_nn_kernel_tensor_release( &pre_params[1] );
- vsi_nn_kernel_scalar_release( &pre_params[5] );
- vsi_nn_kernel_scalar_release( &pre_params[6] );
- vsi_nn_kernel_scalar_release( &pre_params[7] );
- vsi_nn_kernel_scalar_release( &pre_params[8] );
+ vsi_nn_kernel_tensor_release( &ref_params[0] );
+ vsi_nn_kernel_tensor_release( &ref_params[1] );
+ vsi_nn_kernel_scalar_release( &ref_params[6] );
+ vsi_nn_kernel_scalar_release( &ref_params[7] );
+ vsi_nn_kernel_scalar_release( &ref_params[8] );
+ vsi_nn_kernel_scalar_release( &ref_params[9] );
+ vsi_nn_kernel_scalar_release( &ref_params[10] );
+ vsi_nn_kernel_scalar_release( &ref_params[11] );
+ vsi_nn_kernel_scalar_release( &ref_params[12] );
+ vsi_nn_kernel_scalar_release( &ref_params[13] );
}
+ // copy to output
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
uint32_t index = 0;
/* Pass parameters to node. */
- node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[2], rs_out_dim );
- node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
- node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
- node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t;
- node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim );
- node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
- node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
- node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &area );
- node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &vol );
- node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
- status = vsi_nn_kernel_node_pass_param( node, node_params, _SCATTER_ND_UPDATE_POST_PARAM_NUM );
+ cpy_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+ cpy_params[index++] = (vsi_nn_kernel_node_param_t)tensors[3]->t;
+ cpy_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
+ cpy_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+ cpy_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
+ status = vsi_nn_kernel_node_pass_param( node, cpy_params, _SCATTER_ND_UPDATE_COPY_PARAM_NUM );
CHECK_STATUS(status);
- vsi_nn_kernel_tensor_release( &node_params[0] );
- vsi_nn_kernel_tensor_release( &node_params[4] );
- vsi_nn_kernel_tensor_release( &node_params[5] );
- vsi_nn_kernel_scalar_release( &node_params[6] );
- vsi_nn_kernel_scalar_release( &node_params[7] );
- vsi_nn_kernel_scalar_release( &node_params[8] );
- vsi_nn_kernel_scalar_release( &node_params[9] );
+ vsi_nn_kernel_tensor_release( &cpy_params[2] );
+ vsi_nn_kernel_scalar_release( &cpy_params[3] );
+ vsi_nn_kernel_scalar_release( &cpy_params[4] );
}
}
@@ -2306,6 +1932,10 @@ static vsi_nn_kernel_node_t _setup
{
vsi_nn_kernel_release( &ikernels[1] );
}
+ if ( ikernels[2] )
+ {
+ vsi_nn_kernel_release( &ikernels[2] );
+ }
if ( tensors[0] )
{
vsi_nn_ReleaseTensor( &tensors[0] );
@@ -2318,41 +1948,33 @@ static vsi_nn_kernel_node_t _setup
{
vsi_nn_ReleaseTensor( &tensors[2] );
}
+ if ( tensors[3] )
+ {
+ vsi_nn_ReleaseTensor( &tensors[3] );
+ }
+ if (ref_node) {vsi_nn_kernel_node_release( &ref_node );}
if (reset_node) {vsi_nn_kernel_node_release( &reset_node );}
- if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
+ if (update_node) {vsi_nn_kernel_node_release( &update_node );}
}
- else
+
+final:
+ if (ikernels[0])
{
- status = _query_kernel( inputs, outputs, kernel, coord_dim, big_flg);
- if ( VSI_SUCCESS == status)
- {
- node = vsi_nn_kernel_create_node( graph, kernel );
- if ( node )
- {
- uint32_t index = 0;
- /* Pass parameters to node. */
- tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[2], rs_out_dim );
- tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_idx_dim );
- tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_in_dim );
- //tmp_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
- tmp_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
- tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
- tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &area );
- tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &vol );
- tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
- status = vsi_nn_kernel_node_pass_param( node, tmp_params, _SCATTER_ND_UPDATE_PARAM_NUM );
- CHECK_STATUS(status);
- vsi_nn_kernel_tensor_release( &tmp_params[0] );
- vsi_nn_kernel_tensor_release( &tmp_params[1] );
- vsi_nn_kernel_tensor_release( &tmp_params[2] );
- vsi_nn_kernel_tensor_release( &tmp_params[3] );
- vsi_nn_kernel_scalar_release( &tmp_params[4] );
- vsi_nn_kernel_scalar_release( &tmp_params[5] );
- vsi_nn_kernel_scalar_release( &tmp_params[6] );
- vsi_nn_kernel_scalar_release( &tmp_params[7] );
- }
- }
+ vsi_nn_kernel_release(&ikernels[0]);
+ }
+ if (ikernels[1])
+ {
+ vsi_nn_kernel_release(&ikernels[1]);
+ }
+ if (ikernels[2])
+ {
+ vsi_nn_kernel_release(&ikernels[2]);
}
+ vsi_safe_release_tensor(tensors[0]);
+ vsi_safe_release_tensor(tensors[1]);
+ vsi_safe_release_tensor(tensors[2]);
+ vsi_safe_release_tensor(tensors[3]);
+
return node;
} /* _setup() */
diff --git a/src/tim/vx/internal/src/kernel/evis/select_evis.c b/src/tim/vx/internal/src/kernel/evis/select_evis.c
index fae6ad78c..b918e2c08 100644
--- a/src/tim/vx/internal/src/kernel/evis/select_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/select_evis.c
@@ -34,6 +34,7 @@
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
__BEGIN_DECLS
@@ -61,6 +62,10 @@ typedef enum _internal_img_dim_e
CVIVANTE_NAMESPACE("evis.select_"STR(COND_DTYPE)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
_SELECT_KERNEL_SOURCE}
+#define _INPUT_NUM (3)
+#define _OUTPUT_NUM (1)
+#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
+
typedef struct
{
uint32_t key;
@@ -138,7 +143,7 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
(( IN0_TYPE << 24) | ( IN1_TYPE << 16) | ( OUT_TYPE << 8))
#define MAX_MULTIPLIER_NUM (65535)
#define MAX_POST_SHIFT_BITS (31)
- vsi_status status = VX_SUCCESS;
+ vsi_status status = VSI_FAILURE;
// Alignment with a power of two value.
gpu_param_t gpu_param = {
3,
@@ -166,6 +171,8 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
uint16_t in1_M0 = 0;
int32_t in1_postShift = 0;
uint32_t pack_key = 0;
+
+ VSI_UNREFERENCED(param_size);
input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0);
CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input1);
@@ -444,15 +451,67 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_param_t node_params[_SELECT_PARAM_NUM] = {NULL};
vsi_bool image_2d = FALSE;
vsi_nn_kernel_node_t node = NULL;
+ vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL };
+ vsi_size_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+ vsi_size_t* shapes_ptr[_IO_NUM];
+ vsi_size_t* shapes_in[_INPUT_NUM];
+ vsi_size_t rank_in[_INPUT_NUM];
+ uint32_t new_rank = 0;
+ uint32_t i = 0;
+ vsi_bool ret = FALSE;
+
+ VSI_UNREFERENCED(params);
+
+ for (i = 0; i < _IO_NUM; i++)
+ {
+ shapes_ptr[i] = shapes[i];
+ }
+
+ for (i = 0; i < _INPUT_NUM; i++)
+ {
+ shapes_in[i] = inputs[i]->attr.size;
+ rank_in[i] = (vsi_size_t)inputs[i]->attr.dim_num;
+ }
+
+ ret = vsi_nn_kernel_optimize_broadcast_shape(
+ (const vsi_size_t**)shapes_in, rank_in, _INPUT_NUM,
+ outputs[0]->attr.size, outputs[0]->attr.dim_num,
+ shapes_ptr, shapes[_INPUT_NUM], &new_rank);
+
+ if ( ret )
+ {
+ for (i = 0; i < _INPUT_NUM; i++)
+ {
+ reshape_tensors[i] = vsi_nn_reshape_tensor( graph,
+ inputs[i], shapes[i], new_rank );
+ }
+
+ for (i = 0; i < _OUTPUT_NUM; i++)
+ {
+ reshape_tensors[i + _INPUT_NUM] = vsi_nn_reshape_tensor( graph,
+ outputs[i], shapes[i + _INPUT_NUM], new_rank );
+ }
+ }
+ else
+ {
+ for (i = 0; i < _INPUT_NUM; i++)
+ {
+ reshape_tensors[i] = inputs[i];
+ }
+ for (i = 0; i < _OUTPUT_NUM; i++)
+ {
+ reshape_tensors[i + _INPUT_NUM] = outputs[i];
+ }
+ }
- if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
- outputs[0]->attr.dim_num ) )
+ if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[3]->attr.size,
+ reshape_tensors[3]->attr.dim_num ) )
{
return NULL;
}
- image_2d = (outputs[0]->attr.dim_num == 2);
- status = _query_kernel( kernel, inputs, outputs, image_2d);
+ image_2d = (reshape_tensors[3]->attr.dim_num == 2);
+ status = _query_kernel( kernel, inputs, &reshape_tensors[3], image_2d);
if ( VSI_SUCCESS == status)
{
@@ -460,12 +519,22 @@ static vsi_nn_kernel_node_t _setup
if ( node )
{
/* Set inputs and outputs */
+
vsi_nn_kernel_node_pack_io( node_params, _SELECT_PARAM_NUM,
- inputs, input_num, outputs, output_num );
+ &reshape_tensors[0], input_num, &reshape_tensors[3], output_num );
+
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _SELECT_PARAM_NUM );
}
}
+ if (ret)
+ {
+ for (i = 0; i < _IO_NUM; i++)
+ {
+ vsi_safe_release_tensor( reshape_tensors[i] );
+ }
+ }
+
return node;
} /* _setup() */
diff --git a/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c b/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c
index 5d7e2d6cf..b2e22ed7c 100644
--- a/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c
@@ -123,6 +123,8 @@ DEF_KERNEL_INITIALIZER(_sequence_mask_initializer)
int32_t output_zp = 0;
int32_t input_zp = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -252,7 +254,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype = U8;
vsi_status status = VSI_FAILURE;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -299,9 +301,11 @@ static int32_t _optimize_mask_shape
vsi_status status = VSI_SUCCESS;
vsi_size_t in_shape[VSI_NN_MAX_DIM_NUM] = {0};
vsi_size_t new_rank = 0;
- uint32_t i = 0;
+ vsi_size_t i = 0;
+
+ VSI_UNREFERENCED(outputs);
- for(i = 0; i < inputs[0]->attr.dim_num; i++)
+ for (i = 0; i < (vsi_size_t)inputs[0]->attr.dim_num; i++)
{
in_shape[i] = inputs[0]->attr.size[i];
}
@@ -313,7 +317,7 @@ static int32_t _optimize_mask_shape
}
opt_shape_out[0] = max_len;
- for(i = 0; i < (uint32_t)new_rank; i++)
+ for (i = 0; i < new_rank; i++)
{
opt_shape_out[i + 1] = opt_shape_in[i];
}
@@ -344,6 +348,9 @@ static vsi_nn_kernel_node_t _setup
int32_t max_len = vsi_nn_kernel_param_get_int32( params, "max_len" );
int32_t is2Dflg = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
diff --git a/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c b/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c
index bcfe0d01c..6fca37fce 100644
--- a/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c
@@ -95,7 +95,10 @@ DEF_KERNEL_INITIALIZER(_signal_frame_initializer)
vsi_nn_kernel_tensor_attr_t * attr = NULL;
vsi_size_array_t * out_shape = NULL;
+ VSI_UNREFERENCED(param_size);
+
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+ CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
out_shape = attr->shape;
gpu_param.global_scale[0] = 16;
diff --git a/src/tim/vx/internal/src/kernel/evis/slice_evis.c b/src/tim/vx/internal/src/kernel/evis/slice_evis.c
index 883947073..773d38b0d 100644
--- a/src/tim/vx/internal/src/kernel/evis/slice_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/slice_evis.c
@@ -162,6 +162,8 @@ DEF_KERNEL_INITIALIZER(_slice_initializer)
int32_t is_samefl = 0;
uint32_t pack_key = 0;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
@@ -409,6 +411,8 @@ static vsi_nn_kernel_node_t _setup
vsi_size_t output_batch = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
vsi_bool is_same_quant = FALSE;
+ VSI_UNREFERENCED(params);
+
vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num,
shapes[0], &rank[0]);
vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num,
diff --git a/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c
index 2b9d53e94..f95405aca 100644
--- a/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c
@@ -125,6 +125,8 @@ DEF_KERNEL_INITIALIZER(_space2depth_internal_initializer)
uint32_t pack_key = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -273,7 +275,9 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
- int i = 0;
+ size_t i = 0;
+
+ VSI_UNREFERENCED(params);
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -323,6 +327,9 @@ static vsi_nn_kernel_node_t _setup
int32_t block_size_y = vsi_nn_kernel_param_get_int32( params, "block_size_y" );
int32_t opt_flg = (block_size_x == 2 && block_size_y == 1) ? 1 : 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
diff --git a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
index 46595a170..f31de5495 100644
--- a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
@@ -165,6 +165,8 @@ DEF_KERNEL_INITIALIZER(_get_matrix_initializer)
float output_h = 1.0f;
float scale[4] = {0};
+ VSI_UNREFERENCED(param_size);
+
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
@@ -256,6 +258,8 @@ DEF_KERNEL_INITIALIZER(_warp_affine_initializer)
float output_scale = 1.0f;
float output_zp = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
@@ -309,7 +313,6 @@ DEF_KERNEL_INITIALIZER(_warp_affine_initializer)
gpu_param.global_size[1] = out_shape->data[1];
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
- do
{
gpu_dp_inst_t uniConvertDatatoF32_0_4x4 = {{
0x01010101, // TCfg
@@ -369,7 +372,7 @@ DEF_KERNEL_INITIALIZER(_warp_affine_initializer)
"uniExtract8Data_2x8", &uniExtractInteger_2x8 );
}
CHECK_STATUS_FAIL_GOTO(status, final );
- }while(0);
+ }
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
@@ -502,6 +505,9 @@ static vsi_nn_kernel_node_t _setup
float output_h = (float)outputs[0]->attr.size[1];
int32_t i = 0;
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+
if (align_corners && output_w > 1)
{
output_w = output_w - 1;
@@ -565,42 +571,46 @@ static vsi_nn_kernel_node_t _setup
// Get Matrix
node = vsi_nn_kernel_create_node( graph, ikernels[MATRIX_INDEX] );
- vsi_nn_kernel_node_pack_io( node_params, _GET_MATRIX_PARAM_NUM,
- &inputs[1], 1, &tensors[0], 1 );
- node_params[HAS_THETA_1_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_1 );
- node_params[HAS_THETA_1_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_2 );
- node_params[HAS_THETA_1_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_3 );
- node_params[HAS_THETA_2_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_1 );
- node_params[HAS_THETA_2_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_2 );
- node_params[HAS_THETA_2_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_3 );
- node_params[THETA_1_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_1 );
- node_params[THETA_1_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_2 );
- node_params[THETA_1_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_3 );
- node_params[THETA_2_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_1 );
- node_params[THETA_2_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_2 );
- node_params[THETA_2_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_3 );
- node_params[I_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &input_w );
- node_params[I_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &input_h );
- node_params[O_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &output_w );
- node_params[O_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &output_h );
- status = vsi_nn_kernel_node_pass_param( node, node_params, _GET_MATRIX_PARAM_NUM );
- vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_1] );
- vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_2] );
- vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_3] );
- vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_1] );
- vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_2] );
- vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_3] );
- vsi_nn_kernel_scalar_release( &node_params[THETA_1_1] );
- vsi_nn_kernel_scalar_release( &node_params[THETA_1_2] );
- vsi_nn_kernel_scalar_release( &node_params[THETA_1_3] );
- vsi_nn_kernel_scalar_release( &node_params[THETA_2_1] );
- vsi_nn_kernel_scalar_release( &node_params[THETA_2_2] );
- vsi_nn_kernel_scalar_release( &node_params[THETA_2_3] );
- vsi_nn_kernel_scalar_release( &node_params[I_WIDTH] );
- vsi_nn_kernel_scalar_release( &node_params[I_HEIGHT] );
- vsi_nn_kernel_scalar_release( &node_params[O_WIDTH] );
- vsi_nn_kernel_scalar_release( &node_params[O_HEIGHT] );
- vsi_nn_kernel_node_release( &node );
+
+ if (node)
+ {
+ vsi_nn_kernel_node_pack_io( node_params, _GET_MATRIX_PARAM_NUM,
+ &inputs[1], 1, &tensors[0], 1 );
+ node_params[HAS_THETA_1_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_1 );
+ node_params[HAS_THETA_1_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_2 );
+ node_params[HAS_THETA_1_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_3 );
+ node_params[HAS_THETA_2_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_1 );
+ node_params[HAS_THETA_2_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_2 );
+ node_params[HAS_THETA_2_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_3 );
+ node_params[THETA_1_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_1 );
+ node_params[THETA_1_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_2 );
+ node_params[THETA_1_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_3 );
+ node_params[THETA_2_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_1 );
+ node_params[THETA_2_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_2 );
+ node_params[THETA_2_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_3 );
+ node_params[I_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &input_w );
+ node_params[I_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &input_h );
+ node_params[O_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &output_w );
+ node_params[O_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &output_h );
+ status = vsi_nn_kernel_node_pass_param( node, node_params, _GET_MATRIX_PARAM_NUM );
+ vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_1] );
+ vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_2] );
+ vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_3] );
+ vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_1] );
+ vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_2] );
+ vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_3] );
+ vsi_nn_kernel_scalar_release( &node_params[THETA_1_1] );
+ vsi_nn_kernel_scalar_release( &node_params[THETA_1_2] );
+ vsi_nn_kernel_scalar_release( &node_params[THETA_1_3] );
+ vsi_nn_kernel_scalar_release( &node_params[THETA_2_1] );
+ vsi_nn_kernel_scalar_release( &node_params[THETA_2_2] );
+ vsi_nn_kernel_scalar_release( &node_params[THETA_2_3] );
+ vsi_nn_kernel_scalar_release( &node_params[I_WIDTH] );
+ vsi_nn_kernel_scalar_release( &node_params[I_HEIGHT] );
+ vsi_nn_kernel_scalar_release( &node_params[O_WIDTH] );
+ vsi_nn_kernel_scalar_release( &node_params[O_HEIGHT] );
+ vsi_nn_kernel_node_release( &node );
+ }
// Warp Affine
node = vsi_nn_kernel_create_node( graph, ikernels[WARP_AFFINE_INDEX] );
@@ -617,19 +627,26 @@ static vsi_nn_kernel_node_t _setup
border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
}
status = vsi_nn_kernel_node_set_border( node, &border );
- VSI_ASSERT( status == VSI_SUCCESS );
+ if ( VSI_SUCCESS != status )
+ {
+ goto final;
+ }
+ vsi_nn_kernel_node_pack_io( warp_affine_node_params, _WARP_AFFINE_PARAM_NUM,
+ warp_affine_tensors, 2, outputs, 1 );
+ status = vsi_nn_kernel_node_pass_param( node, warp_affine_node_params, _WARP_AFFINE_PARAM_NUM );
+ if ( VSI_SUCCESS != status )
+ {
+ goto final;
+ }
}
- vsi_nn_kernel_node_pack_io( warp_affine_node_params, _WARP_AFFINE_PARAM_NUM,
- warp_affine_tensors, 2, outputs, 1 );
- status = vsi_nn_kernel_node_pass_param( node, warp_affine_node_params, _WARP_AFFINE_PARAM_NUM );
final:
- for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
+ for ( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
{
- if( ikernels[i] )
+ if ( ikernels[i] )
{
vsi_nn_kernel_release( &ikernels[i] );
}
- if( tensors[i] )
+ if ( tensors[i] )
{
vsi_nn_ReleaseTensor( &tensors[i] );
}
diff --git a/src/tim/vx/internal/src/kernel/evis/swish_evis.c b/src/tim/vx/internal/src/kernel/evis/swish_evis.c
index 724037575..befe6ac74 100644
--- a/src/tim/vx/internal/src/kernel/evis/swish_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/swish_evis.c
@@ -154,7 +154,7 @@ DEF_KERNEL_INITIALIZER(_swish_initializer)
size_t param_size
)
{
- vsi_status status = VX_FAILURE;
+ vsi_status status = VSI_FAILURE;
// Alignment with a power of two value.
gpu_param_t gpu_param = {
3,
@@ -177,6 +177,8 @@ DEF_KERNEL_INITIALIZER(_swish_initializer)
vsi_size_array_t *out_shape = NULL;
uint32_t pack_key = 0;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input);
CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
@@ -365,7 +367,7 @@ DEF_KERNEL_INITIALIZER(_hswish_initializer)
size_t param_size
)
{
- vsi_status status = VX_FAILURE;
+ vsi_status status = VSI_FAILURE;
// Alignment with a power of two value.
gpu_param_t gpu_param = {
3,
@@ -387,6 +389,8 @@ DEF_KERNEL_INITIALIZER(_hswish_initializer)
vsi_size_array_t *out_shape = NULL;
uint32_t pack_key = 0;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input);
CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
@@ -649,6 +653,9 @@ static vsi_nn_kernel_node_t _setup
int32_t swish_type = vsi_nn_kernel_param_get_int32( params, "type" );
float beta = 1.0f;
vsi_bool ret = FALSE;
+
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
#if (VX_ACTIVATION_EXT_SUPPORT)
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
{
diff --git a/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c b/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c
index 15854526a..4a57905ce 100644
--- a/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c
@@ -123,6 +123,8 @@ DEF_KERNEL_INITIALIZER(_tensorstackconcat_initializer)
vsi_size_array_t * in_shape = NULL;
// Add initializer
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
@@ -225,6 +227,8 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_t node = NULL;
vsi_bool image_2d = FALSE;
+ VSI_UNREFERENCED(params);
+
image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
status = _query_kernel( kernel, inputs, outputs, image_2d );
if ( VSI_SUCCESS == status)
diff --git a/src/tim/vx/internal/src/kernel/evis/tile_evis.c b/src/tim/vx/internal/src/kernel/evis/tile_evis.c
index 50e43cf81..f46941aff 100644
--- a/src/tim/vx/internal/src/kernel/evis/tile_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/tile_evis.c
@@ -272,6 +272,8 @@ DEF_KERNEL_INITIALIZER(_tile_initializer)
int32_t output_ZP = 0;
int32_t input_ZP = 0;
+ VSI_UNREFERENCED(param_size);
+
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -408,7 +410,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
- int32_t i = 0;
+ size_t i = 0;
int32_t dim0_size1 = inputs[0]->attr.size[0] == 1 ? 1 : 0;
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
@@ -497,6 +499,11 @@ static vsi_nn_kernel_node_t _setup
uint32_t dim = inputs[0]->attr.dim_num;
vsi_size_t multiples[VSI_NN_MAX_DIM_NUM] = { 0 };
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+
+
for ( i = 0; i < dim; i++)
{
multiples[i] = outputs[0]->attr.size[i] / inputs[0]->attr.size[i];
@@ -515,10 +522,34 @@ static vsi_nn_kernel_node_t _setup
return NULL;
}
- reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
- inputs[0], shapes[0], new_rank );
- reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
- outputs[0], shapes[2], new_rank );
+ if ( new_rank == 4)
+ {
+ vsi_size_t newshapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+ newshapes[0][0] = shapes[0][0];
+ newshapes[2][0] = shapes[2][0];
+ newshapes[0][1] = shapes[0][1];
+ newshapes[2][1] = shapes[2][1];
+ newshapes[0][2] = shapes[0][2] * shapes[0][3];
+ newshapes[2][2] = shapes[2][2] * shapes[2][3];
+
+ if (newshapes[0][2] >= GPU_TENSOR_MAX_WIDTH ||
+ newshapes[2][2] >= GPU_TENSOR_MAX_WIDTH)
+ {
+ return NULL;
+ }
+
+ reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+ inputs[0], newshapes[0], 3 );
+ reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+ outputs[0], newshapes[2], 3 );
+ }
+ else
+ {
+ reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+ inputs[0], shapes[0], new_rank );
+ reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+ outputs[0], shapes[2], new_rank );
+ }
}
else
{
@@ -532,7 +563,7 @@ static vsi_nn_kernel_node_t _setup
}
remainder = reshape_tensors[0]->attr.size[0] % 8;
- image_2d = (reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1);
+ image_2d = reshape_tensors[0]->attr.dim_num == 2;
status = _query_kernel( &reshape_tensors[0], &reshape_tensors[1], image_2d, remainder, kernel );
if( VSI_SUCCESS == status)
{
@@ -540,9 +571,9 @@ static vsi_nn_kernel_node_t _setup
if( node )
{
/* Pass parameters to node. */
- vsi_size_t depthIn = new_rank > 2 ? reshape_tensors[0]->attr.size[2] : 1;
- vsi_size_t depthOut = new_rank > 2 ? reshape_tensors[1]->attr.size[2] : 1;
- vsi_size_t batchIn = new_rank > 3 ? reshape_tensors[0]->attr.size[3] : 1;
+ vsi_size_t depthIn = new_rank > 2 ? shapes[0][2] : 1;
+ vsi_size_t depthOut = new_rank > 2 ? shapes[2][2] : 1;
+ vsi_size_t batchIn = new_rank > 3 ? shapes[0][3] : 1;
shapes[1][2] = shapes[1][2] == 0 ? 1 : shapes[1][2];
shapes[1][3] = shapes[1][3] == 0 ? 1 : shapes[1][3];
diff --git a/src/tim/vx/internal/src/kernel/evis/upsample_evis.c b/src/tim/vx/internal/src/kernel/evis/upsample_evis.c
index 0ac1b6d28..fb78c4905 100644
--- a/src/tim/vx/internal/src/kernel/evis/upsample_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/upsample_evis.c
@@ -162,6 +162,8 @@ DEF_KERNEL_INITIALIZER(_upsample_initializer)
float factorOut = 1.0f;
vsi_bool image_2d = FALSE;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
axis_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c b/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c
index 27a478b0e..6bc113f3c 100644
--- a/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c
@@ -152,6 +152,8 @@ DEF_KERNEL_INITIALIZER(_upsamplescale_initializer)
uint32_t pack_key = 0;
_internal_upscale_e flag = UP_ORG;
+ VSI_UNREFERENCED(param_size);
+
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -344,7 +346,7 @@ static vsi_status _query_kernel
_internal_upscale_e flag = (stride == 2 && scale >= 0 ) ? UP_K2 : UP_ORG;
uint32_t key = 0;
- int i;
+ size_t i;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
index aa05c359d..83334269c 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@@ -136,6 +136,10 @@ static vsi_status VX_CALLBACK _kernel_validator
vx_meta_format metas[]
)
{
+ VSI_UNREFERENCED(node);
+ VSI_UNREFERENCED(parameters);
+ VSI_UNREFERENCED(num);
+ VSI_UNREFERENCED(metas);
return VSI_SUCCESS;
} /* _kernel_validator() */
@@ -146,6 +150,9 @@ static vsi_status VX_CALLBACK _kernel_initializer
uint32_t paraNum
)
{
+ VSI_UNREFERENCED(nodObj);
+ VSI_UNREFERENCED(paramObj);
+ VSI_UNREFERENCED(paraNum);
return VSI_SUCCESS;
} /* _kernel_initializer() */
@@ -156,6 +163,9 @@ static vsi_status VX_CALLBACK _kernel_deinitializer
uint32_t paraNum
)
{
+ VSI_UNREFERENCED(nodObj);
+ VSI_UNREFERENCED(paraObj);
+ VSI_UNREFERENCED(paraNum);
return VSI_SUCCESS;
} /* _kernel_deinitializer() */
@@ -287,6 +297,9 @@ static const uint8_t* _load_internal_executable
vsi_nn_kernel_type_e type
)
{
+ VSI_UNREFERENCED(source_name);
+ VSI_UNREFERENCED(size);
+ VSI_UNREFERENCED(type);
#if VSI_USE_VXC_BINARY
switch( type )
{
@@ -518,8 +531,10 @@ static vx_program _create_program_from_executable
program_info.data = _load_internal_executable(
source_info->data[0], &program_info.size, kernel->type);
+ CHECK_PTR_FAIL_GOTO( program_info.data, "Create buffer fail.", final );
program = vxCreateProgramWithBinary( graph->ctx->c,
(const vx_uint8 *)program_info.data, program_info.size );
+final:
return program;
} /* _create_program_from_executable() */
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c
index ecbdccf06..26c918079 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c
@@ -113,6 +113,12 @@ static vsi_size_t eltwise_fill_dim
vsi_size_t divisor = 0;
vsi_size_t remainder = 0;
compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor );
+ if (divisor == 0)
+ {
+ VSILOGE( "divisor might be used in a division by zero." );
+ cost_size = (vsi_size_t)-1;
+ goto final;
+ }
remainder = size_output / divisor;
if( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank )
{
@@ -152,6 +158,7 @@ static vsi_size_t eltwise_fill_dim
shape_output[rank + 1] = remainder;
}
}
+final:
return cost_size;
} /* eltwise_fill_dim() */
@@ -177,11 +184,11 @@ vsi_bool vsi_nn_kernel_optimize_eltwise_shape
eltwise_broadcast_state_e prv_state = ELTWISE_BROADCAST_STATE_EMPTY;
#define _swap_size(a, b, tmp) \
- do { \
+ { \
tmp = a; \
a = b; \
b = tmp; \
- } while(0)
+ }
for( i = 0; i < rank_output; i++ )
{
sx = i < rank_x ? shape_x[i] : 1;
@@ -352,6 +359,12 @@ static vsi_size_t broadcast_fill_dim
vsi_size_t divisor = 0;
vsi_size_t remainder = 0;
compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor );
+ if (divisor == 0)
+ {
+ VSILOGE( "divisor might be used in a division by zero." );
+ cost_size = (vsi_size_t)-1;
+ goto final;
+ }
remainder = size_output / divisor;
if( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank )
{
@@ -386,6 +399,7 @@ static vsi_size_t broadcast_fill_dim
shape_output[rank + 1] = remainder;
}
}
+final:
return cost_size;
} /* broadcast_fill_dim() */
@@ -412,11 +426,11 @@ vsi_bool vsi_nn_kernel_optimize_broadcast_shape
int32_t prv_state_mask = -1;
#define _swap_size(a, b, tmp) \
- do { \
+ { \
tmp = a; \
a = b; \
b = tmp; \
- } while(0)
+ }
if (input_num > MAX_INPUT_NUM)
{
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
index f3a8f4fce..18919b4d5 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
@@ -28,6 +28,7 @@
#include "vsi_nn_error.h"
#include "utils/vsi_nn_math.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
static vsi_bool compute_gpu_divisor
(
@@ -84,6 +85,12 @@ static vsi_size_t element_fill_dim
vsi_size_t divisor = 0;
vsi_size_t remainder = 0;
compute_gpu_divisor( size_x, max_rank, 1, &divisor );
+ if (divisor == 0)
+ {
+ VSILOGE( "divisor might be used in a division by zero." );
+ cost_size = (vsi_size_t)-1;
+ goto final;
+ }
remainder = size_x / divisor;
if ( remainder > max_rank || rank_x >= max_rank)
{
@@ -109,6 +116,7 @@ static vsi_size_t element_fill_dim
}
}
}
+final:
return cost_size;
} /* element_fill_dim() */
@@ -132,6 +140,9 @@ vsi_bool vsi_nn_kernel_optimize_reduce_shape
vsi_size_t outerSize = 1;
vsi_size_t axisSize = 1;
+ VSI_UNREFERENCED(shape_output);
+ VSI_UNREFERENCED(rank_output);
+
for (i = 0; i < axis_size; i++)
{
axisSize *= shape_x[axis[i]];
@@ -391,6 +402,12 @@ static vsi_size_t tile_fill_dim
vsi_size_t divisor = 0;
vsi_size_t remainder = 0;
compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor );
+ if (divisor == 0)
+ {
+ VSILOGE( "divisor might be used in a division by zero." );
+ cost_size = (vsi_size_t)-1;
+ goto final;
+ }
remainder = size_output / divisor;
if ( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank )
{
@@ -430,6 +447,7 @@ static vsi_size_t tile_fill_dim
shape_output[rank + 1] = remainder;
}
}
+final:
return cost_size;
} /* eltwise_fill_dim() */
@@ -442,35 +460,126 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
vsi_size_t* out_shape_output, vsi_size_t* out_rank_output
)
{
- vsi_bool ret = TRUE;
- vsi_bool append_dim = FALSE;
- vsi_size_t i = 0;
- vsi_size_t dims = 0;
+ vsi_bool ret = TRUE;
+ vsi_bool append_dim = FALSE;
+ vsi_size_t i = 0;
+ vsi_size_t j = 0;
+ vsi_size_t dims = 0;
vsi_size_t effective_size_x = 1;
vsi_size_t effective_size_y = 1;
vsi_size_t effective_size_z = 1;
vsi_size_t sx = 0;
vsi_size_t sy = 0;
vsi_size_t sz = 0;
+ int32_t idx_start = -1;
+ int32_t idx_end = 0;
tile_axis_state_e state = TILE_STATE_EMPTY;
tile_axis_state_e next_state = TILE_STATE_EMPTY;
+ vsi_size_t* temp_shape_x = NULL;
+ vsi_size_t* temp_shape_y = NULL;
+ vsi_size_t* temp_shape_output = NULL;
+ vsi_size_t temp_rank = 0;
#define _swap_size(a, b, tmp) \
- do { \
+ { \
tmp = a; \
a = b; \
b = tmp; \
- } while(0)
- for( i = 0; i < rank_output; i++ )
+ }
+
+ VSI_UNREFERENCED(rank_x);
+ VSI_UNREFERENCED(rank);
+
+ temp_shape_x = (vsi_size_t*)malloc(rank * sizeof(vsi_size_t));
+ if (temp_shape_x == NULL)
{
- sx = shape_x[i];
- sy = multiples[i];
- sz = shape_output[i];
+ VSILOGE( "malloc temp_shape_x error." );
+ ret = FALSE;
+ goto final;
+ }
+
+ temp_shape_y = (vsi_size_t*)malloc(rank * sizeof(vsi_size_t));
+ if (temp_shape_y == NULL)
+ {
+ VSILOGE( "malloc temp_shape_y error." );
+ ret = FALSE;
+ goto final;
+ }
+
+ temp_shape_output = (vsi_size_t*)malloc(rank * sizeof(vsi_size_t));
+ if (temp_shape_output == NULL)
+ {
+ VSILOGE( "malloc temp_shape_output error." );
+ ret = FALSE;
+ goto final;
+ }
+ memcpy(temp_shape_x, shape_x, rank * sizeof(vsi_size_t));
+ memcpy(temp_shape_y, multiples, rank * sizeof(vsi_size_t));
+ memcpy(temp_shape_output, shape_output, rank * sizeof(vsi_size_t));
+
+ for (i = 0, temp_rank = 0; i < rank_output; i++)
+ {
+ if (i == rank_output - 1 && temp_shape_x[i] == 1)
+ {
+ if (idx_start >= 0)
+ {
+ sx = 1;
+ sy = temp_shape_y[idx_start];
+ sz = temp_shape_output[idx_start];
+ idx_end = (int32_t)i ;
+ for (j = (vsi_size_t)idx_start + 1; j <= (vsi_size_t)idx_end; j++)
+ {
+ sy *= temp_shape_y[j];
+ sz *= temp_shape_output[j];
+ }
+ temp_rank += tile_fill_dim( temp_shape_x, temp_shape_y, temp_shape_output,
+ temp_rank, VSI_NN_MAX_DIM_NUM, sx, sy, sz );
+ idx_start = -1;
+ }
+ else
+ {
+ temp_shape_x[temp_rank] = temp_shape_x[i];
+ temp_shape_y[temp_rank] = temp_shape_y[i];
+ temp_shape_output[temp_rank++] = temp_shape_output[i];
+ }
+ }
+ else if (temp_shape_x[i] != 1)
+ {
+ idx_end = (int32_t)i - 1;
+ if (idx_start >= 0)
+ {
+ sx = 1;
+ sy = temp_shape_y[idx_start];
+ sz = temp_shape_output[idx_start];
+ for (j = (vsi_size_t)idx_start + 1; j <= (vsi_size_t)idx_end; j++)
+ {
+ sy *= temp_shape_y[j];
+ sz *= temp_shape_output[j];
+ }
+ temp_rank += tile_fill_dim( temp_shape_x, temp_shape_y, temp_shape_output,
+ temp_rank, VSI_NN_MAX_DIM_NUM, sx, sy, sz );
+ idx_start = -1;
+ }
+ temp_shape_x[temp_rank] = temp_shape_x[i];
+ temp_shape_y[temp_rank] = temp_shape_y[i];
+ temp_shape_output[temp_rank++] = temp_shape_output[i];
+ }
+ else if (idx_start == -1)
+ {
+ idx_start = (int32_t)i;
+ }
+ }
+
+ for( i = 0; i < temp_rank; i++ )
+ {
+ sx = temp_shape_x[i];
+ sy = temp_shape_y[i];
+ sz = temp_shape_output[i];
/*
* Skip dim if the size is equal to 1
* Also skip if ( sx == 1 && sy == 1 )
*/
- if ( shape_output[i] == 1 )
+ if ( temp_shape_output[i] == 1 )
{
continue;
}
@@ -490,8 +599,8 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
VSI_ASSERT( FALSE );
}
- next_state = (i + 1) < rank_output ?
- (multiples[i + 1] == 1 ? TILE_STATE_NO_AXIS : TILE_STATE_AXIS_X) : TILE_STATE_EMPTY;
+ next_state = (i + 1) < temp_rank ?
+ (temp_shape_y[i + 1] == 1 ? TILE_STATE_NO_AXIS : TILE_STATE_AXIS_X) : TILE_STATE_EMPTY;
append_dim = FALSE;
#define _pack_state( cur_state, next_state ) (next_state << 16 | cur_state)
@@ -507,9 +616,13 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
* ...,x1,x2,...
* ...,y1,y2,...
*/
+ case _pack_state( TILE_STATE_AXIS_X, TILE_STATE_EMPTY ):
+ effective_size_x = sx;
+ effective_size_y = sy;
+ effective_size_z = sz;
+ break;
case _pack_state( TILE_STATE_AXIS_X, TILE_STATE_AXIS_X ):
case _pack_state( TILE_STATE_AXIS_X, TILE_STATE_NO_AXIS ):
- case _pack_state( TILE_STATE_AXIS_X, TILE_STATE_EMPTY ):
append_dim = TRUE;
break;
/*
@@ -548,7 +661,7 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
if ( ret )
{
/* Append the last dim */
- if ( i == rank_output )
+ if ( i == temp_rank )
{
sx = effective_size_x;
sy = effective_size_y;
@@ -573,6 +686,23 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
*out_rank_output = (uint32_t)dims;
}
#undef _swap_size
+final:
+ if (temp_shape_x)
+ {
+ free( temp_shape_x);
+ temp_shape_x = NULL;
+ }
+ if (temp_shape_y)
+ {
+ free( temp_shape_y);
+ temp_shape_y = NULL;
+ }
+ if (temp_shape_output)
+ {
+ free( temp_shape_output);
+ temp_shape_output = NULL;
+ }
+
return ret;
} /* vsi_nn_kernel_optimize_eltwise_shape() */
@@ -612,7 +742,7 @@ vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape
return TRUE;
}
-static vsi_bool vsi_nn_kernel_optimize_element_shape_with_max_rank
+vsi_bool vsi_nn_kernel_optimize_element_shape_with_max_rank
(
const vsi_size_t* shape_x, const vsi_size_t rank_x,
vsi_size_t* out_shape_x, vsi_size_t* out_rank_x, vsi_size_t max_rank
@@ -755,3 +885,415 @@ vsi_bool vsi_nn_kernel_optimize_scatter_elements_shape
return ret;
} /* vsi_nn_kernel_optimize_scatter_elements_shape() */
+
+
+vsi_bool vsi_nn_kernel_optimize_matrixmul_broadcast_shape
+ (
+ const vsi_size_t * shape_x,
+ const vsi_size_t * shape_y,
+ const vsi_size_t * shape_output,
+ vsi_size_t dim_x,
+ vsi_size_t dim_y,
+ vsi_size_t dim_out,
+ vsi_size_t* out_shape_x,
+ vsi_size_t* out_shape_y,
+ vsi_size_t* out_shape_output,
+ uint32_t* new_rank_out,
+ uint32_t* cross_flg,
+ uint32_t* size_axis_inner_outer,
+ uint32_t* strides_axis_inner_outer
+ )
+{
+ vsi_bool ret = FALSE;
+ vsi_size_t rank_in[2] = {0, 0};
+ vsi_size_t rank_out = 0;
+ vsi_size_t shapes_in_broadcast_part[2][VSI_NN_MAX_DIM_NUM] = {{1}};
+ vsi_size_t* shapes_in_broadcast_part_ptr[2] = {NULL, NULL};
+ vsi_size_t shapes_out_broadcast_part[VSI_NN_MAX_DIM_NUM] = {1};
+ vsi_size_t out_shape_in[2][VSI_NN_MAX_DIM_NUM] = {{1}};
+ vsi_size_t* out_shape_in_ptr[2] = {NULL, NULL};
+ vsi_size_t out_shape_boradcast_output[VSI_NN_MAX_DIM_NUM] = {1};
+ uint32_t new_rank = 0;
+ uint32_t i = 0;
+ vsi_size_t outer0 = 1;
+ vsi_size_t outer1 = 1;
+ vsi_size_t outer2 = 1;
+ vsi_size_t axis_size = 0;
+ vsi_size_t inner_size = 1;
+ vsi_size_t outer_size = 1;
+ vsi_size_t axis_size0 = 1;
+ vsi_size_t axis_size1 = 1;
+ vsi_size_t axis_size2 = 1;
+ vsi_size_t inner_size0 = 0;
+ vsi_size_t inner_size1 = 0;
+ vsi_size_t inner_size2 = 0;
+ vsi_size_t outer_size0 = 0;
+ vsi_size_t outer_size1 = 0;
+ vsi_size_t outer_size2 = 0;
+ uint32_t ne_flg = 0;
+ uint32_t axis = 0;
+ uint32_t outer_flg = 0;
+ uint32_t outer_axis = 0;
+ uint32_t first_flg = 0;
+ cross_flg[0] = 0;
+
+ if (dim_x > 2 && dim_y > 2)
+ {
+ for (i = 2; i < dim_x; i++)
+ {
+ outer0 *= shape_x[i];
+ }
+ for (i = 2; i < dim_y; i++)
+ {
+ outer1 *= shape_y[i];
+ }
+ for (i = 2; i < dim_out; i++)
+ {
+ outer2 *= shape_output[i];
+ }
+
+ for (i = 2; i < vsi_nn_min(dim_x, dim_y); i++)
+ {
+ if (shape_x[i] != shape_y[i] && first_flg == 0)
+ {
+ if (shape_x[i] == 1)
+ {
+ ne_flg = 1;
+ inner_size = shape_y[i];
+ }
+ else
+ {
+ ne_flg = 2;
+ inner_size = shape_x[i];
+ }
+ first_flg = 1;
+ continue;
+ }
+ else if (ne_flg == 1 && shape_x[i] != shape_y[i] && shape_x[i] == 1 && first_flg == 1)
+ {
+ inner_size *= shape_y[i];
+ }
+ else if (ne_flg == 2 && shape_x[i] != shape_y[i] && shape_y[i] == 1 && first_flg == 1)
+ {
+ inner_size *= shape_x[i];
+ }
+ else if (ne_flg == 1 && shape_x[i] != shape_y[i] && shape_x[i] != 1 && first_flg == 1)
+ {
+ outer_flg = 1;
+ outer_axis = i;
+ break;
+ }
+ else if (ne_flg == 2 && shape_x[i] != shape_y[i] && shape_y[i] != 1 && first_flg == 1)
+ {
+ outer_flg = 2;
+ outer_axis = i;
+ break;
+ }
+ else if (i > 2 && shape_x[i] == shape_y[i] && shape_y[i] != 1 && first_flg == 1)
+ {
+ first_flg = 2;
+ }
+ else if (shape_x[i] != shape_y[i] && shape_x[i] != 1 && first_flg == 2)
+ {
+ outer_flg = 1;
+ outer_axis = i;
+ break;
+ }
+ else if (shape_x[i] != shape_y[i] && shape_y[i] != 1 && first_flg == 2)
+ {
+ outer_flg = 2;
+ outer_axis = i;
+ break;
+ }
+ else if (i == 2 && shape_x[i] == shape_y[i] && shape_y[i] != 1)
+ {
+ /*axis = 2;
+ axis_size = shape_x[i];*/
+ }
+ }
+
+ if (ne_flg > 0 && outer0 > 1 && outer1 > 1)
+ {
+ for (i = 2; i < vsi_nn_min(dim_x, dim_y); i++)
+ {
+ if (shape_x[i] == shape_y[i] && shape_x[i] != 1)
+ {
+ cross_flg[0] = 1;
+ axis = i;
+ axis_size = shape_x[i];
+ break;
+ }
+ }
+ }
+
+ if (cross_flg[0] == 1) // cross
+ {
+ if (outer_flg == 1)
+ {
+ for (i = outer_axis; i < dim_x; i++)
+ {
+ outer_size *= shape_x[i];
+ }
+ }
+ else if (outer_flg == 2)
+ {
+ for (i = outer_axis; i < dim_y; i++)
+ {
+ outer_size *= shape_y[i];
+ }
+ }
+ else
+ {
+ outer_size = 1;
+ }
+
+ axis_size0 = 1;
+ axis_size1 = 1;
+ axis_size2 = 1;
+ if (axis > 2 && ne_flg == 1)
+ {
+ axis_size1 = inner_size;
+ axis_size2 = inner_size;
+ }
+ else if (axis > 2 && ne_flg == 2)
+ {
+ axis_size0 = inner_size;
+ axis_size2 = inner_size;
+ }
+
+ inner_size0 = 0;
+ inner_size1 = 0;
+ inner_size2 = 1;
+ if (axis == 2 && ne_flg == 1)
+ {
+ inner_size1 = axis_size;
+ inner_size2 = axis_size;
+ }
+ else if (axis > 2 && ne_flg == 1)
+ {
+ inner_size1 = 1;
+ }
+ else if (axis == 2 && ne_flg == 2)
+ {
+ inner_size0 = axis_size;
+ inner_size2 = axis_size;
+ }
+ else if (axis > 2 && ne_flg == 2)
+ {
+ inner_size0 = 1;
+ }
+
+ outer_size0 = 0;
+ outer_size1 = 0;
+ outer_size2 = axis_size * inner_size;
+ if (outer_flg == 1)
+ {
+ outer_size0 = axis_size0 * axis_size;
+ }
+ else if (outer_flg == 2)
+ {
+ outer_size1 = axis_size1 * axis_size;
+ }
+
+ for (i = 0; i < 2; i++)
+ {
+ out_shape_x[i] = shape_x[i];
+ out_shape_y[i] = shape_y[i];
+ out_shape_output[i] = shape_output[i];
+ }
+ out_shape_x[2] = outer0;
+ out_shape_x[3] = 1;
+ out_shape_y[2] = outer1;
+ out_shape_output[2] = outer2;
+ new_rank_out[0] = 4;
+ new_rank_out[1] = 3;
+ new_rank_out[2] = 3;
+
+ size_axis_inner_outer[0] = (uint32_t)axis_size;
+ size_axis_inner_outer[1] = (uint32_t)inner_size;
+ size_axis_inner_outer[2] = (uint32_t)outer_size;
+
+ strides_axis_inner_outer[0] = (uint32_t)axis_size0;
+ strides_axis_inner_outer[1] = (uint32_t)inner_size0;
+ strides_axis_inner_outer[2] = (uint32_t)outer_size0;
+
+ strides_axis_inner_outer[3] = (uint32_t)axis_size1;
+ strides_axis_inner_outer[4] = (uint32_t)inner_size1;
+ strides_axis_inner_outer[5] = (uint32_t)outer_size1;
+
+ strides_axis_inner_outer[6] = (uint32_t)axis_size2;
+ strides_axis_inner_outer[7] = (uint32_t)inner_size2;
+ strides_axis_inner_outer[8] = (uint32_t)outer_size2;
+
+ return TRUE;
+ }
+ else if (outer0 > 1 && outer1 > 1 && ne_flg > 0 && cross_flg[0] == 0)
+ {
+ cross_flg[0] = 2;
+ }
+ }
+
+ if (cross_flg[0] == 2) // merge
+ {
+ for (i = 0; i < 2; i++)
+ {
+ out_shape_x[i] = shape_x[i];
+ out_shape_y[i] = shape_y[i];
+ out_shape_output[i] = shape_output[i];
+ }
+ out_shape_output[2] = outer2;
+ new_rank_out[2] = 3;
+ if (ne_flg == 1)
+ {
+ out_shape_x[2] = outer0;
+ out_shape_x[3] = 1;
+ out_shape_y[2] = outer1;
+
+ new_rank_out[0] = 4;
+ new_rank_out[1] = 3;
+ }
+ else if (ne_flg == 2)
+ {
+ out_shape_x[2] = outer0;
+ out_shape_y[2] = outer1;
+ out_shape_y[3] = 1;
+
+ new_rank_out[0] = 3;
+ new_rank_out[1] = 4;
+ }
+
+ return TRUE;
+ }
+ else if (dim_x == 1 && dim_y > 1)
+ {
+ out_shape_x[0] = shape_x[0];
+ out_shape_x[1] = 1;
+
+ out_shape_y[0] = shape_y[0];
+ out_shape_y[1] = shape_y[1];
+
+ out_shape_output[0] = shape_output[0];
+ out_shape_output[1] = 1;
+
+ if (dim_y > 2)
+ {
+ shapes_in_broadcast_part[0][0] = 1;
+ rank_in[0] = 1;
+
+ for (i = 2; i <= dim_y; i++)
+ {
+ shapes_in_broadcast_part[1][i - 2] = shape_y[i];
+ }
+ rank_in[1] = dim_y - 2;
+
+ for(i = 1; i <= dim_out; i++)
+ {
+ shapes_out_broadcast_part[i - 1] = shape_output[i];
+ }
+ rank_out = dim_out - 1;
+ }
+ }
+ else if (dim_y == 1 && dim_x > 1)
+ {
+ out_shape_y[0] = 1;
+ out_shape_y[1] = shape_y[0];
+
+ out_shape_x[0] = shape_x[0];
+ out_shape_x[1] = shape_x[1];
+
+ out_shape_output[0] = 1;
+ out_shape_output[1] = shape_output[0];
+
+ if (dim_x > 2)
+ {
+ shapes_in_broadcast_part[1][0] = 1;
+ rank_in[1] = 1;
+
+ for (i = 2; i <= dim_x; i++)
+ {
+ shapes_in_broadcast_part[0][i - 2] = shape_x[i];
+ }
+ rank_in[0] = dim_x - 2;
+
+ for(i = 1; i <= dim_out; i++)
+ {
+ shapes_out_broadcast_part[i - 1] = shape_output[i];
+ }
+ rank_out = dim_out - 1;
+ }
+ }
+ else
+ {
+ out_shape_x[0] = shape_x[0];
+ out_shape_x[1] = shape_x[1];
+
+ out_shape_y[0] = shape_y[0];
+ out_shape_y[1] = shape_y[1];
+
+ out_shape_output[0] = shape_output[0];
+ out_shape_output[1] = shape_output[1];
+
+ for (i = 2; i < dim_x; i++)
+ {
+ shapes_in_broadcast_part[0][i - 2] = shape_x[i];
+ }
+ for (i = 2; i < dim_y; i++)
+ {
+ shapes_in_broadcast_part[1][i - 2] = shape_y[i];
+ }
+ for (i = 2; i < dim_out; i++)
+ {
+ shapes_out_broadcast_part[i - 2] = shape_output[i];
+ }
+ rank_in[0] = dim_x - 2;
+ rank_in[1] = dim_y - 2;
+ rank_out = dim_out - 2;
+
+ }
+
+ shapes_in_broadcast_part_ptr[0] = shapes_in_broadcast_part[0];
+ shapes_in_broadcast_part_ptr[1] = shapes_in_broadcast_part[1];
+ out_shape_in_ptr[0] = out_shape_in[0];
+ out_shape_in_ptr[1] = out_shape_in[1];
+
+ ret = vsi_nn_kernel_optimize_broadcast_shape(
+ (const vsi_size_t **)shapes_in_broadcast_part_ptr, rank_in, 2,
+ shapes_out_broadcast_part, rank_out,
+ (vsi_size_t **)out_shape_in_ptr, out_shape_boradcast_output, &new_rank);
+
+ if (ret)
+ {
+ int32_t j = 0;
+
+ new_rank_out[0] = new_rank + 2;
+ new_rank_out[1] = new_rank + 2;
+ new_rank_out[2] = new_rank + 2;
+
+ j = new_rank - 1;
+ while (out_shape_in[0][j] == 1 && j >= 0) {
+ new_rank_out[0]--;
+ j--;
+ }
+
+ j = new_rank - 1;
+ while (out_shape_in[1][j] == 1 && j >= 0) {
+ new_rank_out[1]--;
+ j--;
+ }
+
+ j = new_rank - 1;
+ while (out_shape_boradcast_output[j] == 1 && j >= 0) {
+ new_rank_out[2]--;
+ j--;
+ }
+
+ for (i = 0; i < new_rank; i++)
+ {
+ out_shape_x[i + 2] = out_shape_in[0][i];
+ out_shape_y[i + 2] = out_shape_in[1][i];
+ out_shape_output[i + 2] = out_shape_boradcast_output[i];
+ }
+ }
+
+ return ret;
+}
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c
index c5b640c55..426dacf16 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c
@@ -62,13 +62,13 @@ typedef struct
} _param_type;
#define CHECK_PARAM_NULL( ptr, rval, ... ) \
- do { \
+ { \
if( ptr == NULL ) { \
VSILOGE(__VA_ARGS__); \
VSI_ASSERT(FALSE); \
return rval; \
} \
- } while(0)
+ }
#define _PARAM_ADD_TEMPLATE(TYPE_NAME, TYPE, PARAM_DTYPE) \
vsi_bool vsi_nn_kernel_param_add_##TYPE_NAME \
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
index 7b0c6ca67..6c6dda92c 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
@@ -68,6 +68,12 @@ KERNEL_SELECTOR( depthwise_conv1d )
{ VSI_NN_KERNEL_TYPE_CL, 3 },
{ VSI_NN_KERNEL_TYPE_CPU, 2 },
};
+
+ VSI_UNREFERENCED(graph);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(outputs);
+ VSI_UNREFERENCED(output_num);
+
dilation = dilation == 0 ? 0 : dilation - 1;
real_kernel = (kernel - 1) * dilation + kernel;
@@ -101,6 +107,12 @@ static vsi_status _select
{ VSI_NN_KERNEL_TYPE_CL, 1 },
{ VSI_NN_KERNEL_TYPE_CPU, 0 },
};
+ VSI_UNREFERENCED(graph);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(outputs);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
return vsi_nn_kernel_pirority_set( selector, pirority, _cnt_of_array(pirority) );
} /* _select */
@@ -141,5 +153,8 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(atan)
REGISTER_VX_FIRST_KERNEL_SELECTOR(atanh)
REGISTER_VX_FIRST_KERNEL_SELECTOR(acosh)
REGISTER_VX_FIRST_KERNEL_SELECTOR(inverse_sigmoid)
+#if (VX_TENSOR_SELECT_VX_SUPPORT)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(select)
+#endif
__END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
index a1680edbf..55a61001a 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
@@ -30,6 +30,7 @@
#include "vsi_nn_error.h"
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_error.h"
typedef enum
{
@@ -73,6 +74,11 @@ vsi_status vsi_nn_kernel_copy_tensor_veiw_patch
vx_trensor_addressing addr = NULL;
vx_size dim_sizes[VSI_NN_MAX_DIM_NUM], strides[VSI_NN_MAX_DIM_NUM];
addr = (vx_trensor_addressing)malloc(sizeof(vx_tensorpatch_addressing_t));
+ if ( NULL == addr )
+ {
+ VSILOGE("Call vxCreateTensorAddressing fail");
+ return status;
+ }
addr->num_of_dims = (vx_uint32)attr->shape->size;
for (i = 0; i < dim; i++)
@@ -138,6 +144,7 @@ vsi_status vsi_nn_kernel_copy_tensor_veiw_patch
}
}
#endif
+
return status;
} /* vsi_nn_kernel_copy_tensor_veiw_patch() */
@@ -153,6 +160,9 @@ vsi_status vsi_nn_kernel_copy_tensor_patch
vsi_size_t start[VSI_NN_MAX_DIM_NUM],end[VSI_NN_MAX_DIM_NUM],stride[VSI_NN_MAX_DIM_NUM];
vsi_status status = VSI_FAILURE;
uint32_t i;
+
+ VSI_UNREFERENCED(buffer_size);
+
if (NULL == tensor || NULL == user_ptr)
{
VSILOGE("Invalid parameter");
@@ -377,10 +387,12 @@ vsi_status vsi_nn_kernel_tensor_write_from_float
vsi_size_t sz = 0;
sz = vsi_nn_kernel_tensor_attr_get_size( attr );
internal_buffer0 = malloc( sz );
+ CHECK_PTR_FAIL_GOTO( internal_buffer0, "Create buffer fail.", final );
}
else
{
internal_buffer0 = malloc( bytes );
+ CHECK_PTR_FAIL_GOTO( internal_buffer0, "Create buffer fail.", final );
internal_buffer = internal_buffer0;
}
@@ -422,6 +434,7 @@ vsi_status vsi_nn_kernel_tensor_write_from_float
if ( attr->dtype == I4 || attr->dtype == U4 )
{
internal_buffer = malloc( bytes );
+ CHECK_PTR_FAIL_GOTO( internal_buffer, "Create buffer fail.", final );
status = vsi_nn_kernel_pack_4bit_data(attr, (uint8_t*)internal_buffer0, (uint8_t*)internal_buffer);
}
}
@@ -442,7 +455,7 @@ vsi_status vsi_nn_kernel_tensor_write_from_float
{
vsi_nn_kernel_tensor_attr_release( &internal_attr );
}
- if ( attr->dtype == I4 || attr->dtype == U4 )
+ if ( attr && (attr->dtype == I4 || attr->dtype == U4) )
{
vsi_nn_safe_free(internal_buffer0);
}
@@ -562,6 +575,8 @@ static void _convert_tensor_attr_to_vx_tensor_param
MAP_TYPE( p->data_format, F64, VSI_NN_TYPE_FLOAT64 );
MAP_TYPE( p->data_format, BF16, VSI_NN_TYPE_BFLOAT16 );
MAP_TYPE( p->data_format, BOOL8, VSI_NN_TYPE_BOOL8 );
+ MAP_TYPE( p->data_format, FP8_E4M3, VSI_NN_TYPE_FLOAT8_E4M3 );
+ MAP_TYPE( p->data_format, FP8_E5M2, VSI_NN_TYPE_FLOAT8_E5M2 );
default:
VSI_ASSERT( FALSE );
break;
@@ -577,6 +592,12 @@ static void _convert_tensor_attr_to_vx_tensor_param
MAP_TYPE( p->quant_format,
VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL,
VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC );
+ MAP_TYPE(p->quant_format,
+ VSI_NN_KERNEL_QUANT_FLOAT8,
+ VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8);
+ MAP_TYPE(p->quant_format,
+ VSI_NN_KERNEL_QUANT_FLOAT8_PERCHANNEL,
+ VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8);
default:
VSI_ASSERT( FALSE );
break;
@@ -615,11 +636,11 @@ vsi_nn_kernel_tensor_t vsi_nn_kernel_tensor_create
//convert attr->shape->data to correct data type
for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
{
- size_vxsize[i] = -1 == attr->shape->data[i] ? -1 : (vx_size)attr->shape->data[i];
+ size_vxsize[i] = (vsi_size_t)-1 == attr->shape->data[i] ? (vx_size)-1 : (vx_size)attr->shape->data[i];
}
for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
{
- size_u32[i] = -1 == attr->shape->data[i] ? -1 : (vx_uint32)attr->shape->data[i];
+ size_u32[i] = (vsi_size_t)-1 == attr->shape->data[i] ? (vx_uint32)-1 : (vx_uint32)attr->shape->data[i];
}
#ifdef VSI_40BIT_VA_SUPPORT
params.sizes = size_vxsize;
@@ -672,6 +693,8 @@ vsi_nn_tensor_t* vsi_nn_pad_tensor
vsi_nn_dtype_t dst_type;
vsi_nn_tensor_t *output = NULL;
+ VSI_UNREFERENCED(mode);
+
input_data_ptr = vsi_nn_ConvertTensorToFloat32Data(graph, input);
CHECK_PTR_FAIL_GOTO( input_data_ptr, "Create data ptr fail.", final );
@@ -764,6 +787,7 @@ vsi_nn_tensor_t* vsi_nn_merge_input_zeropoint_to_bias
uint32_t i, j;
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
weight_data = vsi_nn_ConvertTensorToData(graph, weight);
+ CHECK_PTR_FAIL_GOTO( weight_data, "Create buffer fail.", final );
if (bias == NULL)
{
@@ -787,9 +811,11 @@ vsi_nn_tensor_t* vsi_nn_merge_input_zeropoint_to_bias
attr.dim_num = 2;
}
bias_data = (int32_t *)vsi_nn_ConvertTensorToData(graph, bias);
+ CHECK_PTR_FAIL_GOTO( new_bias_data_ptr, "Create buffer fail.", final );
}
new_bias_data_ptr = (int32_t *)malloc(attr.size[0] * sizeof(int32_t));
+ CHECK_PTR_FAIL_GOTO( new_bias_data_ptr, "Create buffer fail.", final );
memset((void *)new_bias_data_ptr, 0, sizeof(int32_t) * attr.size[0]);
if (input->attr.dtype.zero_point != 0)
@@ -815,6 +841,7 @@ vsi_nn_tensor_t* vsi_nn_merge_input_zeropoint_to_bias
new_bias = vsi_nn_CreateTensorFromData(graph, (uint8_t *)new_bias_data_ptr, &attr);
+final:
vsi_nn_safe_free( new_bias_data_ptr );
vsi_nn_safe_free( bias_data );
vsi_nn_safe_free( weight_data );
diff --git a/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c b/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c
index 6756e3a16..a40bd81ba 100644
--- a/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c
@@ -29,6 +29,7 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_error.h"
#define REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
@@ -62,6 +63,11 @@ REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( a_times_b_plus_c )
vsi_nn_tensor_t * a_times_b = NULL;
vsi_nn_tensor_attr_t attr;
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
scale = 1.0;
overflow_policy = VX_CONVERT_POLICY_SATURATE;
rounding_policy = VX_ROUND_POLICY_TO_ZERO;
@@ -70,7 +76,7 @@ REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( a_times_b_plus_c )
if(!scale_s)
{
VSILOGE("CreateScalar fail\n");
- goto OnError;
+ goto final;
}
memset(&attr, 0, sizeof(attr));
@@ -79,6 +85,7 @@ REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( a_times_b_plus_c )
attr.vtl = TRUE;
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
a_times_b = vsi_nn_CreateTensor(graph, &attr);
+ CHECK_PTR_FAIL_GOTO( a_times_b, "Create tensor fail.", final );
node = vxTensorMultiplyNode( graph->g,
inputs[0]->t, inputs[1]->t,
@@ -89,7 +96,7 @@ REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( a_times_b_plus_c )
if( NULL == node )
{
VSILOGE("Call vxTensorMultiplyNode fail.(a_times_b_plus_c)");
- goto OnError;
+ goto final;
}
node = vxTensorAddNode( graph->g, a_times_b->t, inputs[2]->t,
@@ -97,10 +104,10 @@ REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( a_times_b_plus_c )
if( NULL == node )
{
VSILOGE("Call vxTensorAddNode fail.(a_times_b_plus_c)");
- goto OnError;
+ goto final;
}
-OnError:
+final:
if (scale_s) vxReleaseScalar(&scale_s);
if (a_times_b) vsi_nn_ReleaseTensor(&a_times_b);
diff --git a/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c b/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c
index 955c61d2c..5fd98c2a9 100644
--- a/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c
@@ -48,6 +48,10 @@ static vsi_nn_kernel_node_t _setup
vx_node node = NULL;
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
node = vxBatchNormalizationLayer(
graph->g,
eps,
diff --git a/src/tim/vx/internal/src/kernel/vx/convolutional.c b/src/tim/vx/internal/src/kernel/vx/convolutional.c
index 2f9be4903..d77719477 100644
--- a/src/tim/vx/internal/src/kernel/vx/convolutional.c
+++ b/src/tim/vx/internal/src/kernel/vx/convolutional.c
@@ -293,6 +293,14 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d )
vx_tensor temp_tensors[3] = { NULL };
uint32_t i = 0;
+ VSI_UNREFERENCED(graph);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(outputs);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(kernel);
+
_build_vx_conv2d_param(
&vxparam,
1, vsi_nn_kernel_param_get_int32(params, "stride"),
@@ -310,7 +318,9 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d )
temp_tensors[0] = _expand_tensor_dim( inputs[0]->t,
(vsi_ssize_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 1 );
CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final );
- if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
+ if (inputs[1]->attr.dtype.qnt_type !=
+ VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC &&
+ inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8)
{
temp_tensors[1] = _expand_tensor_dim( inputs[1]->t,
(vsi_ssize_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 1 );
@@ -369,6 +379,14 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
uint32_t i = 0;
vsi_bool need_explicit_padding = FALSE;
+ VSI_UNREFERENCED(graph);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(outputs);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(kernel);
+
_build_vx_conv2d_param(
&vxparam,
1, vsi_nn_kernel_param_get_int32(params, "stride"),
@@ -387,7 +405,9 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
(vsi_ssize_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 1 );
CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final );
- if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
+ if (inputs[1]->attr.dtype.qnt_type !=
+ VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC &&
+ inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8)
{
vsi_size_t new_w_shape[VSI_NN_MAX_DIM_NUM] = { 0 };
uint32_t new_w_rank = 4;
@@ -486,6 +506,14 @@ REGISTER_CONV_OPENVX_KERNEL( conv2d )
vx_node node = NULL;
vx_nn_convolution_params_ext2_t vxparam;
+ VSI_UNREFERENCED(graph);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(outputs);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(kernel);
+
_build_vx_conv2d_param(
&vxparam,
vsi_nn_kernel_param_get_int32(params, "stride_h"),
@@ -518,6 +546,14 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv2d )
vx_node node = NULL;
vx_nn_convolution_params_ext2_t vxparam;
+ VSI_UNREFERENCED(graph);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(outputs);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(kernel);
+
_build_vx_conv2d_param(
&vxparam,
vsi_nn_kernel_param_get_int32(params, "stride_h"),
@@ -552,6 +588,14 @@ REGISTER_CONV_OPENVX_KERNEL( deconvolution1d )
vx_tensor temp_tensors[2] = { NULL };
int i;
+ VSI_UNREFERENCED(graph);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(outputs);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(kernel);
+
_build_vx_deconv2d_param(
&vxparam,
1, vsi_nn_kernel_param_get_int32(params, "stride"),
@@ -595,6 +639,7 @@ REGISTER_CONV_OPENVX_KERNEL( conv3d )
vx_node node = NULL;
#if VX_CONV_3D_API_SUPPORT
vx_nn_convolution_3d_params_t vxparam;
+
memset(&vxparam, 0, sizeof(vxparam));
_build_vx_conv3d_param(
@@ -625,14 +670,23 @@ REGISTER_CONV_OPENVX_KERNEL( conv3d )
outputs[0]->t
);
#endif
+ VSI_UNREFERENCED(graph);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(outputs);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(kernel);
return (vsi_nn_kernel_node_t)node;
} /* depthwise_conv2d*/
REGISTER_CONV_OPENVX_KERNEL( deconv3d )
{
vx_node node = NULL;
+
#if VX_DECONV_3D_API_SUPPORT
vx_nn_deconvolution_3d_params_t vxparam;
+
memset(&vxparam, 0, sizeof(vxparam));
_build_vx_deconv3d_param(
@@ -648,7 +702,7 @@ REGISTER_CONV_OPENVX_KERNEL( deconv3d )
vsi_nn_kernel_param_get_int32(params, "pad_right"),
vsi_nn_kernel_param_get_int32(params, "outpadding_w"),
vsi_nn_kernel_param_get_int32(params, "outpadding_h"),
- vsi_nn_kernel_param_get_int32(params, "outpadding_w"),
+ vsi_nn_kernel_param_get_int32(params, "outpadding_d"),
vsi_nn_kernel_param_get_int32(params, "group"),
vsi_nn_kernel_param_get_int32(params, "overflow_policy"),
vsi_nn_kernel_param_get_int32(params, "rounding_policy"),
@@ -662,7 +716,14 @@ REGISTER_CONV_OPENVX_KERNEL( deconv3d )
outputs[0]->t
);
#endif
+ VSI_UNREFERENCED(graph);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(outputs);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(kernel);
return (vsi_nn_kernel_node_t)node;
} /* deconv3d */
-#undef REGISTER_CONV_OPENVX_KERNEL
\ No newline at end of file
+#undef REGISTER_CONV_OPENVX_KERNEL
diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
index 9e299da26..09514d316 100644
--- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
@@ -106,6 +106,10 @@ static vsi_nn_kernel_node_t _setup
goto final;
}
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
final:
if (lut1)
{
@@ -120,6 +124,14 @@ static vsi_nn_kernel_node_t _setup
return (vsi_nn_kernel_node_t)node;
#else
+ VSI_UNREFERENCED(graph);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(input_num);
+ VSI_UNREFERENCED(outputs);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(lut_type);
return NULL;
#endif
} /* _setup() */
@@ -190,6 +202,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( abs )
vx_tensor input = NULL, input0 = NULL;
vx_tensor output = NULL, output0 = NULL;
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
if (inputs[0]->attr.dim_num > 4)
{
input_size[0] = vsi_nn_GetElementNum(inputs[0]) /
@@ -231,6 +248,10 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( linear )
float a_v = vsi_nn_kernel_param_get_float32( params, "a_v" );
float b_v = vsi_nn_kernel_param_get_float32( params, "b_v" );
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
node = vxActivationLayer(
graph->g,
inputs[0]->t,
@@ -247,6 +268,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( sigmoid )
{
vx_node node = NULL;
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
node = vxActivationLayer(
graph->g,
inputs[0]->t,
@@ -265,6 +291,10 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( tanh )
float scale_a = vsi_nn_kernel_param_get_float32( params, "scale_a" );
float scale_b = vsi_nn_kernel_param_get_float32( params, "scale_b" );
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
node = vxActivationLayer(
graph->g,
inputs[0]->t,
@@ -281,6 +311,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( relu1 )
{
vx_node node = NULL;
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
node = vxActivationLayer(
graph->g,
inputs[0]->t,
@@ -297,6 +332,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( relu6 )
{
vx_node node = NULL;
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
node = vxActivationLayer(
graph->g,
inputs[0]->t,
@@ -313,6 +353,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( rsqrt )
{
vx_node node = NULL;
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
node = vxActivationLayer(
graph->g,
inputs[0]->t,
@@ -329,6 +374,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( sqrt )
{
vx_node node = NULL;
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
node = vxActivationLayer(
graph->g,
inputs[0]->t,
@@ -345,6 +395,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( softrelu )
{
vx_node node = NULL;
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
node = vxActivationLayer(
graph->g,
inputs[0]->t,
diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_vx.c
index 3c9947d40..d81a55563 100644
--- a/src/tim/vx/internal/src/kernel/vx/eltwise_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/eltwise_vx.c
@@ -57,6 +57,12 @@ REGISTER_ELTWISE_OPENVX_KERNEL( add )
{
vx_node node = vxTensorAddNode( graph->g, inputs[0]->t, inputs[1]->t,
VX_CONVERT_POLICY_SATURATE, outputs[0]->t );
+
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
return (vsi_nn_kernel_node_t)node;
} /* add() */
@@ -65,6 +71,11 @@ REGISTER_ELTWISE_OPENVX_KERNEL( sub )
vx_node node = vxTensorSubtractNode( graph->g, inputs[0]->t, inputs[1]->t,
VX_CONVERT_POLICY_SATURATE, outputs[0]->t );
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
return (vsi_nn_kernel_node_t)node;
} /* sub() */
@@ -75,6 +86,10 @@ REGISTER_ELTWISE_OPENVX_KERNEL( div )
vx_scalar scale_s = NULL;
vx_node node = NULL;
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
scale = vsi_nn_kernel_param_get_float32(params, "scale");
overflow_policy = vsi_nn_kernel_param_get_int32(params, "overflow_policy");
rounding_policy = vsi_nn_kernel_param_get_int32(params, "rounding_policy");
@@ -105,6 +120,10 @@ REGISTER_ELTWISE_OPENVX_KERNEL( mul )
vx_scalar scale_s = NULL;
vx_node node = NULL;
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
scale = vsi_nn_kernel_param_get_float32(params, "scale");
overflow_policy = vsi_nn_kernel_param_get_int32(params, "overflow_policy");
rounding_policy = vsi_nn_kernel_param_get_int32(params, "rounding_policy");
diff --git a/src/tim/vx/internal/src/kernel/vx/l2_normalize_vx.c b/src/tim/vx/internal/src/kernel/vx/l2_normalize_vx.c
index 5133dabc4..af68dd210 100644
--- a/src/tim/vx/internal/src/kernel/vx/l2_normalize_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/l2_normalize_vx.c
@@ -116,6 +116,10 @@ REGISTER_L2_NORMALIZE_OPENVX_KERNEL( l2_norm )
if (vx_output) vxReleaseTensor(&vx_output);
#endif
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
if( NULL == node )
{
VSILOGE("Call vxSoftmaxLayer2 fail.(softmax)");
diff --git a/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c b/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c
index 3f5bfa1f4..5279543dc 100644
--- a/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c
@@ -63,6 +63,10 @@ REGISTER_BATCH_GEMM_OPENVX_KERNEL( matrixmul )
vx_scalar trans_a = vxCreateScalar(graph->ctx->c, VX_TYPE_BOOL, &transposeA);
vx_scalar trans_b = vxCreateScalar(graph->ctx->c, VX_TYPE_BOOL, &transposeB);
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
node = vxBatchGemmNode(graph->g,
inputs[0]->t,
inputs[1]->t,
diff --git a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
index a458e3800..c9a2c845c 100644
--- a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
@@ -30,6 +30,7 @@
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_dtype_util.h"
+#include "vsi_nn_error.h"
#define REGISTER_PAD2_OPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
@@ -68,6 +69,10 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 )
vsi_bool release_intermediate_tensor = TRUE;
float const_val = vsi_nn_kernel_param_get_float32(params, "const_val");
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
memset(¶m, 0, sizeof(param));
memset(pad_front_array, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
memset(pad_back_array, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
@@ -90,6 +95,7 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 )
attr.is_const = FALSE;
convert_tensor = vsi_nn_CreateTensor(graph, &attr);
+ CHECK_PTR_FAIL_GOTO( convert_tensor, "Create tensor fail.", final );
node = vxTensorCopyNode(
graph->g,
@@ -105,6 +111,7 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 )
node = vxTensorPadNode( graph->g, convert_tensor->t, outputs[0]->t, ¶m, sizeof(param) );
+final:
vxReleaseScalar( ¶m.pad_const );
if (release_intermediate_tensor)
diff --git a/src/tim/vx/internal/src/kernel/vx/prelu_vx.c b/src/tim/vx/internal/src/kernel/vx/prelu_vx.c
index 4728ad651..ebf381256 100644
--- a/src/tim/vx/internal/src/kernel/vx/prelu_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/prelu_vx.c
@@ -95,6 +95,10 @@ REGISTER_PRELU_OPENVX_KERNEL( prelu )
vx_node node = NULL;
int32_t is_per_channel_alpha = 0;
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
is_per_channel_alpha = vsi_nn_kernel_param_get_int32(params, "is_per_channel_alpha");
if (!is_per_channel_alpha)
diff --git a/src/tim/vx/internal/src/kernel/vx/resize_vx.c b/src/tim/vx/internal/src/kernel/vx/resize_vx.c
index 3b2b16778..fdea91a43 100644
--- a/src/tim/vx/internal/src/kernel/vx/resize_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/resize_vx.c
@@ -121,6 +121,9 @@ static vsi_nn_kernel_node_t _setup
sizeof(param),
outputs[0]->t );
#endif
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
if ( NULL == node )
{
VSILOGI("Call vxTensorScaleNode fail.(resize)");
diff --git a/src/tim/vx/internal/src/kernel/vx/select_vx.c b/src/tim/vx/internal/src/kernel/vx/select_vx.c
new file mode 100644
index 000000000..d50a99504
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/vx/select_vx.c
@@ -0,0 +1,86 @@
+/****************************************************************************
+*
+* Copyright (c) 2020 Vivante Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+#if (VX_TENSOR_SELECT_VX_SUPPORT)
+
+#define REGISTER_SELECTOPENVX_KERNEL( kernel_name ) \
+ static vsi_nn_kernel_node_t _##kernel_name##setup \
+ ( \
+ vsi_nn_graph_t * graph, \
+ vsi_nn_tensor_t ** inputs, \
+ size_t input_num, \
+ vsi_nn_tensor_t ** outputs, \
+ size_t output_num,\
+ const vsi_nn_kernel_param_t * params, \
+ vsi_nn_kernel_t * kernel \
+ ); \
+ REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+ static vsi_nn_kernel_node_t _##kernel_name##setup \
+ ( \
+ vsi_nn_graph_t * graph, \
+ vsi_nn_tensor_t ** inputs, \
+ size_t input_num, \
+ vsi_nn_tensor_t ** outputs, \
+ size_t output_num,\
+ const vsi_nn_kernel_param_t * params, \
+ vsi_nn_kernel_t * kernel \
+ )
+
+REGISTER_SELECTOPENVX_KERNEL( select )
+{
+ vx_node node = NULL;
+ vx_tensor input_list[3] = {NULL};
+ uint32_t i = 0;
+ uint32_t input_count = (uint32_t)input_num;
+
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(output_num);
+
+ for ( i = 0; i < input_count; i++ )
+ {
+ input_list[i] = inputs[i]->t;
+ }
+
+ node = vxTensorSelectLayer(
+ graph->g,
+ input_list,
+ input_count,
+ outputs[0]->t
+ );
+
+ return (vsi_nn_kernel_node_t)node;
+} /* select() */
+
+#undef REGISTER_SELECTOPENVX_KERNEL
+
+#endif
diff --git a/src/tim/vx/internal/src/kernel/vx/softmax_vx.c b/src/tim/vx/internal/src/kernel/vx/softmax_vx.c
index f097fbbb9..1d1d445e5 100644
--- a/src/tim/vx/internal/src/kernel/vx/softmax_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/softmax_vx.c
@@ -59,10 +59,12 @@ REGISTER_SOFTMAX_OPENVX_KERNEL( softmax )
vx_node node = NULL;
float beta = vsi_nn_kernel_param_get_float32(params, "beta");
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+#if !VX_STREAM_PROCESSOR_SUPPORT
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
uint32_t rank_in = 0;
- int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
int32_t new_axis = 0;
+#endif
+ int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
size_t size = sizeof(vx_nn_softmax_params_t);
#ifdef VX_SOFTMAX_AXIS_PARAMETER_SUPPORT
vx_nn_softmax_params_ext_t paramExt;
@@ -78,6 +80,17 @@ REGISTER_SOFTMAX_OPENVX_KERNEL( softmax )
base.beta = beta;
#endif
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
+#if VX_STREAM_PROCESSOR_SUPPORT
+ node = vxSoftmaxLayer2( graph->g,
+ inputs[0]->t,
+ param,
+ size,
+ outputs[0]->t);
+#else
vsi_nn_kernel_optimize_softmax_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
shapes[0], &rank_in, &new_axis);
@@ -108,13 +121,14 @@ REGISTER_SOFTMAX_OPENVX_KERNEL( softmax )
param,
size,
reshape_tensors[1]->t);
+#endif
if( NULL == node )
{
VSILOGE("Call vxSoftmaxLayer2 fail.(softmax)");
}
- vsi_nn_ReleaseTensor( &reshape_tensors[0] );
- vsi_nn_ReleaseTensor( &reshape_tensors[1] );
+ vsi_safe_release_tensor( reshape_tensors[0] );
+ vsi_safe_release_tensor( reshape_tensors[1] );
return (vsi_nn_kernel_node_t)node;
} /* softmax() */
diff --git a/src/tim/vx/internal/src/kernel/vx/square_vx.c b/src/tim/vx/internal/src/kernel/vx/square_vx.c
index 5ae1499da..778557331 100644
--- a/src/tim/vx/internal/src/kernel/vx/square_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/square_vx.c
@@ -46,6 +46,11 @@ static vsi_nn_kernel_node_t _setup
{
vx_node node = NULL;
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(params);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
node = vxActivationLayer(
graph->g,
inputs[0]->t,
diff --git a/src/tim/vx/internal/src/kernel/vx/swish_vx.c b/src/tim/vx/internal/src/kernel/vx/swish_vx.c
index 7557d9b11..9b458c62d 100644
--- a/src/tim/vx/internal/src/kernel/vx/swish_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/swish_vx.c
@@ -62,6 +62,10 @@ REGISTER_SWISH_OPENVX_KERNEL( swish )
vx_enum function = VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SWISH;
float beta = 1.0f;
+ VSI_UNREFERENCED(kernel);
+ VSI_UNREFERENCED(output_num);
+ VSI_UNREFERENCED(input_num);
+
if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
{
swish_type = (vsi_nn_swish_type)vsi_nn_kernel_param_get_int32(params, "type");
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl
index 49d04e2d4..755c809e3 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl
@@ -15,6 +15,8 @@ __kernel void gather_U8toU8(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xy);
+
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.w = gidz * axis_num + indice.x;
uint4 data = read_imageui(input0, coord_in.zw);
@@ -40,6 +42,8 @@ __kernel void gather_F16toF16(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xy);
+
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.w = gidz * axis_num + indice.x;
float4 data = read_imagef(input0, coord_in.zw);
@@ -65,6 +69,8 @@ __kernel void gather_I32toI32(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xy);
+
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.w = gidz * axis_num + indice.x;
int4 data = read_imagei(input0, coord_in.zw);
@@ -90,6 +96,8 @@ __kernel void gather_F32toF32(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xy);
+
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.w = gidz * axis_num + indice.x;
float4 data = read_imagef(input0, coord_in.zw);
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_array.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_array.cl
index 15a466443..574dd6b3f 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/gather_array.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_array.cl
@@ -15,6 +15,7 @@ __kernel void gather_array_U8toU8(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xy);
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.w = gidz * axis_num + indice.x;
Image img1 = create_image_from_image2d(input0, 1);
@@ -43,6 +44,7 @@ __kernel void gather_array_F16toF16(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xy);
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.w = gidz * axis_num + indice.x;
Image img1 = create_image_from_image2d(input0, 2);
@@ -71,6 +73,7 @@ __kernel void gather_array_I32toI32(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xy);
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.w = gidz * axis_num + indice.x;
Image img1 = create_image_from_image2d(input0, 4);
@@ -99,6 +102,7 @@ __kernel void gather_array_F32toF32(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xy);
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.w = gidz * axis_num + indice.x;
Image img1 = create_image_from_image2d(input0, 4);
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl
index 4ff6ec158..bfc88d0ed 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl
@@ -20,6 +20,7 @@ __kernel void gather_batch_U8toU8(
{
int4 indice = read_imagei(input1, coord_idx);
coord_idx.y++;
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.y = gidz * axis_num + indice.x;
uint4 data = read_imageui(input0, coord_in);
@@ -51,6 +52,7 @@ __kernel void gather_batch_F16toF16(
{
int4 indice = read_imagei(input1, coord_idx);
coord_idx.y++;
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.y = gidz * axis_num + indice.x;
float4 data = read_imagef(input0, coord_in);
@@ -82,6 +84,7 @@ __kernel void gather_batch_I32toI32(
{
int4 indice = read_imagei(input1, coord_idx);
coord_idx.y++;
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.y = gidz * axis_num + indice.x;
int4 data = read_imagei(input0, coord_in);
@@ -113,6 +116,7 @@ __kernel void gather_batch_F32toF32(
{
int4 indice = read_imagei(input1, coord_idx);
coord_idx.y++;
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.y = gidz * axis_num + indice.x;
float4 data = read_imagef(input0, coord_in);
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_elements.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_elements.cl
index 323f69417..58403f9a3 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/gather_elements.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_elements.cl
@@ -1,3 +1,11 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+
+_viv_uniform uint width0;
+_viv_uniform uint height0;
+_viv_uniform uint width1;
+_viv_uniform uint height1;
+_viv_uniform uint width_out;
+_viv_uniform uint height_out;
#define GATHER_ELEMENTS_AXIS0_2D(name, data_type, read_func, write_func, conv_func) \
__kernel void gather_elements_axis0_##name##_I32to##name##_2D \
@@ -133,3 +141,159 @@ __kernel void gather_elements_axis2_##name##_I32to##name \
GATHER_ELEMENTS_AXIS2(F32, float4, read_imagef, write_imagef, convert_float4)
GATHER_ELEMENTS_AXIS2(I32, int4, read_imagei, write_imagei, convert_int4_rte)
GATHER_ELEMENTS_AXIS2(U32, uint4, read_imageui, write_imageui, convert_uint4_rte)
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name \
+ ( \
+ __read_only image2d_array_t input0, \
+ __read_only image2d_array_t input1, \
+ __write_only image2d_array_t output, \
+ float input_scale, \
+ float input_tail, \
+ int axis_size \
+ ) \
+{ \
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \
+ int* index_ptr = (int*)index_tensor.ptr; \
+ int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \
+ \
+ Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \
+ data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \
+ data_type data = input_ptr[index + coord.y * width0 + coord.z * width0 * height0]; \
+ \
+ Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \
+ data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \
+ output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(F32, float, float*, 4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I32, int, int*, 4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I8, char, char*, 1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(U8, uchar, uchar*, 1)
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name \
+ ( \
+ __read_only image2d_array_t input0, \
+ __read_only image2d_array_t input1, \
+ __write_only image2d_array_t output, \
+ float input_scale, \
+ float input_tail, \
+ int axis_size \
+ ) \
+{ \
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \
+ int* index_ptr = (int*)index_tensor.ptr; \
+ int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \
+ \
+ Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \
+ data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \
+ data_type data = input_ptr[coord.x + index * width0 + coord.z * width0 * height0]; \
+ \
+ Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \
+ data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \
+ output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(F32, float, float*, 4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I32, int, int*, 4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I8, char, char*, 1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(U8, uchar, uchar*, 1)
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis2_##name##_I32to##name \
+ ( \
+ __read_only image2d_array_t input0, \
+ __read_only image2d_array_t input1, \
+ __write_only image2d_array_t output, \
+ float input_scale, \
+ float input_tail, \
+ int axis_size \
+ ) \
+{ \
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \
+ int* index_ptr = (int*)index_tensor.ptr; \
+ int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \
+ \
+ Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \
+ data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \
+ data_type data = input_ptr[coord.x + coord.y * width0 + index * width0 * height0]; \
+ \
+ Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \
+ data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \
+ output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(F32, float, float*, 4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I32, int, int*, 4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I8, char, char*, 1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(U8, uchar, uchar*, 1)
+
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name##_2D \
+ ( \
+ __read_only image2d_t input0, \
+ __read_only image2d_t input1, \
+ __write_only image2d_t output, \
+ float input_scale, \
+ float input_tail, \
+ int axis_size \
+ ) \
+{ \
+ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ Image index_img = create_image_from_image2d(input1, 4); \
+ int* index_ptr = (int*)index_img.ptr; \
+ int index = index_ptr[coord.x + coord.y * width1]; \
+ \
+ Image input_img = create_image_from_image2d(input0, stride); \
+ data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \
+ data_type data = input_ptr[index + coord.y * width0]; \
+ \
+ Image output_img = create_image_from_image2d(output, stride); \
+ data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \
+ output_ptr[coord.x + coord.y * width_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(F32, float, float*, 4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I32, int, int*, 4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I8, char, char*, 1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(U8, uchar, uchar*, 1)
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name##_2D \
+ ( \
+ __read_only image2d_t input0, \
+ __read_only image2d_t input1, \
+ __write_only image2d_t output, \
+ float input_scale, \
+ float input_tail, \
+ int axis_size \
+ ) \
+{ \
+ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ Image index_img = create_image_from_image2d(input1, 4); \
+ int* index_ptr = (int*)index_img.ptr; \
+ int index = index_ptr[coord.x + coord.y * width1]; \
+ \
+ Image input_img = create_image_from_image2d(input0, stride); \
+ data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \
+ data_type data = input_ptr[coord.x + index * width0]; \
+ \
+ Image output_img = create_image_from_image2d(output, stride); \
+ data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \
+ output_ptr[coord.x + coord.y * width_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(F32, float, float*, 4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I32, int, int*, 4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I8, char, char*, 1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(U8, uchar, uchar*, 1)
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd_batch.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd_batch.cl
index 02e430922..1cf59759f 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd_batch.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd_batch.cl
@@ -1,124 +1,133 @@
__kernel void gather_nd_batch_U8toU8_1D(
__read_only image2d_t input0,
- __read_only image2d_t input1,
- __write_only image2d_t output,
+ __read_only image2d_array_t input1,
+ __write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
- int gidy = get_global_id(1); // batch_num
+ int gidy = get_global_id(1); // index_num
+ int gidz = get_global_id(2); // batch_num
- int4 coord = (int4)(gidx, gidy, 0, 0);
- int4 indice = read_imagei(input1, coord.wy);
- coord.z = indice.x * block_size + gidx;
+ int4 coord = (int4)(gidx, gidy, gidz, 0);
+ int4 indice = read_imagei(input1, coord.wyzw);
+ int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
- uint4 data = read_imageui(input0, coord.zy);
- write_imageui(output, coord.xy, data);
+ uint4 data = read_imageui(input0, coord0);
+ write_imageui(output, coord, data);
}
__kernel void gather_nd_batch_F16toF16_1D(
__read_only image2d_t input0,
- __read_only image2d_t input1,
- __write_only image2d_t output,
+ __read_only image2d_array_t input1,
+ __write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
- int gidy = get_global_id(1); // batch_num
+ int gidy = get_global_id(1); // index_num
+ int gidz = get_global_id(2); // batch_num
- int4 coord = (int4)(gidx, gidy, 0, 0);
- int4 indice = read_imagei(input1, coord.wy);
- coord.z = indice.x * block_size + gidx;
+ int4 coord = (int4)(gidx, gidy, gidz, 0);
+ int4 indice = read_imagei(input1, coord.wyzw);
+ int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
- float4 data = read_imagef(input0, coord.zy);
- write_imagef(output, coord.xy, data);
+ float4 data = read_imagef(input0, coord0);
+ write_imagef(output, coord, data);
}
__kernel void gather_nd_batch_I8toI8_1D(
__read_only image2d_t input0,
- __read_only image2d_t input1,
- __write_only image2d_t output,
+ __read_only image2d_array_t input1,
+ __write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
- int gidy = get_global_id(1); // batch_num
+ int gidy = get_global_id(1); // index_num
+ int gidz = get_global_id(2); // batch_num
- int4 coord = (int4)(gidx, gidy, 0, 0);
- int4 indice = read_imagei(input1, coord.wy);
- coord.z = indice.x * block_size + gidx;
+ int4 coord = (int4)(gidx, gidy, gidz, 0);
+ int4 indice = read_imagei(input1, coord.wyzw);
+ int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
- int4 data = read_imagei(input0, coord.zy);
- write_imagei(output, coord.xy, data);
+ int4 data = read_imagei(input0, coord0);
+ write_imagei(output, coord, data);
}
//2D
__kernel void gather_nd_batch_U8toU8_2D(
__read_only image2d_array_t input0,
- __read_only image2d_t input1,
- __write_only image2d_t output,
+ __read_only image2d_array_t input1,
+ __write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
- int gidy = get_global_id(1); // batch_num
+ int gidy = get_global_id(1); // index_num
+ int gidz = get_global_id(2); // batch_num
- int4 coord = (int4)(0, gidy, gidx, 1);
- int4 indice = read_imagei(input1, coord.xy);
- int4 indice1 = read_imagei(input1, coord.wy);
+ int4 coord = (int4)(1, gidy, gidz, 0);
+ int4 indice = read_imagei(input1, coord.wyzw);
+ int4 indice1 = read_imagei(input1, coord.xyzw);
indice.x = indice.x * block_size + gidx;
indice.y = indice1.x;
- indice.zw = coord.yx;
+ indice.zw = coord.zw;
uint4 data = read_imageui(input0, indice);
- write_imageui(output, coord.zy, data);
+ coord.x = gidx;
+ write_imageui(output, coord, data);
}
__kernel void gather_nd_batch_F16toF16_2D(
__read_only image2d_array_t input0,
- __read_only image2d_t input1,
- __write_only image2d_t output,
+ __read_only image2d_array_t input1,
+ __write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
- int gidy = get_global_id(1); // batch_num
+ int gidy = get_global_id(1); // index_num
+ int gidz = get_global_id(2); // batch_num
- int4 coord = (int4)(0, gidy, gidx, 1);
- int4 indice = read_imagei(input1, coord.xy);
- int4 indice1 = read_imagei(input1, coord.wy);
+ int4 coord = (int4)(1, gidy, gidz, 0);
+ int4 indice = read_imagei(input1, coord.wyzw);
+ int4 indice1 = read_imagei(input1, coord.xyzw);
indice.x = indice.x * block_size + gidx;
indice.y = indice1.x;
- indice.zw = coord.yx;
+ indice.zw = coord.zw;
float4 data = read_imagef(input0, indice);
- write_imagef(output, coord.zy, data);
+ coord.x = gidx;
+ write_imagef(output, coord, data);
}
__kernel void gather_nd_batch_I8toI8_2D(
__read_only image2d_array_t input0,
- __read_only image2d_t input1,
- __write_only image2d_t output,
+ __read_only image2d_array_t input1,
+ __write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
- int gidy = get_global_id(1); // batch_num
+ int gidy = get_global_id(1); // index_num
+ int gidz = get_global_id(2); // batch_num
- int4 coord = (int4)(0, gidy, gidx, 1);
- int4 indice = read_imagei(input1, coord.xy);
- int4 indice1 = read_imagei(input1, coord.wy);
+ int4 coord = (int4)(1, gidy, gidz, 0);
+ int4 indice = read_imagei(input1, coord.wyzw);
+ int4 indice1 = read_imagei(input1, coord.xyzw);
indice.x = indice.x * block_size + gidx;
indice.y = indice1.x;
indice.y = indice1.x;
- indice.zw = coord.yx;
+ indice.zw = coord.zw;
int4 data = read_imagei(input0, indice);
- write_imagei(output, coord.zy, data);
+ coord.x = gidx;
+ write_imagei(output, coord, data);
}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_cross.cl b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_cross.cl
new file mode 100644
index 000000000..e36f10353
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_cross.cl
@@ -0,0 +1,95 @@
+__kernel void gemm_F32F32toF32_merge(
+ __read_only image2d_array_t inputA,
+ __read_only image2d_array_t inputB,
+ __write_only image2d_array_t output,
+ int M,
+ int K,
+ int N,
+ int ac2zero,
+ int bc2zero,
+ float scale_a,
+ float zp_a,
+ float scale_b,
+ float zp_b,
+ float scale_out,
+ float zp_out,
+ int outer)
+{
+ for(int i = 0; i < outer; i++)
+ {
+ int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? i : get_global_id(2)), 0);
+ int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0);
+
+ float4 sum = (float4)(0);
+
+ for(; coord_a.x < K;)
+ {
+ float4 tempA0;
+ float4 tempB0;
+
+ tempA0 = read_imagef(inputA, coord_a);
+ tempB0 = read_imagef(inputB, coord_b);
+ coord_a.x++;
+ coord_b.y++;
+
+ sum = sum + tempA0 * tempB0;
+ }
+
+ coord_b.y = get_global_id(1);
+ coord_b.z = get_global_id(2) + i * get_global_size(2);
+ write_imagef(output, coord_b, sum);
+ }
+}
+
+#define GEMM_MERGE(name, dst_type, read_image_type, convert_type, write_image_type) \
+__kernel void gemm_##name##_merge( \
+ __read_only image2d_array_t inputA, \
+ __read_only image2d_array_t inputB, \
+ __write_only image2d_array_t output, \
+ int M, \
+ int K, \
+ int N, \
+ int ac2zero, \
+ int bc2zero, \
+ float scale_a, \
+ float zp_a, \
+ float scale_b, \
+ float zp_b, \
+ float scale_out, \
+ float zp_out, \
+ int outer) \
+{ \
+ for(int i = 0; i < outer; i++) \
+ { \
+ int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? i : get_global_id(2)), 0); \
+ int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); \
+ float4 sum = (float4)(0); \
+ dst_type dst; \
+ \
+ for(; coord_a.x < K;) \
+ { \
+ float4 tempA0; \
+ float4 tempB0; \
+ \
+ tempA0 = convert_float4(read_image_type(inputA, coord_a)); \
+ tempB0 = convert_float4(read_image_type(inputB, coord_b)); \
+ tempA0.x = (tempA0.x - zp_a) * scale_a; \
+ tempB0.x = (tempB0.x - zp_b) * scale_b; \
+ \
+ coord_a.x++; \
+ coord_b.y++; \
+ \
+ sum = sum + tempA0 * tempB0; \
+ } \
+ sum.x = sum.x * scale_out + zp_out; \
+ dst = convert_type(sum); \
+ \
+ coord_b.y = get_global_id(1); \
+ coord_b.z = get_global_id(2) + i * get_global_size(2); \
+ write_image_type(output, coord_b, dst); \
+ } \
+}
+GEMM_MERGE(I8I8toI8,int4,read_imagei,convert_int4,write_imagei);
+GEMM_MERGE(U8U8toU8,uint4,read_imageui,convert_uint4,write_imageui);
+GEMM_MERGE(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/nearest_grid_sample.cl b/src/tim/vx/internal/src/libnnext/ops/cl/nearest_grid_sample.cl
new file mode 100644
index 000000000..e427fe414
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/nearest_grid_sample.cl
@@ -0,0 +1,77 @@
+__kernel void nearest_grid_sample_F32_F32toF32(
+ __read_only image2d_array_t input0,
+ __read_only image2d_t input1,
+ __write_only image2d_array_t output,
+ float half_input0_w,
+ float half_input0_h,
+ float add_float_value_w,
+ float add_float_value_h,
+ int depth
+ )
+{
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+ int2 coord_in1 = (int2)(get_global_id(0) * 2, get_global_id(1));
+
+ float fx = read_imagef(input1, coord_in1).x;
+ coord_in1.x = coord_in1.x + 1;
+ float fy = read_imagef(input1, coord_in1).x;
+
+ fx = fx * half_input0_w + add_float_value_w;
+ fy = fy * half_input0_h + add_float_value_h;
+ int x_index = convert_int(fx);
+ int y_index = convert_int(fy);
+ int4 coord_in = (int4)(x_index, y_index, 0, 0);
+
+ float4 dst;
+
+ while (coord_in.z < depth){
+ dst = read_imagef(input0, coord_in);
+ write_imagef(output, coord_out, dst);
+ coord_in.z++;
+ coord_out.z++;
+ }
+}
+
+
+__kernel void nearest_grid_sample_U8_U8toU8(
+ __read_only image2d_array_t input0,
+ __read_only image2d_t input1,
+ __write_only image2d_array_t output,
+ float half_input0_w,
+ float half_input0_h,
+ float add_float_value_w,
+ float add_float_value_h,
+ int depth,
+ float in0_scale,
+ float in0_tail,
+ float in1_scale,
+ float in1_tail,
+ float out_scale,
+ float out_tail
+ )
+{
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+ int2 coord_in1 = (int2)(get_global_id(0) * 2, get_global_id(1));
+
+ float fx = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;
+ coord_in1.x = coord_in1.x + 1;
+ float fy = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;
+
+ fx = fx * half_input0_w + add_float_value_w;
+ fy = fy * half_input0_h + add_float_value_h;
+ int x_index = convert_int(fx);
+ int y_index = convert_int(fy);
+ int4 coord_in = (int4)(x_index, y_index, 0, 0);
+
+ float4 val;
+ uint4 dst;
+
+ while (coord_in.z < depth){
+ val = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
+ dst = convert_uint4_rte(val * out_scale + out_tail);
+ write_imageui(output, coord_out, dst);
+ coord_in.z++;
+ coord_out.z++;
+ }
+
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/resize_3d_bilinear.cl b/src/tim/vx/internal/src/libnnext/ops/cl/resize_3d_bilinear.cl
new file mode 100644
index 000000000..f835db5e5
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/resize_3d_bilinear.cl
@@ -0,0 +1,161 @@
+#pragma OPENCL EXTENSION CL_VIV_asm : enable
+
+#define RESIZE_3D(in_name, out_name, read_image_type, dst_type, convert_type, write_image_type) \
+__kernel void resize_3d_bilinear_##in_name##to##out_name( \
+ __read_only image2d_array_t input, \
+ __write_only image2d_array_t output, \
+ float scale_x, \
+ float scale_y, \
+ float scale_z, \
+ float half_pixel_value, \
+ uint in_width, \
+ uint in_height, \
+ uint in_depth, \
+ float in_scale, \
+ float in_tail, \
+ float out_scale, \
+ float out_tail \
+ ) \
+{ \
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value; \
+ float left_x_f = fmax(floor(in_x), 0); \
+ float x_lerp = in_x - left_x_f; \
+ int left_x_idx = convert_int(left_x_f); \
+ float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value; \
+ float top_y_f = fmax(floor(in_y), 0); \
+ float y_lerp = in_y - top_y_f; \
+ int top_y_idx = convert_int(top_y_f); \
+ float in_z = (convert_float(coord_out.z) + half_pixel_value) * scale_z - half_pixel_value; \
+ float front_z_f = fmax(floor(in_z), 0); \
+ float z_lerp = in_z - front_z_f; \
+ int front_z_idx = convert_int(front_z_f); \
+ int4 coord_in = (int4)(left_x_idx, top_y_idx, front_z_idx, 0); \
+ float4 data_000, data_100, data_010, data_110, data_001, data_011, data_101, data_111; \
+ dst_type dst; \
+ \
+ int dx, dy, dz; \
+ dx = in_x < 0 ? 0 : (left_x_f < in_width - 1 ? 1 : 0); \
+ dy = in_y < 0 ? 0 : (top_y_f < in_height - 1 ? 1 : 0); \
+ dz = in_z < 0 ? 0 : (front_z_idx < in_depth - 1 ? 1 : 0); \
+ \
+ data_000 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \
+ coord_in.y = coord_in.y + dy; \
+ data_010 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \
+ coord_in.x = coord_in.x + dx; \
+ data_110 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \
+ coord_in.y = coord_in.y - dy; \
+ data_100 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \
+ coord_in.z = coord_in.z + dz; \
+ data_101 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \
+ coord_in.y = coord_in.y + dy; \
+ data_111 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \
+ coord_in.x = coord_in.x - dx; \
+ data_011 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \
+ coord_in.y = coord_in.y - dy; \
+ data_001 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \
+ \
+ data_000 = data_000 + (data_100 - data_000) * x_lerp; \
+ data_010 = data_010 + (data_110 - data_010) * x_lerp; \
+ data_000 = data_000 + (data_010 - data_000) * y_lerp; \
+ \
+ data_001 = data_001 + (data_101 - data_001) * x_lerp; \
+ data_011 = data_011 + (data_111 - data_011) * x_lerp; \
+ data_001 = data_001 + (data_011 - data_001) * y_lerp; \
+ data_000 = data_000 + (data_001 - data_000) * z_lerp; \
+ \
+ dst = convert_type(data_000 * out_scale + out_tail); \
+ \
+ write_image_type(output, coord_out, dst); \
+}
+RESIZE_3D(F32, F32, read_imagef, float4, convert_float4, write_imagef)
+RESIZE_3D(F32, U8, read_imagef, uint4, convert_uint4, write_imageui)
+RESIZE_3D(U8, F32, read_imageui, float4, convert_float4, write_imagef)
+RESIZE_3D(U8, U8, read_imageui, uint4, convert_uint4, write_imageui)
+RESIZE_3D(I8, I8, read_imagei, int4, convert_int4, write_imagei)
+
+__kernel void resize_3d_bilinear_BF16toBF16(
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ float scale_x,
+ float scale_y,
+ float scale_z,
+ float half_pixel_value,
+ uint in_width,
+ uint in_height,
+ uint in_depth,
+ float in_scale,
+ float in_tail,
+ float out_scale,
+ float out_tail
+ )
+{
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+ float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value;
+ float left_x_f = fmax(floor(in_x), 0);
+ float x_lerp = in_x - left_x_f;
+ int left_x_idx = convert_int(left_x_f);
+ float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value;
+ float top_y_f = fmax(floor(in_y), 0);
+ float y_lerp = in_y - top_y_f;
+ int top_y_idx = convert_int(top_y_f);
+ float in_z = (convert_float(coord_out.z) + half_pixel_value) * scale_z - half_pixel_value;
+ float front_z_f = fmax(floor(in_z), 0);
+ float z_lerp = in_z - front_z_f;
+ int front_z_idx = convert_int(front_z_f);
+ int4 coord_in = (int4)(left_x_idx, top_y_idx, front_z_idx, 0);
+ uint4 data_000, data_100, data_010, data_110, data_001, data_011, data_101, data_111;
+ float4 data_000_f, data_100_f, data_010_f, data_110_f, data_001_f, data_011_f, data_101_f, data_111_f;
+ uint4 dst;
+
+ int dx, dy, dz;
+ dx = in_x < 0 ? 0 : (left_x_f < in_width - 1 ? 1 : 0);
+ dy = in_y < 0 ? 0 : (top_y_f < in_height - 1 ? 1 : 0);
+ dz = in_z < 0 ? 0 : (front_z_idx < in_depth - 1 ? 1 : 0);
+
+ data_000 = read_imageui(input, coord_in);
+ data_000 = data_000 << 16;
+ coord_in.y = coord_in.y + dy;
+ data_010 = read_imageui(input, coord_in);
+ data_010 = data_010 << 16;
+ coord_in.x = coord_in.x + dx;
+ data_110 = read_imageui(input, coord_in);
+ data_110 = data_110 << 16;
+ coord_in.y = coord_in.y - dy;
+ data_100 = read_imageui(input, coord_in);
+ data_100 = data_100 << 16;
+ coord_in.z = coord_in.z + dz;
+ data_101 = read_imageui(input, coord_in);
+ data_101 = data_101 << 16;
+ coord_in.y = coord_in.y + dy;
+ data_111 = read_imageui(input, coord_in);
+ data_111 = data_111 << 16;
+ coord_in.x = coord_in.x - dx;
+ data_011 = read_imageui(input, coord_in);
+ data_011 = data_011 << 16;
+ coord_in.y = coord_in.y - dy;
+ data_001 = read_imageui(input, coord_in);
+ data_001 = data_001 << 16;
+
+ _viv_asm(COPY, data_000_f, data_000, 16);
+ _viv_asm(COPY, data_010_f, data_010, 16);
+ _viv_asm(COPY, data_110_f, data_110, 16);
+ _viv_asm(COPY, data_100_f, data_100, 16);
+ _viv_asm(COPY, data_101_f, data_101, 16);
+ _viv_asm(COPY, data_111_f, data_111, 16);
+ _viv_asm(COPY, data_011_f, data_011, 16);
+ _viv_asm(COPY, data_001_f, data_001, 16);
+
+ data_000_f = data_000_f + (data_100_f - data_000_f) * x_lerp;
+ data_010_f = data_010_f + (data_110_f - data_010_f) * x_lerp;
+ data_000_f = data_000_f + (data_010_f - data_000_f) * y_lerp;
+
+ data_001_f = data_001_f + (data_101_f - data_001_f) * x_lerp;
+ data_011_f = data_011_f + (data_111_f - data_011_f) * x_lerp;
+ data_001_f = data_001_f + (data_011_f - data_001_f) * y_lerp;
+ data_000_f = data_000_f + (data_001_f - data_000_f) * z_lerp;
+
+ _viv_asm(COPY, dst, data_000_f, 16);
+ dst = dst >> 16;
+ write_imageui(output, coord_out, dst);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/resize_3d_nearest.cl b/src/tim/vx/internal/src/libnnext/ops/cl/resize_3d_nearest.cl
new file mode 100644
index 000000000..220acd351
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/resize_3d_nearest.cl
@@ -0,0 +1,119 @@
+
+#define NEAREST_INDEX_PROCESS() \
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x + round_value; \
+ int in_x_idx = convert_int(in_x); \
+ float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y + round_value; \
+ int in_y_idx = convert_int(in_y); \
+ float in_z = (convert_float(coord_out.z) + half_pixel_value) * scale_z + round_value; \
+ int in_z_idx = convert_int(in_z); \
+
+__kernel void resize_3d_nearest_F32toF32(
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ float scale_x,
+ float scale_y,
+ float scale_z,
+ float half_pixel_value,
+ float round_value,
+ float output_scale,
+ float output_tail)
+{
+ NEAREST_INDEX_PROCESS()
+ int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);
+ float4 dst;
+ dst = read_imagef(input, coord_in);
+ write_imagef(output, coord_out, dst);
+}
+
+
+__kernel void resize_3d_nearest_U8toU8(
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ float scale_x,
+ float scale_y,
+ float scale_z,
+ float half_pixel_value,
+ float round_value,
+ float output_scale,
+ float output_tail)
+{
+ NEAREST_INDEX_PROCESS()
+ int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);
+ uint4 dst;
+ dst = convert_uint4(convert_float4(read_imageui(input, coord_in)) * output_scale + output_tail);
+ write_imageui(output, coord_out, dst);
+}
+
+__kernel void resize_3d_nearest_U8toF32(
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ float scale_x,
+ float scale_y,
+ float scale_z,
+ float half_pixel_value,
+ float round_value,
+ float output_scale,
+ float output_tail)
+{
+ NEAREST_INDEX_PROCESS()
+ int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);
+ float4 dst;
+ dst = convert_float4(read_imageui(input, coord_in)) * output_scale + output_tail;
+ write_imagef(output, coord_out, dst);
+}
+
+__kernel void resize_3d_nearest_F32toU8(
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ float scale_x,
+ float scale_y,
+ float scale_z,
+ float half_pixel_value,
+ float round_value,
+ float output_scale,
+ float output_tail)
+{
+ NEAREST_INDEX_PROCESS()
+ int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);
+ uint4 dst;
+ dst = convert_uint4(read_imagef(input, coord_in) * output_scale + output_tail);
+ write_imageui(output, coord_out, dst);
+}
+
+__kernel void resize_3d_nearest_I8toI8(
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ float scale_x,
+ float scale_y,
+ float scale_z,
+ float half_pixel_value,
+ float round_value,
+ float output_scale,
+ float output_tail)
+{
+ NEAREST_INDEX_PROCESS()
+ int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);
+ int4 dst;
+ dst = convert_int4(convert_float4(read_imagei(input, coord_in)) * output_scale);
+ write_imagei(output, coord_out, dst);
+}
+
+__kernel void resize_3d_nearest_BF16toBF16(
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ float scale_x,
+ float scale_y,
+ float scale_z,
+ float half_pixel_value,
+ float round_value,
+ float output_scale,
+ float output_tail)
+{
+ NEAREST_INDEX_PROCESS()
+ int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);
+ uint4 dst;
+ dst = read_imageui(input, coord_in);
+ write_imageui(output, coord_out, dst);
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl b/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl
index 117d6d25e..87a9df7d2 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl
@@ -1,5 +1,5 @@
-#define TILE_3D(name0, name1, data_type, read_image_func, write_image_func) \
+#define TILE_3D(name0, name1, src_type, dst_type, conv_type, read_image_func, write_image_func) \
__kernel void tile_##name0##to##name1 \
( \
__read_only image2d_array_t input, \
@@ -10,7 +10,9 @@ __kernel void tile_##name0##to##name1 \
int multiples_0, \
int multiples_1, \
int multiples_2, \
- int multiples_3 \
+ int multiples_3, \
+ float inoutscale, \
+ float inouttail \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
@@ -18,7 +20,9 @@ __kernel void tile_##name0##to##name1 \
int width = get_image_width(input); \
int height = get_image_height(input); \
\
- data_type src; \
+ src_type src; \
+ dst_type dst; \
+ \
read_image_func(src, input, coord); \
\
int batch_id = (short)coord.z / (short)depthIn; \
@@ -40,17 +44,19 @@ __kernel void tile_##name0##to##name1 \
for (int x = 0; x < multiples_0; x++) \
{ \
coord_out.x = coord.x + x * width; \
- write_image_func(output, coord_out.xyzw, src); \
+ dst = conv_type(convert_float4(src) * inoutscale + inouttail); \
+ write_image_func(output, coord_out.xyzw, dst); \
} \
} \
} \
} \
}
-TILE_3D(I32, I32, int4, READ_IMAGEI_2DARRAY, write_imagei)
-TILE_3D(U32, U32, uint4, READ_IMAGEUI_2DARRAY, write_imageui)
-TILE_3D(F32, F32, float4, READ_IMAGEF_2DARRAY, write_imagef)
+TILE_3D(I32, I32, int4, int4, convert_int4_rte, READ_IMAGEI_2DARRAY, write_imagei)
+TILE_3D(U32, U32, uint4, uint4, convert_uint4_rte, READ_IMAGEUI_2DARRAY, write_imageui)
+TILE_3D(F32, F32, float4, float4,convert_float4_rte,READ_IMAGEF_2DARRAY, write_imagef)
+TILE_3D(F32, U32, float4, uint4, convert_uint4_rte, READ_IMAGEF_2DARRAY, write_imageui)
-#define TILE_2D(name0, name1, data_type, read_image_func, write_image_func) \
+#define TILE_2D(name0, name1, src_type, dst_type, conv_type, read_image_func, write_image_func) \
__kernel void tile_##name0##to##name1##_2D \
( \
__read_only image2d_t input, \
@@ -61,7 +67,9 @@ __kernel void tile_##name0##to##name1##_2D \
int multiples_0, \
int multiples_1, \
int multiples_2, \
- int multiples_3 \
+ int multiples_3, \
+ float inoutscale, \
+ float inouttail \
) \
{ \
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
@@ -70,22 +78,25 @@ __kernel void tile_##name0##to##name1##_2D \
int output_width = get_image_width(output); \
int output_height = get_image_height(output); \
\
- data_type src = read_image_func(input, coord); \
+ src_type src = read_image_func(input, coord); \
+ dst_type dst; \
\
do \
{ \
do \
{ \
- write_image_func(output, coord, src); \
+ dst = conv_type(convert_float4(src) * inoutscale + inouttail); \
+ write_image_func(output, coord, dst); \
coord.x += width; \
} while (coord.x < output_width); \
coord.x = get_global_id(0); \
coord.y += height; \
} while (coord.y < output_height); \
}
-TILE_2D(I32, I32, int4, read_imagei, write_imagei)
-TILE_2D(U32, U32, uint4, read_imageui, write_imageui)
-TILE_2D(F32, F32, float4, read_imagef, write_imagef)
+TILE_2D(I32, I32, int4, int4, convert_int4_rte, read_imagei, write_imagei)
+TILE_2D(U32, U32, uint4, uint4, convert_uint4_rte, read_imageui, write_imageui)
+TILE_2D(F32, F32, float4, float4,convert_float4_rte,read_imagef, write_imagef)
+TILE_2D(F32, U32, float4, uint4, convert_uint4_rte, read_imagef, write_imageui)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis0.vx
new file mode 100644
index 000000000..a20f024a3
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis0.vx
@@ -0,0 +1,191 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;
+_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzRevF16toF16_2x8;
+
+_viv_uniform VXC_512Bits uniSumHorzRevU8toI16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzRevU8toI16B_8x4;
+_viv_uniform VXC_512Bits uniSubZpRevI16toI16_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32B_4x4;
+
+
+_viv_uniform int width;
+_viv_uniform int input_zp;
+_viv_uniform float in_out_scale;
+_viv_uniform float output_zp;
+
+__kernel void cumsum_ex_rev_F16toF16_axis0(
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ int axis, int exclusive, int rev
+ )
+{
+ int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+ int4 coord_out = coord;
+
+ vxc_short8 src, dst;
+ vxc_half8 data, tmpsum, sum;
+ VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+ if(exclusive == 0 && rev)
+ {
+ for(coord.x = width - 8; coord.x >= 0; coord.x -= 8)
+ {
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ _viv_asm(COPY, data, src, 16);
+
+ VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);
+ VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);
+ VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+ uniSumHorzRevF16toF16C_2x8);
+ VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);
+ _viv_asm(COPY, dst, sum, 16);
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ }
+ }
+ else if(exclusive && rev == 0)
+ {
+ _viv_asm(COPY, dst, sum, 16);
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+ for(; coord.x < width - 8;)
+ {
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ coord_out.x = coord.x + 1;
+ coord.x += 8;
+ _viv_asm(COPY, data, src, 16);
+
+ VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);
+ VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);
+ VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);
+ VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);
+ _viv_asm(COPY, dst, sum, 16);
+ VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ }
+ }
+ else if(exclusive && rev)
+ {
+ coord.x = width - 8;
+ coord_out.x = width - 1;
+ _viv_asm(COPY, dst, sum, 16);
+ VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+ for(; coord.x > 0;)
+ {
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ coord_out.x = coord.x - 1;
+ coord.x -= 8;
+ _viv_asm(COPY, data, src, 16);
+
+ VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);
+ VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);
+ VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+ uniSumHorzRevF16toF16C_2x8);
+ VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);
+ _viv_asm(COPY, dst, sum, 16);
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ }
+ }
+}
+
+#define CUMSUM_QINT_EX_REV_AXIS0(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis0( \
+ __read_only image2d_array_t input, \
+ __write_only image2d_array_t output, \
+ int axis, int exclusive, int rev \
+ ) \
+{ \
+ int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); \
+ int4 coord_out = coord; \
+ \
+ src_type src; \
+ dst_type dst; \
+ vxc_short8 rowSum; \
+ int4 sum0 = (int4)(0), sum1 = (int4)(0); \
+ short zp = (short)input_zp; \
+ \
+ if(exclusive == 0 && rev) \
+ { \
+ for(coord.x = width - 8; coord.x >= 0; coord.x -= 8) \
+ { \
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \
+ VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \
+ VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \
+ VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniAccSumHorzRevI16toI32A_4x4); \
+ VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniAccSumHorzRevI16toI32B_4x4); \
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
+ int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+ int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+ VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8); \
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ } \
+ else if(exclusive && rev == 0) \
+ { \
+ for(coord.x = -1; coord.x < width - 8;) \
+ { \
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ coord_out.x = coord.x + 1; \
+ coord.x += 8; \
+ VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \
+ VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \
+ VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \
+ VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniAccSumHorzI16toI32A_4x4); \
+ VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniAccSumHorzI16toI32B_4x4); \
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
+ int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+ int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8); \
+ VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ } \
+ else if(exclusive && rev) \
+ { \
+ for(coord.x = width - 7; coord.x > 0;) \
+ { \
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ coord_out.x = coord.x - 1; \
+ coord.x -= 8; \
+ VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \
+ VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \
+ VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \
+ VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniAccSumHorzRevI16toI32A_4x4); \
+ VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniAccSumHorzRevI16toI32B_4x4); \
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
+ int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+ int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+ VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8); \
+ VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ } \
+}
+CUMSUM_QINT_EX_REV_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16)
+CUMSUM_QINT_EX_REV_AXIS0(I8, I8, vxc_char16, vxc_char16)
+CUMSUM_QINT_EX_REV_AXIS0(I16, I16, vxc_short8, vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis1.vx
new file mode 100644
index 000000000..631964c5f
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis1.vx
@@ -0,0 +1,255 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int height;
+_viv_uniform float in_out_scale;
+_viv_uniform float in_out_zp_scale;
+_viv_uniform float output_zp;
+
+__kernel void cumsum_ex_rev_F16toF16_axis1(
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ int axis, int exclusive, int rev)
+{
+ int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);
+
+ vxc_short8 src, dst;
+ vxc_half8 data, sum;
+ VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+ if(exclusive == 0 && rev)
+ {
+ for(coord.y = height - 1; coord.y >= 0; coord.y--)
+ {
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ _viv_asm(COPY, data, src, 16);
+
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+ _viv_asm(COPY, dst, sum, 16);
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ }
+ }
+ else if(exclusive && rev == 0)
+ {
+ dst ^= dst;
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ for(; coord.y < height - 1;)
+ {
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ coord.y++;
+ _viv_asm(COPY, data, src, 16);
+
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+ _viv_asm(COPY, dst, sum, 16);
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ }
+ }
+ else if(exclusive && rev)
+ {
+ dst ^= dst;
+ coord.y = height - 1;
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+ for(; coord.y > 0;)
+ {
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ coord.y--;
+ _viv_asm(COPY, data, src, 16);
+
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+ _viv_asm(COPY, dst, sum, 16);
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ }
+ }
+}
+
+#define CUMSUM_8BITS_EX_REV_AXIS1(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \
+ __read_only image2d_array_t input, \
+ __write_only image2d_array_t output, \
+ int axis, int exclusive, int rev) \
+{ \
+ int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \
+ \
+ src_type src; \
+ dst_type dst; \
+ int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
+ \
+ if(exclusive == 0 && rev) \
+ { \
+ for(coord.y = height - 1; coord.y >= 0; coord.y--) \
+ { \
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+ VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+ VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+ float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+ float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+ float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+ int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+ int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+ int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+ int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8); \
+ VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8); \
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ } \
+ else if(exclusive && rev == 0) \
+ { \
+ int tmpAlpha0 = convert_int_rte(output_zp); \
+ int4 tmpVal; \
+ tmpVal.x = tmpAlpha0; \
+ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+ VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ for(; coord.y < height - 1;) \
+ { \
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ coord.y++; \
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+ VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+ VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+ float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp; \
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+ float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+ float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+ int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+ int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+ int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+ int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8);\
+ VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8);\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ } \
+ else if(exclusive && rev) \
+ { \
+ coord.y = height - 1; \
+ int tmpAlpha0 = convert_int_rte(output_zp); \
+ int4 tmpVal; \
+ tmpVal.x = tmpAlpha0; \
+ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+ VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ for(; coord.y > 0;) \
+ { \
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+ VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+ VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+ float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \
+ coord.y--; \
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+ float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+ float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+ int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+ int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+ int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+ int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8);\
+ VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8);\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ } \
+}
+CUMSUM_8BITS_EX_REV_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)
+CUMSUM_8BITS_EX_REV_AXIS1(I8, I8, vxc_char16, vxc_char16)
+
+__kernel void cumsum_ex_rev_I16toI16_axis1(
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ int axis, int exclusive, int rev)
+{
+ int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);
+
+ vxc_short8 src, dst;
+ int4 sum0 = (int4)(0), sum1 = (int4)(0);
+ if(exclusive == 0 && rev)
+ {
+ for(coord.y = height - 1; coord.y >= 0; coord.y--)
+ {
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+ float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+ int4 tmpDst0 = convert_int4_rte(tmpSum0);
+ int4 tmpDst1 = convert_int4_rte(tmpSum1);
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
+ uniConvertInt32toUint8_2x8);
+
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ }
+ }
+ else if(exclusive && rev == 0)
+ {
+ int tmpAlpha0 = convert_int_rte(output_zp);
+ int4 tmpVal;
+ tmpVal.x = tmpAlpha0;
+ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+ VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+ for(; coord.y < height - 1;)
+ {
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ coord.y++;
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+ float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp;
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+ int4 tmpDst0 = convert_int4_rte(tmpSum0);
+ int4 tmpDst1 = convert_int4_rte(tmpSum1);
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
+ uniConvertInt32toUint8_2x8);
+
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ }
+ }
+ else if(exclusive && rev)
+ {
+ coord.y = height - 1;
+ int tmpAlpha0 = convert_int_rte(output_zp);
+ int4 tmpVal;
+ tmpVal.x = tmpAlpha0;
+ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+ VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+ for(; coord.y > 0;)
+ {
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+ float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;
+ coord.y--;
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+ int4 tmpDst0 = convert_int4_rte(tmpSum0);
+ int4 tmpDst1 = convert_int4_rte(tmpSum1);
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
+ uniConvertInt32toUint8_2x8);
+
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ }
+ }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis2.vx
new file mode 100644
index 000000000..e8a8d2790
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis2.vx
@@ -0,0 +1,252 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int channel;
+_viv_uniform float in_out_scale;
+_viv_uniform float in_out_zp_scale;
+_viv_uniform float output_zp;
+
+__kernel void cumsum_ex_rev_F16toF16_axis2(
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ int axis, int exclusive, int rev)
+{
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+ vxc_short8 src, dst;
+ vxc_half8 data, sum;
+ VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+ if(rev && exclusive == 0)
+ {
+ for(coord.z = channel - 1; coord.z >= 0; coord.z--)
+ {
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ _viv_asm(COPY, data, src, 16);
+
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+ _viv_asm(COPY, dst, sum, 16);
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ }
+ }
+ else if(rev == 0 && exclusive)
+ {
+ _viv_asm(COPY, dst, sum, 16);
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ for(; coord.z < channel - 1;)
+ {
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ coord.z++;
+ _viv_asm(COPY, data, src, 16);
+
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+ _viv_asm(COPY, dst, sum, 16);
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ }
+ }
+ else if(rev && exclusive)
+ {
+ _viv_asm(COPY, dst, sum, 16);
+ coord.z = channel - 1;
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ for(; coord.z > 0;)
+ {
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ coord.z--;
+ _viv_asm(COPY, data, src, 16);
+
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+ _viv_asm(COPY, dst, sum, 16);
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ }
+ }
+}
+
+#define CUMSUM_8BITS_EX_REV_AXIS2(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \
+ __read_only image2d_array_t input, \
+ __write_only image2d_array_t output, \
+ int axis, int exclusive, int rev) \
+{ \
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+ \
+ src_type src; \
+ dst_type dst; \
+ int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
+ \
+ if(rev && exclusive == 0) \
+ { \
+ for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
+ { \
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+ VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+ VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+ float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+ float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+ float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+ int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+ int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+ int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+ int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8);\
+ VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \
+ uniConvertInt32toUint8_2x8);\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ } \
+ else if(exclusive && rev == 0) \
+ { \
+ int tmpAlpha0 = convert_int_rte(output_zp); \
+ int4 tmpVal; \
+ tmpVal.x = tmpAlpha0; \
+ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
+ VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ for(; coord.z < channel - 1;) \
+ { \
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ coord.z++; \
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+ VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+ VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+ float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp; \
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+ float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+ float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+ int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+ int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+ int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+ int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8); \
+ VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \
+ uniConvertInt32toUint8_2x8); \
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ } \
+ else if(rev && exclusive) \
+ { \
+ coord.z = channel - 1; \
+ int tmpAlpha0 = convert_int_rte(output_zp); \
+ int4 tmpVal; \
+ tmpVal.x = tmpAlpha0; \
+ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+ VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ for(; coord.z > 0;) \
+ { \
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+ VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+ VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+ float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \
+ coord.z--; \
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+ float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+ float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+ int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+ int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+ int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+ int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8); \
+ VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1),
+ uniConvertInt32toUint8_2x8); \
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ } \
+}
+CUMSUM_8BITS_EX_REV_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)
+CUMSUM_8BITS_EX_REV_AXIS2(I8, I8, vxc_char16, vxc_char16)
+
+__kernel void cumsum_ex_rev_I16toI16_axis2(
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ int axis, int exclusive, int rev)
+{
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+ vxc_short8 src, dst;
+ int4 sum0 = (int4)(0), sum1 = (int4)(0);
+ if(exclusive == 0 && rev)
+ {
+ for(coord.z = channel - 1; coord.z >= 0; coord.z--)
+ {
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+ float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+ int4 tmpDst0 = convert_int4_rte(tmpSum0);
+ int4 tmpDst1 = convert_int4_rte(tmpSum1);
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),
+ uniConvertInt32toUint8_2x8);
+
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ }
+ }
+ else if(exclusive && rev == 0)
+ {
+ int tmpAlpha0 = convert_int_rte(output_zp);
+ int4 tmpVal;
+ tmpVal.x = tmpAlpha0;
+ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+ VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ for(; coord.z < channel - 1;)
+ {
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ coord.z++;
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+ float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp;
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+ int4 tmpDst0 = convert_int4_rte(tmpSum0);
+ int4 tmpDst1 = convert_int4_rte(tmpSum1);
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),
+ uniConvertInt32toUint8_2x8);
+
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ }
+ }
+ else if(exclusive && rev)
+ {
+ coord.z = channel - 1;
+ int tmpAlpha0 = convert_int_rte(output_zp);
+ int4 tmpVal;
+ tmpVal.x = tmpAlpha0;
+ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+ VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ for(; coord.z > 0;)
+ {
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+ float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;
+ coord.z--;
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+ int4 tmpDst0 = convert_int4_rte(tmpSum0);
+ int4 tmpDst1 = convert_int4_rte(tmpSum1);
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),
+ uniConvertInt32toUint8_2x8);
+
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ }
+ }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_f16_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_f16_u8.vx
index b9f4e1754..60159d98a 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_f16_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_f16_u8.vx
@@ -176,3 +176,135 @@ __kernel void cumsum_F16to##out_name##_axis0_2D( \
CUMSUM_F16TOQINT_AXIS0_2D(I8, vxc_half8, vxc_char16)
CUMSUM_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8)
CUMSUM_F16TOQINT_AXIS0_2D(U8, vxc_half8, vxc_uchar16)
+
+#define CUMSUM_F16TOQINT_EX_REV_AXIS2(out_name, src_type, dst_type) \
+__kernel void cumsum_ex_rev_F16to##out_name##_axis2( \
+ __read_only image2d_array_t input, \
+ __write_only image2d_array_t output, \
+ int axis, int exclusive, int rev \
+ ) \
+{ \
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+ \
+ vxc_short8 src; \
+ dst_type dst; \
+ vxc_half8 data, sum; \
+ VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+ vxc_ushort8 ms0; \
+ _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+ if(exclusive == 0 && rev) \
+ { \
+ for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
+ { \
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ _viv_asm(COPY, data, src, 16); \
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uniU8MulAndPostShift_0_Lo_2x8); \
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ } \
+ else if(exclusive && rev == 0) \
+ { \
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uniU8MulAndPostShift_0_Lo_2x8); \
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ for(; coord.z < channel - 1;) \
+ { \
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ coord.z++; \
+ _viv_asm(COPY, data, src, 16); \
+ \
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uniU8MulAndPostShift_0_Lo_2x8); \
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ } \
+ else if(exclusive && rev) \
+ { \
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uniU8MulAndPostShift_0_Lo_2x8); \
+ coord.z = channel - 1; \
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ for(; coord.z > 0;) \
+ { \
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ coord.z--; \
+ _viv_asm(COPY, data, src, 16); \
+ \
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uniU8MulAndPostShift_0_Lo_2x8); \
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ } \
+}
+CUMSUM_F16TOQINT_EX_REV_AXIS2(I8, vxc_half8, vxc_char16)
+CUMSUM_F16TOQINT_EX_REV_AXIS2(I16, vxc_half8, vxc_short8)
+CUMSUM_F16TOQINT_EX_REV_AXIS2(U8, vxc_half8, vxc_uchar16)
+
+#define CUMSUM_F16TOQINT_EX_REV_AXIS1(out_name, src_type, dst_type) \
+__kernel void cumsum_ex_rev_F16to##out_name##_axis1( \
+ __read_only image2d_array_t input, \
+ __write_only image2d_array_t output, \
+ int axis, int exclusive, int rev \
+ ) \
+{ \
+ int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \
+ \
+ vxc_short8 src; \
+ dst_type dst; \
+ vxc_half8 data, sum; \
+ VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+ vxc_ushort8 ms0; \
+ _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+ if(exclusive == 0 && rev) \
+ { \
+ for(coord.y = height - 1; coord.y >= 0; coord.y--) \
+ { \
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ _viv_asm(COPY, data, src, 16); \
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uniU8MulAndPostShift_0_Lo_2x8); \
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ } \
+ else if(exclusive && rev == 0) \
+ { \
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uniU8MulAndPostShift_0_Lo_2x8); \
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ for(; coord.y < height - 1;) \
+ { \
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ coord.y++; \
+ _viv_asm(COPY, data, src, 16); \
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uniU8MulAndPostShift_0_Lo_2x8); \
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ } \
+ else if(exclusive && rev) \
+ { \
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uniU8MulAndPostShift_0_Lo_2x8); \
+ coord.y = height - 1; \
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ for(; coord.y > 0;) \
+ { \
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ coord.y--; \
+ _viv_asm(COPY, data, src, 16); \
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uniU8MulAndPostShift_0_Lo_2x8); \
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ } \
+}
+CUMSUM_F16TOQINT_EX_REV_AXIS1(I8, vxc_half8, vxc_char16)
+CUMSUM_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8)
+CUMSUM_F16TOQINT_EX_REV_AXIS1(U8, vxc_half8, vxc_uchar16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_rgb.vx b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_rgb.vx
new file mode 100644
index 000000000..2088285dd
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_rgb.vx
@@ -0,0 +1,316 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float4 matrix0;
+_viv_uniform float2 matrix1;
+__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb_2D
+(
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ float _m0,
+ float _m1,
+ float _m2,
+ float _m3,
+ float _m4,
+ float _m5
+)
+{
+ int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));
+ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+ float4 coord_f = convert_float4(coord_in);
+
+ coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
+
+ coord_in.x = floor(coord_f.x) * 3;
+ coord_in.y = floor(coord_f.y);
+ coord_in.z = floor(coord_f.z) * 3;
+ coord_in.w = floor(coord_f.w);
+
+ vxc_uchar16 dst;
+ VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+ coord_in.x = coord_in.x + 1;
+ VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+ coord_in.x = coord_in.x + 1;
+ VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+
+ VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+ coord_in.z = coord_in.z + 1;
+ VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+ coord_in.z = coord_in.z + 1;
+ VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+
+ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_warp_affine_bilinear_U8toU8_rgb_2D
+(
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ float _m0,
+ float _m1,
+ float _m2,
+ float _m3,
+ float _m4,
+ float _m5
+)
+{
+ int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));
+ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+ float4 coord_f = convert_float4(coord_in);
+
+ coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
+
+ coord_in.x = floor(coord_f.x) * 3;
+ coord_in.y = floor(coord_f.y);
+ coord_in.z = floor(coord_f.z) * 3;
+ coord_in.w = floor(coord_f.w);
+
+ vxc_uchar16 src0, src1, src_0, src_1, dst;
+ VXC_ReadImage(src_0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+ VXC_ReadImage(src_1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+ src0.x = src_0.s0;
+ src0.y = src_0.s3;
+ src1.x = src_1.s0;
+ src1.y = src_1.s3;
+
+#if (VX_VERSION==1)
+ VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#else
+ VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+ src1.s0 = src0.s1;
+ VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#endif
+
+ src0.x = src_0.s1;
+ src0.y = src_0.s4;
+ src1.x = src_1.s1;
+ src1.y = src_1.s4;
+#if (VX_VERSION==1)
+ VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#else
+ VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+ src1.s0 = src0.s1;
+ VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#endif
+
+ src0.x = src_0.s2;
+ src0.y = src_0.s5;
+ src1.x = src_1.s2;
+ src1.y = src_1.s5;
+#if (VX_VERSION==1)
+ VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#else
+ VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+ src1.s0 = src0.s1;
+ VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#endif
+
+ VXC_ReadImage(src_0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+ VXC_ReadImage(src_1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+ src0.x = src_0.s0;
+ src0.y = src_0.s3;
+ src1.x = src_1.s0;
+ src1.y = src_1.s3;
+#if (VX_VERSION==1)
+ VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#else
+ VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+ src1.s0 = src0.s1;
+ VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#endif
+
+ src0.x = src_0.s1;
+ src0.y = src_0.s4;
+ src1.x = src_1.s1;
+ src1.y = src_1.s4;
+#if (VX_VERSION==1)
+ VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#else
+ VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+ src1.s0 = src0.s1;
+ VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#endif
+
+ src0.x = src_0.s2;
+ src0.y = src_0.s5;
+ src1.x = src_1.s2;
+ src1.y = src_1.s5;
+#if (VX_VERSION==1)
+ VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#else
+ VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+ src1.s0 = src0.s1;
+ VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#endif
+
+ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb
+(
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ float _m0,
+ float _m1,
+ float _m2,
+ float _m3,
+ float _m4,
+ float _m5
+)
+{
+ int4 coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2));
+ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+ float4 coord_f = convert_float4(coord_in);
+
+ coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
+
+ coord_in.x = floor(coord_f.x) * 3;
+ coord_in.y = floor(coord_f.y);
+ coord_in.z = floor(coord_f.z) * 3;
+ coord_in.w = floor(coord_f.w);
+
+ int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
+ int8 input_desc;
+ _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+ int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
+ _viv_asm(MOV, coord_input.w, baseAddr);
+
+ vxc_uchar16 dst;
+ VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+ coord_input.x = coord_input.x + 1;
+ VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+ coord_input.x = coord_input.x + 1;
+ VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+ coord_input.xy = coord_in.zw;
+ VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+ coord_input.x = coord_input.x + 1;
+ VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+ coord_input.x = coord_input.x + 1;
+ VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_warp_affine_bilinear_U8toU8_rgb
+(
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ float _m0,
+ float _m1,
+ float _m2,
+ float _m3,
+ float _m4,
+ float _m5
+)
+{
+ int4 coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2));
+ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+ float4 coord_f = convert_float4(coord_in);
+
+ coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
+
+ coord_in.x = floor(coord_f.x) * 3;
+ coord_in.y = floor(coord_f.y);
+ coord_in.z = floor(coord_f.z) * 3;
+ coord_in.w = floor(coord_f.w);
+
+ int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
+ int8 input_desc;
+ _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+ int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
+ _viv_asm(MOV, coord_input.w, baseAddr);
+
+ vxc_uchar16 src0, src1, src_0, src_1, dst;
+ VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+ src0.x = src_0.s0;
+ src0.y = src_0.s3;
+ src1.x = src_1.s0;
+ src1.y = src_1.s3;
+
+#if (VX_VERSION==1)
+ VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#else
+ VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+ src1.s0 = src0.s1;
+ VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#endif
+
+ src0.x = src_0.s1;
+ src0.y = src_0.s4;
+ src1.x = src_1.s1;
+ src1.y = src_1.s4;
+#if (VX_VERSION==1)
+ VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#else
+ VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+ src1.s0 = src0.s1;
+ VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#endif
+
+ src0.x = src_0.s2;
+ src0.y = src_0.s5;
+ src1.x = src_1.s2;
+ src1.y = src_1.s5;
+#if (VX_VERSION==1)
+ VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#else
+ VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+ src1.s0 = src0.s1;
+ VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#endif
+
+ coord_input.xy = coord_in.zw;
+ VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+ src0.x = src_0.s0;
+ src0.y = src_0.s3;
+ src1.x = src_1.s0;
+ src1.y = src_1.s3;
+#if (VX_VERSION==1)
+ VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#else
+ VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+ src1.s0 = src0.s1;
+ VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#endif
+
+ src0.x = src_0.s1;
+ src0.y = src_0.s4;
+ src1.x = src_1.s1;
+ src1.y = src_1.s4;
+#if (VX_VERSION==1)
+ VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#else
+ VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+ src1.s0 = src0.s1;
+ VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#endif
+
+ src0.x = src_0.s2;
+ src0.y = src_0.s5;
+ src1.x = src_1.s2;
+ src1.y = src_1.s5;
+#if (VX_VERSION==1)
+ VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#else
+ VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+ src1.s0 = src0.s1;
+ VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#endif
+
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
index 3a1661e85..73171a8b0 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
@@ -18,6 +18,7 @@ __kernel void gather_I8toI8(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xy);
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.w = gidz * axis_num + indice.x;
vxc_char16 src;
@@ -42,6 +43,7 @@ __kernel void gather_U8toU8(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xy);
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.w = gidz * axis_num + indice.x;
vxc_uchar16 src;
@@ -66,8 +68,8 @@ __kernel void gather_I16toI16(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
-
int4 indice = read_imagei(input1, coord_in.xy);
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.w = gidz * axis_num + indice.x;
vxc_short8 src;
@@ -92,6 +94,7 @@ __kernel void gather_F16toF16(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xy);
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.w = gidz * axis_num + indice.x;
vxc_short8 src;
@@ -112,6 +115,7 @@ __kernel void gather_I8toI8_axis0(
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 indices = read_imagei(input1, coord.xx);
+ indices = indices >= 0 ? indices : indices + axis_num;
int2 coord_in = (int2)(indices.x, get_global_id(1));
vxc_char16 src, dst;
@@ -138,6 +142,7 @@ __kernel void gather_U8toU8_axis0(
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 indices = read_imagei(input1, coord.xx);
+ indices = indices >= 0 ? indices : indices + axis_num;
int2 coord_in = (int2)(indices.x, get_global_id(1));
vxc_uchar16 src, dst;
@@ -164,6 +169,7 @@ __kernel void gather_I16toI16_axis0(
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 indices = read_imagei(input1, coord.xx);
+ indices = indices >= 0 ? indices : indices + axis_num;
int2 coord_in = (int2)(indices.x, get_global_id(1));
vxc_short8 src, dst;
@@ -190,6 +196,7 @@ __kernel void gather_F16toF16_axis0(
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 indices = read_imagei(input1, coord.xx);
+ indices = indices >= 0 ? indices : indices + axis_num;
int2 coord_in = (int2)(indices.x, get_global_id(1));
vxc_short8 src, dst;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx
index 9ed287631..9c21fd131 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx
@@ -18,6 +18,7 @@ __kernel void gather_I8toI8_array(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xy);
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.w = gidz * axis_num + indice.x;
Image img1 = create_image_from_image2d(input0, 1);
@@ -46,6 +47,7 @@ __kernel void gather_U8toU8_array(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xy);
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.w = gidz * axis_num + indice.x;
Image img1 = create_image_from_image2d(input0, 1);
@@ -74,8 +76,8 @@ __kernel void gather_I16toI16_array(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
-
int4 indice = read_imagei(input1, coord_in.xy);
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.w = gidz * axis_num + indice.x;
Image img1 = create_image_from_image2d(input0, 2);
@@ -105,6 +107,7 @@ __kernel void gather_F16toF16_array(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xy);
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.w = gidz * axis_num + indice.x;
Image img1 = create_image_from_image2d(input0, 2);
@@ -142,6 +145,7 @@ __kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.xy); \
__global data_type* data_ptr = (__global data_type*)input_ptr; \
__global write_type* out_ptr = (__global write_type*)output_ptr; \
+ indices = indices >= 0 ? indices : indices + axis_num; \
src.s0 = data_ptr[indices.x]; \
src.s1 = data_ptr[indices.y]; \
src.s2 = data_ptr[indices.z]; \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx
index 8d09d50d4..47f1db609 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx
@@ -24,6 +24,7 @@ __kernel void gather_batch_I8toI8(
{
int4 indice = read_imagei(input1, coord_idx);
coord_idx.y++;
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.y = gidz * axis_num + indice.x;
vxc_char16 src;
@@ -54,6 +55,7 @@ __kernel void gather_batch_U8toU8(
{
int4 indice = read_imagei(input1, coord_idx);
coord_idx.y++;
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.y = gidz * axis_num + indice.x;
vxc_uchar16 src;
@@ -84,6 +86,7 @@ __kernel void gather_batch_I16toI16(
{
int4 indice = read_imagei(input1, coord_idx);
coord_idx.y++;
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.y = gidz * axis_num + indice.x;
vxc_short8 src;
@@ -114,6 +117,7 @@ __kernel void gather_batch_F16toF16(
{
int4 indice = read_imagei(input1, coord_idx);
coord_idx.y++;
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.y = gidz * axis_num + indice.x;
vxc_short8 src;
@@ -135,6 +139,7 @@ __kernel void gather_batch_I8toI8_axis0(
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 indices = read_imagei(input1, coord.xz);
+ indices = indices >= 0 ? indices : indices + axis_num;
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
vxc_char16 src, dst;
@@ -163,6 +168,7 @@ __kernel void gather_batch_U8toU8_axis0(
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 indices = read_imagei(input1, coord.xz);
+ indices = indices >= 0 ? indices : indices + axis_num;
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
vxc_uchar16 src, dst;
@@ -191,6 +197,7 @@ __kernel void gather_batch_I16toI16_axis0(
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 indices = read_imagei(input1, coord.xz);
+ indices = indices >= 0 ? indices : indices + axis_num;
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
vxc_short8 src, dst;
@@ -219,6 +226,7 @@ __kernel void gather_batch_F16toF16_axis0(
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 indices = read_imagei(input1, coord.xz);
+ indices = indices >= 0 ? indices : indices + axis_num;
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
vxc_short8 src, dst;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_elements.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_elements.vx
index 39a8a990d..9f962c410 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_elements.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_elements.vx
@@ -1,6 +1,12 @@
#include "cl_viv_vx_ext.h"
_viv_uniform int axis_size;
+_viv_uniform uint width0;
+_viv_uniform uint height0;
+_viv_uniform uint width1;
+_viv_uniform uint height1;
+_viv_uniform uint width_out;
+_viv_uniform uint height_out;
#define GATHER_ELEMENTS_AXIS0_2D(name, data_type) \
__kernel void gather_elements_axis0_##name##_I32to##name##_2D \
@@ -151,3 +157,141 @@ GATHER_ELEMENTS_AXIS2(F16, vxc_short4)
GATHER_ELEMENTS_AXIS2(I16, vxc_short4)
GATHER_ELEMENTS_AXIS2(I8, vxc_char4)
GATHER_ELEMENTS_AXIS2(U8, vxc_uchar4)
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name \
+ ( \
+ __read_only image2d_array_t input0, \
+ __read_only image2d_array_t input1, \
+ __write_only image2d_array_t output, \
+ int axis \
+ ) \
+{ \
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \
+ int* index_ptr = (int*)index_tensor.ptr; \
+ int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \
+ \
+ Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \
+ data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \
+ data_type data = input_ptr[index + coord.y * width0 + coord.z * width0 * height0]; \
+ \
+ Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \
+ data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \
+ output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I8, char, char*, 1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(U8, uchar, uchar*, 1)
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name \
+ ( \
+ __read_only image2d_array_t input0, \
+ __read_only image2d_array_t input1, \
+ __write_only image2d_array_t output, \
+ int axis \
+ ) \
+{ \
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \
+ int* index_ptr = (int*)index_tensor.ptr; \
+ int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \
+ \
+ Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \
+ data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \
+ data_type data = input_ptr[coord.x + index * width0 + coord.z * width0 * height0]; \
+ \
+ Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \
+ data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \
+ output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I8, char, char*, 1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(U8, uchar, uchar*, 1)
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis2_##name##_I32to##name \
+ ( \
+ __read_only image2d_array_t input0, \
+ __read_only image2d_array_t input1, \
+ __write_only image2d_array_t output, \
+ int axis \
+ ) \
+{ \
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+ Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \
+ int* index_ptr = (int*)index_tensor.ptr; \
+ int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \
+ \
+ Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \
+ data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \
+ data_type data = input_ptr[coord.x + coord.y * width0 + index * width0 * height0]; \
+ \
+ Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \
+ data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \
+ output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I8, char, char*, 1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(U8, uchar, uchar*, 1)
+
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name##_2D \
+ ( \
+ __read_only image2d_t input0, \
+ __read_only image2d_t input1, \
+ __write_only image2d_t output, \
+ int axis \
+ ) \
+{ \
+ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ Image index_img = create_image_from_image2d(input1, 4); \
+ int* index_ptr = (int*)index_img.ptr; \
+ int index = index_ptr[coord.x + coord.y * width1]; \
+ \
+ Image input_img = create_image_from_image2d(input0, stride); \
+ data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \
+ data_type data = input_ptr[index + coord.y * width0]; \
+ \
+ Image output_img = create_image_from_image2d(output, stride); \
+ data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \
+ output_ptr[coord.x + coord.y * width_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I8, char, char*, 1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(U8, uchar, uchar*, 1)
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name##_2D \
+ ( \
+ __read_only image2d_t input0, \
+ __read_only image2d_t input1, \
+ __write_only image2d_t output, \
+ int axis \
+ ) \
+{ \
+ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ Image index_img = create_image_from_image2d(input1, 4); \
+ int* index_ptr = (int*)index_img.ptr; \
+ int index = index_ptr[coord.x + coord.y * width1]; \
+ \
+ Image input_img = create_image_from_image2d(input0, stride); \
+ data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \
+ data_type data = input_ptr[coord.x + index * width0]; \
+ \
+ Image output_img = create_image_from_image2d(output, stride); \
+ data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \
+ output_ptr[coord.x + coord.y * width_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I8, char, char*, 1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(U8, uchar, uchar*, 1)
+
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx
index e9b8fd14e..87825fd13 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx
@@ -24,6 +24,7 @@ __kernel void gather_##src0_type_name##toF16( \
\
int4 coord_in = (int4)(gidy, 0, gidx, 0); \
int4 indice = read_imagei(input1, coord_in.xy); \
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \
coord_in.w = gidz * axis_num + indice.x; \
\
read_type src; \
@@ -60,6 +61,7 @@ __kernel void gather_F16to##src1_type_name( \
int4 coord_in = (int4)(gidy, 0, gidx, 0); \
\
int4 indice = read_imagei(input1, coord_in.xy); \
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \
coord_in.w = gidz * axis_num + indice.x; \
\
vxc_short8 src; \
@@ -92,6 +94,7 @@ __kernel void gather_I16toF16(
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xy);
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.w = gidz * axis_num + indice.x;
vxc_short8 src;
@@ -122,6 +125,7 @@ __kernel void gather_##src0_type_name##toF16_axis0( \
{ \
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
int4 indices = read_imagei(input1, coord.xx); \
+ indices = indices >= 0 ? indices : indices + axis_num; \
int2 coord_in = (int2)(indices.x, get_global_id(1)); \
\
read_type src; \
@@ -153,6 +157,7 @@ __kernel void gather_F16to##src1_type_name##_axis0( \
{ \
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
int4 indices = read_imagei(input1, coord.xx); \
+ indices = indices >= 0 ? indices : indices + axis_num; \
int2 coord_in = (int2)(indices.x, get_global_id(1)); \
\
vxc_short8 src; \
@@ -184,6 +189,7 @@ __kernel void gather_I16toF16_axis0(
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 indices = read_imagei(input1, coord.xx);
+ indices = indices >= 0 ? indices : indices + axis_num;
int2 coord_in = (int2)(indices.x, get_global_id(1));
vxc_short8 src;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx
index 0e94445ca..988c81183 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx
@@ -33,6 +33,7 @@ __kernel void gather_batch_##src0_type_name##toF16( \
{ \
int4 indice = read_imagei(input1, coord_idx); \
coord_idx.y++; \
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \
coord_in.y = gidz * axis_num + indice.x; \
\
read_type src; \
@@ -78,6 +79,7 @@ __kernel void gather_batch_F16to##src1_type_name( \
{ \
int4 indice = read_imagei(input1, coord_idx); \
coord_idx.y++; \
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \
coord_in.y = gidz * axis_num + indice.x; \
\
vxc_short8 src; \
@@ -120,6 +122,7 @@ __kernel void gather_batch_I16toF16(
{
int4 indice = read_imagei(input1, coord_idx);
coord_idx.y++;
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
coord_in.y = gidz * axis_num + indice.x;
vxc_short8 src;
@@ -145,6 +148,7 @@ __kernel void gather_batch_##src0_type_name##toF16_axis0( \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
int4 indices = read_imagei(input1, coord.xz); \
+ indices = indices >= 0 ? indices : indices + axis_num; \
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \
\
read_type src; \
@@ -179,6 +183,7 @@ __kernel void gather_batch_F16to##src1_type_name##_axis0( \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
int4 indices = read_imagei(input1, coord.xz); \
+ indices = indices >= 0 ? indices : indices + axis_num; \
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \
\
vxc_short8 src; \
@@ -213,6 +218,7 @@ __kernel void gather_batch_I16toF16_axis0(
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 indices = read_imagei(input1, coord.xz);
+ indices = indices >= 0 ? indices : indices + axis_num;
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
vxc_short8 src, dst;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx
index c479a3b58..e467f252e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx
@@ -2,93 +2,96 @@
__kernel void gather_nd_batch_I8toI8_1D(
__read_only image2d_t input0,
- __read_only image2d_t input1,
- __write_only image2d_t output,
+ __read_only image2d_array_t input1,
+ __write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
- int gidy = get_global_id(1); // batch
+ int gidy = get_global_id(1); // index num
+ int gidz = get_global_id(2); // batch num
- int4 coord = (int4)(gidx, gidy, 0, 0);
- Image img = create_image_from_image2d(input1, 4);
- uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);
+ int4 coord = (int4)(gidx, gidy, gidz, 0);
+ Tensor img = create_tensor_from_image2d_array(input1, 4);
+ uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
int4 indice = ((int4 *)indice_ptr)[0];
-
- coord.z = indice.x * block_size + gidx;
+ int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
vxc_char16 src;
- VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+ VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_nd_batch_U8toU8_1D(
__read_only image2d_t input0,
- __read_only image2d_t input1,
- __write_only image2d_t output,
+ __read_only image2d_array_t input1,
+ __write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
- int gidy = get_global_id(1); // batch num
+ int gidy = get_global_id(1); // index num
+ int gidz = get_global_id(2); // batch num
- int4 coord = (int4)(gidx, gidy, 0, 0);
- Image img = create_image_from_image2d(input1, 4);
- uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);
+ int4 coord = (int4)(gidx, gidy, gidz, 0);
+ Tensor img = create_tensor_from_image2d_array(input1, 4);
+ uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
int4 indice = ((int4 *)indice_ptr)[0];
- coord.z = indice.x * block_size + gidx;
+ int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
vxc_uchar16 src;
- VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+ VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_nd_batch_I16toI16_1D(
__read_only image2d_t input0,
- __read_only image2d_t input1,
- __write_only image2d_t output,
+ __read_only image2d_array_t input1,
+ __write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
- int gidy = get_global_id(1); // batch num
+ int gidy = get_global_id(1); // index num
+ int gidz = get_global_id(2); // batch num
- int4 coord = (int4)(gidx, gidy, 0, 0);
- Image img = create_image_from_image2d(input1, 4);
- uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);
+ int4 coord = (int4)(gidx, gidy, gidz, 0);
+ Tensor img = create_tensor_from_image2d_array(input1, 4);
+ uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
int4 indice = ((int4 *)indice_ptr)[0];
- coord.z = indice.x * block_size + gidx;
+ int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
vxc_short8 src;
- VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+ VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_nd_batch_F16toF16_1D(
__read_only image2d_t input0,
- __read_only image2d_t input1,
- __write_only image2d_t output,
+ __read_only image2d_array_t input1,
+ __write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
- int gidy = get_global_id(1); // batch num
+ int gidy = get_global_id(1); // index num
+ int gidz = get_global_id(2); // batch num
- int4 coord = (int4)(gidx, gidy, 0, 0);
- Image img = create_image_from_image2d(input1, 4);
- uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);
+ int4 coord = (int4)(gidx, gidy, gidz, 0);
+ Tensor img = create_tensor_from_image2d_array(input1, 4);
+ uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
int4 indice = ((int4 *)indice_ptr)[0];
- coord.z = indice.x * block_size + gidx;
+ int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
vxc_short8 src;
- VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+ VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx
index acc6c4cfc..58c2af349 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx
@@ -2,18 +2,19 @@
__kernel void gather_nd_batch_I8toI8_2D(
__read_only image2d_array_t input0,
- __read_only image2d_t input1,
- __write_only image2d_t output,
+ __read_only image2d_array_t input1,
+ __write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
- int gidy = get_global_id(1); // batch num
+ int gidy = get_global_id(1); // index num
+ int gidz = get_global_id(2); // batch num
- int4 coord = (int4)(gidx, 0, gidy, 0);
- Image img = create_image_from_image2d(input1, 4);
- uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);
+ int4 coord = (int4)(gidx, gidy, gidz, 0);
+ Tensor img = create_tensor_from_image2d_array(input1, 4);
+ uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
int4 indice = ((int4 *)indice_ptr)[0];
indice.x = indice.x * block_size + gidx;
@@ -22,23 +23,24 @@ __kernel void gather_nd_batch_I8toI8_2D(
vxc_char16 src;
VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_nd_U8toU8_2D(
__read_only image2d_array_t input0,
- __read_only image2d_t input1,
- __write_only image2d_t output,
+ __read_only image2d_array_t input1,
+ __write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
- int gidy = get_global_id(1); // batch num
+ int gidy = get_global_id(1); // index num
+ int gidz = get_global_id(2); // batch num
- int4 coord = (int4)(gidx, 0, gidy, 0);
- Image img = create_image_from_image2d(input1, 4);
- uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);
+ int4 coord = (int4)(gidx, gidy, gidz, 0);
+ Tensor img = create_tensor_from_image2d_array(input1, 4);
+ uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
int4 indice = ((int4 *)indice_ptr)[0];
indice.x = indice.x * block_size + gidx;
@@ -46,23 +48,24 @@ __kernel void gather_nd_U8toU8_2D(
vxc_uchar16 src;
VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_nd_I16toI16_2D(
__read_only image2d_array_t input0,
- __read_only image2d_t input1,
- __write_only image2d_t output,
+ __read_only image2d_array_t input1,
+ __write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
- int gidy = get_global_id(1); // batch num
+ int gidy = get_global_id(1); // index num
+ int gidz = get_global_id(2); // batch num
- int4 coord = (int4)(gidx, 0, gidy, 0);
- Image img = create_image_from_image2d(input1, 4);
- uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);
+ int4 coord = (int4)(gidx, gidy, gidz, 0);
+ Tensor img = create_tensor_from_image2d_array(input1, 4);
+ uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
int4 indice = ((int4 *)indice_ptr)[0];
indice.x = indice.x * block_size + gidx;
@@ -70,23 +73,24 @@ __kernel void gather_nd_I16toI16_2D(
vxc_short8 src;
VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_nd_F16toF16_2D(
__read_only image2d_array_t input0,
- __read_only image2d_t input1,
- __write_only image2d_t output,
+ __read_only image2d_array_t input1,
+ __write_only image2d_array_t output,
int block_size,
int coord_dim
)
{
int gidx = get_global_id(0); // block_size
- int gidy = get_global_id(1); // batch num
+ int gidy = get_global_id(1); // index num
+ int gidz = get_global_id(2); // batch num
- int4 coord = (int4)(gidx, 0, gidy, 0);
- Image img = create_image_from_image2d(input1, 4);
- uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);
+ int4 coord = (int4)(gidx, gidy, gidz, 0);
+ Tensor img = create_tensor_from_image2d_array(input1, 4);
+ uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
int4 indice = ((int4 *)indice_ptr)[0];
indice.x = indice.x * block_size + gidx;
@@ -94,5 +98,5 @@ __kernel void gather_nd_F16toF16_2D(
vxc_short8 src;
VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_0.vx
index c1b970d43..5dfbc3ad7 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_0.vx
@@ -184,12 +184,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
__read_only image2d_array_t scale, \
__read_only image2d_t meanVari, \
__write_only image2d_array_t output, \
- float eps, int is2D, float rSpaceOrg, int pStride) \
+ float eps, int is2D, float rSpaceOrg, float pStride) \
{ \
+ int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
int gidz = get_global_id(2); \
- int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \
- int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \
+ int4 coord = (int4)(gidx, gidy, gidz, 0); \
+ int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \
src_type src0; \
dst_type dst; \
vxc_short8 src1; \
@@ -235,7 +236,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
__read_only image2d_array_t scale, \
__read_only image2d_t meanVari, \
__write_only image2d_array_t output, \
- float eps, int is2D, float rSpaceOrg, int pStride) \
+ float eps, int is2D, float rSpaceOrg, float pStride) \
{ \
int gidz = get_global_id(1); \
int2 coord = (int2)(get_global_id(0), gidz); \
@@ -285,12 +286,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
__read_only image2d_t scale, \
__read_only image2d_t meanVari, \
__write_only image2d_array_t output, \
- float eps, int is2D, float rSpaceOrg, int pStride) \
+ float eps, int is2D, float rSpaceOrg, float pStride) \
{ \
+ int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
int gidz = get_global_id(2); \
- int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \
- int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \
+ int4 coord = (int4)(gidx, gidy, gidz, 0); \
+ int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \
src_type src0; \
dst_type dst; \
float scale_vari, bias_val; \
@@ -331,7 +333,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
__read_only image2d_t scale, \
__read_only image2d_t meanVari, \
__write_only image2d_array_t output, \
- float eps, int is2D, float rSpaceOrg, int pStride) \
+ float eps, int is2D, float rSpaceOrg, float pStride) \
{ \
int gidz = get_global_id(1); \
int2 coord = (int2)(get_global_id(0), gidz); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx
index 3562ae557..8b45e178f 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx
@@ -17,12 +17,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
__read_only image2d_array_t scale, \
__read_only image2d_t meanVari, \
__write_only image2d_array_t output, \
- float eps, int is2D, float rSpaceOrg, int pStride) \
+ float eps, int is2D, float rSpaceOrg, float pStride) \
{ \
+ int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
int gidz = get_global_id(2); \
- int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \
- int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \
+ int4 coord = (int4)(gidx, gidy, gidz, 0); \
+ int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \
src_type src0; \
vxc_short8 src1, outval; \
vxc_half8 scale_h, dst; \
@@ -75,7 +76,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
__read_only image2d_array_t scale, \
__read_only image2d_t meanVari, \
__write_only image2d_array_t output, \
- float eps, int is2D, float rSpaceOrg, int pStride) \
+ float eps, int is2D, float rSpaceOrg, float pStride) \
{ \
int gidz = get_global_id(1); \
int2 coord = (int2)(get_global_id(0), gidz); \
@@ -132,12 +133,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
__read_only image2d_t scale, \
__read_only image2d_t meanVari, \
__write_only image2d_array_t output, \
- float eps, int is2D, float rSpaceOrg, int pStride) \
+ float eps, int is2D, float rSpaceOrg, float pStride) \
{ \
+ int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
int gidz = get_global_id(2); \
- int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \
- int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \
+ int4 coord = (int4)(gidx, gidy, gidz, 0); \
+ int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \
src_type src0; \
vxc_short8 outval; \
vxc_half8 dst; \
@@ -186,7 +188,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
__read_only image2d_t scale, \
__read_only image2d_t meanVari, \
__write_only image2d_array_t output, \
- float eps, int is2D, float rSpaceOrg, int pStride) \
+ float eps, int is2D, float rSpaceOrg, float pStride) \
{ \
int gidz = get_global_id(1); \
int2 coord = (int2)(get_global_id(0), gidz); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx
index b62b67faf..33edef844 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx
@@ -138,12 +138,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
__read_only image2d_array_t scale, \
__read_only image2d_t meanVari, \
__write_only image2d_array_t output, \
- float eps, int is2D, float rSpaceOrg, int pStride) \
+ float eps, int is2D, float rSpaceOrg, float pStride) \
{ \
+ int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
int gidz = get_global_id(2); \
- int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \
- int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \
+ int4 coord = (int4)(gidx, gidy, gidz, 0); \
+ int4 coord_para = (int4)((convert_int(gidx* rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \
vxc_short8 src0; \
vxc_short8 src1; \
vxc_half8 scale_h; \
@@ -195,7 +196,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
__read_only image2d_array_t scale, \
__read_only image2d_t meanVari, \
__write_only image2d_array_t output, \
- float eps, int is2D, float rSpaceOrg, int pStride) \
+ float eps, int is2D, float rSpaceOrg, float pStride) \
{ \
int gidz = get_global_id(1); \
int2 coord = (int2)(get_global_id(0), gidz); \
@@ -250,12 +251,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
__read_only image2d_t scale, \
__read_only image2d_t meanVari, \
__write_only image2d_array_t output, \
- float eps, int is2D, float rSpaceOrg, int pStride) \
+ float eps, int is2D, float rSpaceOrg, float pStride) \
{ \
+ int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
int gidz = get_global_id(2); \
- int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \
- int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \
+ int4 coord = (int4)(gidx, gidy, gidz, 0); \
+ int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \
vxc_short8 src0; \
src_type in_h; \
float scale_vari, bias_val; \
@@ -302,7 +304,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
__read_only image2d_t scale, \
__read_only image2d_t meanVari, \
__write_only image2d_array_t output, \
- float eps, int is2D, float rSpaceOrg, int pStride) \
+ float eps, int is2D, float rSpaceOrg, float pStride) \
{ \
int gidz = get_global_id(1); \
int2 coord = (int2)(get_global_id(0), gidz); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx
index 77fdcc99a..8086f28c9 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx
@@ -29,8 +29,8 @@ _viv_uniform VXC_512Bits uniConvertF16_0_4x4;
_viv_uniform VXC_512Bits uniConvertF16_1_4x4;
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
-#define GRUCELL_F16_F16TOF16(act_name, act_func) \
-__kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \
+#define GRUCELL_F16_F16TOF16(act_name, act_func, rec_act_name, rec_act_func) \
+__kernel void grucell_reset_after_activation_F16_F16toF16_##act_name##_##rec_act_name( \
__read_only image2d_t hstate_in, \
__read_only image2d_t input_z_conv, \
__read_only image2d_t input_r_conv, \
@@ -62,15 +62,15 @@ __kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \
\
float4 r; \
VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \
- r = act_func(r); \
+ r = rec_act_func(r); \
float4 h0, h1; \
VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
float4 h = h0 + r * h1; \
float4 z; \
VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \
- z = act_func(z); \
- h = tanh_func(h); \
+ z = rec_act_func(z); \
+ h = act_func(h); \
float4 h_tm; \
VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
float4 result = (1 - z) * h + z * h_tm; \
@@ -83,14 +83,15 @@ __kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \
VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
-GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)
+GRUCELL_F16_F16TOF16(TANH, tanh_func, SIGMOID, sigmoid_func)
+GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func, SIGMOID, sigmoid_func)
_viv_uniform float hstate_in_scale;
_viv_uniform float hstate_in_tail;
_viv_uniform float output_scale;
_viv_uniform float output_zp;
-#define GRUCELL_QNT_F16TO_QNT(name0, name1, act_name, act_func, src0_type, dst_type) \
-__kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name( \
+#define GRUCELL_QNT_F16TO_QNT(name, act_func, rec_act_func, src0_type, dst_type) \
+__kernel void grucell_reset_after_activation_##name( \
__read_only image2d_t hstate_in, \
__read_only image2d_t input_z_conv, \
__read_only image2d_t input_r_conv, \
@@ -122,15 +123,15 @@ __kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name
\
float4 r; \
VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \
- r = act_func(r); \
+ r = rec_act_func(r); \
float4 h0, h1; \
VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
float4 h = h0 + r * h1; \
float4 z; \
VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \
- z = act_func(z); \
- h = tanh_func(h); \
+ z = rec_act_func(z); \
+ h = act_func(h); \
float4 h_tm; \
VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
h_tm = h_tm * hstate_in_scale + hstate_in_tail; \
@@ -143,6 +144,9 @@ __kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name
VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
-GRUCELL_QNT_F16TO_QNT(U8, U8, SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8)
-GRUCELL_QNT_F16TO_QNT(I8, I8, SIGMOID, sigmoid_func, vxc_char8, vxc_char8)
-GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8)
+GRUCELL_QNT_F16TO_QNT(U8_F16toU8_TANH_SIGMOID, tanh_func, sigmoid_func, vxc_uchar8, vxc_uchar8)
+GRUCELL_QNT_F16TO_QNT(I8_F16toI8_TANH_SIGMOID, tanh_func, sigmoid_func, vxc_char8, vxc_char8)
+GRUCELL_QNT_F16TO_QNT(I16_F16toI16_TANH_SIGMOID, tanh_func, sigmoid_func, vxc_short8, vxc_short8)
+GRUCELL_QNT_F16TO_QNT(U8_F16toU8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_uchar8, vxc_uchar8)
+GRUCELL_QNT_F16TO_QNT(I8_F16toI8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_char8, vxc_char8)
+GRUCELL_QNT_F16TO_QNT(I16_F16toI16_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_short8, vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_cross.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_cross.vx
new file mode 100644
index 000000000..b4dc43c24
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_cross.vx
@@ -0,0 +1,208 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float output_ZP;
+_viv_uniform float mulKIn0In1Zp;
+_viv_uniform float inOutScale;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4;
+_viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4;
+
+_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;
+
+#define GEMM_QINT_TO_QINT_CROSS(src0_type_name, read_type) \
+__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_cross( \
+ image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \
+ int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N, \
+ int axis_size, int inner_size, int outer_size, int axis_size0, \
+ int inner_size0, int outer_size0, int axis_size1, int inner_size1, \
+ int outer_size1, int axis_size2, int inner_size2, int outer_size2) \
+{ \
+ read_type srcA0, srcA1, srcA2, srcA3, srcB, outC; \
+ vxc_float4 sum = (vxc_float4)(mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp); \
+ int gidz = get_global_id(2); \
+ for(int j = 0; j < outer_size; j++) \
+ { \
+ for(int i = 0; i < inner_size; i++) \
+ { \
+ vxc_float4 sum0 = sum, sum1 = sum, sum2 = sum, sum3 = sum; \
+ int4 coord_a = (int4)(0, get_global_id(1), gidz * axis_size0 + i * inner_size0 + j * outer_size0, 0); \
+ int4 coord_b = (int4)(get_global_id(0), 0, gidz * axis_size1 + i * inner_size1 + j * outer_size1, 0); \
+ int8 inputA_desc, inputB_desc, output_desc; \
+ _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \
+ int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \
+ _viv_asm(MOV, coord_a.w, baseAddr_a); \
+ _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \
+ int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \
+ _viv_asm(MOV, coord_b.w, baseAddr_b); \
+ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \
+ { \
+ vxc_float4 tempA0, tempA1, tempA2, tempA3; \
+ vxc_float4 tempB0, tempB1, tempB2, tempB3; \
+ VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \
+ VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \
+ VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \
+ coord_a.x += 4; coord_b.y += 4; \
+ VXC_DP4x4(tempA0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGemmU8U8toFp32Block4_4x4); \
+ VXC_DP4x4(tempA1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGemmU8U8toFp32Block4_4x4); \
+ VXC_DP4x4(tempA2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGemmU8U8toFp32Block4_4x4); \
+ VXC_DP4x4(tempA3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGemmU8U8toFp32Block4_4x4); \
+ VXC_DP8x4(tempB0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGemmU8U8MulZptoFp32_8x4); \
+ VXC_DP8x4(tempB1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGemmU8U8MulZptoFp32_8x4); \
+ VXC_DP8x4(tempB2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGemmU8U8MulZptoFp32_8x4); \
+ VXC_DP8x4(tempB3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGemmU8U8MulZptoFp32_8x4); \
+ sum0 += tempA0 + tempB0; \
+ sum1 += tempA1 + tempB1; \
+ sum2 += tempA2 + tempB2; \
+ sum3 += tempA3 + tempB3; \
+ } \
+ vxc_int4 tmpOut0, tmpOut1; \
+ coord_b.y = get_global_id(1); \
+ coord_b.z = gidz * axis_size2 + i * inner_size2 + j * outer_size2; \
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+ int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \
+ _viv_asm(MOV, coord_b.w, baseAddr); \
+ tmpOut0 = convert_int4_rte(sum0 * inOutScale + output_ZP); \
+ tmpOut1 = convert_int4_rte(sum1 * inOutScale + output_ZP); \
+ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8); \
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ coord_b.y++; \
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ coord_b.y++; \
+ tmpOut0 = convert_int4_rte(sum2 * inOutScale + output_ZP); \
+ tmpOut1 = convert_int4_rte(sum3 * inOutScale + output_ZP); \
+ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8); \
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ coord_b.y++; \
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ } \
+ } \
+}
+GEMM_QINT_TO_QINT_CROSS(U8, vxc_uchar16)
+GEMM_QINT_TO_QINT_CROSS(I8, vxc_char16)
+
+__kernel void gemm_F16F16toF16_cross(image2d_array_t inputA,
+ image2d_array_t inputB, image2d_array_t output,
+ int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N,
+ int axis_size, int inner_size, int outer_size, int axis_size0,
+ int inner_size0, int outer_size0, int axis_size1, int inner_size1,
+ int outer_size1, int axis_size2, int inner_size2, int outer_size2)
+{
+ uint gidy = get_global_id(1);
+ uint gidz = get_global_id(2);
+ for(int j = 0; j < outer_size; j++)
+ {
+ for(int i = 0; i < inner_size; i++)
+ {
+ int4 coord_a = (int4)(0, gidy, gidz * axis_size0 + i * inner_size0 + j * outer_size0, 0);
+ int4 coord_b = (int4)(get_global_id(0), 0, gidz * axis_size1 + i * inner_size1 + j * outer_size1, 0);
+
+ half4 valC;
+ vxc_short8 srcA0, srcA1, srcA2, srcA3, outC;
+ vxc_half8 tmpA0, tmpA1, tmpA2, tmpA3;
+ vxc_short16 srcB;
+ vxc_half16 tmpB;
+ vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);
+ vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);
+
+ int8 inputA_desc, inputB_desc, output_desc;
+ _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));
+ int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;
+ _viv_asm(MOV, coord_a.w, baseAddr_a);
+ _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));
+ int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;
+ _viv_asm(MOV, coord_b.w, baseAddr_b);
+
+ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)
+ {
+ vxc_float4 tempA0, tempA1, tempA2, tempA3;
+ VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+ coord_a.x += 4; coord_b.y += 4;
+ _viv_asm(COPY, tmpA0, srcA0, 16);
+ _viv_asm(COPY, tmpA1, srcA1, 16);
+ _viv_asm(COPY, tmpA2, srcA2, 16);
+ _viv_asm(COPY, tmpA3, srcA3, 16);
+ _viv_asm(COPY, tmpB.hi, srcB.hi, 16);
+ _viv_asm(COPY, tmpB.lo, srcB.lo, 16);
+ VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniGemmU8F16toF32Lo_4x4b);
+ VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniGemmU8F16toF32Lo_4x4b);
+ VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, tmpA2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniGemmU8F16toF32Lo_4x4b);
+ VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, tmpA3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniGemmU8F16toF32Lo_4x4b);
+ sum0 += (tempA0);
+ sum1 += (tempA1);
+ sum2 += (tempA2);
+ sum3 += (tempA3);
+ }
+ coord_b.y = gidy;
+ coord_b.z = gidz * axis_size2 + i * inner_size2 + j * outer_size2;
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+ int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0;
+ _viv_asm(MOV, coord_b.w, baseAddr);
+ _viv_asm(CONV, valC, sum0);
+ _viv_asm(COPY, outC, valC, 16);
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+ coord_b.y++;
+ _viv_asm(CONV, valC, sum1);
+ _viv_asm(COPY, outC, valC, 16);
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+ coord_b.y++;
+ _viv_asm(CONV, valC, sum2);
+ _viv_asm(COPY, outC, valC, 16);
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+ coord_b.y++;
+ _viv_asm(CONV, valC, sum3);
+ _viv_asm(COPY, outC, valC, 16);
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+ }
+ }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_cross_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_cross_i16.vx
new file mode 100644
index 000000000..241118079
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_cross_i16.vx
@@ -0,0 +1,214 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int input0_ZP;
+_viv_uniform int input1_ZP;
+_viv_uniform float output_ZP;
+_viv_uniform float outputScale;
+_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32B_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+_viv_uniform int ac2zero;
+_viv_uniform int bc2zero;
+
+_viv_uniform int outer;
+
+#define GEMM_QINT_TO_QINT_MERGE(src0_type_name, read_type) \
+__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_merge( \
+ image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \
+ int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \
+{ \
+ uint gidy = get_global_id(1); \
+ short in0_zp, in1_zp; \
+ _viv_asm(COPY, in0_zp, input0_ZP, 4); \
+ _viv_asm(COPY, in1_zp, input1_ZP, 4); \
+ for(int i = 0; i < outer; i++) \
+ { \
+ read_type srcA, srcB, outC; \
+ int4 coord_a = (int4)(0, gidy, (ac2zero ? i : get_global_id(2)), 0); \
+ int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); \
+ vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \
+ vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \
+ \
+ int8 inputA_desc, inputB_desc, output_desc; \
+ _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \
+ int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \
+ _viv_asm(MOV, coord_a.w, baseAddr_a); \
+ _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \
+ int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \
+ _viv_asm(MOV, coord_b.w, baseAddr_b); \
+ \
+ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \
+ { \
+ vxc_float4 tempA0, tempA1, tempA2, tempA3; \
+ vxc_float4 tempB0, tempB1, tempB2, tempB3; \
+ VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniConvertUint8SubZpToFp32_4x4); \
+ VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniConvertUint8SubZpToFp32B_4x4); \
+ VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniConvertUint8SubZpToFp32_4x4); \
+ VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniConvertUint8SubZpToFp32B_4x4); \
+ VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniConvertUint8SubZpToFp32_4x4); \
+ VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniConvertUint8SubZpToFp32B_4x4); \
+ VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ coord_a.x += 4; \
+ coord_b.y += 4; \
+ VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniConvertUint8SubZpToFp32_4x4); \
+ VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniConvertUint8SubZpToFp32B_4x4); \
+ sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \
+ sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \
+ sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \
+ sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \
+ } \
+ vxc_int4 tmpOut0, tmpOut1; \
+ coord_b.y = gidy; \
+ coord_b.z = get_global_id(2) + i * get_global_size(2); \
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+ int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \
+ _viv_asm(MOV, coord_b.w, baseAddr); \
+ tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \
+ tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \
+ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8); \
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ coord_b.y++; \
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ coord_b.y++; \
+ tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \
+ tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \
+ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8); \
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ coord_b.y++; \
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ } \
+}
+GEMM_QINT_TO_QINT_MERGE(I16, vxc_short8)
+
+#define GEMM_QINT_TO_QINT_CROSS(src0_type_name, read_type) \
+__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_cross( \
+ image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \
+ int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N, \
+ int axis_size, int inner_size, int outer_size, int axis_size0, \
+ int inner_size0, int outer_size0, int axis_size1, int inner_size1, \
+ int outer_size1, int axis_size2, int inner_size2, int outer_size2) \
+{ \
+ uint gidy = get_global_id(1); \
+ uint gidz = get_global_id(2); \
+ short in0_zp, in1_zp; \
+ _viv_asm(COPY, in0_zp, input0_ZP, 4); \
+ _viv_asm(COPY, in1_zp, input1_ZP, 4); \
+ for(int j = 0; j < outer_size; j++) \
+ { \
+ for(int i = 0; i < inner_size; i++) \
+ { \
+ read_type srcA, srcB, outC; \
+ int4 coord_a = (int4)(0, gidy, gidz * axis_size0 + i * inner_size0 + j * outer_size0, 0); \
+ int4 coord_b = (int4)(get_global_id(0), 0, gidz * axis_size1 + i * inner_size1 + j * outer_size1, 0); \
+ vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \
+ vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \
+ \
+ int8 inputA_desc, inputB_desc, output_desc; \
+ _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \
+ int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \
+ _viv_asm(MOV, coord_a.w, baseAddr_a); \
+ _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \
+ int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \
+ _viv_asm(MOV, coord_b.w, baseAddr_b); \
+ \
+ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \
+ { \
+ vxc_float4 tempA0, tempA1, tempA2, tempA3; \
+ vxc_float4 tempB0, tempB1, tempB2, tempB3; \
+ VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniConvertUint8SubZpToFp32_4x4); \
+ VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniConvertUint8SubZpToFp32B_4x4); \
+ VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniConvertUint8SubZpToFp32_4x4); \
+ VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniConvertUint8SubZpToFp32B_4x4); \
+ VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniConvertUint8SubZpToFp32_4x4); \
+ VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniConvertUint8SubZpToFp32B_4x4); \
+ VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ coord_a.x += 4; \
+ coord_b.y += 4; \
+ VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniConvertUint8SubZpToFp32_4x4); \
+ VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniConvertUint8SubZpToFp32B_4x4); \
+ sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \
+ sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \
+ sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \
+ sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \
+ } \
+ vxc_int4 tmpOut0, tmpOut1; \
+ coord_b.y = gidy; \
+ coord_b.z = gidz * axis_size2 + i * inner_size2 + j * outer_size2; \
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+ int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \
+ _viv_asm(MOV, coord_b.w, baseAddr); \
+ tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \
+ tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \
+ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8); \
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ coord_b.y++; \
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ coord_b.y++; \
+ tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \
+ tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \
+ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8); \
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ coord_b.y++; \
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ } \
+ } \
+}
+GEMM_QINT_TO_QINT_CROSS(I16, vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_merge.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_merge.vx
new file mode 100644
index 000000000..9f33be797
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_merge.vx
@@ -0,0 +1,294 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float output_ZP;
+_viv_uniform float mulKIn0In1Zp;
+_viv_uniform float inOutScale;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+_viv_uniform int ac2zero;
+_viv_uniform int bc2zero;
+_viv_uniform int outer;
+
+_viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4;
+_viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4;
+
+_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Lo_4x4;
+_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Hi_4x4;
+_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Lo_4x4;
+_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Hi_4x4;
+
+_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;
+
+#define GEMM_QINT_TO_QINT_MERGE(src0_type_name, read_type) \
+__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_merge( \
+ image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \
+ int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \
+{ \
+ read_type srcA0, srcA1, srcA2, srcA3, srcB, outC; \
+ vxc_float4 sum = (vxc_float4)(mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp); \
+ for(int i = 0; i < outer; i++) \
+ { \
+ vxc_float4 sum0 = sum, sum1 = sum, sum2 = sum, sum3 = sum; \
+ int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? i : get_global_id(2)), 0); \
+ int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); \
+ int8 inputA_desc, inputB_desc, output_desc; \
+ _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \
+ int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \
+ _viv_asm(MOV, coord_a.w, baseAddr_a); \
+ _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \
+ int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \
+ _viv_asm(MOV, coord_b.w, baseAddr_b); \
+ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \
+ { \
+ vxc_float4 tempA0, tempA1, tempA2, tempA3; \
+ vxc_float4 tempB0, tempB1, tempB2, tempB3; \
+ VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \
+ VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \
+ VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \
+ coord_a.x += 4; coord_b.y += 4; \
+ VXC_DP4x4(tempA0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGemmU8U8toFp32Block4_4x4); \
+ VXC_DP4x4(tempA1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGemmU8U8toFp32Block4_4x4); \
+ VXC_DP4x4(tempA2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGemmU8U8toFp32Block4_4x4); \
+ VXC_DP4x4(tempA3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGemmU8U8toFp32Block4_4x4); \
+ VXC_DP8x4(tempB0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGemmU8U8MulZptoFp32_8x4); \
+ VXC_DP8x4(tempB1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGemmU8U8MulZptoFp32_8x4); \
+ VXC_DP8x4(tempB2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGemmU8U8MulZptoFp32_8x4); \
+ VXC_DP8x4(tempB3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGemmU8U8MulZptoFp32_8x4); \
+ sum0 += tempA0 + tempB0; \
+ sum1 += tempA1 + tempB1; \
+ sum2 += tempA2 + tempB2; \
+ sum3 += tempA3 + tempB3; \
+ } \
+ vxc_int4 tmpOut0, tmpOut1; \
+ coord_b.y = get_global_id(1); \
+ coord_b.z = get_global_id(2) + i * get_global_size(2); \
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+ int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \
+ _viv_asm(MOV, coord_b.w, baseAddr); \
+ tmpOut0 = convert_int4_rte(sum0 * inOutScale + output_ZP); \
+ tmpOut1 = convert_int4_rte(sum1 * inOutScale + output_ZP); \
+ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8); \
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ coord_b.y++; \
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ coord_b.y++; \
+ tmpOut0 = convert_int4_rte(sum2 * inOutScale + output_ZP); \
+ tmpOut1 = convert_int4_rte(sum3 * inOutScale + output_ZP); \
+ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8); \
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ coord_b.y++; \
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ } \
+}
+GEMM_QINT_TO_QINT_MERGE(U8, vxc_uchar16)
+GEMM_QINT_TO_QINT_MERGE(I8, vxc_char16)
+
+#if (VX_VERSION==2)
+__kernel void gemm_F16F16toF16_merge(image2d_array_t inputA,
+ image2d_array_t inputB, image2d_array_t output,
+ int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N)
+{
+ uint gidy = get_global_id(1);
+ for(int i = 0; i < outer; i++)
+ {
+ int4 coord_a = (int4)(0, gidy, (ac2zero ? i : get_global_id(2)), 0);
+ int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0);
+
+ half4 valC;
+ vxc_short8 srcA0, srcA1, srcA2, srcA3, outC;
+ vxc_half8 tmpA0, tmpA1, tmpA2, tmpA3;
+ vxc_short16 srcB;
+ vxc_half16 tmpB;
+ vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);
+ vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);
+
+ int8 inputA_desc, inputB_desc, output_desc;
+ _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));
+ int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;
+ _viv_asm(MOV, coord_a.w, baseAddr_a);
+ _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));
+ int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;
+ _viv_asm(MOV, coord_b.w, baseAddr_b);
+
+ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)
+ {
+ vxc_float4 tempA0, tempA1, tempA2, tempA3;
+ VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+ coord_a.x += 4; coord_b.y += 4;
+ _viv_asm(COPY, tmpA0, srcA0, 16);
+ _viv_asm(COPY, tmpA1, srcA1, 16);
+ _viv_asm(COPY, tmpA2, srcA2, 16);
+ _viv_asm(COPY, tmpA3, srcA3, 16);
+ _viv_asm(COPY, tmpB.hi, srcB.hi, 16);
+ _viv_asm(COPY, tmpB.lo, srcB.lo, 16);
+ VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniGemmU8F16toF32Lo_4x4b);
+ VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniGemmU8F16toF32Lo_4x4b);
+ VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, tmpA2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniGemmU8F16toF32Lo_4x4b);
+ VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, tmpA3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniGemmU8F16toF32Lo_4x4b);
+ sum0 += (tempA0);
+ sum1 += (tempA1);
+ sum2 += (tempA2);
+ sum3 += (tempA3);
+ }
+ coord_b.y = gidy;
+ coord_b.z = get_global_id(2) + i * get_global_size(2);
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+ int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0;
+ _viv_asm(MOV, coord_b.w, baseAddr);
+ _viv_asm(CONV, valC, sum0);
+ _viv_asm(COPY, outC, valC, 16);
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+ coord_b.y++;
+ _viv_asm(CONV, valC, sum1);
+ _viv_asm(COPY, outC, valC, 16);
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+ coord_b.y++;
+ _viv_asm(CONV, valC, sum2);
+ _viv_asm(COPY, outC, valC, 16);
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+ coord_b.y++;
+ _viv_asm(CONV, valC, sum3);
+ _viv_asm(COPY, outC, valC, 16);
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+ }
+}
+#else
+__kernel void gemm_F16F16toF16_merge(image2d_array_t inputA,
+ image2d_array_t inputB, image2d_array_t output,
+ int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N)
+{
+ uint gidy = get_global_id(1);
+ for(int i = 0; i < outer; i++)
+ {
+ int4 coord_a = (int4)(0, gidy, (ac2zero ? i : get_global_id(2)), 0);
+ int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0);
+
+ half4 valC;
+ vxc_short8 srcA0, srcB0, srcA1, srcB1, outC;
+ vxc_half8 tmpA0, tmpB0, tmpA1, tmpB1;
+ vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);
+ vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);
+
+ int8 inputA_desc, inputB_desc, output_desc;
+ _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));
+ int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;
+ _viv_asm(MOV, coord_a.w, baseAddr_a);
+ _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));
+ int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;
+ _viv_asm(MOV, coord_b.w, baseAddr_b);
+
+ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)
+ {
+ vxc_float4 tempA0, tempA1, tempA2, tempA3;
+ vxc_float4 tempB0, tempB1, tempB2, tempB3;
+ VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+ VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+ coord_a.x += 4; coord_b.y += 4;
+ _viv_asm(COPY, tmpA0, srcA0, 16);
+ _viv_asm(COPY, tmpB0, srcB0, 16);
+ _viv_asm(COPY, tmpA1, srcA1, 16);
+ _viv_asm(COPY, tmpB1, srcB1, 16);
+
+ VXC_DP4x4(tempA0, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniGemmFp16toFp32Row0Lo_4x4);
+ VXC_DP4x4(tempB0, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniGemmFp16toFp32Row0Hi_4x4);
+ VXC_DP4x4(tempA1, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniGemmFp16toFp32Row1Lo_4x4);
+ VXC_DP4x4(tempB1, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniGemmFp16toFp32Row1Hi_4x4);
+ VXC_DP4x4(tempA2, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniGemmFp16toFp32Row0Lo_4x4);
+ VXC_DP4x4(tempB2, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniGemmFp16toFp32Row0Hi_4x4);
+ VXC_DP4x4(tempA3, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniGemmFp16toFp32Row1Lo_4x4);
+ VXC_DP4x4(tempB3, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniGemmFp16toFp32Row1Hi_4x4);
+ sum0 += (tempA0 + tempB0);
+ sum1 += (tempA1 + tempB1);
+ sum2 += (tempA2 + tempB2);
+ sum3 += (tempA3 + tempB3);
+ }
+ coord_b.y = gidy;
+ coord_b.z = get_global_id(2) + i * get_global_size(2);
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+ int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0;
+ _viv_asm(MOV, coord_b.w, baseAddr);
+ _viv_asm(CONV, valC, sum0);
+ _viv_asm(COPY, outC, valC, 16);
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+ coord_b.y++;
+ _viv_asm(CONV, valC, sum1);
+ _viv_asm(COPY, outC, valC, 16);
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+ coord_b.y++;
+ _viv_asm(CONV, valC, sum2);
+ _viv_asm(COPY, outC, valC, 16);
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+ coord_b.y++;
+ _viv_asm(CONV, valC, sum3);
+ _viv_asm(COPY, outC, valC, 16);
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+ }
+}
+#endif
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_BF16_to_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_BF16_to_BF16.vx
new file mode 100644
index 000000000..03b2c33d4
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_BF16_to_BF16.vx
@@ -0,0 +1,99 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+
+_viv_uniform VXC_512Bits uniBF16toFp32_part0_2x8;
+_viv_uniform VXC_512Bits uniBF16toFp32_part1_2x8;
+
+#define GRID_SAMPLE_BF16_PROCESS() \
+ fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+ fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+ float4 in_x = (float4)(fxy0.xz, fxy1.xz); \
+ int4 x_idx = convert_int4(in_x); \
+ float4 in_y = (float4)(fxy0.yw, fxy1.yw); \
+ int4 y_idx = convert_int4(in_y); \
+ int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+ int8 input_desc; \
+ _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+ int baseAddr = input_desc.s0; \
+ _viv_asm(MOV, coord_in.w, baseAddr); \
+ vxc_short8 src; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.y; \
+ coord_in.y = y_idx.y; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.z; \
+ coord_in.y = y_idx.z; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.w; \
+ coord_in.y = y_idx.w; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ int8 output_desc; \
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+ _viv_asm(MOV, coord_out.w, baseAddr); \
+ int loop = depth - 1; \
+ while (coord_in.z < loop) \
+ { \
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+ coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+ coord_in.x = x_idx.x; \
+ coord_in.y = y_idx.x; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.y; \
+ coord_in.y = y_idx.y; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.z; \
+ coord_in.y = y_idx.z; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.w; \
+ coord_in.y = y_idx.w; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+
+
+__kernel void nearest_grid_sample_BF16_BF16toBF16(
+ __read_only image2d_array_t input0,
+ __read_only image2d_t input1,
+ __write_only image2d_array_t output,
+ int align_corners)
+{
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+ int4 coord_in1 = coord_out.xyxy;
+
+ coord_in1.xz = coord_in1.xz * 2;
+
+ vxc_short8 read_val;
+ vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+ VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+
+ float4 fxy0;
+ float4 fxy1;
+
+ vxc_short8 read_src;
+ VXC_DP2x8(read_src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part0_2x8);
+ _viv_asm(COPY, fxy0, read_src, 16);
+ VXC_DP2x8(read_src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part1_2x8);
+ _viv_asm(COPY, fxy1, read_src, 16);
+
+
+
+ GRID_SAMPLE_BF16_PROCESS();
+
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_F16_to_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_F16_to_F16.vx
new file mode 100644
index 000000000..ec90f1daa
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_F16_to_F16.vx
@@ -0,0 +1,148 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;
+_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;
+_viv_uniform VXC_512Bits uniExtactHalf8_2x8;
+
+#define GRID_SAMPLE_F16_PROCESS() \
+ fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+ fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+ float4 in_x = (float4)(fxy0.xz, fxy1.xz); \
+ int4 x_idx = convert_int4(in_x); \
+ float4 in_y = (float4)(fxy0.yw, fxy1.yw); \
+ int4 y_idx = convert_int4(in_y); \
+ int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+ int8 input_desc; \
+ _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+ int baseAddr = input_desc.s0; \
+ _viv_asm(MOV, coord_in.w, baseAddr); \
+ vxc_short8 src; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.y; \
+ coord_in.y = y_idx.y; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.z; \
+ coord_in.y = y_idx.z; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.w; \
+ coord_in.y = y_idx.w; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ int8 output_desc; \
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+ _viv_asm(MOV, coord_out.w, baseAddr); \
+ int loop = depth - 1; \
+ while (coord_in.z < loop) \
+ { \
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+ coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+ coord_in.x = x_idx.x; \
+ coord_in.y = y_idx.x; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.y; \
+ coord_in.y = y_idx.y; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.z; \
+ coord_in.y = y_idx.z; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.w; \
+ coord_in.y = y_idx.w; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+
+__kernel void nearest_grid_sample_F16_F32toF16(
+ __read_only image2d_array_t input0,
+ __read_only image2d_t input1,
+ __write_only image2d_array_t output,
+ int align_corners)
+{
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+ int4 coord_in1 = coord_out.xyxy;
+
+ coord_in1.xz = coord_in1.xz * 2;
+ coord_in1.z = coord_in1.z + 4;
+
+ float4 fxy0 = read_imagef(input1, coord_in1.xy);
+ float4 fxy1 = read_imagef(input1, coord_in1.zw);
+
+ GRID_SAMPLE_F16_PROCESS();
+
+}
+
+_viv_uniform int input1_ZP;
+_viv_uniform float input1Scale;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
+
+__kernel void nearest_grid_sample_F16_U8toF16(
+ __read_only image2d_array_t input0,
+ __read_only image2d_t input1,
+ __write_only image2d_array_t output,
+ int align_corners)
+{
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+ int4 coord_in1 = coord_out.xyxy;
+ coord_in1.xz = coord_in1.xz * 2;
+ vxc_uchar16 read_coord;
+ VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+ float4 fxy0;
+ float4 fxy1;
+ unsigned char input1ZP;
+ _viv_asm(COPY, input1ZP, input1_ZP, 4);
+ VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
+ VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
+ fxy0 = fxy0 * input1Scale;
+ fxy1 = fxy1 * input1Scale;
+
+ GRID_SAMPLE_F16_PROCESS();
+
+}
+
+
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
+
+__kernel void nearest_grid_sample_F16_F16toF16(
+ __read_only image2d_array_t input0,
+ __read_only image2d_t input1,
+ __write_only image2d_array_t output,
+ int align_corners)
+{
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+ int4 coord_in1 = coord_out.xyxy;
+
+ coord_in1.xz = coord_in1.xz * 2;
+
+ vxc_short8 read_val;
+ vxc_half8 read_coord;
+
+ VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+ _viv_asm(COPY, read_coord, read_val, 16);
+
+ float4 fxy0;
+ float4 fxy1;
+
+ VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
+ VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+
+ GRID_SAMPLE_F16_PROCESS();
+
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_F16_to_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_F16_to_U8.vx
new file mode 100644
index 000000000..6a43dddd0
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_F16_to_U8.vx
@@ -0,0 +1,171 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
+_viv_uniform float uint8Scale;
+_viv_uniform float output_ZP;
+
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
+
+#define GRID_SAMPLE_F16_to_U8_PROCESS() \
+ fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+ fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+ float4 in_x = (float4)(fxy0.xz, fxy1.xz); \
+ int4 x_idx = convert_int4(in_x); \
+ float4 in_y = (float4)(fxy0.yw, fxy1.yw); \
+ int4 y_idx = convert_int4(in_y); \
+ int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+ int8 input_desc; \
+ _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+ int baseAddr = input_desc.s0; \
+ _viv_asm(MOV, coord_in.w, baseAddr); \
+ vxc_short8 s0; \
+ vxc_uchar16 result; \
+ vxc_half8 src; \
+ VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.y; \
+ coord_in.y = y_idx.y; \
+ VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.z; \
+ coord_in.y = y_idx.z; \
+ VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.w; \
+ coord_in.y = y_idx.w; \
+ VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ _viv_asm(COPY, src, s0, 16); \
+ int8 output_desc; \
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+ _viv_asm(MOV, coord_out.w, baseAddr); \
+ int loop = depth - 1; \
+ float4 dst4; \
+ int4 dst; \
+ while (coord_in.z < loop) \
+ { \
+ VXC_DP4x4(dst4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4); \
+ dst4 = dst4 * uint8Scale + output_ZP; \
+ dst = convert_int4_rte(dst4); \
+ VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+ result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+ coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+ coord_in.x = x_idx.x; \
+ coord_in.y = y_idx.x; \
+ VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.y; \
+ coord_in.y = y_idx.y; \
+ VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.z; \
+ coord_in.y = y_idx.z; \
+ VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.w; \
+ coord_in.y = y_idx.w; \
+ VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ _viv_asm(COPY, src, s0, 16); \
+ } \
+ VXC_DP4x4(dst4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4); \
+ dst4 = dst4 * uint8Scale + output_ZP; \
+ dst = convert_int4_rte(dst4); \
+ VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+__kernel void nearest_grid_sample_F16_F32toU8(
+ __read_only image2d_array_t input0,
+ __read_only image2d_t input1,
+ __write_only image2d_array_t output,
+ int align_corners)
+{
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+ int4 coord_in1 = coord_out.xyxy;
+
+ coord_in1.xz = coord_in1.xz * 2;
+ coord_in1.z = coord_in1.z + 4;
+
+ float4 fxy0 = read_imagef(input1, coord_in1.xy);
+ float4 fxy1 = read_imagef(input1, coord_in1.zw);
+ GRID_SAMPLE_F16_to_U8_PROCESS();
+
+}
+
+_viv_uniform int input1_ZP;
+_viv_uniform float input1Scale;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
+
+
+__kernel void nearest_grid_sample_F16_U8toU8(
+ __read_only image2d_array_t input0,
+ __read_only image2d_array_t input1,
+ __write_only image2d_array_t output,
+ int align_corners)
+{
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+ int4 coord_in1 = coord_out.xyxy;
+
+ coord_in1.xz = coord_in1.xz * 2;
+
+ vxc_uchar16 read_coord;
+
+ VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+ float4 fxy0;
+ float4 fxy1;
+
+ unsigned char input1ZP;
+ _viv_asm(COPY, input1ZP, input1_ZP, 4);
+
+ VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
+ VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
+
+ fxy0 = fxy0 * input1Scale;
+ fxy1 = fxy1 * input1Scale;
+
+ GRID_SAMPLE_F16_to_U8_PROCESS();
+
+}
+
+
+__kernel void nearest_grid_sample_F16_F16toU8(
+ __read_only image2d_array_t input0,
+ __read_only image2d_t input1,
+ __write_only image2d_array_t output,
+ int align_corners)
+{
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+ int4 coord_in1 = coord_out.xyxy;
+
+ coord_in1.xz = coord_in1.xz * 2;
+
+ vxc_short8 read_val;
+ vxc_half8 read_coord;
+
+ VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+ _viv_asm(COPY, read_coord, read_val, 16);
+
+ float4 fxy0;
+ float4 fxy1;
+
+ VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
+ VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+
+ GRID_SAMPLE_F16_to_U8_PROCESS();
+
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_I16_to_I16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_I16_to_I16.vx
new file mode 100644
index 000000000..b838b08d8
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_I16_to_I16.vx
@@ -0,0 +1,98 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+
+_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;
+_viv_uniform float input1_scale;
+_viv_uniform VXC_512Bits uniConvertI8toI8_2x8;
+
+
+#define GRID_SAMPLE_I16_PROCESS() \
+ fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+ fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+ float4 in_x = (float4)(fxy0.xz, fxy1.xz); \
+ int4 x_idx = convert_int4(in_x); \
+ float4 in_y = (float4)(fxy0.yw, fxy1.yw); \
+ int4 y_idx = convert_int4(in_y); \
+ int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+ int8 input_desc; \
+ _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+ int baseAddr = input_desc.s0; \
+ _viv_asm(MOV, coord_in.w, baseAddr); \
+ vxc_short8 src, dst; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.y; \
+ coord_in.y = y_idx.y; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.z; \
+ coord_in.y = y_idx.z; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.w; \
+ coord_in.y = y_idx.w; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ int8 output_desc; \
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+ _viv_asm(MOV, coord_out.w, baseAddr); \
+ int loop = depth - 1; \
+ while (coord_in.z < loop) \
+ { \
+ VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+ coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+ coord_in.x = x_idx.x; \
+ coord_in.y = y_idx.x; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.y; \
+ coord_in.y = y_idx.y; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.z; \
+ coord_in.y = y_idx.z; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.w; \
+ coord_in.y = y_idx.w; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+
+__kernel void nearest_grid_sample_I16_I16toI16(
+ __read_only image2d_array_t input0,
+ __read_only image2d_t input1,
+ __write_only image2d_array_t output,
+ int align_corners)
+{
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+ int4 coord_in1 = coord_out.xyxy;
+
+ coord_in1.xz = coord_in1.xz * 2;
+ vxc_short8 read_coord;
+ VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+ float4 fxy0;
+ float4 fxy1;
+
+ VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);
+ VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);
+
+ fxy0 = fxy0 * input1_scale;
+ fxy1 = fxy1 * input1_scale;
+
+ GRID_SAMPLE_I16_PROCESS();
+
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_I8_to_I8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_I8_to_I8.vx
new file mode 100644
index 000000000..871383cbc
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_I8_to_I8.vx
@@ -0,0 +1,97 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+
+
+_viv_uniform float input1_scale;
+_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;
+_viv_uniform VXC_512Bits uniConvertI8toI8_2x8;
+
+#define GRID_SAMPLE_I8_PROCESS() \
+ fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+ fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+ float4 in_x = (float4)(fxy0.xz, fxy1.xz); \
+ int4 x_idx = convert_int4(in_x); \
+ float4 in_y = (float4)(fxy0.yw, fxy1.yw); \
+ int4 y_idx = convert_int4(in_y); \
+ int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+ int8 input_desc; \
+ _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+ int baseAddr = input_desc.s0; \
+ _viv_asm(MOV, coord_in.w, baseAddr); \
+ vxc_char16 src, dst; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.y; \
+ coord_in.y = y_idx.y; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.z; \
+ coord_in.y = y_idx.z; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.w; \
+ coord_in.y = y_idx.w; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ int8 output_desc; \
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+ _viv_asm(MOV, coord_out.w, baseAddr); \
+ int loop = depth - 1; \
+ while (coord_in.z < loop) \
+ { \
+ VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+ coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+ coord_in.x = x_idx.x; \
+ coord_in.y = y_idx.x; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.y; \
+ coord_in.y = y_idx.y; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.z; \
+ coord_in.y = y_idx.z; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.w; \
+ coord_in.y = y_idx.w; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+__kernel void nearest_grid_sample_I8_I8toI8(
+ __read_only image2d_array_t input0,
+ __read_only image2d_t input1,
+ __write_only image2d_array_t output,
+ int align_corners)
+{
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+ int4 coord_in1 = coord_out.xyxy;
+
+ coord_in1.xz = coord_in1.xz * 2;
+ vxc_char16 read_coord;
+ VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+ float4 fxy0;
+ float4 fxy1;
+
+ VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);
+ VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);
+
+ fxy0 = fxy0 * input1_scale;
+ fxy1 = fxy1 * input1_scale;
+
+ GRID_SAMPLE_I8_PROCESS();
+
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_U8_to_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_U8_to_U8.vx
new file mode 100644
index 000000000..696c96dc9
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_U8_to_U8.vx
@@ -0,0 +1,160 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+
+_viv_uniform int input1_ZP;
+_viv_uniform float input1Scale;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
+
+_viv_uniform VXC_512Bits uniMultiplyAndPostShift_2x8;
+_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
+
+#define GRID_SAMPLE_U8_PROCESS() \
+ fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+ fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+ float4 in_x = (float4)(fxy0.xz, fxy1.xz); \
+ int4 x_idx = convert_int4(in_x); \
+ float4 in_y = (float4)(fxy0.yw, fxy1.yw); \
+ int4 y_idx = convert_int4(in_y); \
+ int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+ int8 input_desc; \
+ _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+ int baseAddr = input_desc.s0; \
+ _viv_asm(MOV, coord_in.w, baseAddr); \
+ vxc_uchar16 src, dst; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.y; \
+ coord_in.y = y_idx.y; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.z; \
+ coord_in.y = y_idx.z; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.w; \
+ coord_in.y = y_idx.w; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ int8 output_desc; \
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+ _viv_asm(MOV, coord_out.w, baseAddr); \
+ int loop = depth - 1; \
+ vxc_ushort8 multiplier; \
+ _viv_asm(COPY, multiplier, multAndoutZP, 16); \
+ while (coord_in.z < loop) \
+ { \
+ VXC_DP2x8(dst, src, multiplier, \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); \
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+ coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+ coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+ coord_in.x = x_idx.x; \
+ coord_in.y = y_idx.x; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.y; \
+ coord_in.y = y_idx.y; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.z; \
+ coord_in.y = y_idx.z; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = x_idx.w; \
+ coord_in.y = y_idx.w; \
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ } \
+ VXC_DP2x8(dst, src, multiplier, \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); \
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+__kernel void nearest_grid_sample_U8_F32toU8(
+ __read_only image2d_array_t input0,
+ __read_only image2d_t input1,
+ __write_only image2d_array_t output,
+ int align_corners)
+{
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+ int4 coord_in1 = coord_out.xyxy;
+
+ coord_in1.xz = coord_in1.xz * 2;
+ coord_in1.z = coord_in1.z + 4;
+
+ float4 fxy0 = read_imagef(input1, coord_in1.xy);
+ float4 fxy1 = read_imagef(input1, coord_in1.zw);
+ GRID_SAMPLE_U8_PROCESS();
+
+}
+
+
+__kernel void nearest_grid_sample_U8_U8toU8(
+ __read_only image2d_array_t input0,
+ __read_only image2d_array_t input1,
+ __write_only image2d_array_t output,
+ int align_corners)
+{
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+ int4 coord_in1 = coord_out.xyxy;
+
+ coord_in1.xz = coord_in1.xz * 2;
+
+ vxc_uchar16 read_coord;
+
+ VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+ float4 fxy0;
+ float4 fxy1;
+
+ unsigned char input1ZP;
+ _viv_asm(COPY, input1ZP, input1_ZP, 4);
+
+ VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
+ VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
+
+ fxy0 = fxy0 * input1Scale;
+ fxy1 = fxy1 * input1Scale;
+
+ GRID_SAMPLE_U8_PROCESS();
+
+}
+
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
+
+__kernel void nearest_grid_sample_U8_F16toU8(
+ __read_only image2d_array_t input0,
+ __read_only image2d_t input1,
+ __write_only image2d_array_t output,
+ int align_corners)
+{
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+ int4 coord_in1 = coord_out.xyxy;
+
+ coord_in1.xz = coord_in1.xz * 2;
+
+ vxc_short8 read_val;
+ vxc_half8 read_coord;
+
+ VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+ _viv_asm(COPY, read_coord, read_val, 16);
+
+ float4 fxy0;
+ float4 fxy1;
+
+ VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
+ VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+
+ GRID_SAMPLE_U8_PROCESS();
+
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow.vx
index 19873f170..438d7be12 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pow.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pow.vx
@@ -22,8 +22,8 @@ __kernel void pow_##name \
\
src0_type src0; \
copy0_type data0; \
- src0_type src1; \
- copy0_type data1; \
+ src1_type src1; \
+ copy1_type data1; \
VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, data0, src0, 16); \
VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
@@ -94,8 +94,8 @@ __kernel void pow_##name##_2D \
\
src0_type src0; \
copy0_type data0; \
- src0_type src1; \
- copy0_type data1; \
+ src1_type src1; \
+ copy1_type data1; \
VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, data0, src0, 16); \
VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra.vx
index 28f3f0c0e..91e4213dd 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra.vx
@@ -28,9 +28,21 @@ _viv_uniform int zp;
_viv_uniform float outputScale;
__kernel void pre_process_bgra_scale_U8toU8(
- __read_only image2d_array_t input, __write_only image2d_array_t output,
- global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
- float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ global int * xRatio,
+ global int * yRatio,
+ global int * xOffset,
+ global int * yOffset,
+ float rMean,
+ float gMean,
+ float bMean,
+ float r_scale,
+ int reverse_channel,
+ int trans,
+ float g_scale,
+ float b_scale
+ )
{
int4 gidx = get_global_id(0);
int gidy = get_global_id(1);
@@ -86,6 +98,7 @@ __kernel void pre_process_bgra_scale_U8toU8(
int4 tmp1, tmp2, result1, result2;
float4 tmpDst, tmp0;
float4 mean = (float4)(bMean, gMean, rMean, 0);
+ float4 var = (float4)(b_scale, g_scale, r_scale, 0);
//tmpFx = (int4)(fx.x, fx.x, fx.x, fx.x);
int tmpV = 1 << 19;
vxc_short8 tmpFx;
@@ -148,9 +161,21 @@ __kernel void pre_process_bgra_scale_U8toU8(
}
__kernel void pre_process_bgra_copy_U8toU8(
- __read_only image2d_array_t input, __write_only image2d_array_t output,
- global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
- float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ global int * xRatio,
+ global int * yRatio,
+ global int * xOffset,
+ global int * yOffset,
+ float rMean,
+ float gMean,
+ float bMean,
+ float r_scale,
+ int reverse_channel,
+ int trans,
+ float g_scale,
+ float b_scale
+)
{
int2 pos = (int2)((get_global_id(0) + (*xOffset)) << 2, get_global_id(1) + (*yOffset));
@@ -165,10 +190,10 @@ __kernel void pre_process_bgra_copy_U8toU8(
VXC_DP4x4(tmpG, lineBGRA0, lineBGRA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGfromBgra_4x4);
VXC_DP4x4(tmpR, lineBGRA0, lineBGRA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRfromBgra_4x4);
- tmpDst = (tmpB - bMean) * var;
+ tmpDst = (tmpB - bMean) * b_scale;
result1 = convert_int4_rte(tmpDst * outputScale + zp);
- tmpDst = (tmpG - gMean) * var;
+ tmpDst = (tmpG - gMean) * g_scale;
result2 = convert_int4_rte(tmpDst * outputScale + zp);
VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
@@ -178,7 +203,7 @@ __kernel void pre_process_bgra_copy_U8toU8(
dstPos.z = 1;
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
- tmpDst = (tmpR - rMean) * var;
+ tmpDst = (tmpR - rMean) * r_scale;
result1 = convert_int4_rte(tmpDst * outputScale + zp);
VXC_DP2x8(dst, result1, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx
index fcc8d9c06..a20a579f6 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx
@@ -3,7 +3,10 @@
_viv_uniform int bOrder;
_viv_uniform int rOrder;
-_viv_uniform float outputScaleVar;
+_viv_uniform float outputScaleVar_b;
+_viv_uniform float outputScaleVar_g;
+_viv_uniform float outputScaleVar_r;
+
_viv_uniform float bMeanScaleVarZp;
_viv_uniform float gMeanScaleVarZp;
_viv_uniform float rMeanScaleVarZp;
@@ -28,10 +31,12 @@ __kernel void pre_process_nv12_copy_##name \
float rMean, \
float gMean, \
float bMean, \
- float var, \
+ float r_scale, \
int reverse_channel, \
int trans, \
- int nv_type \
+ int nv_type, \
+ float g_scale, \
+ float b_scale \
) \
{ \
int gidx = get_global_id(0); \
@@ -65,21 +70,21 @@ __kernel void pre_process_nv12_copy_##name \
dst_type dst0; \
save_type dst; \
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
- tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
+ tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \
_viv_asm(CONV_RTE, result, tmpDstB); \
dstPos.z = bOrder; \
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
_viv_asm(COPY, dst, dst0, copy_bytes); \
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
\
- tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
+ tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \
_viv_asm(CONV_RTE, result, tmpDstG); \
dstPos.z = 1; \
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
_viv_asm(COPY, dst, dst0, copy_bytes); \
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
\
- tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
+ tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \
_viv_asm(CONV_RTE, result, tmpDstR); \
dstPos.z = rOrder; \
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx
index f4ac83b40..2fe9ad62f 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx
@@ -3,7 +3,10 @@
_viv_uniform int bOrder;
_viv_uniform int rOrder;
-_viv_uniform float outputScaleVar;
+_viv_uniform float outputScaleVar_b;
+_viv_uniform float outputScaleVar_g;
+_viv_uniform float outputScaleVar_r;
+
_viv_uniform float bMeanScaleVarZp;
_viv_uniform float gMeanScaleVarZp;
_viv_uniform float rMeanScaleVarZp;
@@ -36,10 +39,12 @@ __kernel void pre_process_nv12_scale_##name##_gq \
float rMean, \
float gMean, \
float bMean, \
- float var, \
+ float r_scale, \
int reverse_channel, \
int trans, \
- int nv_type \
+ int nv_type, \
+ float g_scale, \
+ float b_scale \
) \
{ \
uint4 gidx = get_global_id(0); \
@@ -93,21 +98,21 @@ __kernel void pre_process_nv12_scale_##name##_gq \
dst_type dst0; \
save_type dst; \
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
- tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
+ tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \
_viv_asm(CONV_RTE, result, tmpDstB); \
dstPos.z = bOrder; \
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
_viv_asm(COPY, dst, dst0, copy_bytes); \
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
\
- tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
+ tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \
_viv_asm(CONV_RTE, result, tmpDstG); \
dstPos.z = 1; \
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
_viv_asm(COPY, dst, dst0, copy_bytes); \
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
\
- tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
+ tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \
_viv_asm(CONV_RTE, result, tmpDstR); \
dstPos.z = rOrder; \
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
@@ -132,10 +137,12 @@ __kernel void pre_process_nv12_scale_##name \
float rMean, \
float gMean, \
float bMean, \
- float var, \
+ float r_scale, \
int reverse_channel, \
int trans, \
- int nv_type \
+ int nv_type, \
+ float g_scale, \
+ float b_scale \
) \
{ \
uint4 gidx = get_global_id(0); \
@@ -187,21 +194,21 @@ __kernel void pre_process_nv12_scale_##name \
dst_type dst0; \
save_type dst; \
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
- tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
+ tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \
_viv_asm(CONV_RTE, result, tmpDstB); \
dstPos.z = bOrder; \
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
_viv_asm(COPY, dst, dst0, copy_bytes); \
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
\
- tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
+ tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \
_viv_asm(CONV_RTE, result, tmpDstG); \
dstPos.z = 1; \
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
_viv_asm(COPY, dst, dst0, copy_bytes); \
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
\
- tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
+ tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \
_viv_asm(CONV_RTE, result, tmpDstR); \
dstPos.z = rOrder; \
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb.vx
index 536c18df0..c42f2eb6b 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb.vx
@@ -29,9 +29,11 @@ __write_only image2d_array_t output, \
float rMean, \
float gMean, \
float bMean, \
- float f32Var, \
+ float r_scale, \
int reverse_channel, \
- int trans \
+ int trans, \
+ float g_scale, \
+ float b_scale \
) \
{ \
int2 ratioXY = (int2)(*xRatio, *yRatio); \
@@ -80,7 +82,7 @@ __write_only image2d_array_t output, \
\
float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \
\
- bgrMean *= f32Var; \
+ bgrMean *= (float4)(b_scale, g_scale, r_scale, 0); \
\
int4 test01, temp1; \
int4 test02, temp2; \
@@ -113,7 +115,7 @@ __write_only image2d_array_t output, \
\
/*convert U8 to dst*/ \
dst_type dst; \
- tmp_dst = tmp_dst * f32Var - bgrMean.zzzz; \
+ tmp_dst = tmp_dst * r_scale - bgrMean.zzzz; \
tmp_dst = tmp_dst * outputScale + outputZP; \
conv_type dst0; \
_viv_asm(CONV_RTE, dst0, tmp_dst); \
@@ -140,7 +142,7 @@ __write_only image2d_array_t output, \
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
uniConvertIntergetoF32_4x4); \
\
- tmp_dst = tmp_dst * f32Var - bgrMean.y; \
+ tmp_dst = tmp_dst * g_scale - bgrMean.y; \
tmp_dst = tmp_dst * outputScale + outputZP; \
_viv_asm(CONV_RTE, dst0, tmp_dst); \
VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
@@ -165,7 +167,7 @@ __write_only image2d_array_t output, \
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
uniConvertIntergetoF32_4x4); \
\
- tmp_dst = tmp_dst * f32Var - bgrMean.x; \
+ tmp_dst = tmp_dst * b_scale - bgrMean.x; \
tmp_dst = tmp_dst * outputScale + outputZP; \
_viv_asm(CONV_RTE, dst0, tmp_dst); \
VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx
index 5cb3ebbe7..a008b46e2 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx
@@ -10,8 +10,9 @@ _viv_uniform VXC_512Bits uniExtract8Data_2x8;
_viv_uniform float output_scale;
_viv_uniform float output_zp;
+_viv_uniform int4 rgb_order;
-#define RESIZE_BILINEAR_4X1(mean, output) \
+#define RESIZE_BILINEAR_4X1(scale, mean, output, _coord) \
VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
@@ -49,21 +50,19 @@ _viv_uniform float output_zp;
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
uniConvertIntergetoF32_4x4); \
\
- tmp_dst = tmp_dst * output_scale - mean * output_scale + output_zp; \
+ tmp_dst = tmp_dst * scale * output_scale - mean * scale * output_scale + output_zp; \
_viv_asm(CONV, dst0, tmp_dst); \
VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
uniExtract8Data_2x8); \
_viv_asm(COPY, dst, dst1, 8); \
- VXC_WriteImage(output, coord_out, dst, \
+ VXC_WriteImage(output, _coord, dst, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
#define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \
__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
( \
__read_only image2d_array_t input, \
- __write_only image2d_array_t output0, \
- __write_only image2d_array_t output1, \
- __write_only image2d_array_t output2, \
+ __write_only image2d_array_t output, \
global int *xRatio, \
global int *yRatio, \
global int *xOffset, \
@@ -71,7 +70,11 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
float rMean, \
float gMean, \
float bMean, \
- float f32Var \
+ float r_scale, \
+ int reverse, \
+ int height, \
+ float g_scale, \
+ float b_scale \
) \
{ \
int2 ratioXY = (int2)(*xRatio, *yRatio); \
@@ -133,7 +136,8 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
int4 test02, temp2; \
int4 tt; \
vxc_uchar4 val; \
- int2 coord_out = (int2)(xPos.x, yPos); \
+ int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \
+ coord_out.yzw += rgb_order.xyz; \
\
vxc_uchar8 line1, line2; \
\
@@ -158,16 +162,16 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
conv_type dst0; \
dst_type dst1; \
copy_type dst; \
- tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \
+ tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \
_viv_asm(CONV, dst0, tmp_dst); \
VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
uniExtract8Data_2x8); \
_viv_asm(COPY, dst, dst1, 8); \
- VXC_WriteImage(output0, coord_out, dst, \
+ VXC_WriteImage(output, coord_out.xy, dst, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
\
- RESIZE_BILINEAR_4X1(gMean, output1) \
- RESIZE_BILINEAR_4X1(bMean, output2) \
+ RESIZE_BILINEAR_4X1(g_scale, gMean, output, coord_out.xz) \
+ RESIZE_BILINEAR_4X1(b_scale, bMean, output, coord_out.xw) \
}
PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8, half4, vxc_short8)
PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4, vxc_short8)
@@ -176,9 +180,7 @@ PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4, vxc_short8)
__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
( \
__read_only image2d_array_t input, \
- __write_only image2d_array_t output0, \
- __write_only image2d_array_t output1, \
- __write_only image2d_array_t output2, \
+ __write_only image2d_array_t output, \
global int *xRatio, \
global int *yRatio, \
global int *xOffset, \
@@ -186,7 +188,11 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
float rMean, \
float gMean, \
float bMean, \
- float f32Var \
+ float r_scale, \
+ int reverse, \
+ int height, \
+ float g_scale, \
+ float b_scale \
) \
{ \
int2 ratioXY = (int2)(*xRatio, *yRatio); \
@@ -241,7 +247,8 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
\
int4 test01, temp1; \
int4 test02, temp2; \
- int2 coord_out = (int2)(xPos.x, yPos); \
+ int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \
+ coord_out.yzw += rgb_order.xyz; \
\
VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
uniVecShift10); \
@@ -265,12 +272,12 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
\
int4 dst0; \
write_type dst; \
- tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \
+ tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \
dst0 = convert_int4_rte(tmp_dst); \
VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
uniExtract8Data_2x8); \
\
- VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
\
coord_in.x = coord.x; \
coord_in.z = 1; \
@@ -310,12 +317,12 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
uniExtractBytes); \
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
uniConvertIntergetoF32_4x4); \
- tmp_dst = tmp_dst * output_scale - gMean * output_scale + output_zp; \
+ tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \
dst0 = convert_int4_rte(tmp_dst); \
VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
uniExtract8Data_2x8); \
\
- VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.xz, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
\
coord_in.x = coord.x; \
coord_in.z = 2; \
@@ -355,12 +362,12 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
uniExtractBytes); \
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
uniConvertIntergetoF32_4x4); \
- tmp_dst = tmp_dst * output_scale - bMean * output_scale + output_zp; \
+ tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \
dst0 = convert_int4_rte(tmp_dst); \
VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
uniExtract8Data_2x8); \
\
- VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.xw, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)
PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx
index b0714e47c..724b28ad3 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx
@@ -6,14 +6,13 @@ _viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;
_viv_uniform float output_scale;
_viv_uniform float output_zp;
+_viv_uniform int4 rgb_order;
#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \
__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
( \
__read_only image2d_array_t input, \
- __write_only image2d_array_t output0, \
- __write_only image2d_array_t output1, \
- __write_only image2d_array_t output2, \
+ __write_only image2d_array_t output, \
global int *xRatio, \
global int *yRatio, \
global int *xOffset, \
@@ -21,7 +20,11 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
float rMean, \
float gMean, \
float bMean, \
- float f32Var \
+ float r_scale, \
+ int reverse, \
+ int height, \
+ float g_scale, \
+ float b_scale \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
@@ -38,8 +41,9 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
\
coord.x = coord.z + 8; \
- float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \
- rMean * output_scale - output_zp, output_scale); \
+ float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp,\
+ rMean * r_scale * output_scale - output_zp, \
+ rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \
\
half4 paramData_f16; \
copy_type tmp_dst; \
@@ -49,33 +53,38 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniDataMeanStddevHi_2x8); \
_viv_asm(COPY, tmp_dst, dst0, 16); \
- VXC_WriteImage(output0, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ int4 coord_out = coord; \
+ coord_out.yw = coord_out.ww + rgb_order.xy; \
+ VXC_WriteImage(output, coord_out.zy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, tmp_dst, dst1, 16); \
- VXC_WriteImage(output0, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.xy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
- float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \
- gMean * output_scale - output_zp, output_scale); \
+ float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \
+ gMean * g_scale * output_scale - output_zp, \
+ gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \
_viv_asm(CONV, paramData_f16, paramData1); \
VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniDataMeanStddevLo_2x8); \
VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniDataMeanStddevHi_2x8); \
_viv_asm(COPY, tmp_dst, dst0, 16); \
- VXC_WriteImage(output1, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, tmp_dst, dst1, 16); \
- VXC_WriteImage(output1, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
- float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \
- bMean * output_scale - output_zp, output_scale); \
+ float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \
+ bMean * b_scale * output_scale - output_zp, \
+ bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \
_viv_asm(CONV, paramData_f16, paramData2); \
VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniDataMeanStddevLo_2x8); \
VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniDataMeanStddevHi_2x8); \
_viv_asm(COPY, tmp_dst, dst0, 16); \
- VXC_WriteImage(output2, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ coord_out.w = coord.w + rgb_order.z; \
+ VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, tmp_dst, dst1, 16); \
- VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
}
PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8, vxc_short8)
PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)
@@ -84,9 +93,7 @@ PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)
__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
( \
__read_only image2d_array_t input, \
- __write_only image2d_array_t output0, \
- __write_only image2d_array_t output1, \
- __write_only image2d_array_t output2, \
+ __write_only image2d_array_t output, \
global int *xRatio, \
global int *yRatio, \
global int *xOffset, \
@@ -94,7 +101,11 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
float rMean, \
float gMean, \
float bMean, \
- float f32Var \
+ float r_scale, \
+ int reverse, \
+ int height, \
+ float g_scale, \
+ float b_scale \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
@@ -110,8 +121,11 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
coord_in.z ++; \
VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
\
- float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \
- rMean * output_scale - output_zp, output_scale); \
+ int4 coord_out = coord; \
+ coord_out.xyw = coord.www + rgb_order.xyz; \
+ float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \
+ rMean * r_scale * output_scale - output_zp, \
+ rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \
\
half4 paramData_f16; \
_viv_asm(CONV, paramData_f16, paramData0); \
@@ -120,27 +134,29 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
uniDataMeanStddevLo_2x8); \
VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
uniDataMeanStddevHi_2x8); \
- VXC_WriteImage(output0, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.zx, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
\
- float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \
- gMean * output_scale - output_zp, output_scale); \
+ float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \
+ gMean * g_scale * output_scale - output_zp, \
+ gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \
_viv_asm(CONV, paramData_f16, paramData1); \
\
VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniDataMeanStddevLo_2x8); \
VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
uniDataMeanStddevHi_2x8); \
- VXC_WriteImage(output1, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
\
- float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \
- bMean * output_scale - output_zp, output_scale); \
+ float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \
+ bMean * b_scale * output_scale - output_zp, \
+ bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \
_viv_asm(CONV, paramData_f16, paramData2); \
\
VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniDataMeanStddevLo_2x8); \
VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
uniDataMeanStddevHi_2x8); \
- VXC_WriteImage(output2, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
}
PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)
PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx
index 1ac60fe72..ed58fa920 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx
@@ -5,13 +5,12 @@ _viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;
_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;
_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;
_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;
+_viv_uniform int4 rgb_order;
__kernel void pre_process_rgb888_planar_4over3_U8toU8
(
__read_only image2d_array_t input,
- __write_only image2d_array_t output0,
- __write_only image2d_array_t output1,
- __write_only image2d_array_t output2,
+ __write_only image2d_array_t output,
global int *xRatio,
global int *yRatio,
global int *xOffset,
@@ -19,7 +18,11 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8
float rMean,
float gMean,
float bMean,
- float f32Var
+ float r_scale,
+ int reverse,
+ int height,
+ float g_scale,
+ float b_scale
)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);
@@ -49,9 +52,11 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8
VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);
VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);
- VXC_WriteImage(output0, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ int4 coord_r = coord_out;
+ coord_r.yzw += rgb_order.xxx;
+ VXC_WriteImage(output, coord_r.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord_r.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord_r.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
@@ -72,9 +77,11 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8
VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);
VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);
- VXC_WriteImage(output1, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ int4 coord_g = coord_out;
+ coord_g.yzw += rgb_order.yyy;
+ VXC_WriteImage(output, coord_g.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord_g.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord_g.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
@@ -94,17 +101,17 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8
VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);
VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);
- VXC_WriteImage(output2, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output2, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ int4 coord_b = coord_out;
+ coord_b.yzw += rgb_order.zzz;
+ VXC_WriteImage(output, coord_b.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord_b.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord_b.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
}
__kernel void pre_process_rgb888_planar_half_U8toU8
(
__read_only image2d_array_t input,
- __write_only image2d_array_t output0,
- __write_only image2d_array_t output1,
- __write_only image2d_array_t output2,
+ __write_only image2d_array_t output,
global int *xRatio,
global int *yRatio,
global int *xOffset,
@@ -112,7 +119,11 @@ __kernel void pre_process_rgb888_planar_half_U8toU8
float rMean,
float gMean,
float bMean,
- float f32Var
+ float r_scale,
+ int reverse,
+ int height,
+ float g_scale,
+ float b_scale
)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);
@@ -130,7 +141,9 @@ __kernel void pre_process_rgb888_planar_half_U8toU8
int2 coord = coord_in.xy >> 1;
- VXC_WriteImage(output0, coord, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output1, coord, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output2, coord, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ int4 coord_rgb = coord.xyyy;
+ coord_rgb.yzw += rgb_order.xyz;
+ VXC_WriteImage(output, coord_rgb.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord_rgb.xz, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord_rgb.xw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_0.vx
new file mode 100644
index 000000000..336c4e6e1
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_0.vx
@@ -0,0 +1,377 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniVecShift10;
+_viv_uniform VXC_512Bits uniAddRShift;
+_viv_uniform VXC_512Bits uniGetTempVal;
+_viv_uniform VXC_512Bits uniExtractBytes;
+
+_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;
+
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+#define RESIZE_BILINEAR_4X1(scale, mean) \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = coord.y; \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = coord.z; \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = coord.w; \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.z ++; \
+ coord_in.x = coord.x; \
+ \
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+ temp1 = temp1 + test01; \
+ \
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+ temp2 = temp2 + test02; \
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniExtractBytes); \
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniConvertIntergetoF32_4x4); \
+ \
+ tmp_dst = tmp_dst * scale * output_scale - mean * scale * output_scale + output_zp; \
+ _viv_asm(CONV, dst0, tmp_dst);
+#define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \
+__kernel void pre_process_rgb888_planar_scale_U8to##dst_name##_nhwc \
+ ( \
+ __read_only image2d_array_t input, \
+ __write_only image2d_array_t output, \
+ global int *xRatio, \
+ global int *yRatio, \
+ global int *xOffset, \
+ global int *yOffset, \
+ float rMean, \
+ float gMean, \
+ float bMean, \
+ float r_scale, \
+ int reverse, \
+ float g_scale, \
+ float b_scale \
+ ) \
+{ \
+ int2 ratioXY = (int2)(*xRatio, *yRatio); \
+ \
+ int4 xPos = get_global_id(0); \
+ int yPos = get_global_id(1); \
+ \
+ int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \
+ xPos += (int4)(0, 1, 2, 3); \
+ \
+ int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \
+ int4 sx = fx0 & 0xffff8000; \
+ fx0 -= sx; \
+ sx = sx >> 15; \
+ \
+ vxc_short4 fx; \
+ VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniAddRShift); \
+ \
+ int fy = yPos * ratioXY.y + ratioSufXY.y; \
+ int sy = fy & 0xffff8000; \
+ \
+ fy -= sy; \
+ sy = sy >> 15; \
+ \
+ fy = (fy + (1<< 4)) >> 5; \
+ \
+ vxc_uchar16 line0Y; \
+ vxc_uchar16 line1Y; \
+ int4 coord; \
+ int4 coord_in = (int4)(0, 0, 0, 0); \
+ sx = sx + *xOffset; \
+ coord = sx.xyzw; \
+ coord_in.y = sy + *yOffset; \
+ coord_in.x = coord.x; \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = coord.y; \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = coord.z; \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = coord.w; \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.z ++; \
+ coord_in.x = coord.x; \
+ \
+ int4 test01, temp1; \
+ int4 test02, temp2; \
+ int4 tt; \
+ vxc_uchar4 val; \
+ int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \
+ coord_out.x = coord_out.x * 3; \
+ coord_out.z = coord_out.x + 8; \
+ \
+ vxc_uchar8 line1, line2; \
+ \
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+ temp1 = temp1 + test01; \
+ \
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+ temp2 = temp2 + test02; \
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+ \
+ vxc_float4 tmp_dst; \
+ vxc_uchar4 u8_dst; \
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniExtractBytes); \
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniConvertIntergetoF32_4x4); \
+ \
+ conv_type dst0; \
+ dst_type dst1, dst2; \
+ copy_type data0, data1, dst; \
+ tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \
+ _viv_asm(CONV, dst0, tmp_dst); \
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+ uniExtract8Data_2x8); \
+ \
+ RESIZE_BILINEAR_4X1(g_scale, gMean) \
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uniExtract8Data_2x8); \
+ RESIZE_BILINEAR_4X1(b_scale, bMean) \
+ VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+ uniExtract8Data_2x8); \
+ _viv_asm(COPY, data0, dst1, 16); \
+ _viv_asm(COPY, data1, dst2, 16); \
+ VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uni16BitsDataInterleave_0_2x8); \
+ VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uni16BitsDataInterleave_1_2x8); \
+ VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8, half4, vxc_short8)
+PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4, vxc_short8)
+
+#define PRE_PROCESS_RGB888_PLANAR_8BITS(dst_name, write_type) \
+__kernel void pre_process_rgb888_planar_scale_U8to##dst_name##_nhwc \
+ ( \
+ __read_only image2d_array_t input, \
+ __write_only image2d_array_t output, \
+ global int *xRatio, \
+ global int *yRatio, \
+ global int *xOffset, \
+ global int *yOffset, \
+ float rMean, \
+ float gMean, \
+ float bMean, \
+ float r_scale, \
+ int reverse, \
+ float g_scale, \
+ float b_scale \
+ ) \
+{ \
+ int2 ratioXY = (int2)(*xRatio, *yRatio); \
+ int4 xPos = get_global_id(0); \
+ int yPos = get_global_id(1); \
+ \
+ int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \
+ xPos += (int4)(0, 1, 2, 3); \
+ \
+ int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \
+ int4 sx = fx0 & 0xffff8000; \
+ fx0 -= sx; \
+ sx = sx >> 15; \
+ \
+ vxc_short4 fx; \
+ VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \
+ \
+ int fy = yPos * ratioXY.y + ratioSufXY.y; \
+ int sy = fy & 0xffff8000; \
+ \
+ fy -= sy; \
+ sy = sy >> 15; \
+ fy = (fy + (1<< 4)) >> 5; \
+ \
+ vxc_uchar16 line0Y; \
+ vxc_uchar16 line1Y; \
+ int4 coord; \
+ sx = sx + *xOffset; \
+ coord.xyz = sx.xyz; \
+ coord.w = sy + *yOffset; \
+ int2 coord1 = (int2)(sx.w, coord.w); \
+ int4 coord_in = (int4)(coord.xw, 0, 0); \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = coord.y; \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = coord.z; \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = coord1.x; \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+ int4 test01, temp1; \
+ int4 test02, temp2; \
+ int2 coord_out = (int2)(xPos.x, yPos); \
+ coord_out.x = coord_out.x * 3; \
+ \
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGetTempVal); \
+ temp1 = temp1 + test01; \
+ \
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGetTempVal); \
+ temp2 = temp2 + test02; \
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+ \
+ vxc_float4 tmp_dst; \
+ vxc_uchar4 u8_dst; \
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniExtractBytes); \
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniConvertIntergetoF32_4x4); \
+ \
+ int4 dst0; \
+ write_type dst1, dst; \
+ tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \
+ dst0 = convert_int4_rte(tmp_dst); \
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+ uniExtract8Data_2x8); \
+ \
+ coord_in.x = coord.x; \
+ coord_in.z = 1; \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = coord.y; \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = coord.z; \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = coord1.x; \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGetTempVal); \
+ temp1 = temp1 + test01; \
+ \
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGetTempVal); \
+ temp2 = temp2 + test02; \
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniExtractBytes); \
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniConvertIntergetoF32_4x4); \
+ tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \
+ dst0 = convert_int4_rte(tmp_dst); \
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uniExtract8Data_2x8); \
+ \
+ coord_in.x = coord.x; \
+ coord_in.z = 2; \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = coord.y; \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = coord.z; \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.x = coord1.x; \
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGetTempVal); \
+ temp1 = temp1 + test01; \
+ \
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGetTempVal); \
+ temp2 = temp2 + test02; \
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniExtractBytes); \
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniConvertIntergetoF32_4x4); \
+ tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \
+ dst0 = convert_int4_rte(tmp_dst); \
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \
+ uniExtract8Data_2x8); \
+ VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uni8BitsDataInterleave_0_2x8); \
+ VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \
+ uni16BitsDataInterleave_1_2x8); \
+ VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \
+}
+PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)
+PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_1.vx
new file mode 100644
index 000000000..80c603bc2
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_1.vx
@@ -0,0 +1,153 @@
+
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;
+_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;
+
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;
+
+#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \
+__kernel void pre_process_rgb888_planar_copy_U8to##dst_name##_nhwc \
+ ( \
+ __read_only image2d_array_t input, \
+ __write_only image2d_array_t output, \
+ global int *xRatio, \
+ global int *yRatio, \
+ global int *xOffset, \
+ global int *yOffset, \
+ float rMean, \
+ float gMean, \
+ float bMean, \
+ float r_scale, \
+ int reverse, \
+ float g_scale, \
+ float b_scale \
+ ) \
+{ \
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+ coord.xy += (int2)(*xOffset, *yOffset); \
+ vxc_uchar16 src0, src1, src2; \
+ dst_type dst0, dst1; \
+ \
+ int4 coord_in = (int4)(coord.xy, 0, 0); \
+ VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.z ++; \
+ VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.z ++; \
+ VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+ int4 coord_out = coord; \
+ coord_out.z = coord_out.z * 3; \
+ coord_out.x = coord_out.z + 8; \
+ float4 paramData0 = (float4)(rMean * output_scale * r_scale - output_zp, \
+ rMean * r_scale * output_scale - output_zp, \
+ rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \
+ \
+ half4 paramData_f16; \
+ copy_type data0, data1, data2, dst; \
+ _viv_asm(CONV, paramData_f16, paramData0); \
+ VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniDataMeanStddevLo_2x8); \
+ float4 paramData1 = (float4)(gMean * output_scale * g_scale - output_zp,\
+ gMean * g_scale * output_scale - output_zp, \
+ gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \
+ _viv_asm(CONV, paramData_f16, paramData1); \
+ VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), \
+ uniDataMeanStddevLo_2x8); \
+ _viv_asm(COPY, data0, dst0, 16); \
+ \
+ float4 paramData2 = (float4)(bMean * output_scale * b_scale - output_zp, \
+ bMean * b_scale * output_scale - output_zp, \
+ bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \
+ _viv_asm(CONV, paramData_f16, paramData2); \
+ VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+ uniDataMeanStddevLo_2x8); \
+ _viv_asm(COPY, data1, dst1, 16); \
+ VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uni16BitsDataInterleave_0_2x8); \
+ VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uni16BitsDataInterleave_1_2x8); \
+ VXC_WriteImage(output, coord_out.xw, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8, vxc_short8)
+PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)
+
+#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \
+__kernel void pre_process_rgb888_planar_copy_U8to##dst_name##_nhwc \
+ ( \
+ __read_only image2d_array_t input, \
+ __write_only image2d_array_t output, \
+ global int *xRatio, \
+ global int *yRatio, \
+ global int *xOffset, \
+ global int *yOffset, \
+ float rMean, \
+ float gMean, \
+ float bMean, \
+ float r_scale, \
+ int reverse, \
+ int height, \
+ float g_scale, \
+ float b_scale \
+ ) \
+{ \
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+ coord.xy += (int2) (*xOffset, *yOffset); \
+ vxc_uchar16 src0, src1, src2; \
+ write_type dst0, dst1, dst2, dst3; \
+ \
+ int4 coord_in = (int4)(coord.xy, 0, 0); \
+ VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.z ++; \
+ VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ coord_in.z ++; \
+ VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+ int4 coord_out = coord; \
+ coord_out.z = coord_out.z * 3; \
+ coord_out.x = coord_out.z + 16; \
+ float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \
+ rMean * r_scale * output_scale - output_zp, \
+ rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \
+ \
+ half4 paramData_f16; \
+ _viv_asm(CONV, paramData_f16, paramData0); \
+ \
+ VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uniDataMeanStddevLo_2x8); \
+ \
+ float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \
+ gMean * g_scale * output_scale - output_zp, \
+ gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \
+ _viv_asm(CONV, paramData_f16, paramData1); \
+ \
+ VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+ uniDataMeanStddevLo_2x8); \
+ \
+ float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \
+ bMean * b_scale * output_scale - output_zp, \
+ bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \
+ _viv_asm(CONV, paramData_f16, paramData2); \
+ \
+ VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uniDataMeanStddevLo_2x8); \
+ VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uni8BitsDataInterleave_0_2x8); \
+ VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+ uni8BitsDataInterleave_1_2x8); \
+ VXC_DP2x8(dst3, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uni8BitsDataInterleave_2_2x8); \
+ VXC_WriteImage(output, coord_out.zw, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.xw, dst3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)
+PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_2.vx
new file mode 100644
index 000000000..8d686ebd6
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_2.vx
@@ -0,0 +1,57 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;
+
+__kernel void pre_process_rgb888_planar_half_U8toU8_nhwc
+ (
+ __read_only image2d_array_t input,
+ __write_only image2d_array_t output,
+ global int *xRatio,
+ global int *yRatio,
+ global int *xOffset,
+ global int *yOffset,
+ float rMean,
+ float gMean,
+ float bMean,
+ float r_scale,
+ int reverse,
+ float g_scale,
+ float b_scale
+ )
+{
+ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+ vxc_uchar16 src0, src1, src2;
+
+ VXC_ReadImage2DArray(src0, input, coord_in, 0,
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+ coord_in.z ++;
+ VXC_ReadImage2DArray(src1, input, coord_in, 0,
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+ coord_in.z ++;
+ VXC_ReadImage2DArray(src2, input, coord_in, 0,
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+ int4 coord;
+ coord.xy = coord_in.xy >> 1;
+
+ coord.x = coord.x * 3;
+ coord.z = coord.x + 16;
+
+ vxc_uchar16 dst0, dst1;
+ src0.lo = src0.s02468ace;
+ src0.hi = src1.s02468ace;
+ src1.lo = src2.s02468ace;
+
+ VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),
+ uni8BitsDataInterleave_0_2x8);
+ VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),
+ uni8BitsDataInterleave_1_2x8);
+ VXC_DP2x8(dst1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),
+ uni8BitsDataInterleave_2_2x8);
+
+ VXC_WriteImage(output, coord.xy, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord.zy, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_0.vx
index 107846e09..de9dbdeaf 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_0.vx
@@ -10,8 +10,9 @@ _viv_uniform VXC_512Bits uniExtract8Data_2x8;
_viv_uniform float output_scale;
_viv_uniform float output_zp;
+_viv_uniform int4 rgb_order;
-#define RESIZE_BILINEAR_4X1(input, mean, output) \
+#define RESIZE_BILINEAR_4X1(input, scale, mean, output, _coord) \
VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
@@ -41,12 +42,12 @@ _viv_uniform float output_zp;
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
uniConvertIntergetoF32_4x4); \
\
- tmp_dst = tmp_dst * output_scale - mean * output_scale + output_zp; \
+ tmp_dst = tmp_dst * scale * output_scale - scale * mean * output_scale + output_zp; \
_viv_asm(CONV, dst0, tmp_dst); \
VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
uniExtract8Data_2x8); \
_viv_asm(COPY, dst, dst1, 8); \
- VXC_WriteImage(output, coord_out, dst, \
+ VXC_WriteImage(output, _coord, dst, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
#define RGB888_PLANAR_SEP_16BITS(dst_name, dst_type, conv_type, copy_type) \
@@ -55,9 +56,7 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
__read_only image2d_array_t input0, \
__read_only image2d_array_t input1, \
__read_only image2d_array_t input2, \
- __write_only image2d_array_t output0, \
- __write_only image2d_array_t output1, \
- __write_only image2d_array_t output2, \
+ __write_only image2d_array_t output, \
global int *xRatio, \
global int *yRatio, \
global int *xOffset, \
@@ -65,7 +64,11 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
float rMean, \
float gMean, \
float bMean, \
- float f32Var \
+ float r_scale, \
+ int reverse, \
+ int height, \
+ float g_scale, \
+ float b_scale \
) \
{ \
int2 ratioXY = (int2)(*xRatio, *yRatio); \
@@ -118,7 +121,8 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
int4 test02, temp2; \
int4 tt; \
vxc_uchar4 val; \
- int2 coord_out = (int2)(xPos.x, yPos); \
+ int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \
+ coord_out.yzw += rgb_order.xyz; \
\
vxc_uchar8 line1, line2; \
\
@@ -143,16 +147,16 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
conv_type dst0; \
dst_type dst1; \
copy_type dst; \
- tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \
+ tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \
_viv_asm(CONV, dst0, tmp_dst); \
VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
uniExtract8Data_2x8); \
_viv_asm(COPY, dst, dst1, 8); \
- VXC_WriteImage(output0, coord_out, dst, \
+ VXC_WriteImage(output, coord_out.xy, dst, \
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
\
- RESIZE_BILINEAR_4X1(input1, gMean, output1) \
- RESIZE_BILINEAR_4X1(input2, bMean, output2) \
+ RESIZE_BILINEAR_4X1(input1, g_scale, gMean, output, coord_out.xz) \
+ RESIZE_BILINEAR_4X1(input2, b_scale, bMean, output, coord_out.xw) \
}
RGB888_PLANAR_SEP_16BITS(F16, vxc_half8, half4, vxc_short8)
RGB888_PLANAR_SEP_16BITS(I16, vxc_short8, int4, vxc_short8)
@@ -163,9 +167,7 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
__read_only image2d_array_t input0, \
__read_only image2d_array_t input1, \
__read_only image2d_array_t input2, \
- __write_only image2d_array_t output0, \
- __write_only image2d_array_t output1, \
- __write_only image2d_array_t output2, \
+ __write_only image2d_array_t output, \
global int *xRatio, \
global int *yRatio, \
global int *xOffset, \
@@ -173,7 +175,11 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
float rMean, \
float gMean, \
float bMean, \
- float f32Var \
+ float r_scale, \
+ int reverse, \
+ int height, \
+ float g_scale, \
+ float b_scale \
) \
{ \
int2 ratioXY = (int2)(*xRatio, *yRatio); \
@@ -221,7 +227,8 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
\
int4 test01, temp1; \
int4 test02, temp2; \
- int2 coord_out = (int2)(xPos.x, yPos); \
+ int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \
+ coord_out.yzw += rgb_order.xyz; \
\
VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
uniVecShift10); \
@@ -245,12 +252,13 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
\
int4 dst0; \
write_type dst; \
- tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \
+ tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \
dst0 = convert_int4_rte(tmp_dst); \
VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
uniExtract8Data_2x8); \
\
- VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.xy, dst, \
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
\
VXC_ReadImage(line0Y, input1, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(line0Y, input1, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
@@ -282,12 +290,13 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
uniExtractBytes); \
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
uniConvertIntergetoF32_4x4); \
- tmp_dst = tmp_dst * output_scale - gMean * output_scale + output_zp; \
+ tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \
dst0 = convert_int4_rte(tmp_dst); \
VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
uniExtract8Data_2x8); \
\
- VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.xz, \
+ dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
\
VXC_ReadImage(line0Y, input2, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(line0Y, input2, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
@@ -319,12 +328,13 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
uniExtractBytes); \
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
uniConvertIntergetoF32_4x4); \
- tmp_dst = tmp_dst * output_scale - bMean * output_scale + output_zp; \
+ tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \
dst0 = convert_int4_rte(tmp_dst); \
VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
uniExtract8Data_2x8); \
\
- VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.xw, \
+ dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
RGB888_PLANAR_SEP_8BITS(U8, vxc_uchar16)
-RGB888_PLANAR_SEP_8BITS(I8, vxc_char16)
\ No newline at end of file
+RGB888_PLANAR_SEP_8BITS(I8, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx
index ff55851a6..b308e65cc 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx
@@ -5,6 +5,7 @@ _viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;
_viv_uniform float output_scale;
_viv_uniform float output_zp;
+_viv_uniform int4 rgb_order;
#define RGB888_PLANAR_SEP_COPY_16BITS(dst_name, dst_type, copy_type) \
__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
@@ -12,9 +13,7 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
__read_only image2d_array_t input0, \
__read_only image2d_array_t input1, \
__read_only image2d_array_t input2, \
- __write_only image2d_array_t output0, \
- __write_only image2d_array_t output1, \
- __write_only image2d_array_t output2, \
+ __write_only image2d_array_t output, \
global int *xRatio, \
global int *yRatio, \
global int *xOffset, \
@@ -22,7 +21,11 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
float rMean, \
float gMean, \
float bMean, \
- float f32Var \
+ float r_scale, \
+ int reverse, \
+ int height, \
+ float g_scale, \
+ float b_scale \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
@@ -36,8 +39,9 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
\
coord.x = coord.z + 8; \
- float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \
- rMean * output_scale - output_zp, output_scale); \
+ float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \
+ rMean * r_scale * output_scale - output_zp, \
+ rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \
\
half4 paramData_f16; \
copy_type tmp_dst; \
@@ -47,33 +51,38 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniDataMeanStddevHi_2x8); \
_viv_asm(COPY, tmp_dst, dst0, 16); \
- VXC_WriteImage(output0, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ int4 coord_out = coord; \
+ coord_out.yw = coord_out.ww + rgb_order.xy; \
+ VXC_WriteImage(output, coord_out.zy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, tmp_dst, dst1, 16); \
- VXC_WriteImage(output0, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.xy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
- float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \
- gMean * output_scale - output_zp, output_scale); \
+ float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \
+ gMean * g_scale * output_scale - output_zp, \
+ gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \
_viv_asm(CONV, paramData_f16, paramData1); \
VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniDataMeanStddevLo_2x8); \
VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniDataMeanStddevHi_2x8); \
_viv_asm(COPY, tmp_dst, dst0, 16); \
- VXC_WriteImage(output1, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, tmp_dst, dst1, 16); \
- VXC_WriteImage(output1, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
\
- float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \
- bMean * output_scale - output_zp, output_scale); \
+ float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \
+ bMean * b_scale * output_scale - output_zp, \
+ bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \
_viv_asm(CONV, paramData_f16, paramData2); \
VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniDataMeanStddevLo_2x8); \
VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniDataMeanStddevHi_2x8); \
_viv_asm(COPY, tmp_dst, dst0, 16); \
- VXC_WriteImage(output2, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ coord_out.w = coord.w + rgb_order.z; \
+ VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
_viv_asm(COPY, tmp_dst, dst1, 16); \
- VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
}
RGB888_PLANAR_SEP_COPY_16BITS(F16, vxc_half8, vxc_short8)
RGB888_PLANAR_SEP_COPY_16BITS(I16, vxc_short8, vxc_short8)
@@ -84,9 +93,7 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
__read_only image2d_array_t input0, \
__read_only image2d_array_t input1, \
__read_only image2d_array_t input2, \
- __write_only image2d_array_t output0, \
- __write_only image2d_array_t output1, \
- __write_only image2d_array_t output2, \
+ __write_only image2d_array_t output, \
global int *xRatio, \
global int *yRatio, \
global int *xOffset, \
@@ -94,7 +101,11 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
float rMean, \
float gMean, \
float bMean, \
- float f32Var \
+ float r_scale, \
+ int reverse, \
+ int height, \
+ float g_scale, \
+ float b_scale \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
@@ -107,8 +118,11 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
\
- float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \
- rMean * output_scale - output_zp, output_scale); \
+ int4 coord_out = coord; \
+ coord_out.xyw += rgb_order.xyz; \
+ float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \
+ rMean * r_scale * output_scale - output_zp, \
+ rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \
\
half4 paramData_f16; \
_viv_asm(CONV, paramData_f16, paramData0); \
@@ -117,27 +131,29 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
uniDataMeanStddevLo_2x8); \
VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
uniDataMeanStddevHi_2x8); \
- VXC_WriteImage(output0, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.zx, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
\
- float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \
- gMean * output_scale - output_zp, output_scale); \
+ float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \
+ gMean * g_scale * output_scale - output_zp, \
+ gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \
_viv_asm(CONV, paramData_f16, paramData1); \
\
VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniDataMeanStddevLo_2x8); \
VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
uniDataMeanStddevHi_2x8); \
- VXC_WriteImage(output1, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
\
- float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \
- bMean * output_scale - output_zp, output_scale); \
+ float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \
+ bMean * b_scale * output_scale - output_zp, \
+ bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \
_viv_asm(CONV, paramData_f16, paramData2); \
\
VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
uniDataMeanStddevLo_2x8); \
VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
uniDataMeanStddevHi_2x8); \
- VXC_WriteImage(output2, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
}
PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)
PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_2.vx
index bbfed6e7e..51a97f047 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_2.vx
@@ -5,15 +5,14 @@ _viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;
_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;
_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;
_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;
+_viv_uniform int4 rgb_order;
__kernel void pre_process_rgb888_planar_sep_4over3_U8toU8
(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__read_only image2d_array_t input2,
- __write_only image2d_array_t output0,
- __write_only image2d_array_t output1,
- __write_only image2d_array_t output2,
+ __write_only image2d_array_t output,
global int *xRatio,
global int *yRatio,
global int *xOffset,
@@ -21,7 +20,11 @@ __kernel void pre_process_rgb888_planar_sep_4over3_U8toU8
float rMean,
float gMean,
float bMean,
- float f32Var
+ float r_scale,
+ int reverse,
+ int height,
+ float g_scale,
+ float b_scale
)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
@@ -47,9 +50,11 @@ __kernel void pre_process_rgb888_planar_sep_4over3_U8toU8
VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);
VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);
- VXC_WriteImage(output0, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ int4 coord_r = coord_out;
+ coord_r.yzw += rgb_order.xxx;
+ VXC_WriteImage(output, coord_r.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord_r.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord_r.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src0, input1, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input1, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
@@ -65,9 +70,11 @@ __kernel void pre_process_rgb888_planar_sep_4over3_U8toU8
VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);
VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);
- VXC_WriteImage(output1, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ int4 coord_g = coord_out;
+ coord_g.yzw += rgb_order.yyy;
+ VXC_WriteImage(output, coord_g.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord_g.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord_g.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src0, input2, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input2, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
@@ -83,9 +90,11 @@ __kernel void pre_process_rgb888_planar_sep_4over3_U8toU8
VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);
VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);
- VXC_WriteImage(output2, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output2, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ int4 coord_b = coord_out;
+ coord_b.yzw += rgb_order.zzz;
+ VXC_WriteImage(output, coord_b.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord_b.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord_b.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
}
__kernel void pre_process_rgb888_planar_sep_half_U8toU8
@@ -93,9 +102,7 @@ __kernel void pre_process_rgb888_planar_sep_half_U8toU8
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__read_only image2d_array_t input2,
- __write_only image2d_array_t output0,
- __write_only image2d_array_t output1,
- __write_only image2d_array_t output2,
+ __write_only image2d_array_t output,
global int *xRatio,
global int *yRatio,
global int *xOffset,
@@ -103,7 +110,11 @@ __kernel void pre_process_rgb888_planar_sep_half_U8toU8
float rMean,
float gMean,
float bMean,
- float f32Var
+ float r_scale,
+ int reverse,
+ int height,
+ float g_scale,
+ float b_scale
)
{
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
@@ -116,7 +127,9 @@ __kernel void pre_process_rgb888_planar_sep_half_U8toU8
coord_in.zw = coord_in.xy >> 1;
- VXC_WriteImage(output0, coord_in.zw, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output1, coord_in.zw, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
- VXC_WriteImage(output2, coord_in.zw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ int4 coord_rgb = coord_in.zwww;
+ coord_rgb.yzw += rgb_order.xyz;
+ VXC_WriteImage(output, coord_rgb.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord_rgb.xz, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord_rgb.xw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_0.vx
new file mode 100644
index 000000000..a9b792599
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_0.vx
@@ -0,0 +1,342 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniVecShift10;
+_viv_uniform VXC_512Bits uniAddRShift;
+_viv_uniform VXC_512Bits uniGetTempVal;
+_viv_uniform VXC_512Bits uniExtractBytes;
+
+_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;
+
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+#define RESIZE_BILINEAR_4X1(input, scale, mean) \
+ VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line0Y, input, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+ VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+ temp1 = temp1 + test01; \
+ \
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+ temp2 = temp2 + test02; \
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniExtractBytes); \
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniConvertIntergetoF32_4x4); \
+ \
+ tmp_dst = tmp_dst * scale * output_scale - mean * scale * output_scale + output_zp; \
+ _viv_asm(CONV, dst0, tmp_dst);
+
+#define RGB888_PLANAR_SEP_16BITS(dst_name, dst_type, conv_type, copy_type) \
+__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name##_nhwc \
+ ( \
+ __read_only image2d_array_t input0, \
+ __read_only image2d_array_t input1, \
+ __read_only image2d_array_t input2, \
+ __write_only image2d_array_t output, \
+ global int *xRatio, \
+ global int *yRatio, \
+ global int *xOffset, \
+ global int *yOffset, \
+ float rMean, \
+ float gMean, \
+ float bMean, \
+ float r_scale, \
+ int reverse, \
+ float g_scale, \
+ float b_scale \
+ ) \
+{ \
+ int2 ratioXY = (int2)(*xRatio, *yRatio); \
+ \
+ int4 xPos = get_global_id(0); \
+ int yPos = get_global_id(1); \
+ \
+ int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \
+ xPos += (int4)(0, 1, 2, 3); \
+ \
+ int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \
+ int4 sx = fx0 & 0xffff8000; \
+ fx0 -= sx; \
+ sx = sx >> 15; \
+ \
+ vxc_short4 fx; \
+ VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniAddRShift); \
+ \
+ int fy = yPos * ratioXY.y + ratioSufXY.y; \
+ int sy = fy & 0xffff8000; \
+ \
+ fy -= sy; \
+ sy = sy >> 15; \
+ \
+ fy = (fy + (1<< 4)) >> 5; \
+ \
+ vxc_uchar16 line0Y; \
+ vxc_uchar16 line1Y; \
+ int4 coord; \
+ sx = sx + *xOffset; \
+ coord.xyz = sx.xyz; \
+ coord.w = sy + *yOffset; \
+ int2 coord1 = (int2)(sx.w, coord.w); \
+ VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+ VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+ int4 test01, temp1; \
+ int4 test02, temp2; \
+ int4 tt; \
+ vxc_uchar4 val; \
+ int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \
+ coord_out.x = coord_out.x * 3; \
+ coord_out.z = coord_out.x + 8; \
+ \
+ vxc_uchar8 line1, line2; \
+ \
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+ temp1 = temp1 + test01; \
+ \
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+ temp2 = temp2 + test02; \
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+ \
+ vxc_float4 tmp_dst; \
+ vxc_uchar4 u8_dst; \
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniExtractBytes); \
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniConvertIntergetoF32_4x4); \
+ \
+ conv_type dst0; \
+ dst_type dst1, dst2; \
+ copy_type data0, data1, dst; \
+ tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \
+ _viv_asm(CONV, dst0, tmp_dst); \
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+ uniExtract8Data_2x8); \
+ RESIZE_BILINEAR_4X1(input1, g_scale, gMean) \
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uniExtract8Data_2x8); \
+ \
+ RESIZE_BILINEAR_4X1(input2, b_scale, bMean) \
+ VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+ uniExtract8Data_2x8); \
+ _viv_asm(COPY, data0, dst1, 16); \
+ _viv_asm(COPY, data1, dst2, 16); \
+ VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uni16BitsDataInterleave_0_2x8); \
+ VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uni16BitsDataInterleave_1_2x8); \
+ VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+RGB888_PLANAR_SEP_16BITS(F16, vxc_half8, half4, vxc_short8)
+RGB888_PLANAR_SEP_16BITS(I16, vxc_short8, int4, vxc_short8)
+
+#define RGB888_PLANAR_SEP_8BITS(dst_name, write_type) \
+__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name##_nhwc \
+ ( \
+ __read_only image2d_array_t input0, \
+ __read_only image2d_array_t input1, \
+ __read_only image2d_array_t input2, \
+ __write_only image2d_array_t output, \
+ global int *xRatio, \
+ global int *yRatio, \
+ global int *xOffset, \
+ global int *yOffset, \
+ float rMean, \
+ float gMean, \
+ float bMean, \
+ float r_scale, \
+ int reverse, \
+ float g_scale, \
+ float b_scale \
+ ) \
+{ \
+ int2 ratioXY = (int2)(*xRatio, *yRatio); \
+ int4 xPos = get_global_id(0); \
+ int yPos = get_global_id(1); \
+ \
+ int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \
+ xPos += (int4)(0, 1, 2, 3); \
+ \
+ int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \
+ int4 sx = fx0 & 0xffff8000; \
+ fx0 -= sx; \
+ sx = sx >> 15; \
+ \
+ vxc_short4 fx; \
+ VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \
+ \
+ int fy = yPos * ratioXY.y + ratioSufXY.y; \
+ int sy = fy & 0xffff8000; \
+ \
+ fy -= sy; \
+ sy = sy >> 15; \
+ fy = (fy + (1<< 4)) >> 5; \
+ \
+ vxc_uchar16 line0Y; \
+ vxc_uchar16 line1Y; \
+ int4 coord; \
+ sx = sx + *xOffset; \
+ coord.xyz = sx.xyz; \
+ coord.w = sy + *yOffset; \
+ int2 coord1 = (int2)(sx.w, coord.w); \
+ VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+ VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+ int4 test01, temp1; \
+ int4 test02, temp2; \
+ int2 coord_out = (int2)(xPos.x, yPos); \
+ coord_out.x = coord_out.x * 3; \
+ \
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGetTempVal); \
+ temp1 = temp1 + test01; \
+ \
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGetTempVal); \
+ temp2 = temp2 + test02; \
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+ \
+ vxc_float4 tmp_dst; \
+ vxc_uchar4 u8_dst; \
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniExtractBytes); \
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniConvertIntergetoF32_4x4); \
+ \
+ int4 dst0; \
+ write_type dst1, dst; \
+ tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \
+ dst0 = convert_int4_rte(tmp_dst); \
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+ uniExtract8Data_2x8); \
+ \
+ VXC_ReadImage(line0Y, input1, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line0Y, input1, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line0Y, input1, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line0Y, input1, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+ VXC_ReadImage(line1Y, input1, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line1Y, input1, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line1Y, input1, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line1Y, input1, coord1, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGetTempVal); \
+ temp1 = temp1 + test01; \
+ \
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGetTempVal); \
+ temp2 = temp2 + test02; \
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniExtractBytes); \
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniConvertIntergetoF32_4x4); \
+ tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \
+ dst0 = convert_int4_rte(tmp_dst); \
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uniExtract8Data_2x8); \
+ \
+ VXC_ReadImage(line0Y, input2, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line0Y, input2, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line0Y, input2, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line0Y, input2, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+ VXC_ReadImage(line1Y, input2, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line1Y, input2, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line1Y, input2, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(line1Y, input2, coord1, VXC_5BITOFFSET_XY(0, 1), \
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGetTempVal); \
+ temp1 = temp1 + test01; \
+ \
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniVecShift10); \
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniGetTempVal); \
+ temp2 = temp2 + test02; \
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniExtractBytes); \
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniConvertIntergetoF32_4x4); \
+ tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \
+ dst0 = convert_int4_rte(tmp_dst); \
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \
+ uniExtract8Data_2x8); \
+ VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uni8BitsDataInterleave_0_2x8); \
+ VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \
+ uni16BitsDataInterleave_1_2x8); \
+ VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \
+}
+RGB888_PLANAR_SEP_8BITS(U8, vxc_uchar16)
+RGB888_PLANAR_SEP_8BITS(I8, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_1.vx
new file mode 100644
index 000000000..1ae298c22
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_1.vx
@@ -0,0 +1,148 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;
+
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;
+
+#define RGB888_PLANAR_SEP_COPY_16BITS(dst_name, dst_type, copy_type) \
+__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name##_nhwc \
+ ( \
+ __read_only image2d_array_t input0, \
+ __read_only image2d_array_t input1, \
+ __read_only image2d_array_t input2, \
+ __write_only image2d_array_t output, \
+ global int *xRatio, \
+ global int *yRatio, \
+ global int *xOffset, \
+ global int *yOffset, \
+ float rMean, \
+ float gMean, \
+ float bMean, \
+ float r_scale, \
+ int reverse, \
+ float g_scale, \
+ float b_scale \
+ ) \
+{ \
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+ coord.xy += (int2)(*xOffset, *yOffset); \
+ vxc_uchar16 src0, src1, src2; \
+ dst_type dst0, dst1; \
+ \
+ VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+ int4 coord_out = coord; \
+ coord_out.z = coord_out.z * 3; \
+ coord_out.x = coord_out.z + 8; \
+ float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp,\
+ rMean * r_scale * output_scale - output_zp, \
+ rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \
+ \
+ half4 paramData_f16; \
+ copy_type data0, data1, data2, dst; \
+ _viv_asm(CONV, paramData_f16, paramData0); \
+ VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+ uniDataMeanStddevLo_2x8); \
+ float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp,\
+ gMean * g_scale * output_scale - output_zp, \
+ gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \
+ _viv_asm(CONV, paramData_f16, paramData1); \
+ VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), \
+ uniDataMeanStddevLo_2x8); \
+ _viv_asm(COPY, data0, dst0, 16); \
+ \
+ float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp,\
+ bMean * b_scale * output_scale - output_zp, \
+ bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \
+ _viv_asm(CONV, paramData_f16, paramData2); \
+ VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+ uniDataMeanStddevLo_2x8); \
+ _viv_asm(COPY, data1, dst0, 16); \
+ VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uni16BitsDataInterleave_0_2x8); \
+ VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uni16BitsDataInterleave_1_2x8); \
+ VXC_WriteImage(output, coord_out.xw, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+RGB888_PLANAR_SEP_COPY_16BITS(F16, vxc_half8, vxc_short8)
+RGB888_PLANAR_SEP_COPY_16BITS(I16, vxc_short8, vxc_short8)
+
+#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \
+__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name##_nhwc \
+ ( \
+ __read_only image2d_array_t input0, \
+ __read_only image2d_array_t input1, \
+ __read_only image2d_array_t input2, \
+ __write_only image2d_array_t output, \
+ global int *xRatio, \
+ global int *yRatio, \
+ global int *xOffset, \
+ global int *yOffset, \
+ float rMean, \
+ float gMean, \
+ float bMean, \
+ float r_scale, \
+ int reverse, \
+ float g_scale, \
+ float b_scale \
+ ) \
+{ \
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+ coord.xy += (int2) (*xOffset, *yOffset); \
+ vxc_uchar16 src0, src1, src2; \
+ write_type dst0, dst1, dst2, dst3; \
+ \
+ VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+ int4 coord_out = coord; \
+ coord_out.z = coord_out.z * 3; \
+ coord_out.x = coord_out.z + 16; \
+ float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp,\
+ rMean * r_scale * output_scale - output_zp, \
+ rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \
+ \
+ half4 paramData_f16; \
+ _viv_asm(CONV, paramData_f16, paramData0); \
+ \
+ VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uniDataMeanStddevLo_2x8); \
+ \
+ float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp,\
+ gMean * g_scale * output_scale - output_zp, \
+ gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \
+ _viv_asm(CONV, paramData_f16, paramData1); \
+ \
+ VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+ uniDataMeanStddevLo_2x8); \
+ \
+ float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp,\
+ bMean * b_scale * output_scale - output_zp, \
+ bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \
+ _viv_asm(CONV, paramData_f16, paramData2); \
+ \
+ VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uniDataMeanStddevLo_2x8); \
+ VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uni8BitsDataInterleave_0_2x8); \
+ VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+ uni8BitsDataInterleave_1_2x8); \
+ VXC_DP2x8(dst3, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+ uni8BitsDataInterleave_2_2x8); \
+ VXC_WriteImage(output, coord_out.zw, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ VXC_WriteImage(output, coord_out.xw, dst3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)
+PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_2.vx
new file mode 100644
index 000000000..d43f82587
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_2.vx
@@ -0,0 +1,54 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;
+
+__kernel void pre_process_rgb888_planar_sep_half_U8toU8_nhwc
+ (
+ __read_only image2d_array_t input0,
+ __read_only image2d_array_t input1,
+ __read_only image2d_array_t input2,
+ __write_only image2d_array_t output,
+ global int *xRatio,
+ global int *yRatio,
+ global int *xOffset,
+ global int *yOffset,
+ float rMean,
+ float gMean,
+ float bMean,
+ float r_scale,
+ int reverse,
+ float g_scale,
+ float b_scale
+ )
+{
+ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
+
+ vxc_uchar16 src0, src1, src2;
+
+ VXC_ReadImage(src0, input0, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+ VXC_ReadImage(src1, input1, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+ VXC_ReadImage(src2, input2, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+ int4 coord;
+ coord.xy = coord_in.xy >> 1;
+
+ coord.x = coord.x * 3;
+ coord.z = coord.x + 16;
+
+ vxc_uchar16 dst0, dst1;
+ src0.lo = src0.s02468ace;
+ src0.hi = src1.s02468ace;
+ src1.lo = src2.s02468ace;
+
+ VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),
+ uni8BitsDataInterleave_0_2x8);
+ VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),
+ uni8BitsDataInterleave_1_2x8);
+ VXC_DP2x8(dst1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),
+ uni8BitsDataInterleave_2_2x8);
+
+ VXC_WriteImage(output, coord.xy, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord.zy, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx
index c200019c3..5a343e708 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx
@@ -17,6 +17,8 @@ _viv_uniform VXC_512Bits uniExtractBtoF32_part1_4x4;
_viv_uniform VXC_512Bits uniExtractBtoF32_part2_4x4;
_viv_uniform VXC_512Bits uniExtractBtoF32_part3_4x4;
_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform float4 param_data;
+_viv_uniform float4 rgb_scale;
#define IMAGE_PRE_PROCESS_COPY_16BITS(dst_name, dst_type, copy_type, convert_type) \
__kernel void pre_process_rgb_copy_U8to##dst_name \
@@ -30,9 +32,11 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
float rMean, \
float gMean, \
float bMean, \
- float f32Var, \
+ float r_scale, \
int reverse_channel, \
- int trans \
+ int trans, \
+ float g_scale, \
+ float b_scale \
) \
{ \
int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); \
@@ -46,10 +50,6 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
- \
- f32Var *= outputScale; \
- float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \
- bMean * f32Var - outputZP, f32Var); \
\
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), r_order, 0); \
float4 tmp0, tmp1; \
@@ -57,8 +57,8 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
\
VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \
VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \
- tmp0 = tmp0 * paramData.w - paramData.x; \
- tmp1 = tmp1 * paramData.w - paramData.x; \
+ tmp0 = tmp0 * rgb_scale.x - param_data.x; \
+ tmp1 = tmp1 * rgb_scale.x - param_data.x; \
_viv_asm(CONV_RTE, result0, tmp0); \
_viv_asm(CONV_RTE, result1, tmp1); \
VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
@@ -68,8 +68,8 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
coord_out.z = 1; \
VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \
VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \
- tmp0 = tmp0 * paramData.w - paramData.y; \
- tmp1 = tmp1 * paramData.w - paramData.y; \
+ tmp0 = tmp0 * rgb_scale.y - param_data.y; \
+ tmp1 = tmp1 * rgb_scale.y - param_data.y; \
_viv_asm(CONV_RTE, result0, tmp0); \
_viv_asm(CONV_RTE, result1, tmp1); \
VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
@@ -79,8 +79,8 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
coord_out.z = b_order; \
VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \
VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \
- tmp0 = tmp0 * paramData.w - paramData.z; \
- tmp1 = tmp1 * paramData.w - paramData.z; \
+ tmp0 = tmp0 * rgb_scale.z - param_data.z; \
+ tmp1 = tmp1 * rgb_scale.z - param_data.z; \
_viv_asm(CONV_RTE, result0, tmp0); \
_viv_asm(CONV_RTE, result1, tmp1); \
VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
@@ -102,9 +102,11 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
float rMean, \
float gMean, \
float bMean, \
- float f32Var, \
+ float r_scale, \
int reverse_channel, \
- int trans \
+ int trans, \
+ float g_scale, \
+ float b_scale \
) \
{ \
int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); \
@@ -119,10 +121,6 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
coord.x += 16; \
VXC_ReadImage(src2, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
- \
- f32Var *= outputScale; \
- float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \
- bMean * f32Var - outputZP, f32Var); \
\
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), r_order, 0); \
float4 tmp0, tmp1; \
@@ -130,15 +128,15 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
\
VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \
VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \
- tmp0 = tmp0 * paramData.w - paramData.x; \
- tmp1 = tmp1 * paramData.w - paramData.x; \
+ tmp0 = tmp0 * rgb_scale.x - param_data.x; \
+ tmp1 = tmp1 * rgb_scale.x - param_data.x; \
result0 = convert_int4_rte(tmp0); \
result1 = convert_int4_rte(tmp1); \
VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part2_4x4); \
VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part3_4x4); \
- tmp0 = tmp0 * paramData.w - paramData.x; \
- tmp1 = tmp1 * paramData.w - paramData.x; \
+ tmp0 = tmp0 * rgb_scale.x - param_data.x; \
+ tmp1 = tmp1 * rgb_scale.x - param_data.x; \
result0 = convert_int4_rte(tmp0); \
result1 = convert_int4_rte(tmp1); \
VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
@@ -147,15 +145,15 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
coord_out.z = 1; \
VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \
VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \
- tmp0 = tmp0 * paramData.w - paramData.y; \
- tmp1 = tmp1 * paramData.w - paramData.y; \
+ tmp0 = tmp0 * rgb_scale.y - param_data.y; \
+ tmp1 = tmp1 * rgb_scale.y - param_data.y; \
result0 = convert_int4_rte(tmp0); \
result1 = convert_int4_rte(tmp1); \
VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part2_4x4); \
VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part3_4x4); \
- tmp0 = tmp0 * paramData.w - paramData.y; \
- tmp1 = tmp1 * paramData.w - paramData.y; \
+ tmp0 = tmp0 * rgb_scale.y - param_data.y; \
+ tmp1 = tmp1 * rgb_scale.y - param_data.y; \
result0 = convert_int4_rte(tmp0); \
result1 = convert_int4_rte(tmp1); \
VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
@@ -164,15 +162,15 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
coord_out.z = b_order; \
VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \
VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \
- tmp0 = tmp0 * paramData.w - paramData.z; \
- tmp1 = tmp1 * paramData.w - paramData.z; \
+ tmp0 = tmp0 * rgb_scale.z - param_data.z; \
+ tmp1 = tmp1 * rgb_scale.z - param_data.z; \
result0 = convert_int4_rte(tmp0); \
result1 = convert_int4_rte(tmp1); \
VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part2_4x4); \
VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part3_4x4); \
- tmp0 = tmp0 * paramData.w - paramData.z; \
- tmp1 = tmp1 * paramData.w - paramData.z; \
+ tmp0 = tmp0 * rgb_scale.z - param_data.z; \
+ tmp1 = tmp1 * rgb_scale.z - param_data.z; \
result0 = convert_int4_rte(tmp0); \
result1 = convert_int4_rte(tmp1); \
VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx
index 25f981a11..3a91a3559 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx
@@ -49,9 +49,11 @@ __kernel void pre_process_yuv420_copy_##name \
float rMean, \
float gMean, \
float bMean, \
- float var, \
+ float r_scale, \
int reverse_channel, \
- int trans \
+ int trans, \
+ float g_scale, \
+ float b_scale \
) \
{ \
int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \
@@ -110,17 +112,23 @@ __kernel void pre_process_yuv420_copy_##name \
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
\
- var *= output_scale; \
- float4 paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \
- rMean * var - output_zp, var); \
+ float4 paramData = (float4)(bMean * b_scale * output_scale - output_zp,\
+ gMean * g_scale * output_scale - output_zp, \
+ rMean * r_scale * output_scale - output_zp, b_scale * output_scale); \
half4 paramData_f16; \
_viv_asm(CONV, paramData_f16, paramData); \
\
VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \
VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \
+ \
+ paramData.w = g_scale * output_scale; \
+ _viv_asm(CONV, paramData_f16, paramData); \
\
VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \
VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \
+ \
+ paramData.w = r_scale * output_scale; \
+ _viv_asm(CONV, paramData_f16, paramData); \
\
VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \
VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \
@@ -150,9 +158,11 @@ __kernel void pre_process_yuv420_copy_##name \
float rMean, \
float gMean, \
float bMean, \
- float var, \
+ float r_scale, \
int reverse_channel, \
- int trans \
+ int trans, \
+ float g_scale, \
+ float b_scale \
) \
{ \
int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \
@@ -202,18 +212,22 @@ __kernel void pre_process_yuv420_copy_##name \
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
\
- var *= output_scale; \
- float4 paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \
- rMean * var - output_zp, var); \
+ float4 paramData = (float4)(bMean * b_scale * output_scale - output_zp, \
+ gMean * g_scale * output_scale - output_zp, \
+ rMean * r_scale * output_scale - output_zp, b_scale * output_scale); \
half4 paramData_f16; \
_viv_asm(CONV, paramData_f16, paramData); \
\
VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \
VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \
\
+ paramData.w = g_scale * output_scale; \
+ _viv_asm(CONV, paramData_f16, paramData); \
VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \
VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \
\
+ paramData.w = r_scale * output_scale; \
+ _viv_asm(CONV, paramData_f16, paramData); \
VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \
VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \
\
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx
index 40db13719..99a64459e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx
@@ -48,9 +48,11 @@ __kernel void pre_process_yuv420_scale_##name \
float rMean, \
float gMean, \
float bMean, \
- float var, \
+ float r_scale, \
int reverse_channel, \
- int trans \
+ int trans, \
+ float g_scale, \
+ float b_scale \
) \
{ \
int4 gidx = get_global_id(0); \
@@ -199,7 +201,7 @@ __kernel void pre_process_yuv420_scale_##name \
float4 tmpDst; \
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
- tmpDst = (tmpDst - bMean) * var; \
+ tmpDst = (tmpDst - bMean) * b_scale; \
dstPos.z = bOrder; \
result = convert_int4_rte(tmpDst * output_scale + output_zp); \
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
@@ -213,7 +215,7 @@ __kernel void pre_process_yuv420_scale_##name \
temp2 = fx * tmpData0 + tmpData1; \
result = fy * temp2 + (temp1 << 10); \
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
- tmpDst = (tmpDst - gMean) * var; \
+ tmpDst = (tmpDst - gMean) * g_scale; \
dstPos.z = 1; \
result = convert_int4_rte(tmpDst * output_scale + output_zp); \
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
@@ -227,7 +229,7 @@ __kernel void pre_process_yuv420_scale_##name \
temp2 = fx * tmpData0 + tmpData1; \
result = fy * temp2 + (temp1 << 10); \
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
- tmpDst = (tmpDst - rMean) * var; \
+ tmpDst = (tmpDst - rMean) * r_scale; \
dstPos.z = rOrder; \
result = convert_int4_rte(tmpDst * output_scale + output_zp); \
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx
index 7bfa6d112..676a8485c 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx
@@ -48,9 +48,11 @@ __kernel void pre_process_yuv420_scale_##name \
float rMean, \
float gMean, \
float bMean, \
- float var, \
+ float r_scale, \
int reverse_channel, \
- int trans \
+ int trans, \
+ float g_scale, \
+ float b_scale \
) \
{ \
int4 gidx = get_global_id(0); \
@@ -201,7 +203,7 @@ __kernel void pre_process_yuv420_scale_##name \
float4 tmpDst; \
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
- tmpDst = (tmpDst - bMean) * var; \
+ tmpDst = (tmpDst - bMean) * b_scale; \
dstPos.z = bOrder; \
tmpDst = tmpDst * output_scale + output_zp; \
_viv_asm(CONV_RTE, tmpVal, tmpDst); \
@@ -217,7 +219,7 @@ __kernel void pre_process_yuv420_scale_##name \
temp2 = fx * tmpData0 + tmpData1; \
result = fy * temp2 + (temp1 << 10); \
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
- tmpDst = (tmpDst - gMean) * var; \
+ tmpDst = (tmpDst - gMean) * g_scale; \
dstPos.z = 1; \
tmpDst = tmpDst * output_scale + output_zp; \
_viv_asm(CONV_RTE, tmpVal, tmpDst); \
@@ -233,7 +235,7 @@ __kernel void pre_process_yuv420_scale_##name \
temp2 = fx * tmpData0 + tmpData1; \
result = fy * temp2 + (temp1 << 10); \
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
- tmpDst = (tmpDst - rMean) * var; \
+ tmpDst = (tmpDst - rMean) * r_scale; \
dstPos.z = rOrder; \
tmpDst = tmpDst * output_scale + output_zp; \
_viv_asm(CONV_RTE, tmpVal, tmpDst); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx
index eed071587..0006e4a71 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx
@@ -3,7 +3,9 @@
_viv_uniform int bOrder;
_viv_uniform int rOrder;
-_viv_uniform float outputScaleVar;
+_viv_uniform float outputScaleVar_b;
+_viv_uniform float outputScaleVar_g;
+_viv_uniform float outputScaleVar_r;
_viv_uniform float bMeanScaleVarZp;
_viv_uniform float gMeanScaleVarZp;
_viv_uniform float rMeanScaleVarZp;
@@ -27,10 +29,12 @@ __kernel void pre_process_yuv422_copy_##name \
float rMean, \
float gMean, \
float bMean, \
- float var, \
+ float r_scale, \
int reverse_channel, \
int trans, \
- int yuv422_type \
+ int yuv422_type, \
+ float g_scale, \
+ float b_scale \
) \
{ \
int gidx = get_global_id(0); \
@@ -60,21 +64,21 @@ __kernel void pre_process_yuv422_copy_##name \
dst_type dst0; \
save_type dst; \
int4 dstPos = (int4)(gidx, gidy, 0, 0); \
- tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
+ tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \
_viv_asm(CONV_RTE, result, tmpDstB); \
dstPos.z = bOrder; \
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
_viv_asm(COPY, dst, dst0, copy_bytes); \
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
\
- tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
+ tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \
_viv_asm(CONV_RTE, result, tmpDstG); \
dstPos.z = 1; \
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
_viv_asm(COPY, dst, dst0, copy_bytes); \
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
\
- tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
+ tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \
_viv_asm(CONV_RTE, result, tmpDstR); \
dstPos.z = rOrder; \
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx
index 78546d991..9fb80e504 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx
@@ -3,7 +3,10 @@
_viv_uniform int bOrder;
_viv_uniform int rOrder;
-_viv_uniform float outputScaleVar;
+_viv_uniform float outputScaleVar_b;
+_viv_uniform float outputScaleVar_g;
+_viv_uniform float outputScaleVar_r;
+
_viv_uniform float bMeanScaleVarZp;
_viv_uniform float gMeanScaleVarZp;
_viv_uniform float rMeanScaleVarZp;
@@ -33,10 +36,12 @@ __kernel void pre_process_yuv422_scale_##name \
float rMean, \
float gMean, \
float bMean, \
- float var, \
+ float r_scale, \
int reverse_channel, \
int trans, \
- int yuv422_type \
+ int yuv422_type, \
+ float g_scale, \
+ float b_scale \
) \
{ \
int4 gidx = get_global_id(0); \
@@ -108,21 +113,21 @@ __kernel void pre_process_yuv422_scale_##name \
dst_type dst0; \
save_type dst; \
int4 dstPos = (int4)(gidx.x, gidy, 0, 0); \
- tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
+ tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \
_viv_asm(CONV_RTE, result, tmpDstB); \
dstPos.z = bOrder; \
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
_viv_asm(COPY, dst, dst0, copy_bytes); \
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
\
- tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
+ tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \
_viv_asm(CONV_RTE, result, tmpDstG); \
dstPos.z = 1; \
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
_viv_asm(COPY, dst, dst0, copy_bytes); \
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
\
- tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
+ tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \
_viv_asm(CONV_RTE, result, tmpDstR); \
dstPos.z = rOrder; \
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx
index 05f9973c3..3a6a3c50f 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx
@@ -46,9 +46,11 @@ __kernel void pre_process_yuv444_copy_U8toU8(
float rMean,
float gMean,
float bMean,
- float var,
+ float r_scale,
int reverse_channel,
- int trans
+ int trans,
+ float g_scale,
+ float b_scale
)
{
int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset));
@@ -107,18 +109,23 @@ __kernel void pre_process_yuv444_copy_U8toU8(
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
- var *= outputScale;
- float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\
- rMean * var - zp, var);
+ float4 paramData = (float4)(bMean * b_scale * outputScale - zp, gMean * g_scale * outputScale - zp,\
+ rMean * r_scale * outputScale - zp, b_scale * outputScale);
half4 paramData_f16;
_viv_asm(CONV, paramData_f16, paramData);
VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);
VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);
+ paramData.w = g_scale * outputScale;
+ _viv_asm(CONV, paramData_f16, paramData);
+
VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);
VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);
+ paramData.w = r_scale * outputScale;
+ _viv_asm(CONV, paramData_f16, paramData);
+
VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);
VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);
@@ -142,9 +149,11 @@ __kernel void pre_process_yuv444_copy_U8toF16(
float rMean,
float gMean,
float bMean,
- float var,
+ float r_scale,
int reverse_channel,
- int trans
+ int trans,
+ float g_scale,
+ float b_scale
)
{
int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset));
@@ -204,17 +213,23 @@ __kernel void pre_process_yuv444_copy_U8toF16(
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
- float4 paramData = (float4)(bMean * var, gMean * var,\
- rMean * var, var);
+ float4 paramData = (float4)(bMean * b_scale * outputScale, gMean * g_scale * outputScale,\
+ rMean * r_scale * outputScale, b_scale * outputScale);
half4 paramData_f16;
_viv_asm(CONV, paramData_f16, paramData);
VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);
VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);
+ paramData.w = g_scale * outputScale;
+ _viv_asm(CONV, paramData_f16, paramData);
+
VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);
VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);
+ paramData.w = r_scale * outputScale;
+ _viv_asm(CONV, paramData_f16, paramData);
+
VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);
VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale.vx
index a195750c4..9b4a418e2 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale.vx
@@ -39,7 +39,8 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \
__read_only image2d_t y_img, __read_only image2d_t u_img, \
__read_only image2d_t v_img, __write_only image2d_array_t output, \
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, \
- float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) \
+ float rMean, float gMean, float bMean, float r_scale, int reverse_channel, int trans, \
+ float g_scale, float b_scale) \
{ \
int4 gidx = get_global_id(0); \
int gidy = get_global_id(1); \
@@ -151,7 +152,7 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \
float4 tmpDst; \
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
- tmpDst = (tmpDst - bMean) * var; \
+ tmpDst = (tmpDst - bMean) * b_scale; \
dstPos.z = bOrder; \
result = convert_int4_rte(tmpDst * outputScale + zp); \
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \
@@ -165,7 +166,7 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \
temp2 = fx * tmpData0 + tmpData1; \
result = fy * temp2 + (temp1 << 10); \
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
- tmpDst = (tmpDst - gMean) * var; \
+ tmpDst = (tmpDst - gMean) * g_scale; \
dstPos.z = 1; \
result = convert_int4_rte(tmpDst * outputScale + zp); \
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \
@@ -179,7 +180,7 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \
temp2 = fx * tmpData0 + tmpData1; \
result = fy * temp2 + (temp1 << 10); \
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
- tmpDst = (tmpDst - rMean) * var; \
+ tmpDst = (tmpDst - rMean) * r_scale; \
dstPos.z = rOrder; \
result = convert_int4_rte(tmpDst * outputScale + zp); \
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale_fp16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale_fp16.vx
index c5e706d9a..99325d87d 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale_fp16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale_fp16.vx
@@ -37,7 +37,8 @@ __kernel void pre_process_yuv444_scale_U8toF16(
__read_only image2d_t y_img, __read_only image2d_t u_img,
__read_only image2d_t v_img, __write_only image2d_array_t output,
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
- float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
+ float rMean, float gMean, float bMean, float r_scale, int reverse_channel, int trans,
+ float g_scale, float b_scale)
{
int4 gidx = get_global_id(0);
int gidy = get_global_id(1);
@@ -157,7 +158,7 @@ __kernel void pre_process_yuv444_scale_U8toF16(
float4 tmpDst;
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
- tmpDst = (tmpDst - bMean) * var;
+ tmpDst = (tmpDst - bMean) * b_scale;
dstPos.z = bOrder;
_viv_asm(CONV, hDst, tmpDst);
VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
@@ -172,7 +173,7 @@ __kernel void pre_process_yuv444_scale_U8toF16(
temp2 = fx * tmpData0 + tmpData1;
result = fy * temp2 + (temp1 << 10);
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
- tmpDst = (tmpDst - gMean) * var;
+ tmpDst = (tmpDst - gMean) * g_scale;
dstPos.z = 1;
_viv_asm(CONV, hDst, tmpDst);
VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
@@ -187,7 +188,7 @@ __kernel void pre_process_yuv444_scale_U8toF16(
temp2 = fx * tmpData0 + tmpData1;
result = fy * temp2 + (temp1 << 10);
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
- tmpDst = (tmpDst - rMean) * var;
+ tmpDst = (tmpDst - rMean) * r_scale;
dstPos.z = rOrder;
_viv_asm(CONV, hDst, tmpDst);
VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx
index 80840646b..750eadaf1 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx
@@ -2,7 +2,6 @@
_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
_viv_uniform VXC_512Bits uniFp16toFp32_4x4;
-_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
_viv_uniform VXC_512Bits uniExtactHalf8_2x8;
_viv_uniform float scale_x;
_viv_uniform int out_height;
@@ -63,8 +62,10 @@ __kernel void resize_1d_bilinear_F16toF16_DOWN
_viv_asm(COPY, src_half, src, 16);
- VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_left_4x4);
- VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_right_4x4);
+ VXC_DP4x4(left4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniConvertFp2FP32_left_4x4);
+ VXC_DP4x4(right4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniConvertFp2FP32_right_4x4);
right4 -= left4;
float4 dst4 = right4 * x_lerp + left4;
@@ -129,8 +130,10 @@ __kernel void resize_1d_bilinear_F16toU8_DOWN
_viv_asm(COPY, src_half, src, 16);
- VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_left_4x4);
- VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_right_4x4);
+ VXC_DP4x4(left4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniConvertFp2FP32_left_4x4);
+ VXC_DP4x4(right4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+ uniConvertFp2FP32_right_4x4);
right4 -= left4;
float4 dst4 = right4 * x_lerp + left4;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_fp.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_fp.vx
new file mode 100644
index 000000000..a60e9b8e9
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_fp.vx
@@ -0,0 +1,307 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int update_width;
+_viv_uniform int output_width;
+_viv_uniform int ref_stride;
+_viv_uniform int output_stride;
+
+_viv_uniform int4 coord_stride;
+_viv_uniform int4 coord_stride1;
+_viv_uniform float inout_scale;
+_viv_uniform float output_zp;
+
+_viv_uniform VXC_512Bits uniConvertFp16ToFp32_4x4;
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+
+inline void AtomicAdd_float(volatile __global float *source, const float operand)
+{
+ union
+ {
+ unsigned int intVal;
+ float floatVal;
+ } newVal;
+ union
+ {
+ unsigned int intVal;
+ float floatVal;
+ } prevVal;
+ do
+ {
+ prevVal.floatVal = *source;
+ newVal.floatVal = prevVal.floatVal + operand;
+ } while(atomic_cmpxchg((volatile __global unsigned int *)source,
+ prevVal.intVal, newVal.intVal) != prevVal.intVal);
+}
+
+__kernel void scatter_nd_update_update_F16(
+ __read_only image2d_t index,
+ __read_only image2d_t update,
+ image2d_t temp_buf_float,
+ image2d_t link_buffer0,
+ int width, int area, int vol, int val4,
+ int val5, int val6, int val7, int coord_dim)
+{
+ int gidx = get_global_id(0);
+ int gidy = get_global_id(1);
+ Image img1 = create_image_from_image2d(index, 4);
+ Image img2 = create_image_from_image2d(update, 2);
+ Image img3 = create_image_from_image2d(temp_buf_float, 4);
+ __global int* index_ptr = (__global int*)img1.ptr;
+ __global short* update_ptr = (__global short*)img2.ptr;
+ __global float* output_ptr = (__global float*)img3.ptr;
+ half src;
+
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim);
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);
+ short tmpData = update_ptr[gidy * update_width + gidx];
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;
+ int loc = idx * output_width + gidx;
+ _viv_asm(COPY, src, tmpData, 4);
+ float data;
+ _viv_asm(CONV, data, src);
+ AtomicAdd_float(output_ptr + loc, data);
+}
+
+__kernel void scatter_nd_update_update_F16_4X(
+ __read_only image2d_t index,
+ __read_only image2d_t update,
+ image2d_t temp_buf_float,
+ image2d_t link_buffer0,
+ int width, int area, int vol, int val4,
+ int val5, int val6, int val7, int coord_dim)
+{
+ int gidx = get_global_id(0);
+ int gidy = get_global_id(1);
+ Image img1 = create_image_from_image2d(index, 4);
+ Image img2 = create_image_from_image2d(update, 2);
+ Image img3 = create_image_from_image2d(temp_buf_float, 4);
+ __global int* index_ptr = (__global int*)img1.ptr;
+ __global vxc_short4* update_ptr = (__global vxc_short4*)img2.ptr;
+ __global float* output_ptr = (__global float*)img3.ptr;
+ vxc_half4 src;
+
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim);
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);
+ vxc_short4 tmpData = update_ptr[gidy * update_width + gidx];
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;
+ int4 loc = idx * output_width + gidx * 4 + (int4)(0, 1, 2, 3);
+
+ _viv_asm(COPY, src, tmpData, 8);
+ float4 data;
+ VXC_DP4x4(data, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1),
+ uniConvertFp16ToFp32_4x4);
+ AtomicAdd_float(output_ptr + loc.x, data.x);
+ AtomicAdd_float(output_ptr + loc.y, data.y);
+ AtomicAdd_float(output_ptr + loc.z, data.z);
+ AtomicAdd_float(output_ptr + loc.w, data.w);
+}
+
+__kernel void scatter_nd_update_update_BF16(
+ __read_only image2d_t index,
+ __read_only image2d_t update,
+ image2d_t temp_buf_float,
+ image2d_t link_buffer0,
+ int width, int area, int vol, int val4,
+ int val5, int val6, int val7, int coord_dim)
+{
+ int gidx = get_global_id(0);
+ int gidy = get_global_id(1);
+ Image img1 = create_image_from_image2d(index, 4);
+ Image img2 = create_image_from_image2d(update, 2);
+ Image img3 = create_image_from_image2d(temp_buf_float, 4);
+ __global int* index_ptr = (__global int*)img1.ptr;
+ __global short* update_ptr = (__global short*)img2.ptr;
+ __global float* output_ptr = (__global float*)img3.ptr;
+ float data;
+
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim);
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);
+ short tmpData = update_ptr[gidy * update_width + gidx];
+ vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+ vxc_short8 src0, src1;
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;
+ int loc = idx * output_width + gidx;
+ _viv_asm(COPY, src0, tmpData, 4);
+ VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+ uniConvBF16toF32_Part0_2x8);
+ _viv_asm(COPY, data, src1, 4);
+ AtomicAdd_float(output_ptr + loc, data);
+}
+
+__kernel void scatter_nd_update_update_BF16_4X(
+ __read_only image2d_t index,
+ __read_only image2d_t update,
+ image2d_t temp_buf_float,
+ image2d_t link_buffer0,
+ int width, int area, int vol, int val4,
+ int val5, int val6, int val7, int coord_dim)
+{
+ int gidx = get_global_id(0);
+ int gidy = get_global_id(1);
+ Image img1 = create_image_from_image2d(index, 4);
+ Image img2 = create_image_from_image2d(update, 2);
+ Image img3 = create_image_from_image2d(temp_buf_float, 4);
+ __global int* index_ptr = (__global int*)img1.ptr;
+ __global vxc_short4* update_ptr = (__global vxc_short4*)img2.ptr;
+ __global float* output_ptr = (__global float*)img3.ptr;
+
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim);
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);
+ vxc_short4 tmpData = update_ptr[gidy * update_width + gidx];
+ vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+ vxc_short8 src0, src1;
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;
+ int4 loc = idx * output_width + gidx * 4 + (int4)(0, 1, 2, 3);
+
+ _viv_asm(COPY, src0, tmpData, 8);
+ float4 data;
+ VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+ uniConvBF16toF32_Part0_2x8);
+ _viv_asm(COPY, data, src1, 16);
+ AtomicAdd_float(output_ptr + loc.x, data.x);
+ AtomicAdd_float(output_ptr + loc.y, data.y);
+ AtomicAdd_float(output_ptr + loc.z, data.z);
+ AtomicAdd_float(output_ptr + loc.w, data.w);
+}
+
+#define SCATTER_ND_UPDATE_REF_FP16(type0, type1, ptr_type) \
+__kernel void scatter_nd_update_ref_##type0##to##type1( \
+ __read_only image2d_t index, \
+ __read_only image2d_t update, \
+ __read_only image2d_t temp_buf_int, \
+ image2d_t temp_ref, \
+ image2d_t link_buffer0, \
+ image2d_t link_buffer1, \
+ int width, int area, int vol, int val4, \
+ int val5, int val6, int val7, int coord_dim) \
+{ \
+ int gidx = get_global_id(0); \
+ int gidy = get_global_id(1); \
+ Image img1 = create_image_from_image2d(index, 4); \
+ Image img2 = create_image_from_image2d(temp_buf_int, 4); \
+ Image img3 = create_image_from_image2d(temp_ref, 2); \
+ __global int* index_ptr = (__global int*)img1.ptr; \
+ __global ptr_type* acc_ptr = (__global ptr_type*)img2.ptr; \
+ __global short* ref_ptr = (__global short*)img3.ptr; \
+ \
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim); \
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \
+ int loc = idx * output_stride + gidx; \
+ float4 tmpData; \
+ tmpData.x = convert_float(acc_ptr[loc]) * inout_scale + output_zp; \
+ half4 data; \
+ short tmpDst; \
+ _viv_asm(CONV, data, tmpData); \
+ _viv_asm(COPY, tmpDst, data, 4); \
+ ref_ptr[loc] = tmpDst; \
+}
+SCATTER_ND_UPDATE_REF_FP16(I32, F16, int)
+SCATTER_ND_UPDATE_REF_FP16(F32, F16, float)
+
+#define SCATTER_ND_UPDATE_REF_FP16_4X(type0, type1, ptr_type) \
+__kernel void scatter_nd_update_ref_##type0##to##type1##_4X( \
+ __read_only image2d_t index, \
+ __read_only image2d_t update, \
+ __read_only image2d_t temp_buf_int, \
+ image2d_t temp_ref, \
+ image2d_t link_buffer0, \
+ image2d_t link_buffer1, \
+ int width, int area, int vol, int val4, \
+ int val5, int val6, int val7, int coord_dim) \
+{ \
+ int gidx = get_global_id(0); \
+ int gidy = get_global_id(1); \
+ Image img1 = create_image_from_image2d(index, 4); \
+ Image img2 = create_image_from_image2d(temp_buf_int, 4); \
+ Image img3 = create_image_from_image2d(temp_ref, 2); \
+ __global int* index_ptr = (__global int*)img1.ptr; \
+ __global ptr_type* acc_ptr = (__global ptr_type*)img2.ptr; \
+ __global vxc_short4* ref_ptr = (__global vxc_short4*)img3.ptr; \
+ \
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim); \
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \
+ float4 tmpData = convert_float4(vload4(gidx, acc_ptr + idx * ref_stride)); \
+ int loc = idx * output_stride + gidx; \
+ float4 tmpVal = tmpData * inout_scale + output_zp; \
+ half4 data; \
+ vxc_short8 tmpDst; \
+ _viv_asm(CONV, data, tmpVal); \
+ _viv_asm(COPY, tmpDst, data, 16); \
+ ref_ptr[loc] = tmpDst.s0246; \
+}
+SCATTER_ND_UPDATE_REF_FP16_4X(I32, F16, int)
+SCATTER_ND_UPDATE_REF_FP16_4X(F32, F16, float)
+
+__kernel void scatter_nd_update_ref_F32toBF16(
+ __read_only image2d_t index,
+ __read_only image2d_t update,
+ __read_only image2d_t temp_buf_int,
+ image2d_t temp_ref,
+ image2d_t link_buffer0,
+ image2d_t link_buffer1,
+ int width, int area, int vol, int val4,
+ int val5, int val6, int val7, int coord_dim)
+{
+ int gidx = get_global_id(0);
+ int gidy = get_global_id(1);
+ Image img1 = create_image_from_image2d(index, 4);
+ Image img2 = create_image_from_image2d(temp_buf_int, 4);
+ Image img3 = create_image_from_image2d(temp_ref, 2);
+ __global int* index_ptr = (__global int*)img1.ptr;
+ __global float* acc_ptr = (__global float*)img2.ptr;
+ __global short* ref_ptr = (__global short*)img3.ptr;
+
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim);
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;
+ int loc = idx * output_stride + gidx;
+ float tmpData;
+ tmpData = acc_ptr[loc];
+ vxc_ushort8 src0, src2;
+ _viv_asm(COPY, src0, tmpData, 4);
+ VXC_DP2x8(src2, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+
+ ref_ptr[loc] = src2.x;
+}
+
+__kernel void scatter_nd_update_ref_F32toBF16_4X(
+ __read_only image2d_t index,
+ __read_only image2d_t update,
+ __read_only image2d_t temp_buf_int,
+ image2d_t temp_ref,
+ image2d_t link_buffer0,
+ image2d_t link_buffer1,
+ int width, int area, int vol, int val4,
+ int val5, int val6, int val7, int coord_dim)
+{
+ int gidx = get_global_id(0);
+ int gidy = get_global_id(1);
+ Image img1 = create_image_from_image2d(index, 4);
+ Image img2 = create_image_from_image2d(temp_buf_int, 4);
+ Image img3 = create_image_from_image2d(temp_ref, 2);
+ __global int* index_ptr = (__global int*)img1.ptr;
+ __global float* acc_ptr = (__global float*)img2.ptr;
+ __global vxc_short4* ref_ptr = (__global vxc_short4*)img3.ptr;
+
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim);
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;
+ float4 tmpData = vload4(gidx, acc_ptr + idx * ref_stride);
+ int loc = idx * output_stride + gidx;
+ vxc_short8 src0, src2;
+ _viv_asm(COPY, src0, tmpData, 16);
+ VXC_DP2x8(src2, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+ ref_ptr[loc] = src2.s0123;
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_qint.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_qint.vx
new file mode 100644
index 000000000..2284f49ce
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_qint.vx
@@ -0,0 +1,263 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;
+_viv_uniform int update_width;
+_viv_uniform int output_width;
+_viv_uniform int ref_stride;
+_viv_uniform int output_stride;
+_viv_uniform int2 multAndoutZP0;
+
+_viv_uniform int4 coord_stride;
+_viv_uniform int4 coord_stride1;
+
+_viv_uniform float output_zp;
+_viv_uniform int input_zp;
+_viv_uniform float input_scale;
+_viv_uniform float inout_scale;
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+#define SCATTER_RESET(name0, name1, ptr0, ptr1, type0, type1, len0, len1, size0, size1, ptr2, ptr3, len3) \
+__kernel void scatter_nd_update_reset_##name0##to##name1( \
+ __read_only image2d_t input_ref, \
+ image2d_t temp_ref, \
+ image2d_t temp_buf_int, \
+ int length, int res) \
+{ \
+ int gidx = get_global_id(0); \
+ Image img1 = create_image_from_image2d(input_ref, size0); \
+ Image img2 = create_image_from_image2d(temp_ref, size1); \
+ Image img3 = create_image_from_image2d(temp_buf_int, 4); \
+ __global ptr0* input_ptr = (__global ptr0*)img1.ptr; \
+ __global ptr1* output_ptr = (__global ptr1*)img2.ptr; \
+ __global int* tmp_update_ptr = (__global int*)img3.ptr; \
+ ptr0 tmpData = input_ptr[gidx]; \
+ int4 zeros = (int4)(0); \
+ int loc2 = gidx * 8; \
+ type0 src; \
+ type1 tmpDst; \
+ ptr1 dst; \
+ vxc_ushort8 ms0; \
+ _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+ _viv_asm(COPY, src, tmpData, len0); \
+ VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniU8MulAndPostShift_0_Lo_2x8); \
+ _viv_asm(COPY, dst, tmpDst, len1); \
+ output_ptr[gidx] = dst; \
+ vstore4(zeros, 0, tmp_update_ptr + loc2); \
+ vstore4(zeros, 1, tmp_update_ptr + loc2); \
+ if(gidx < res) \
+ { \
+ __global ptr2* input_ptr1 = (__global ptr2*)img1.ptr; \
+ __global ptr3* output_ptr1 = (__global ptr3*)img2.ptr; \
+ ptr2 tmpData1 = input_ptr1[length + gidx]; \
+ ptr3 dst1; \
+ dst1 ^= dst1; \
+ tmp_update_ptr[length + gidx] = 0; \
+ _viv_asm(COPY, src, tmpData1, 4); \
+ VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniU8MulAndPostShift_0_Lo_2x8); \
+ _viv_asm(COPY, dst1, tmpDst, len3); \
+ output_ptr1[length + gidx] = dst1; \
+ } \
+}
+SCATTER_RESET(U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, 8, 8, 1, 1, uchar, uchar, 1)
+SCATTER_RESET(I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, 8, 8, 1, 1, char, char, 1)
+SCATTER_RESET(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, 16, 16, 2, 2, short, short, 2)
+SCATTER_RESET(F16, F16, vxc_short8, vxc_short8, vxc_half8, vxc_half8, 16, 16, 2, 2, short, short, 2)
+SCATTER_RESET(U8, F16, vxc_uchar8, vxc_short8, vxc_uchar8, vxc_half8, 8, 16, 1, 2, uchar, short, 2)
+SCATTER_RESET(I8, F16, vxc_char8, vxc_short8, vxc_char8, vxc_half8, 8, 16, 1, 2, char, short, 2)
+SCATTER_RESET(I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, 16, 8, 2, 1, short, short, 2)
+SCATTER_RESET(F16, U8, vxc_short8, vxc_uchar8, vxc_half8, vxc_uchar8, 16, 8, 2, 1, short, uchar, 1)
+
+__kernel void scatter_nd_update_reset_BF16toBF16(
+ __read_only image2d_t input_ref,
+ image2d_t temp_ref,
+ image2d_t temp_buf_int)
+{
+ int gidx = get_global_id(0);
+ Image img1 = create_image_from_image2d(input_ref, 2);
+ Image img2 = create_image_from_image2d(temp_ref, 2);
+ Image img3 = create_image_from_image2d(temp_buf_int, 4);
+ __global vxc_short8* input_ptr = (__global vxc_short8*)img1.ptr;
+ __global vxc_short8* output_ptr = (__global vxc_short8*)img2.ptr;
+ __global float* tmp_update_ptr = (__global float*)img3.ptr;
+ vxc_short8 src = input_ptr[gidx];
+ float4 zeros = (float4)(0, 0, 0, 0);
+ int loc2 = gidx * 8;
+ output_ptr[gidx] = src;
+ vstore4(zeros, 0, tmp_update_ptr + loc2);
+ vstore4(zeros, 1, tmp_update_ptr + loc2);
+}
+
+#define SCATTER_ND_UPDATE_QINT(src0_type, data_type, ptr_type, element_size) \
+__kernel void scatter_nd_update_update_##src0_type( \
+ __read_only image2d_t index, \
+ __read_only image2d_t update, \
+ image2d_t temp_buf_int, \
+ image2d_t link_buffer0, \
+ int width, int area, int vol, int val4, \
+ int val5, int val6, int val7, int coord_dim) \
+{ \
+ int gidx = get_global_id(0); \
+ int gidy = get_global_id(1); \
+ Image img1 = create_image_from_image2d(index, 4); \
+ Image img2 = create_image_from_image2d(update, element_size); \
+ Image img3 = create_image_from_image2d(temp_buf_int, 4); \
+ __global int* index_ptr = (__global int*)img1.ptr; \
+ __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \
+ __global int* output_ptr = (__global int*)img3.ptr; \
+ data_type src; \
+ \
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim); \
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \
+ ptr_type tmpData = update_ptr[gidy * update_width + gidx]; \
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \
+ int loc = idx * output_width + gidx; \
+ _viv_asm(COPY, src, tmpData, 4); \
+ vxc_int4 data; \
+ short zp = input_zp; \
+ VXC_DP4x4(data, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniConvert1stUint8SubZpToFp32_4x4); \
+ atomic_add(output_ptr + loc, data.x); \
+}
+SCATTER_ND_UPDATE_QINT(U8, vxc_uchar8, uchar, 1)
+SCATTER_ND_UPDATE_QINT(I8, vxc_char8, char, 1)
+SCATTER_ND_UPDATE_QINT(I16, vxc_short8, short, 2)
+
+#define SCATTER_ND_UPDATE_QINT_4X(src0_type, data_type, ptr_type, element_size) \
+__kernel void scatter_nd_update_update_##src0_type##_4X( \
+ __read_only image2d_t index, \
+ __read_only image2d_t update, \
+ image2d_t temp_buf_int, \
+ image2d_t link_buffer0, \
+ int width, int area, int vol, int val4, \
+ int val5, int val6, int val7, int coord_dim) \
+{ \
+ int gidx = get_global_id(0); \
+ int gidy = get_global_id(1); \
+ Image img1 = create_image_from_image2d(index, 4); \
+ Image img2 = create_image_from_image2d(update, element_size); \
+ Image img3 = create_image_from_image2d(temp_buf_int, 4); \
+ __global int* index_ptr = (__global int*)img1.ptr; \
+ __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \
+ __global int* output_ptr = (__global int*)img3.ptr; \
+ \
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim); \
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \
+ ptr_type src = update_ptr[gidy * update_width + gidx]; \
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \
+ int4 loc = idx * output_width + gidx * 4 + (int4)(0, 1, 2, 3); \
+ vxc_int4 data; \
+ short zp = input_zp; \
+ VXC_DP4x4(data, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+ uniConvert1stUint8SubZpToFp32_4x4); \
+ atomic_add(output_ptr + loc.x, data.x); \
+ atomic_add(output_ptr + loc.y, data.y); \
+ atomic_add(output_ptr + loc.z, data.z); \
+ atomic_add(output_ptr + loc.w, data.w); \
+}
+SCATTER_ND_UPDATE_QINT_4X(U8, vxc_uchar8, vxc_uchar4, 1)
+SCATTER_ND_UPDATE_QINT_4X(I8, vxc_char8, vxc_char4, 1)
+SCATTER_ND_UPDATE_QINT_4X(I16, vxc_short8, vxc_short4, 2)
+
+#define SCATTER_ND_UPDATE_REF(src0_type, dst_type, data_type, ptr_type, element_size) \
+__kernel void scatter_nd_update_ref_##src0_type##to##dst_type( \
+ __read_only image2d_t index, \
+ __read_only image2d_t update, \
+ __read_only image2d_t temp_buf_int, \
+ image2d_t temp_ref, \
+ image2d_t link_buffer0, \
+ image2d_t link_buffer1, \
+ int width, int area, int vol, int val4, \
+ int val5, int val6, int val7, int coord_dim) \
+{ \
+ int gidx = get_global_id(0); \
+ int gidy = get_global_id(1); \
+ Image img1 = create_image_from_image2d(index, 4); \
+ Image img2 = create_image_from_image2d(temp_buf_int, 4); \
+ Image img3 = create_image_from_image2d(temp_ref, element_size); \
+ __global int* index_ptr = (__global int*)img1.ptr; \
+ __global int* acc_ptr = (__global int*)img2.ptr; \
+ __global ptr_type* ref_ptr = (__global ptr_type*)img3.ptr; \
+ data_type dst; \
+ \
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim); \
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \
+ int loc = idx * output_stride + gidx; \
+ int tmpData = acc_ptr[loc]; \
+ int4 data; \
+ data.x = convert_int_rte(tmpData * inout_scale + output_zp); \
+ VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8); \
+ ref_ptr[loc] = dst.x; \
+}
+SCATTER_ND_UPDATE_REF(I32, U8, vxc_uchar8, uchar, 1)
+SCATTER_ND_UPDATE_REF(I32, I8, vxc_char8, char, 1)
+SCATTER_ND_UPDATE_REF(I32, I16, vxc_short8, short, 2)
+
+#define SCATTER_ND_UPDATE_REF_4X(src0_type, dst_type, data_type, ptr_type, element_size) \
+__kernel void scatter_nd_update_ref_##src0_type##to##dst_type##_4X( \
+ __read_only image2d_t index, \
+ __read_only image2d_t update, \
+ __read_only image2d_t temp_buf_int, \
+ image2d_t temp_ref, \
+ image2d_t link_buffer0, \
+ image2d_t link_buffer1, \
+ int width, int area, int vol, int val4, \
+ int val5, int val6, int val7, int coord_dim) \
+{ \
+ int gidx = get_global_id(0); \
+ int gidy = get_global_id(1); \
+ Image img1 = create_image_from_image2d(index, 4); \
+ Image img2 = create_image_from_image2d(temp_buf_int, 4); \
+ Image img3 = create_image_from_image2d(temp_ref, element_size); \
+ __global int* index_ptr = (__global int*)img1.ptr; \
+ __global int* acc_ptr = (__global int*)img2.ptr; \
+ __global ptr_type* ref_ptr = (__global ptr_type*)img3.ptr; \
+ data_type dst; \
+ \
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim); \
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \
+ float4 tmpData = convert_float4(vload4(gidx, acc_ptr + idx * ref_stride)); \
+ int loc = idx * output_stride + gidx; \
+ int4 data = convert_int4_rte(tmpData * inout_scale + output_zp); \
+ VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+ uniConvertInt32toUint8_2x8); \
+ ref_ptr[loc] = dst.xyzw; \
+}
+SCATTER_ND_UPDATE_REF_4X(I32, U8, vxc_uchar8, vxc_uchar4, 1)
+SCATTER_ND_UPDATE_REF_4X(I32, I8, vxc_char8, vxc_char4, 1)
+SCATTER_ND_UPDATE_REF_4X(I32, I16, vxc_short8, vxc_short4, 2)
+
+#define SCATTER_ND_UPDATE_COPY(src0_type, ptr_type, element_size, ptr_type1) \
+__kernel void scatter_nd_update_copy_##src0_type( \
+ __read_only image2d_t temp_ref, \
+ __read_only image2d_t link_buffer1, \
+ image2d_t output, \
+ int length, int res) \
+{ \
+ int gidx = get_global_id(0); \
+ Image img1 = create_image_from_image2d(temp_ref, element_size); \
+ Image img2 = create_image_from_image2d(output, element_size); \
+ __global ptr_type* input_ptr = (__global ptr_type*)img1.ptr; \
+ __global ptr_type* output_ptr = (__global ptr_type*)img2.ptr; \
+ output_ptr[gidx] = input_ptr[gidx]; \
+ if(gidx < res) \
+ { \
+ __global ptr_type1* input_ptr1 = (__global ptr_type1*)img1.ptr; \
+ __global ptr_type1* output_ptr1 = (__global ptr_type1*)img2.ptr; \
+ output_ptr1[length + gidx] = input_ptr1[length + gidx]; \
+ } \
+}
+SCATTER_ND_UPDATE_COPY(U8, vxc_uchar8, 1, uchar)
+SCATTER_ND_UPDATE_COPY(I8, vxc_char8, 1, char)
+SCATTER_ND_UPDATE_COPY(I16, vxc_short8, 2, short)
+SCATTER_ND_UPDATE_COPY(F16, vxc_short8, 2, short)
+SCATTER_ND_UPDATE_COPY(BF16, vxc_short8, 2, short)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx b/src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx
index 319348593..3c770f373 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx
@@ -21,7 +21,7 @@ __kernel void sequence_mask_##src0_type_name##to##src1_type_name##_2D( \
short zp = inputZP; \
VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
uniConvert1stUint8SubZpToFp32_4x4); \
- int index = convert_int_rte(tmpData.s0 * input_scale); \
+ int index = convert_int_rtz(tmpData.s0 * input_scale); \
int4 data; \
data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \
write_type dst; \
@@ -47,7 +47,7 @@ __kernel void sequence_mask_##src0_type_name##to##src1_type_name( \
short zp = inputZP; \
VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
uniConvert1stUint8SubZpToFp32_4x4); \
- int index = convert_int_rte(tmpData.s0 * input_scale); \
+ int index = convert_int_rtz(tmpData.s0 * input_scale); \
int4 data; \
data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \
write_type dst; \
@@ -73,7 +73,7 @@ __kernel void sequence_mask_F16toF16_2D(
float4 tmpData;
VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
- int index = convert_int_rte(tmpData.x);
+ int index = convert_int_rtz(tmpData.x);
float4 data;
data = outIdx < index? outputVal1 : convert_float(output_ZP);
vxc_short8 dst;
@@ -96,7 +96,7 @@ __kernel void sequence_mask_F16toF16(
float4 tmpData;
VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
- int index = convert_int_rte(tmpData.x);
+ int index = convert_int_rtz(tmpData.x);
float4 data;
data = outIdx < index? outputVal1 : convert_float(output_ZP);
vxc_short8 dst;
@@ -119,7 +119,7 @@ __kernel void sequence_mask_F16toU8_2D(
float4 tmpData;
VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
- int index = convert_int_rte(tmpData.x);
+ int index = convert_int_rtz(tmpData.x);
int4 data;
data = outIdx < index? convert_int_rte(outputVal1) : output_ZP;
vxc_uchar16 dst;
@@ -140,7 +140,7 @@ __kernel void sequence_mask_F16toU8(
float4 tmpData;
VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
UniFP16toFP32Lo4_dp4x4);
- int index = convert_int_rte(tmpData.x);
+ int index = convert_int_rtz(tmpData.x);
int4 data;
data = outIdx < index? convert_int_rte(outputVal1) : output_ZP;
vxc_uchar16 dst;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/tiny_yolov4_postprocess_box.vx b/src/tim/vx/internal/src/libnnext/ops/vx/tiny_yolov4_postprocess_box.vx
new file mode 100644
index 000000000..6e513f126
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/tiny_yolov4_postprocess_box.vx
@@ -0,0 +1,103 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+#include "cl_viv_vx_ext.h"
+
+#define logE (1.44269502f)
+
+float4 sigmoid4(float4 x)
+{
+ x *= -logE;
+ x = 1 + exp2(x);
+ return 1 / x;
+}
+
+float4 exp4(float4 x)
+{
+ x *= logE;
+ return exp2(x);
+}
+
+#define CONST0 (1.0499999523162842f)
+#define CONST1 (0.0250000003725290f)
+
+_viv_uniform VXC_512Bits uniDatatoFloat32_0_4x4;
+_viv_uniform VXC_512Bits uniDatatoFloat32_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uniDataTranspose_0_2x8;
+_viv_uniform VXC_512Bits uniDataTranspose_1_2x8;
+_viv_uniform float input0_scale;
+_viv_uniform float input0_tail;
+_viv_uniform float input1_scale;
+_viv_uniform float input1_tail;
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+_viv_uniform float CONST2;
+__kernel void tiny_yolov4_postprocess_box_U8_U8toU8
+ (
+ __read_only image2d_array_t input0,
+ __read_only image2d_array_t input1,
+ __write_only image2d_array_t output,
+ float bias_0,
+ float bias_1
+ )
+{
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(0));
+
+ vxc_uchar16 src0, src1, src2, src3;
+ VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+ VXC_ReadImage(src1, input0, coord.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_ReadImage(src1, input0, coord.xy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+
+ VXC_ReadImage(src2, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+ VXC_ReadImage(src3, input1, coord.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+ coord.zw += (int2)(2, 3);
+
+ float4 data0, data1, data2, data3, data;
+ VXC_DP4x4(data0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);
+ data0 = data0 * input0_scale + input0_tail;
+ data0 = sigmoid4(data0);
+ data0 = data0 * CONST0 - CONST1;
+
+ VXC_DP4x4(data, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);
+ data = data * input1_scale + input1_tail;
+ data0 = data0 * CONST2 + data * CONST2;
+
+ VXC_DP4x4(data1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_1_4x4);
+ data1 = data1 * input0_scale + input0_tail;
+ data1 = sigmoid4(data1);
+ data1 = data1 * CONST0 - CONST1;
+
+ VXC_DP4x4(data, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);
+ data = data * input1_scale + input1_tail;
+ data1 = data1 * CONST2 + data * CONST2;
+
+ VXC_DP4x4(data2, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);
+ data2 = data2 * input0_scale + input0_tail;
+ data2 = exp4(data2) * bias_0;
+
+ VXC_DP4x4(data3, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_1_4x4);
+ data3 = data3 * input0_scale + input0_tail;
+ data3 = exp4(data3) * bias_1;
+
+ data0 = data0 * output_scale + output_zp;
+ data1 = data1 * output_scale + output_zp;
+
+ int4 dst0 = convert_int4_rte(data0);
+ int4 dst1 = convert_int4_rte(data1);
+ VXC_DP2x8(src1, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+ data2 = data2 * output_scale + output_zp;
+ data3 = data3 * output_scale + output_zp;
+ dst0 = convert_int4_rte(data2);
+ dst1 = convert_int4_rte(data3);
+ VXC_DP2x8(src1, dst0, dst1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+
+ VXC_DP2x8(src0, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniDataTranspose_0_2x8);
+ VXC_DP2x8(src0, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniDataTranspose_1_2x8);
+
+ VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ coord.x ++;
+ VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord.yz, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord.yw, src0, VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/tiny_yolov4_postprocess_confidence.vx b/src/tim/vx/internal/src/libnnext/ops/vx/tiny_yolov4_postprocess_confidence.vx
new file mode 100644
index 000000000..0a41c0e2c
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/tiny_yolov4_postprocess_confidence.vx
@@ -0,0 +1,54 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniU8TimesU8_0_4x4;
+_viv_uniform VXC_512Bits uniU8PlusU8_trans_0_2x8;
+_viv_uniform VXC_512Bits uniU8PlusU8_trans_1_2x8;
+_viv_uniform VXC_512Bits uniU16TimesMultiplier_PostShift_2x8;
+_viv_uniform int output_zp;
+
+__kernel void tiny_yolov4_postprocess_conf_U8toU8
+(
+ __read_only image2d_t input,
+ __write_only image2d_t output
+)
+{
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, get_global_id(0));
+
+ vxc_uchar16 src0, src1, src2, src3, src4;
+
+ VXC_ReadImage(src0, input, coord.wz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+ vxc_ushort8 data0, data1;
+
+ VXC_ReadImage(src1, input, coord.wy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_ReadImage(src2, input, coord.wy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_ReadImage(src3, input, coord.wy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ VXC_ReadImage(src4, input, coord.wy, VXC_5BITOFFSET_XY(0, 4), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+
+ coord.zw = coord.xx + (int2)(2, 3);
+
+ VXC_DP4x4(data0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);
+ VXC_DP4x4(data0, src0, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);
+ VXC_DP4x4(data1, src0, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);
+ VXC_DP4x4(data1, src0, src4, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);
+
+ VXC_DP2x8(src1, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),
+ uniU16TimesMultiplier_PostShift_2x8);
+ VXC_DP2x8(src1, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),
+ uniU16TimesMultiplier_PostShift_2x8);
+
+ uchar zp;
+ _viv_asm(COPY, zp, output_zp, 2);
+
+ VXC_DP2x8(src0, src1, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),
+ uniU8PlusU8_trans_0_2x8);
+ VXC_DP2x8(src0, src1, zp, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),
+ uniU8PlusU8_trans_1_2x8);
+
+ VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+ coord.x ++;
+ VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord.yz, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0));
+ VXC_WriteImage(output, coord.yw, src0, VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
index f528ccb35..5421a5aba 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
@@ -4902,6 +4902,710 @@ __kernel void cumsum_BF16toBF16_axis0_2D(\n\
}\n\
"; /* end of cumsum_bf16_vx*/
+static const char cumsum_ex_rev_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;\n\
+_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16B_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16C_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzRevF16toF16_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSumHorzRevU8toI16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzRevU8toI16B_8x4;\n\
+_viv_uniform VXC_512Bits uniSubZpRevI16toI16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32B_4x4;\n\
+\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int input_zp;\n\
+_viv_uniform float in_out_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+__kernel void cumsum_ex_rev_F16toF16_axis0(\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ int axis, int exclusive, int rev\n\
+ )\n\
+{\n\
+ int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\
+ int4 coord_out = coord;\n\
+\n\
+ vxc_short8 src, dst;\n\
+ vxc_half8 data, tmpsum, sum;\n\
+ VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+ if(exclusive == 0 && rev)\n\
+ {\n\
+ for(coord.x = width - 8; coord.x >= 0; coord.x -= 8)\n\
+ {\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ _viv_asm(COPY, data, src, 16);\n\
+\n\
+ VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);\n\
+ VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);\n\
+ VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+ uniSumHorzRevF16toF16C_2x8);\n\
+ VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);\n\
+ _viv_asm(COPY, dst, sum, 16);\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ }\n\
+ }\n\
+ else if(exclusive && rev == 0)\n\
+ {\n\
+ _viv_asm(COPY, dst, sum, 16);\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+ for(; coord.x < width - 8;)\n\
+ {\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ coord_out.x = coord.x + 1;\n\
+ coord.x += 8;\n\
+ _viv_asm(COPY, data, src, 16);\n\
+\n\
+ VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);\n\
+ VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);\n\
+ VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);\n\
+ VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);\n\
+ _viv_asm(COPY, dst, sum, 16);\n\
+ VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ }\n\
+ }\n\
+ else if(exclusive && rev)\n\
+ {\n\
+ coord.x = width - 8;\n\
+ coord_out.x = width - 1;\n\
+ _viv_asm(COPY, dst, sum, 16);\n\
+ VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+ for(; coord.x > 0;)\n\
+ {\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ coord_out.x = coord.x - 1;\n\
+ coord.x -= 8;\n\
+ _viv_asm(COPY, data, src, 16);\n\
+\n\
+ VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);\n\
+ VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);\n\
+ VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+ uniSumHorzRevF16toF16C_2x8);\n\
+ VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);\n\
+ _viv_asm(COPY, dst, sum, 16);\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ }\n\
+ }\n\
+}\n\
+\n\
+#define CUMSUM_QINT_EX_REV_AXIS0(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis0( \\\n\
+ __read_only image2d_array_t input, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ int axis, int exclusive, int rev \\\n\
+ ) \\\n\
+{ \\\n\
+ int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); \\\n\
+ int4 coord_out = coord; \\\n\
+ \\\n\
+ src_type src; \\\n\
+ dst_type dst; \\\n\
+ vxc_short8 rowSum; \\\n\
+ int4 sum0 = (int4)(0), sum1 = (int4)(0); \\\n\
+ short zp = (short)input_zp; \\\n\
+ \\\n\
+ if(exclusive == 0 && rev) \\\n\
+ { \\\n\
+ for(coord.x = width - 8; coord.x >= 0; coord.x -= 8) \\\n\
+ { \\\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \\\n\
+ VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \\\n\
+ VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \\\n\
+ VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniAccSumHorzRevI16toI32A_4x4); \\\n\
+ VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniAccSumHorzRevI16toI32B_4x4); \\\n\
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\
+ int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+ int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+ VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ } \\\n\
+ else if(exclusive && rev == 0) \\\n\
+ { \\\n\
+ for(coord.x = -1; coord.x < width - 8;) \\\n\
+ { \\\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_out.x = coord.x + 1; \\\n\
+ coord.x += 8; \\\n\
+ VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \\\n\
+ VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \\\n\
+ VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \\\n\
+ VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniAccSumHorzI16toI32A_4x4); \\\n\
+ VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniAccSumHorzI16toI32B_4x4); \\\n\
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\
+ int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+ int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ } \\\n\
+ else if(exclusive && rev) \\\n\
+ { \\\n\
+ for(coord.x = width - 7; coord.x > 0;) \\\n\
+ { \\\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_out.x = coord.x - 1; \\\n\
+ coord.x -= 8; \\\n\
+ VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \\\n\
+ VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \\\n\
+ VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \\\n\
+ VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniAccSumHorzRevI16toI32A_4x4); \\\n\
+ VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniAccSumHorzRevI16toI32B_4x4); \\\n\
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\
+ int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+ int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+ VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ } \\\n\
+}\n\
+CUMSUM_QINT_EX_REV_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_QINT_EX_REV_AXIS0(I8, I8, vxc_char16, vxc_char16)\n\
+CUMSUM_QINT_EX_REV_AXIS0(I16, I16, vxc_short8, vxc_short8)\n\
+"; /* end of cumsum_ex_rev_axis0_vx*/
+
+static const char cumsum_ex_rev_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\
+\n\
+_viv_uniform int height;\n\
+_viv_uniform float in_out_scale;\n\
+_viv_uniform float in_out_zp_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+__kernel void cumsum_ex_rev_F16toF16_axis1(\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ int axis, int exclusive, int rev)\n\
+{\n\
+ int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);\n\
+\n\
+ vxc_short8 src, dst;\n\
+ vxc_half8 data, sum;\n\
+ VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+ if(exclusive == 0 && rev)\n\
+ {\n\
+ for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
+ {\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ _viv_asm(COPY, data, src, 16);\n\
+\n\
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+ _viv_asm(COPY, dst, sum, 16);\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ }\n\
+ }\n\
+ else if(exclusive && rev == 0)\n\
+ {\n\
+ dst ^= dst;\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ for(; coord.y < height - 1;)\n\
+ {\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ coord.y++;\n\
+ _viv_asm(COPY, data, src, 16);\n\
+\n\
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+ _viv_asm(COPY, dst, sum, 16);\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ }\n\
+ }\n\
+ else if(exclusive && rev)\n\
+ {\n\
+ dst ^= dst;\n\
+ coord.y = height - 1;\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ for(; coord.y > 0;)\n\
+ {\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ coord.y--;\n\
+ _viv_asm(COPY, data, src, 16);\n\
+\n\
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+ _viv_asm(COPY, dst, sum, 16);\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ }\n\
+ }\n\
+}\n\
+\n\
+#define CUMSUM_8BITS_EX_REV_AXIS1(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\
+ __read_only image2d_array_t input, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ int axis, int exclusive, int rev) \\\n\
+{ \\\n\
+ int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \\\n\
+ \\\n\
+ src_type src; \\\n\
+ dst_type dst; \\\n\
+ int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\
+ \\\n\
+ if(exclusive == 0 && rev) \\\n\
+ { \\\n\
+ for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\
+ { \\\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+ VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+ VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+ float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \\\n\
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+ float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+ float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+ int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+ int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+ int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+ int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ } \\\n\
+ else if(exclusive && rev == 0) \\\n\
+ { \\\n\
+ int tmpAlpha0 = convert_int_rte(output_zp); \\\n\
+ int4 tmpVal; \\\n\
+ tmpVal.x = tmpAlpha0; \\\n\
+ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
+ VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ for(; coord.y < height - 1;) \\\n\
+ { \\\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord.y++; \\\n\
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+ VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+ VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+ float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp; \\\n\
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+ float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+ float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+ int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+ int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+ int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+ int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8);\\\n\
+ VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8);\\\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ } \\\n\
+ else if(exclusive && rev) \\\n\
+ { \\\n\
+ coord.y = height - 1; \\\n\
+ int tmpAlpha0 = convert_int_rte(output_zp); \\\n\
+ int4 tmpVal; \\\n\
+ tmpVal.x = tmpAlpha0; \\\n\
+ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
+ VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ for(; coord.y > 0;) \\\n\
+ { \\\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+ VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+ VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+ float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \\\n\
+ coord.y--; \\\n\
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+ float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+ float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+ int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+ int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+ int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+ int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8);\\\n\
+ VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8);\\\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ } \\\n\
+}\n\
+CUMSUM_8BITS_EX_REV_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_8BITS_EX_REV_AXIS1(I8, I8, vxc_char16, vxc_char16)\n\
+\n\
+__kernel void cumsum_ex_rev_I16toI16_axis1(\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ int axis, int exclusive, int rev)\n\
+{\n\
+ int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);\n\
+\n\
+ vxc_short8 src, dst;\n\
+ int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\
+ if(exclusive == 0 && rev)\n\
+ {\n\
+ for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
+ {\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+ float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;\n\
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+ int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+ int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\
+ uniConvertInt32toUint8_2x8);\n\
+\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ }\n\
+ }\n\
+ else if(exclusive && rev == 0)\n\
+ {\n\
+ int tmpAlpha0 = convert_int_rte(output_zp);\n\
+ int4 tmpVal;\n\
+ tmpVal.x = tmpAlpha0;\n\
+ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+ VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ for(; coord.y < height - 1;)\n\
+ {\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ coord.y++;\n\
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+ float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp;\n\
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+ int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+ int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\
+ uniConvertInt32toUint8_2x8);\n\
+\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ }\n\
+ }\n\
+ else if(exclusive && rev)\n\
+ {\n\
+ coord.y = height - 1;\n\
+ int tmpAlpha0 = convert_int_rte(output_zp);\n\
+ int4 tmpVal;\n\
+ tmpVal.x = tmpAlpha0;\n\
+ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+ VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ for(; coord.y > 0;)\n\
+ {\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+ float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;\n\
+ coord.y--;\n\
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+ int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+ int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\
+ uniConvertInt32toUint8_2x8);\n\
+\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ }\n\
+ }\n\
+}\n\
+"; /* end of cumsum_ex_rev_axis1_vx*/
+
+static const char cumsum_ex_rev_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\
+\n\
+_viv_uniform int channel;\n\
+_viv_uniform float in_out_scale;\n\
+_viv_uniform float in_out_zp_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+__kernel void cumsum_ex_rev_F16toF16_axis2(\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ int axis, int exclusive, int rev)\n\
+{\n\
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+ vxc_short8 src, dst;\n\
+ vxc_half8 data, sum;\n\
+ VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+ if(rev && exclusive == 0)\n\
+ {\n\
+ for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\
+ {\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ _viv_asm(COPY, data, src, 16);\n\
+\n\
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+ _viv_asm(COPY, dst, sum, 16);\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ }\n\
+ }\n\
+ else if(rev == 0 && exclusive)\n\
+ {\n\
+ _viv_asm(COPY, dst, sum, 16);\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ for(; coord.z < channel - 1;)\n\
+ {\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ coord.z++;\n\
+ _viv_asm(COPY, data, src, 16);\n\
+\n\
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+ _viv_asm(COPY, dst, sum, 16);\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ }\n\
+ }\n\
+ else if(rev && exclusive)\n\
+ {\n\
+ _viv_asm(COPY, dst, sum, 16);\n\
+ coord.z = channel - 1;\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ for(; coord.z > 0;)\n\
+ {\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ coord.z--;\n\
+ _viv_asm(COPY, data, src, 16);\n\
+\n\
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+ _viv_asm(COPY, dst, sum, 16);\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ }\n\
+ }\n\
+}\n\
+\n\
+#define CUMSUM_8BITS_EX_REV_AXIS2(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\
+ __read_only image2d_array_t input, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ int axis, int exclusive, int rev) \\\n\
+{ \\\n\
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
+ \\\n\
+ src_type src; \\\n\
+ dst_type dst; \\\n\
+ int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\
+ \\\n\
+ if(rev && exclusive == 0) \\\n\
+ { \\\n\
+ for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\
+ { \\\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+ VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+ VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+ float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \\\n\
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+ float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+ float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+ int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+ int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+ int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+ int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8);\\\n\
+ VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \\\n\
+ uniConvertInt32toUint8_2x8);\\\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ } \\\n\
+ else if(exclusive && rev == 0) \\\n\
+ { \\\n\
+ int tmpAlpha0 = convert_int_rte(output_zp); \\\n\
+ int4 tmpVal; \\\n\
+ tmpVal.x = tmpAlpha0; \\\n\
+ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\
+ VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ for(; coord.z < channel - 1;) \\\n\
+ { \\\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord.z++; \\\n\
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+ VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+ VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+ float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp; \\\n\
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+ float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+ float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+ int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+ int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+ int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+ int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \\\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ } \\\n\
+ else if(rev && exclusive) \\\n\
+ { \\\n\
+ coord.z = channel - 1; \\\n\
+ int tmpAlpha0 = convert_int_rte(output_zp); \\\n\
+ int4 tmpVal; \\\n\
+ tmpVal.x = tmpAlpha0; \\\n\
+ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
+ VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ for(; coord.z > 0;) \\\n\
+ { \\\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+ VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+ VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+ float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \\\n\
+ coord.z--; \\\n\
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+ float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+ float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+ int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+ int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+ int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+ int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1),\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ } \\\n\
+}\n\
+CUMSUM_8BITS_EX_REV_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_8BITS_EX_REV_AXIS2(I8, I8, vxc_char16, vxc_char16)\n\
+\n\
+__kernel void cumsum_ex_rev_I16toI16_axis2(\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ int axis, int exclusive, int rev)\n\
+{\n\
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+ vxc_short8 src, dst;\n\
+ int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\
+ if(exclusive == 0 && rev)\n\
+ {\n\
+ for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\
+ {\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+ float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;\n\
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+ int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+ int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\
+ uniConvertInt32toUint8_2x8);\n\
+\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ }\n\
+ }\n\
+ else if(exclusive && rev == 0)\n\
+ {\n\
+ int tmpAlpha0 = convert_int_rte(output_zp);\n\
+ int4 tmpVal;\n\
+ tmpVal.x = tmpAlpha0;\n\
+ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+ VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ for(; coord.z < channel - 1;)\n\
+ {\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ coord.z++;\n\
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+ float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp;\n\
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+ int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+ int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\
+ uniConvertInt32toUint8_2x8);\n\
+\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ }\n\
+ }\n\
+ else if(exclusive && rev)\n\
+ {\n\
+ coord.z = channel - 1;\n\
+ int tmpAlpha0 = convert_int_rte(output_zp);\n\
+ int4 tmpVal;\n\
+ tmpVal.x = tmpAlpha0;\n\
+ VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+ VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ for(; coord.z > 0;)\n\
+ {\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+ VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+ float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;\n\
+ coord.z--;\n\
+ float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+ float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+ int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+ int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+ VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\
+ uniConvertInt32toUint8_2x8);\n\
+\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ }\n\
+ }\n\
+}\n\
+"; /* end of cumsum_ex_rev_axis2_vx*/
+
static const char cumsum_f16_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
\n\
_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
@@ -5080,6 +5784,138 @@ __kernel void cumsum_F16to##out_name##_axis0_2D( \\\n\
CUMSUM_F16TOQINT_AXIS0_2D(I8, vxc_half8, vxc_char16)\n\
CUMSUM_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8)\n\
CUMSUM_F16TOQINT_AXIS0_2D(U8, vxc_half8, vxc_uchar16)\n\
+\n\
+#define CUMSUM_F16TOQINT_EX_REV_AXIS2(out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_ex_rev_F16to##out_name##_axis2( \\\n\
+ __read_only image2d_array_t input, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ int axis, int exclusive, int rev \\\n\
+ ) \\\n\
+{ \\\n\
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
+ \\\n\
+ vxc_short8 src; \\\n\
+ dst_type dst; \\\n\
+ vxc_half8 data, sum; \\\n\
+ VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
+ vxc_ushort8 ms0; \\\n\
+ _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+ if(exclusive == 0 && rev) \\\n\
+ { \\\n\
+ for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\
+ { \\\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ _viv_asm(COPY, data, src, 16); \\\n\
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ } \\\n\
+ else if(exclusive && rev == 0) \\\n\
+ { \\\n\
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ for(; coord.z < channel - 1;) \\\n\
+ { \\\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord.z++; \\\n\
+ _viv_asm(COPY, data, src, 16); \\\n\
+ \\\n\
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ } \\\n\
+ else if(exclusive && rev) \\\n\
+ { \\\n\
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+ coord.z = channel - 1; \\\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ for(; coord.z > 0;) \\\n\
+ { \\\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord.z--; \\\n\
+ _viv_asm(COPY, data, src, 16); \\\n\
+ \\\n\
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ } \\\n\
+}\n\
+CUMSUM_F16TOQINT_EX_REV_AXIS2(I8, vxc_half8, vxc_char16)\n\
+CUMSUM_F16TOQINT_EX_REV_AXIS2(I16, vxc_half8, vxc_short8)\n\
+CUMSUM_F16TOQINT_EX_REV_AXIS2(U8, vxc_half8, vxc_uchar16)\n\
+\n\
+#define CUMSUM_F16TOQINT_EX_REV_AXIS1(out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_ex_rev_F16to##out_name##_axis1( \\\n\
+ __read_only image2d_array_t input, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ int axis, int exclusive, int rev \\\n\
+ ) \\\n\
+{ \\\n\
+ int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \\\n\
+ \\\n\
+ vxc_short8 src; \\\n\
+ dst_type dst; \\\n\
+ vxc_half8 data, sum; \\\n\
+ VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
+ vxc_ushort8 ms0; \\\n\
+ _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+ if(exclusive == 0 && rev) \\\n\
+ { \\\n\
+ for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\
+ { \\\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ _viv_asm(COPY, data, src, 16); \\\n\
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ } \\\n\
+ else if(exclusive && rev == 0) \\\n\
+ { \\\n\
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ for(; coord.y < height - 1;) \\\n\
+ { \\\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord.y++; \\\n\
+ _viv_asm(COPY, data, src, 16); \\\n\
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ } \\\n\
+ else if(exclusive && rev) \\\n\
+ { \\\n\
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+ coord.y = height - 1; \\\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ for(; coord.y > 0;) \\\n\
+ { \\\n\
+ VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord.y--; \\\n\
+ _viv_asm(COPY, data, src, 16); \\\n\
+ VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+ VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ } \\\n\
+}\n\
+CUMSUM_F16TOQINT_EX_REV_AXIS1(I8, vxc_half8, vxc_char16)\n\
+CUMSUM_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8)\n\
+CUMSUM_F16TOQINT_EX_REV_AXIS1(U8, vxc_half8, vxc_uchar16)\n\
"; /* end of cumsum_f16_u8_vx*/
static const char custom_softmax_vx[] = "/*\n\
@@ -5509,15 +6345,13 @@ __kernel void custom_warp_affine_bilinear_U8toU8\n\
}\n\
"; /* end of custom_warp_affine_vx*/
-static const char custom_warp_perspective_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+static const char custom_warp_affine_rgb_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
\n\
#include \"cl_viv_vx_ext.h\"\n\
\n\
_viv_uniform float4 matrix0;\n\
-_viv_uniform float4 matrix1;\n\
-_viv_uniform float4 matrix2;\n\
-_viv_uniform float4 matrix4;\n\
-__kernel void custom_warp_perspective_nearest_neighbor_U8toU8_2D\n\
+_viv_uniform float2 matrix1;\n\
+__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb_2D\n\
(\n\
__read_only image2d_array_t input,\n\
__write_only image2d_array_t output,\n\
@@ -5526,53 +6360,38 @@ __kernel void custom_warp_perspective_nearest_neighbor_U8toU8_2D\n\
float _m2,\n\
float _m3,\n\
float _m4,\n\
- float _m5,\n\
- float _m6,\n\
- float _m7,\n\
- float _m8\n\
+ float _m5\n\
)\n\
{\n\
- int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
+ int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));\n\
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
\n\
- float4 coord_f0 = convert_float4(coord_in);\n\
-\n\
- float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\
- z0.zw = z0.zw + 2 * matrix1.z;\n\
- float4 z1 = z0 + 4 * matrix1.z;\n\
-\n\
- z0 = 1.0f / z0;\n\
- z1 = 1.0f / z1;\n\
+ float4 coord_f = convert_float4(coord_in);\n\
\n\
- coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\
- float4 coord_f = coord_f0 * z0.xxyy;\n\
+ coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\
\n\
- coord_in = convert_int4(coord_f);\n\
+ coord_in.x = floor(coord_f.x) * 3;\n\
+ coord_in.y = floor(coord_f.y);\n\
+ coord_in.z = floor(coord_f.z) * 3;\n\
+ coord_in.w = floor(coord_f.w);\n\
\n\
vxc_uchar16 dst;\n\
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
- coord_f0 = coord_f0.zwzw + matrix4;\n\
- coord_f = coord_f0 * z0.zzww;\n\
- coord_in = convert_int4(coord_f);\n\
+ coord_in.x = coord_in.x + 1;\n\
+ VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+ coord_in.x = coord_in.x + 1;\n\
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+\n\
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
- coord_f0 = coord_f0.zwzw + matrix4;\n\
- coord_f = coord_f0 * z1.xxyy;\n\
- coord_in = convert_int4(coord_f);\n\
- VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+ coord_in.z = coord_in.z + 1;\n\
+ VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+ coord_in.z = coord_in.z + 1;\n\
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
- coord_f0 = coord_f0.zwzw + matrix4;\n\
- coord_f = coord_f0 * z1.zzww;\n\
- coord_in = convert_int4(coord_f);\n\
- VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
\n\
-\n\
- VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\
}\n\
\n\
-__kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\
+__kernel void custom_warp_affine_bilinear_U8toU8_rgb_2D\n\
(\n\
__read_only image2d_array_t input,\n\
__write_only image2d_array_t output,\n\
@@ -5581,32 +6400,30 @@ __kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\
float _m2,\n\
float _m3,\n\
float _m4,\n\
- float _m5,\n\
- float _m6,\n\
- float _m7,\n\
- float _m8\n\
+ float _m5\n\
)\n\
{\n\
- int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
+ int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1));\n\
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
\n\
- float4 coord_f0 = convert_float4(coord_in);\n\
+ float4 coord_f = convert_float4(coord_in);\n\
\n\
- float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\
- z0.zw = z0.zw + 2 * matrix1.z;\n\
- float4 z1 = z0 + 4 * matrix1.z;\n\
+ coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\
\n\
- z0 = 1.0f / z0;\n\
- z1 = 1.0f / z1;\n\
+ coord_in.x = floor(coord_f.x) * 3;\n\
+ coord_in.y = floor(coord_f.y);\n\
+ coord_in.z = floor(coord_f.z) * 3;\n\
+ coord_in.w = floor(coord_f.w);\n\
\n\
- coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\
- float4 coord_f = coord_f0 * z0.xxyy;\n\
+ vxc_uchar16 src0, src1, src_0, src_1, dst;\n\
+ VXC_ReadImage(src_0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src_1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
\n\
- coord_in = convert_int4(floor(coord_f));\n\
+ src0.x = src_0.s0;\n\
+ src0.y = src_0.s3;\n\
+ src1.x = src_1.s0;\n\
+ src1.y = src_1.s3;\n\
\n\
- vxc_uchar16 src0, src1, dst;\n\
- VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
#if (VX_VERSION==1)\n\
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
#else\n\
@@ -5615,21 +6432,22 @@ __kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
#endif\n\
\n\
- VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ src0.x = src_0.s1;\n\
+ src0.y = src_0.s4;\n\
+ src1.x = src_1.s1;\n\
+ src1.y = src_1.s4;\n\
#if (VX_VERSION==1)\n\
- VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
#else\n\
- VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
src1.s0 = src0.s1;\n\
- VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
#endif\n\
\n\
- coord_f0 = coord_f0.zwzw + matrix4;\n\
- coord_f = coord_f0 * z0.zzww;\n\
- coord_in = convert_int4(floor(coord_f));\n\
- VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ src0.x = src_0.s2;\n\
+ src0.y = src_0.s5;\n\
+ src1.x = src_1.s2;\n\
+ src1.y = src_1.s5;\n\
#if (VX_VERSION==1)\n\
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
#else\n\
@@ -5638,8 +6456,13 @@ __kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
#endif\n\
\n\
- VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src_0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src_1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ src0.x = src_0.s0;\n\
+ src0.y = src_0.s3;\n\
+ src1.x = src_1.s0;\n\
+ src1.y = src_1.s3;\n\
#if (VX_VERSION==1)\n\
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
#else\n\
@@ -5648,21 +6471,22 @@ __kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
#endif\n\
\n\
- coord_f0 = coord_f0.zwzw + matrix4;\n\
- coord_f = coord_f0 * z1.xxyy;\n\
- coord_in = convert_int4(floor(coord_f));\n\
- VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ src0.x = src_0.s1;\n\
+ src0.y = src_0.s4;\n\
+ src1.x = src_1.s1;\n\
+ src1.y = src_1.s4;\n\
#if (VX_VERSION==1)\n\
- VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
#else\n\
- VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
src1.s0 = src0.s1;\n\
- VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
#endif\n\
\n\
- VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ src0.x = src_0.s2;\n\
+ src0.y = src_0.s5;\n\
+ src1.x = src_1.s2;\n\
+ src1.y = src_1.s5;\n\
#if (VX_VERSION==1)\n\
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
#else\n\
@@ -5671,36 +6495,10 @@ __kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
#endif\n\
\n\
- coord_f0 = coord_f0.zwzw + matrix4;\n\
- coord_f = coord_f0 * z1.zzww;\n\
- coord_in = convert_int4(floor(coord_f));\n\
- VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-#if (VX_VERSION==1)\n\
- VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
-#else\n\
- VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
- src1.s0 = src0.s1;\n\
- VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
-#endif\n\
-\n\
- VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-#if (VX_VERSION==1)\n\
- VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
-#else\n\
- VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
- src1.s0 = src0.s1;\n\
- VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
-#endif\n\
-\n\
- VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\
}\n\
\n\
-#define IMAGE_LOAD_3D(dst, xoffset, yoffset, start, end) \\\n\
- VXC_OP4(img_load_3d, dst, input, coord_input.xywz, VXC_5BITOFFSET_XY(xoffset, yoffset), \\\n\
- VXC_MODIFIER(start, end, 0, VXC_RM_TowardZero, 0));\n\
-__kernel void custom_warp_perspective_nearest_neighbor_U8toU8\n\
+__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb\n\
(\n\
__read_only image2d_array_t input,\n\
__write_only image2d_array_t output,\n\
@@ -5709,28 +6507,20 @@ __kernel void custom_warp_perspective_nearest_neighbor_U8toU8\n\
float _m2,\n\
float _m3,\n\
float _m4,\n\
- float _m5,\n\
- float _m6,\n\
- float _m7,\n\
- float _m8\n\
+ float _m5\n\
)\n\
{\n\
- int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+ int4 coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2));\n\
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
\n\
- float4 coord_f0 = convert_float4(coord_in);\n\
-\n\
- float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\
- z0.zw = z0.zw + 2 * matrix1.z;\n\
- float4 z1 = z0 + 4 * matrix1.z;\n\
-\n\
- z0 = 1.0f / z0;\n\
- z1 = 1.0f / z1;\n\
+ float4 coord_f = convert_float4(coord_in);\n\
\n\
- coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\
- float4 coord_f = coord_f0 * z0.xxyy;\n\
+ coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\
\n\
- coord_in = convert_int4(coord_f);\n\
+ coord_in.x = floor(coord_f.x) * 3;\n\
+ coord_in.y = floor(coord_f.y);\n\
+ coord_in.z = floor(coord_f.z) * 3;\n\
+ coord_in.w = floor(coord_f.w);\n\
\n\
int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\
int8 input_desc;\n\
@@ -5739,28 +6529,391 @@ __kernel void custom_warp_perspective_nearest_neighbor_U8toU8\n\
_viv_asm(MOV, coord_input.w, baseAddr);\n\
\n\
vxc_uchar16 dst;\n\
- IMAGE_LOAD_3D(dst, 0, 0, 0, 0)\n\
- coord_input.xy = coord_in.zw;\n\
- IMAGE_LOAD_3D(dst, 0, 0, 1, 1)\n\
- coord_f0 = coord_f0.zwzw + matrix4;\n\
- coord_f = coord_f0 * z0.zzww;\n\
- coord_in = convert_int4(coord_f);\n\
- coord_input.xy = coord_in.xy;\n\
- IMAGE_LOAD_3D(dst, 0, 0, 2, 2)\n\
- coord_input.xy = coord_in.zw;\n\
- IMAGE_LOAD_3D(dst, 0, 0, 3, 3)\n\
- coord_f0 = coord_f0.zwzw + matrix4;\n\
- coord_f = coord_f0 * z1.xxyy;\n\
- coord_in = convert_int4(coord_f);\n\
- coord_input.xy = coord_in.xy;\n\
- IMAGE_LOAD_3D(dst, 0, 0, 4, 4)\n\
- coord_input.xy = coord_in.zw;\n\
- IMAGE_LOAD_3D(dst, 0, 0, 5, 5)\n\
- coord_f0 = coord_f0.zwzw + matrix4;\n\
- coord_f = coord_f0 * z1.zzww;\n\
- coord_in = convert_int4(coord_f);\n\
- coord_input.xy = coord_in.xy;\n\
- IMAGE_LOAD_3D(dst, 0, 0, 6, 6)\n\
+ VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+ coord_input.x = coord_input.x + 1;\n\
+ VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+ coord_input.x = coord_input.x + 1;\n\
+ VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+ coord_input.xy = coord_in.zw;\n\
+ VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+ coord_input.x = coord_input.x + 1;\n\
+ VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+ coord_input.x = coord_input.x + 1;\n\
+ VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void custom_warp_affine_bilinear_U8toU8_rgb\n\
+(\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ float _m0,\n\
+ float _m1,\n\
+ float _m2,\n\
+ float _m3,\n\
+ float _m4,\n\
+ float _m5\n\
+)\n\
+{\n\
+ int4 coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2));\n\
+ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+\n\
+ float4 coord_f = convert_float4(coord_in);\n\
+\n\
+ coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+\n\
+ coord_in.x = floor(coord_f.x) * 3;\n\
+ coord_in.y = floor(coord_f.y);\n\
+ coord_in.z = floor(coord_f.z) * 3;\n\
+ coord_in.w = floor(coord_f.w);\n\
+\n\
+ int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\
+ int8 input_desc;\n\
+ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+ int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\
+ _viv_asm(MOV, coord_input.w, baseAddr);\n\
+\n\
+ vxc_uchar16 src0, src1, src_0, src_1, dst;\n\
+ VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ src0.x = src_0.s0;\n\
+ src0.y = src_0.s3;\n\
+ src1.x = src_1.s0;\n\
+ src1.y = src_1.s3;\n\
+\n\
+#if (VX_VERSION==1)\n\
+ VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+ VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ src1.s0 = src0.s1;\n\
+ VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+ src0.x = src_0.s1;\n\
+ src0.y = src_0.s4;\n\
+ src1.x = src_1.s1;\n\
+ src1.y = src_1.s4;\n\
+#if (VX_VERSION==1)\n\
+ VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+ VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ src1.s0 = src0.s1;\n\
+ VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+ src0.x = src_0.s2;\n\
+ src0.y = src_0.s5;\n\
+ src1.x = src_1.s2;\n\
+ src1.y = src_1.s5;\n\
+#if (VX_VERSION==1)\n\
+ VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+ VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ src1.s0 = src0.s1;\n\
+ VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+ coord_input.xy = coord_in.zw;\n\
+ VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ src0.x = src_0.s0;\n\
+ src0.y = src_0.s3;\n\
+ src1.x = src_1.s0;\n\
+ src1.y = src_1.s3;\n\
+#if (VX_VERSION==1)\n\
+ VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+ VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ src1.s0 = src0.s1;\n\
+ VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+ src0.x = src_0.s1;\n\
+ src0.y = src_0.s4;\n\
+ src1.x = src_1.s1;\n\
+ src1.y = src_1.s4;\n\
+#if (VX_VERSION==1)\n\
+ VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+ VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ src1.s0 = src0.s1;\n\
+ VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+ src0.x = src_0.s2;\n\
+ src0.y = src_0.s5;\n\
+ src1.x = src_1.s2;\n\
+ src1.y = src_1.s5;\n\
+#if (VX_VERSION==1)\n\
+ VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+ VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ src1.s0 = src0.s1;\n\
+ VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\
+}"; /* end of custom_warp_affine_rgb_vx*/
+
+static const char custom_warp_perspective_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float4 matrix0;\n\
+_viv_uniform float4 matrix1;\n\
+_viv_uniform float4 matrix2;\n\
+_viv_uniform float4 matrix4;\n\
+__kernel void custom_warp_perspective_nearest_neighbor_U8toU8_2D\n\
+(\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ float _m0,\n\
+ float _m1,\n\
+ float _m2,\n\
+ float _m3,\n\
+ float _m4,\n\
+ float _m5,\n\
+ float _m6,\n\
+ float _m7,\n\
+ float _m8\n\
+)\n\
+{\n\
+ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
+ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+\n\
+ float4 coord_f0 = convert_float4(coord_in);\n\
+\n\
+ float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\
+ z0.zw = z0.zw + 2 * matrix1.z;\n\
+ float4 z1 = z0 + 4 * matrix1.z;\n\
+\n\
+ z0 = 1.0f / z0;\n\
+ z1 = 1.0f / z1;\n\
+\n\
+ coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+ float4 coord_f = coord_f0 * z0.xxyy;\n\
+\n\
+ coord_in = convert_int4(coord_f);\n\
+\n\
+ vxc_uchar16 dst;\n\
+ VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+ coord_f0 = coord_f0.zwzw + matrix4;\n\
+ coord_f = coord_f0 * z0.zzww;\n\
+ coord_in = convert_int4(coord_f);\n\
+ VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+ coord_f0 = coord_f0.zwzw + matrix4;\n\
+ coord_f = coord_f0 * z1.xxyy;\n\
+ coord_in = convert_int4(coord_f);\n\
+ VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+ coord_f0 = coord_f0.zwzw + matrix4;\n\
+ coord_f = coord_f0 * z1.zzww;\n\
+ coord_in = convert_int4(coord_f);\n\
+ VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\
+(\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ float _m0,\n\
+ float _m1,\n\
+ float _m2,\n\
+ float _m3,\n\
+ float _m4,\n\
+ float _m5,\n\
+ float _m6,\n\
+ float _m7,\n\
+ float _m8\n\
+)\n\
+{\n\
+ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
+ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+\n\
+ float4 coord_f0 = convert_float4(coord_in);\n\
+\n\
+ float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\
+ z0.zw = z0.zw + 2 * matrix1.z;\n\
+ float4 z1 = z0 + 4 * matrix1.z;\n\
+\n\
+ z0 = 1.0f / z0;\n\
+ z1 = 1.0f / z1;\n\
+\n\
+ coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+ float4 coord_f = coord_f0 * z0.xxyy;\n\
+\n\
+ coord_in = convert_int4(floor(coord_f));\n\
+\n\
+ vxc_uchar16 src0, src1, dst;\n\
+ VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+ VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+ VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ src1.s0 = src0.s1;\n\
+ VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+ VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+ VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+ VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ src1.s0 = src0.s1;\n\
+ VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+ coord_f0 = coord_f0.zwzw + matrix4;\n\
+ coord_f = coord_f0 * z0.zzww;\n\
+ coord_in = convert_int4(floor(coord_f));\n\
+ VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+ VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+ VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ src1.s0 = src0.s1;\n\
+ VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+ VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+ VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+ VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ src1.s0 = src0.s1;\n\
+ VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+ coord_f0 = coord_f0.zwzw + matrix4;\n\
+ coord_f = coord_f0 * z1.xxyy;\n\
+ coord_in = convert_int4(floor(coord_f));\n\
+ VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+ VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+ VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ src1.s0 = src0.s1;\n\
+ VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+ VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+ VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+ VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ src1.s0 = src0.s1;\n\
+ VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+ coord_f0 = coord_f0.zwzw + matrix4;\n\
+ coord_f = coord_f0 * z1.zzww;\n\
+ coord_in = convert_int4(floor(coord_f));\n\
+ VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+ VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+ VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ src1.s0 = src0.s1;\n\
+ VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+ VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+ VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+ VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+ src1.s0 = src0.s1;\n\
+ VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+#define IMAGE_LOAD_3D(dst, xoffset, yoffset, start, end) \\\n\
+ VXC_OP4(img_load_3d, dst, input, coord_input.xywz, VXC_5BITOFFSET_XY(xoffset, yoffset), \\\n\
+ VXC_MODIFIER(start, end, 0, VXC_RM_TowardZero, 0));\n\
+__kernel void custom_warp_perspective_nearest_neighbor_U8toU8\n\
+(\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ float _m0,\n\
+ float _m1,\n\
+ float _m2,\n\
+ float _m3,\n\
+ float _m4,\n\
+ float _m5,\n\
+ float _m6,\n\
+ float _m7,\n\
+ float _m8\n\
+)\n\
+{\n\
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+\n\
+ float4 coord_f0 = convert_float4(coord_in);\n\
+\n\
+ float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\
+ z0.zw = z0.zw + 2 * matrix1.z;\n\
+ float4 z1 = z0 + 4 * matrix1.z;\n\
+\n\
+ z0 = 1.0f / z0;\n\
+ z1 = 1.0f / z1;\n\
+\n\
+ coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+ float4 coord_f = coord_f0 * z0.xxyy;\n\
+\n\
+ coord_in = convert_int4(coord_f);\n\
+\n\
+ int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\
+ int8 input_desc;\n\
+ _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+ int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\
+ _viv_asm(MOV, coord_input.w, baseAddr);\n\
+\n\
+ vxc_uchar16 dst;\n\
+ IMAGE_LOAD_3D(dst, 0, 0, 0, 0)\n\
+ coord_input.xy = coord_in.zw;\n\
+ IMAGE_LOAD_3D(dst, 0, 0, 1, 1)\n\
+ coord_f0 = coord_f0.zwzw + matrix4;\n\
+ coord_f = coord_f0 * z0.zzww;\n\
+ coord_in = convert_int4(coord_f);\n\
+ coord_input.xy = coord_in.xy;\n\
+ IMAGE_LOAD_3D(dst, 0, 0, 2, 2)\n\
+ coord_input.xy = coord_in.zw;\n\
+ IMAGE_LOAD_3D(dst, 0, 0, 3, 3)\n\
+ coord_f0 = coord_f0.zwzw + matrix4;\n\
+ coord_f = coord_f0 * z1.xxyy;\n\
+ coord_in = convert_int4(coord_f);\n\
+ coord_input.xy = coord_in.xy;\n\
+ IMAGE_LOAD_3D(dst, 0, 0, 4, 4)\n\
+ coord_input.xy = coord_in.zw;\n\
+ IMAGE_LOAD_3D(dst, 0, 0, 5, 5)\n\
+ coord_f0 = coord_f0.zwzw + matrix4;\n\
+ coord_f = coord_f0 * z1.zzww;\n\
+ coord_in = convert_int4(coord_f);\n\
+ coord_input.xy = coord_in.xy;\n\
+ IMAGE_LOAD_3D(dst, 0, 0, 6, 6)\n\
coord_input.xy = coord_in.zw;\n\
IMAGE_LOAD_3D(dst, 0, 0, 7, 7)\n\
\n\
@@ -8432,6 +9585,7 @@ __kernel void gather_I8toI8(\n\
\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
int4 indice = read_imagei(input1, coord_in.xy);\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.w = gidz * axis_num + indice.x;\n\
\n\
vxc_char16 src;\n\
@@ -8456,6 +9610,7 @@ __kernel void gather_U8toU8(\n\
\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
int4 indice = read_imagei(input1, coord_in.xy);\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.w = gidz * axis_num + indice.x;\n\
\n\
vxc_uchar16 src;\n\
@@ -8479,9 +9634,9 @@ __kernel void gather_I16toI16(\n\
int gidz = get_global_id(2); // block_num\n\
\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
-\n\
\n\
int4 indice = read_imagei(input1, coord_in.xy);\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.w = gidz * axis_num + indice.x;\n\
\n\
vxc_short8 src;\n\
@@ -8506,6 +9661,7 @@ __kernel void gather_F16toF16(\n\
\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
int4 indice = read_imagei(input1, coord_in.xy);\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.w = gidz * axis_num + indice.x;\n\
\n\
vxc_short8 src;\n\
@@ -8526,6 +9682,7 @@ __kernel void gather_I8toI8_axis0(\n\
{\n\
int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
int4 indices = read_imagei(input1, coord.xx);\n\
+ indices = indices >= 0 ? indices : indices + axis_num;\n\
int2 coord_in = (int2)(indices.x, get_global_id(1));\n\
\n\
vxc_char16 src, dst;\n\
@@ -8552,6 +9709,7 @@ __kernel void gather_U8toU8_axis0(\n\
{\n\
int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
int4 indices = read_imagei(input1, coord.xx);\n\
+ indices = indices >= 0 ? indices : indices + axis_num;\n\
int2 coord_in = (int2)(indices.x, get_global_id(1));\n\
\n\
vxc_uchar16 src, dst;\n\
@@ -8578,6 +9736,7 @@ __kernel void gather_I16toI16_axis0(\n\
{\n\
int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
int4 indices = read_imagei(input1, coord.xx);\n\
+ indices = indices >= 0 ? indices : indices + axis_num;\n\
int2 coord_in = (int2)(indices.x, get_global_id(1));\n\
\n\
vxc_short8 src, dst;\n\
@@ -8604,6 +9763,7 @@ __kernel void gather_F16toF16_axis0(\n\
{\n\
int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
int4 indices = read_imagei(input1, coord.xx);\n\
+ indices = indices >= 0 ? indices : indices + axis_num;\n\
int2 coord_in = (int2)(indices.x, get_global_id(1));\n\
\n\
vxc_short8 src, dst;\n\
@@ -8640,6 +9800,7 @@ __kernel void gather_I8toI8_array(\n\
\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
int4 indice = read_imagei(input1, coord_in.xy);\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.w = gidz * axis_num + indice.x;\n\
\n\
Image img1 = create_image_from_image2d(input0, 1);\n\
@@ -8668,6 +9829,7 @@ __kernel void gather_U8toU8_array(\n\
\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
int4 indice = read_imagei(input1, coord_in.xy);\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.w = gidz * axis_num + indice.x;\n\
\n\
Image img1 = create_image_from_image2d(input0, 1);\n\
@@ -8695,9 +9857,9 @@ __kernel void gather_I16toI16_array(\n\
int gidz = get_global_id(2); // block_num\n\
\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
-\n\
\n\
int4 indice = read_imagei(input1, coord_in.xy);\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.w = gidz * axis_num + indice.x;\n\
\n\
Image img1 = create_image_from_image2d(input0, 2);\n\
@@ -8727,6 +9889,7 @@ __kernel void gather_F16toF16_array(\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
\n\
int4 indice = read_imagei(input1, coord_in.xy);\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.w = gidz * axis_num + indice.x;\n\
\n\
Image img1 = create_image_from_image2d(input0, 2);\n\
@@ -8764,6 +9927,7 @@ __kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \\\n\
uchar* output_ptr = get_image_ptr_from_coord(img2, coord.xy); \\\n\
__global data_type* data_ptr = (__global data_type*)input_ptr; \\\n\
__global write_type* out_ptr = (__global write_type*)output_ptr; \\\n\
+ indices = indices >= 0 ? indices : indices + axis_num; \\\n\
src.s0 = data_ptr[indices.x]; \\\n\
src.s1 = data_ptr[indices.y]; \\\n\
src.s2 = data_ptr[indices.z]; \\\n\
@@ -8804,6 +9968,7 @@ __kernel void gather_batch_I8toI8(\n\
{\n\
int4 indice = read_imagei(input1, coord_idx);\n\
coord_idx.y++;\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.y = gidz * axis_num + indice.x;\n\
\n\
vxc_char16 src;\n\
@@ -8834,6 +9999,7 @@ __kernel void gather_batch_U8toU8(\n\
{\n\
int4 indice = read_imagei(input1, coord_idx);\n\
coord_idx.y++;\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.y = gidz * axis_num + indice.x;\n\
\n\
vxc_uchar16 src;\n\
@@ -8864,6 +10030,7 @@ __kernel void gather_batch_I16toI16(\n\
{\n\
int4 indice = read_imagei(input1, coord_idx);\n\
coord_idx.y++;\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.y = gidz * axis_num + indice.x;\n\
\n\
vxc_short8 src;\n\
@@ -8894,6 +10061,7 @@ __kernel void gather_batch_F16toF16(\n\
{\n\
int4 indice = read_imagei(input1, coord_idx);\n\
coord_idx.y++;\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.y = gidz * axis_num + indice.x;\n\
\n\
vxc_short8 src;\n\
@@ -8915,6 +10083,7 @@ __kernel void gather_batch_I8toI8_axis0(\n\
{\n\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
int4 indices = read_imagei(input1, coord.xz);\n\
+ indices = indices >= 0 ? indices : indices + axis_num;\n\
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\
\n\
vxc_char16 src, dst;\n\
@@ -8943,6 +10112,7 @@ __kernel void gather_batch_U8toU8_axis0(\n\
{\n\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
int4 indices = read_imagei(input1, coord.xz);\n\
+ indices = indices >= 0 ? indices : indices + axis_num;\n\
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\
\n\
vxc_uchar16 src, dst;\n\
@@ -8971,6 +10141,7 @@ __kernel void gather_batch_I16toI16_axis0(\n\
{\n\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
int4 indices = read_imagei(input1, coord.xz);\n\
+ indices = indices >= 0 ? indices : indices + axis_num;\n\
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\
\n\
vxc_short8 src, dst;\n\
@@ -8999,6 +10170,7 @@ __kernel void gather_batch_F16toF16_axis0(\n\
{\n\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
int4 indices = read_imagei(input1, coord.xz);\n\
+ indices = indices >= 0 ? indices : indices + axis_num;\n\
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\
\n\
vxc_short8 src, dst;\n\
@@ -9020,6 +10192,12 @@ __kernel void gather_batch_F16toF16_axis0(\n\
static const char gather_elements_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
\n\
_viv_uniform int axis_size;\n\
+_viv_uniform uint width0;\n\
+_viv_uniform uint height0;\n\
+_viv_uniform uint width1;\n\
+_viv_uniform uint height1;\n\
+_viv_uniform uint width_out;\n\
+_viv_uniform uint height_out;\n\
\n\
#define GATHER_ELEMENTS_AXIS0_2D(name, data_type) \\\n\
__kernel void gather_elements_axis0_##name##_I32to##name##_2D \\\n\
@@ -9170,6 +10348,144 @@ GATHER_ELEMENTS_AXIS2(F16, vxc_short4)\n\
GATHER_ELEMENTS_AXIS2(I16, vxc_short4)\n\
GATHER_ELEMENTS_AXIS2(I8, vxc_char4)\n\
GATHER_ELEMENTS_AXIS2(U8, vxc_uchar4)\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name \\\n\
+ ( \\\n\
+ __read_only image2d_array_t input0, \\\n\
+ __read_only image2d_array_t input1, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ int axis \\\n\
+ ) \\\n\
+{ \\\n\
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \\\n\
+ int* index_ptr = (int*)index_tensor.ptr; \\\n\
+ int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \\\n\
+ \\\n\
+ Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \\\n\
+ data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \\\n\
+ data_type data = input_ptr[index + coord.y * width0 + coord.z * width0 * height0]; \\\n\
+ \\\n\
+ Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \\\n\
+ data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \\\n\
+ output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I8, char, char*, 1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(U8, uchar, uchar*, 1)\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name \\\n\
+ ( \\\n\
+ __read_only image2d_array_t input0, \\\n\
+ __read_only image2d_array_t input1, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ int axis \\\n\
+ ) \\\n\
+{ \\\n\
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \\\n\
+ int* index_ptr = (int*)index_tensor.ptr; \\\n\
+ int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \\\n\
+ \\\n\
+ Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \\\n\
+ data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \\\n\
+ data_type data = input_ptr[coord.x + index * width0 + coord.z * width0 * height0]; \\\n\
+ \\\n\
+ Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \\\n\
+ data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \\\n\
+ output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I8, char, char*, 1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(U8, uchar, uchar*, 1)\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis2_##name##_I32to##name \\\n\
+ ( \\\n\
+ __read_only image2d_array_t input0, \\\n\
+ __read_only image2d_array_t input1, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ int axis \\\n\
+ ) \\\n\
+{ \\\n\
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \\\n\
+ int* index_ptr = (int*)index_tensor.ptr; \\\n\
+ int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \\\n\
+ \\\n\
+ Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \\\n\
+ data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \\\n\
+ data_type data = input_ptr[coord.x + coord.y * width0 + index * width0 * height0]; \\\n\
+ \\\n\
+ Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \\\n\
+ data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \\\n\
+ output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I8, char, char*, 1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(U8, uchar, uchar*, 1)\n\
+\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name##_2D \\\n\
+ ( \\\n\
+ __read_only image2d_t input0, \\\n\
+ __read_only image2d_t input1, \\\n\
+ __write_only image2d_t output, \\\n\
+ int axis \\\n\
+ ) \\\n\
+{ \\\n\
+ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ Image index_img = create_image_from_image2d(input1, 4); \\\n\
+ int* index_ptr = (int*)index_img.ptr; \\\n\
+ int index = index_ptr[coord.x + coord.y * width1]; \\\n\
+ \\\n\
+ Image input_img = create_image_from_image2d(input0, stride); \\\n\
+ data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \\\n\
+ data_type data = input_ptr[index + coord.y * width0]; \\\n\
+ \\\n\
+ Image output_img = create_image_from_image2d(output, stride); \\\n\
+ data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \\\n\
+ output_ptr[coord.x + coord.y * width_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I8, char, char*, 1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(U8, uchar, uchar*, 1)\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name##_2D \\\n\
+ ( \\\n\
+ __read_only image2d_t input0, \\\n\
+ __read_only image2d_t input1, \\\n\
+ __write_only image2d_t output, \\\n\
+ int axis \\\n\
+ ) \\\n\
+{ \\\n\
+ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ Image index_img = create_image_from_image2d(input1, 4); \\\n\
+ int* index_ptr = (int*)index_img.ptr; \\\n\
+ int index = index_ptr[coord.x + coord.y * width1]; \\\n\
+ \\\n\
+ Image input_img = create_image_from_image2d(input0, stride); \\\n\
+ data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \\\n\
+ data_type data = input_ptr[coord.x + index * width0]; \\\n\
+ \\\n\
+ Image output_img = create_image_from_image2d(output, stride); \\\n\
+ data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \\\n\
+ output_ptr[coord.x + coord.y * width_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I8, char, char*, 1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(U8, uchar, uchar*, 1)\n\
+\n\
+\n\
"; /* end of gather_elements_vx*/
static const char gather_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -9198,6 +10514,7 @@ __kernel void gather_##src0_type_name##toF16( \\\n\
\\\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0); \\\n\
int4 indice = read_imagei(input1, coord_in.xy); \\\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \\\n\
coord_in.w = gidz * axis_num + indice.x; \\\n\
\\\n\
read_type src; \\\n\
@@ -9234,6 +10551,7 @@ __kernel void gather_F16to##src1_type_name( \\\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0); \\\n\
\\\n\
int4 indice = read_imagei(input1, coord_in.xy); \\\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \\\n\
coord_in.w = gidz * axis_num + indice.x; \\\n\
\\\n\
vxc_short8 src; \\\n\
@@ -9266,6 +10584,7 @@ __kernel void gather_I16toF16(\n\
\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
int4 indice = read_imagei(input1, coord_in.xy);\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.w = gidz * axis_num + indice.x;\n\
\n\
vxc_short8 src;\n\
@@ -9296,6 +10615,7 @@ __kernel void gather_##src0_type_name##toF16_axis0( \\\n\
{ \\\n\
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
int4 indices = read_imagei(input1, coord.xx); \\\n\
+ indices = indices >= 0 ? indices : indices + axis_num; \\\n\
int2 coord_in = (int2)(indices.x, get_global_id(1)); \\\n\
\\\n\
read_type src; \\\n\
@@ -9327,6 +10647,7 @@ __kernel void gather_F16to##src1_type_name##_axis0( \\\n\
{ \\\n\
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
int4 indices = read_imagei(input1, coord.xx); \\\n\
+ indices = indices >= 0 ? indices : indices + axis_num; \\\n\
int2 coord_in = (int2)(indices.x, get_global_id(1)); \\\n\
\\\n\
vxc_short8 src; \\\n\
@@ -9358,6 +10679,7 @@ __kernel void gather_I16toF16_axis0(\n\
{\n\
int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
int4 indices = read_imagei(input1, coord.xx);\n\
+ indices = indices >= 0 ? indices : indices + axis_num;\n\
int2 coord_in = (int2)(indices.x, get_global_id(1));\n\
\n\
vxc_short8 src;\n\
@@ -9414,6 +10736,7 @@ __kernel void gather_batch_##src0_type_name##toF16( \\\n\
{ \\\n\
int4 indice = read_imagei(input1, coord_idx); \\\n\
coord_idx.y++; \\\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \\\n\
coord_in.y = gidz * axis_num + indice.x; \\\n\
\\\n\
read_type src; \\\n\
@@ -9459,6 +10782,7 @@ __kernel void gather_batch_F16to##src1_type_name( \\\n\
{ \\\n\
int4 indice = read_imagei(input1, coord_idx); \\\n\
coord_idx.y++; \\\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \\\n\
coord_in.y = gidz * axis_num + indice.x; \\\n\
\\\n\
vxc_short8 src; \\\n\
@@ -9501,6 +10825,7 @@ __kernel void gather_batch_I16toF16(\n\
{\n\
int4 indice = read_imagei(input1, coord_idx);\n\
coord_idx.y++;\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.y = gidz * axis_num + indice.x;\n\
\n\
vxc_short8 src;\n\
@@ -9526,6 +10851,7 @@ __kernel void gather_batch_##src0_type_name##toF16_axis0( \\\n\
{ \\\n\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
int4 indices = read_imagei(input1, coord.xz); \\\n\
+ indices = indices >= 0 ? indices : indices + axis_num; \\\n\
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \\\n\
\\\n\
read_type src; \\\n\
@@ -9560,6 +10886,7 @@ __kernel void gather_batch_F16to##src1_type_name##_axis0( \\\n\
{ \\\n\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
int4 indices = read_imagei(input1, coord.xz); \\\n\
+ indices = indices >= 0 ? indices : indices + axis_num; \\\n\
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \\\n\
\\\n\
vxc_short8 src; \\\n\
@@ -9594,6 +10921,7 @@ __kernel void gather_batch_I16toF16_axis0(\n\
{\n\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
int4 indices = read_imagei(input1, coord.xz);\n\
+ indices = indices >= 0 ? indices : indices + axis_num;\n\
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\
\n\
vxc_short8 src, dst;\n\
@@ -10083,95 +11411,98 @@ static const char gather_nd_batch_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
\n\
__kernel void gather_nd_batch_I8toI8_1D(\n\
__read_only image2d_t input0,\n\
- __read_only image2d_t input1,\n\
- __write_only image2d_t output,\n\
+ __read_only image2d_array_t input1,\n\
+ __write_only image2d_array_t output,\n\
int block_size,\n\
int coord_dim\n\
)\n\
{\n\
int gidx = get_global_id(0); // block_size\n\
- int gidy = get_global_id(1); // batch\n\
+ int gidy = get_global_id(1); // index num\n\
+ int gidz = get_global_id(2); // batch num\n\
\n\
- int4 coord = (int4)(gidx, gidy, 0, 0);\n\
- Image img = create_image_from_image2d(input1, 4);\n\
- uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);\n\
+ int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+ Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+ uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
int4 indice = ((int4 *)indice_ptr)[0];\n\
-\n\
- coord.z = indice.x * block_size + gidx;\n\
+ int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\
\n\
vxc_char16 src;\n\
- VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
\n\
- VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
}\n\
\n\
__kernel void gather_nd_batch_U8toU8_1D(\n\
__read_only image2d_t input0,\n\
- __read_only image2d_t input1,\n\
- __write_only image2d_t output,\n\
+ __read_only image2d_array_t input1,\n\
+ __write_only image2d_array_t output,\n\
int block_size,\n\
int coord_dim\n\
)\n\
{\n\
int gidx = get_global_id(0); // block_size\n\
- int gidy = get_global_id(1); // batch num\n\
+ int gidy = get_global_id(1); // index num\n\
+ int gidz = get_global_id(2); // batch num\n\
\n\
- int4 coord = (int4)(gidx, gidy, 0, 0);\n\
- Image img = create_image_from_image2d(input1, 4);\n\
- uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);\n\
+ int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+ Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+ uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
int4 indice = ((int4 *)indice_ptr)[0];\n\
\n\
- coord.z = indice.x * block_size + gidx;\n\
+ int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\
\n\
vxc_uchar16 src;\n\
- VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
}\n\
\n\
__kernel void gather_nd_batch_I16toI16_1D(\n\
__read_only image2d_t input0,\n\
- __read_only image2d_t input1,\n\
- __write_only image2d_t output,\n\
+ __read_only image2d_array_t input1,\n\
+ __write_only image2d_array_t output,\n\
int block_size,\n\
int coord_dim\n\
)\n\
{\n\
int gidx = get_global_id(0); // block_size\n\
- int gidy = get_global_id(1); // batch num\n\
+ int gidy = get_global_id(1); // index num\n\
+ int gidz = get_global_id(2); // batch num\n\
\n\
- int4 coord = (int4)(gidx, gidy, 0, 0);\n\
- Image img = create_image_from_image2d(input1, 4);\n\
- uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);\n\
+ int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+ Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+ uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
int4 indice = ((int4 *)indice_ptr)[0];\n\
\n\
- coord.z = indice.x * block_size + gidx;\n\
+ int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\
\n\
vxc_short8 src;\n\
- VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
}\n\
\n\
__kernel void gather_nd_batch_F16toF16_1D(\n\
__read_only image2d_t input0,\n\
- __read_only image2d_t input1,\n\
- __write_only image2d_t output,\n\
+ __read_only image2d_array_t input1,\n\
+ __write_only image2d_array_t output,\n\
int block_size,\n\
int coord_dim\n\
)\n\
{\n\
int gidx = get_global_id(0); // block_size\n\
- int gidy = get_global_id(1); // batch num\n\
+ int gidy = get_global_id(1); // index num\n\
+ int gidz = get_global_id(2); // batch num\n\
\n\
- int4 coord = (int4)(gidx, gidy, 0, 0);\n\
- Image img = create_image_from_image2d(input1, 4);\n\
- uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);\n\
+ int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+ Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+ uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
int4 indice = ((int4 *)indice_ptr)[0];\n\
\n\
- coord.z = indice.x * block_size + gidx;\n\
+ int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\
\n\
vxc_short8 src;\n\
- VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
}\n\
"; /* end of gather_nd_batch_vx*/
@@ -10179,18 +11510,19 @@ static const char gather_nd_batch_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
\n\
__kernel void gather_nd_batch_I8toI8_2D(\n\
__read_only image2d_array_t input0,\n\
- __read_only image2d_t input1,\n\
- __write_only image2d_t output,\n\
+ __read_only image2d_array_t input1,\n\
+ __write_only image2d_array_t output,\n\
int block_size,\n\
int coord_dim\n\
)\n\
{\n\
int gidx = get_global_id(0); // block_size\n\
- int gidy = get_global_id(1); // batch num\n\
+ int gidy = get_global_id(1); // index num\n\
+ int gidz = get_global_id(2); // batch num\n\
\n\
- int4 coord = (int4)(gidx, 0, gidy, 0);\n\
- Image img = create_image_from_image2d(input1, 4);\n\
- uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);\n\
+ int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+ Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+ uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
int4 indice = ((int4 *)indice_ptr)[0];\n\
\n\
indice.x = indice.x * block_size + gidx;\n\
@@ -10199,23 +11531,24 @@ __kernel void gather_nd_batch_I8toI8_2D(\n\
vxc_char16 src;\n\
VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
\n\
- VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
}\n\
\n\
__kernel void gather_nd_U8toU8_2D(\n\
__read_only image2d_array_t input0,\n\
- __read_only image2d_t input1,\n\
- __write_only image2d_t output,\n\
+ __read_only image2d_array_t input1,\n\
+ __write_only image2d_array_t output,\n\
int block_size,\n\
int coord_dim\n\
)\n\
{\n\
int gidx = get_global_id(0); // block_size\n\
- int gidy = get_global_id(1); // batch num\n\
+ int gidy = get_global_id(1); // index num\n\
+ int gidz = get_global_id(2); // batch num\n\
\n\
- int4 coord = (int4)(gidx, 0, gidy, 0);\n\
- Image img = create_image_from_image2d(input1, 4);\n\
- uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);\n\
+ int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+ Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+ uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
int4 indice = ((int4 *)indice_ptr)[0];\n\
\n\
indice.x = indice.x * block_size + gidx;\n\
@@ -10223,23 +11556,24 @@ __kernel void gather_nd_U8toU8_2D(\n\
\n\
vxc_uchar16 src;\n\
VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
}\n\
\n\
__kernel void gather_nd_I16toI16_2D(\n\
__read_only image2d_array_t input0,\n\
- __read_only image2d_t input1,\n\
- __write_only image2d_t output,\n\
+ __read_only image2d_array_t input1,\n\
+ __write_only image2d_array_t output,\n\
int block_size,\n\
int coord_dim\n\
)\n\
{\n\
int gidx = get_global_id(0); // block_size\n\
- int gidy = get_global_id(1); // batch num\n\
+ int gidy = get_global_id(1); // index num\n\
+ int gidz = get_global_id(2); // batch num\n\
\n\
- int4 coord = (int4)(gidx, 0, gidy, 0);\n\
- Image img = create_image_from_image2d(input1, 4);\n\
- uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);\n\
+ int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+ Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+ uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
int4 indice = ((int4 *)indice_ptr)[0];\n\
\n\
indice.x = indice.x * block_size + gidx;\n\
@@ -10247,23 +11581,24 @@ __kernel void gather_nd_I16toI16_2D(\n\
\n\
vxc_short8 src;\n\
VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
}\n\
\n\
__kernel void gather_nd_F16toF16_2D(\n\
__read_only image2d_array_t input0,\n\
- __read_only image2d_t input1,\n\
- __write_only image2d_t output,\n\
+ __read_only image2d_array_t input1,\n\
+ __write_only image2d_array_t output,\n\
int block_size,\n\
int coord_dim\n\
)\n\
{\n\
int gidx = get_global_id(0); // block_size\n\
- int gidy = get_global_id(1); // batch num\n\
+ int gidy = get_global_id(1); // index num\n\
+ int gidz = get_global_id(2); // batch num\n\
\n\
- int4 coord = (int4)(gidx, 0, gidy, 0);\n\
- Image img = create_image_from_image2d(input1, 4);\n\
- uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);\n\
+ int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+ Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+ uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
int4 indice = ((int4 *)indice_ptr)[0];\n\
\n\
indice.x = indice.x * block_size + gidx;\n\
@@ -10271,7 +11606,7 @@ __kernel void gather_nd_F16toF16_2D(\n\
\n\
vxc_short8 src;\n\
VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
}\n\
"; /* end of gather_nd_batch_2d_vx*/
@@ -10733,12 +12068,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
__read_only image2d_array_t scale, \\\n\
__read_only image2d_t meanVari, \\\n\
__write_only image2d_array_t output, \\\n\
- float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+ float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
{ \\\n\
+ int gidx = get_global_id(0); \\\n\
int gidy = get_global_id(1); \\\n\
int gidz = get_global_id(2); \\\n\
- int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\
- int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\
+ int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\
+ int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \\\n\
src_type src0; \\\n\
dst_type dst; \\\n\
vxc_short8 src1; \\\n\
@@ -10784,7 +12120,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
__read_only image2d_array_t scale, \\\n\
__read_only image2d_t meanVari, \\\n\
__write_only image2d_array_t output, \\\n\
- float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+ float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
{ \\\n\
int gidz = get_global_id(1); \\\n\
int2 coord = (int2)(get_global_id(0), gidz); \\\n\
@@ -10834,12 +12170,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
__read_only image2d_t scale, \\\n\
__read_only image2d_t meanVari, \\\n\
__write_only image2d_array_t output, \\\n\
- float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+ float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
{ \\\n\
+ int gidx = get_global_id(0); \\\n\
int gidy = get_global_id(1); \\\n\
int gidz = get_global_id(2); \\\n\
- int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\
- int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\
+ int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\
+ int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \\\n\
src_type src0; \\\n\
dst_type dst; \\\n\
float scale_vari, bias_val; \\\n\
@@ -10880,7 +12217,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
__read_only image2d_t scale, \\\n\
__read_only image2d_t meanVari, \\\n\
__write_only image2d_array_t output, \\\n\
- float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+ float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
{ \\\n\
int gidz = get_global_id(1); \\\n\
int2 coord = (int2)(get_global_id(0), gidz); \\\n\
@@ -10938,12 +12275,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
__read_only image2d_array_t scale, \\\n\
__read_only image2d_t meanVari, \\\n\
__write_only image2d_array_t output, \\\n\
- float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+ float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
{ \\\n\
+ int gidx = get_global_id(0); \\\n\
int gidy = get_global_id(1); \\\n\
int gidz = get_global_id(2); \\\n\
- int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\
- int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\
+ int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\
+ int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \\\n\
src_type src0; \\\n\
vxc_short8 src1, outval; \\\n\
vxc_half8 scale_h, dst; \\\n\
@@ -10996,7 +12334,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
__read_only image2d_array_t scale, \\\n\
__read_only image2d_t meanVari, \\\n\
__write_only image2d_array_t output, \\\n\
- float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+ float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
{ \\\n\
int gidz = get_global_id(1); \\\n\
int2 coord = (int2)(get_global_id(0), gidz); \\\n\
@@ -11053,12 +12391,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
__read_only image2d_t scale, \\\n\
__read_only image2d_t meanVari, \\\n\
__write_only image2d_array_t output, \\\n\
- float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+ float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
{ \\\n\
+ int gidx = get_global_id(0); \\\n\
int gidy = get_global_id(1); \\\n\
int gidz = get_global_id(2); \\\n\
- int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\
- int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\
+ int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\
+ int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \\\n\
src_type src0; \\\n\
vxc_short8 outval; \\\n\
vxc_half8 dst; \\\n\
@@ -11107,7 +12446,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
__read_only image2d_t scale, \\\n\
__read_only image2d_t meanVari, \\\n\
__write_only image2d_array_t output, \\\n\
- float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+ float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
{ \\\n\
int gidz = get_global_id(1); \\\n\
int2 coord = (int2)(get_global_id(0), gidz); \\\n\
@@ -11294,12 +12633,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
__read_only image2d_array_t scale, \\\n\
__read_only image2d_t meanVari, \\\n\
__write_only image2d_array_t output, \\\n\
- float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+ float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
{ \\\n\
+ int gidx = get_global_id(0); \\\n\
int gidy = get_global_id(1); \\\n\
int gidz = get_global_id(2); \\\n\
- int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\
- int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\
+ int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\
+ int4 coord_para = (int4)((convert_int(gidx* rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \\\n\
vxc_short8 src0; \\\n\
vxc_short8 src1; \\\n\
vxc_half8 scale_h; \\\n\
@@ -11351,7 +12691,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
__read_only image2d_array_t scale, \\\n\
__read_only image2d_t meanVari, \\\n\
__write_only image2d_array_t output, \\\n\
- float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+ float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
{ \\\n\
int gidz = get_global_id(1); \\\n\
int2 coord = (int2)(get_global_id(0), gidz); \\\n\
@@ -11406,12 +12746,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
__read_only image2d_t scale, \\\n\
__read_only image2d_t meanVari, \\\n\
__write_only image2d_array_t output, \\\n\
- float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+ float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
{ \\\n\
+ int gidx = get_global_id(0); \\\n\
int gidy = get_global_id(1); \\\n\
int gidz = get_global_id(2); \\\n\
- int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\
- int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\
+ int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\
+ int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \\\n\
vxc_short8 src0; \\\n\
src_type in_h; \\\n\
float scale_vari, bias_val; \\\n\
@@ -11458,7 +12799,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
__read_only image2d_t scale, \\\n\
__read_only image2d_t meanVari, \\\n\
__write_only image2d_array_t output, \\\n\
- float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+ float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
{ \\\n\
int gidz = get_global_id(1); \\\n\
int2 coord = (int2)(get_global_id(0), gidz); \\\n\
@@ -12731,8 +14072,8 @@ _viv_uniform VXC_512Bits uniConvertF16_0_4x4;\n\
_viv_uniform VXC_512Bits uniConvertF16_1_4x4;\n\
_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
\n\
-#define GRUCELL_F16_F16TOF16(act_name, act_func) \\\n\
-__kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \\\n\
+#define GRUCELL_F16_F16TOF16(act_name, act_func, rec_act_name, rec_act_func) \\\n\
+__kernel void grucell_reset_after_activation_F16_F16toF16_##act_name##_##rec_act_name( \\\n\
__read_only image2d_t hstate_in, \\\n\
__read_only image2d_t input_z_conv, \\\n\
__read_only image2d_t input_r_conv, \\\n\
@@ -12764,15 +14105,15 @@ __kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \\\n\
\\\n\
float4 r; \\\n\
VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\
- r = act_func(r); \\\n\
+ r = rec_act_func(r); \\\n\
float4 h0, h1; \\\n\
VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
float4 h = h0 + r * h1; \\\n\
float4 z; \\\n\
VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\
- z = act_func(z); \\\n\
- h = tanh_func(h); \\\n\
+ z = rec_act_func(z); \\\n\
+ h = act_func(h); \\\n\
float4 h_tm; \\\n\
VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
float4 result = (1 - z) * h + z * h_tm; \\\n\
@@ -12785,14 +14126,15 @@ __kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \\\n\
VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
}\n\
-GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)\n\
+GRUCELL_F16_F16TOF16(TANH, tanh_func, SIGMOID, sigmoid_func)\n\
+GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func, SIGMOID, sigmoid_func)\n\
\n\
_viv_uniform float hstate_in_scale;\n\
_viv_uniform float hstate_in_tail;\n\
_viv_uniform float output_scale;\n\
_viv_uniform float output_zp;\n\
-#define GRUCELL_QNT_F16TO_QNT(name0, name1, act_name, act_func, src0_type, dst_type) \\\n\
-__kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name( \\\n\
+#define GRUCELL_QNT_F16TO_QNT(name, act_func, rec_act_func, src0_type, dst_type) \\\n\
+__kernel void grucell_reset_after_activation_##name( \\\n\
__read_only image2d_t hstate_in, \\\n\
__read_only image2d_t input_z_conv, \\\n\
__read_only image2d_t input_r_conv, \\\n\
@@ -12824,15 +14166,15 @@ __kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name
\\\n\
float4 r; \\\n\
VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\
- r = act_func(r); \\\n\
+ r = rec_act_func(r); \\\n\
float4 h0, h1; \\\n\
VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
float4 h = h0 + r * h1; \\\n\
float4 z; \\\n\
VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\
- z = act_func(z); \\\n\
- h = tanh_func(h); \\\n\
+ z = rec_act_func(z); \\\n\
+ h = act_func(h); \\\n\
float4 h_tm; \\\n\
VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
h_tm = h_tm * hstate_in_scale + hstate_in_tail; \\\n\
@@ -12845,9 +14187,12 @@ __kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name
VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
}\n\
-GRUCELL_QNT_F16TO_QNT(U8, U8, SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\
-GRUCELL_QNT_F16TO_QNT(I8, I8, SIGMOID, sigmoid_func, vxc_char8, vxc_char8)\n\
-GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8)\n\
+GRUCELL_QNT_F16TO_QNT(U8_F16toU8_TANH_SIGMOID, tanh_func, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\
+GRUCELL_QNT_F16TO_QNT(I8_F16toI8_TANH_SIGMOID, tanh_func, sigmoid_func, vxc_char8, vxc_char8)\n\
+GRUCELL_QNT_F16TO_QNT(I16_F16toI16_TANH_SIGMOID, tanh_func, sigmoid_func, vxc_short8, vxc_short8)\n\
+GRUCELL_QNT_F16TO_QNT(U8_F16toU8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\
+GRUCELL_QNT_F16TO_QNT(I8_F16toI8_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_char8, vxc_char8)\n\
+GRUCELL_QNT_F16TO_QNT(I16_F16toI16_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_short8, vxc_short8)\n\
"; /* end of grucell_reset_after_activation_vx*/
static const char hswish_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -21791,6 +23136,432 @@ __kernel void gemm_transb_BF16BF16toBF16(image2d_array_t inputA,\n\
}\n\
"; /* end of matrixmul_bf16_vx*/
+static const char matrixmul_cross_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float output_ZP;\n\
+_viv_uniform float mulKIn0In1Zp;\n\
+_viv_uniform float inOutScale;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4;\n\
+_viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;\n\
+\n\
+#define GEMM_QINT_TO_QINT_CROSS(src0_type_name, read_type) \\\n\
+__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_cross( \\\n\
+ image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\
+ int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N, \\\n\
+ int axis_size, int inner_size, int outer_size, int axis_size0, \\\n\
+ int inner_size0, int outer_size0, int axis_size1, int inner_size1, \\\n\
+ int outer_size1, int axis_size2, int inner_size2, int outer_size2) \\\n\
+{ \\\n\
+ read_type srcA0, srcA1, srcA2, srcA3, srcB, outC; \\\n\
+ vxc_float4 sum = (vxc_float4)(mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp); \\\n\
+ int gidz = get_global_id(2); \\\n\
+ for(int j = 0; j < outer_size; j++) \\\n\
+ { \\\n\
+ for(int i = 0; i < inner_size; i++) \\\n\
+ { \\\n\
+ vxc_float4 sum0 = sum, sum1 = sum, sum2 = sum, sum3 = sum; \\\n\
+ int4 coord_a = (int4)(0, get_global_id(1), gidz * axis_size0 + i * inner_size0 + j * outer_size0, 0); \\\n\
+ int4 coord_b = (int4)(get_global_id(0), 0, gidz * axis_size1 + i * inner_size1 + j * outer_size1, 0); \\\n\
+ int8 inputA_desc, inputB_desc, output_desc; \\\n\
+ _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\
+ int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\
+ _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\
+ _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\
+ int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\
+ _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\
+ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\
+ { \\\n\
+ vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\
+ vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\
+ VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+ VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+ VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_a.x += 4; coord_b.y += 4; \\\n\
+ VXC_DP4x4(tempA0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGemmU8U8toFp32Block4_4x4); \\\n\
+ VXC_DP4x4(tempA1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGemmU8U8toFp32Block4_4x4); \\\n\
+ VXC_DP4x4(tempA2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGemmU8U8toFp32Block4_4x4); \\\n\
+ VXC_DP4x4(tempA3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGemmU8U8toFp32Block4_4x4); \\\n\
+ VXC_DP8x4(tempB0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGemmU8U8MulZptoFp32_8x4); \\\n\
+ VXC_DP8x4(tempB1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGemmU8U8MulZptoFp32_8x4); \\\n\
+ VXC_DP8x4(tempB2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGemmU8U8MulZptoFp32_8x4); \\\n\
+ VXC_DP8x4(tempB3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGemmU8U8MulZptoFp32_8x4); \\\n\
+ sum0 += tempA0 + tempB0; \\\n\
+ sum1 += tempA1 + tempB1; \\\n\
+ sum2 += tempA2 + tempB2; \\\n\
+ sum3 += tempA3 + tempB3; \\\n\
+ } \\\n\
+ vxc_int4 tmpOut0, tmpOut1; \\\n\
+ coord_b.y = get_global_id(1); \\\n\
+ coord_b.z = gidz * axis_size2 + i * inner_size2 + j * outer_size2; \\\n\
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+ int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \\\n\
+ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
+ tmpOut0 = convert_int4_rte(sum0 * inOutScale + output_ZP); \\\n\
+ tmpOut1 = convert_int4_rte(sum1 * inOutScale + output_ZP); \\\n\
+ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ coord_b.y++; \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ coord_b.y++; \\\n\
+ tmpOut0 = convert_int4_rte(sum2 * inOutScale + output_ZP); \\\n\
+ tmpOut1 = convert_int4_rte(sum3 * inOutScale + output_ZP); \\\n\
+ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ coord_b.y++; \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ } \\\n\
+}\n\
+GEMM_QINT_TO_QINT_CROSS(U8, vxc_uchar16)\n\
+GEMM_QINT_TO_QINT_CROSS(I8, vxc_char16)\n\
+\n\
+__kernel void gemm_F16F16toF16_cross(image2d_array_t inputA,\n\
+ image2d_array_t inputB, image2d_array_t output,\n\
+ int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N,\n\
+ int axis_size, int inner_size, int outer_size, int axis_size0,\n\
+ int inner_size0, int outer_size0, int axis_size1, int inner_size1,\n\
+ int outer_size1, int axis_size2, int inner_size2, int outer_size2)\n\
+{\n\
+ uint gidy = get_global_id(1);\n\
+ uint gidz = get_global_id(2);\n\
+ for(int j = 0; j < outer_size; j++)\n\
+ {\n\
+ for(int i = 0; i < inner_size; i++)\n\
+ {\n\
+ int4 coord_a = (int4)(0, gidy, gidz * axis_size0 + i * inner_size0 + j * outer_size0, 0);\n\
+ int4 coord_b = (int4)(get_global_id(0), 0, gidz * axis_size1 + i * inner_size1 + j * outer_size1, 0);\n\
+\n\
+ half4 valC;\n\
+ vxc_short8 srcA0, srcA1, srcA2, srcA3, outC;\n\
+ vxc_half8 tmpA0, tmpA1, tmpA2, tmpA3;\n\
+ vxc_short16 srcB;\n\
+ vxc_half16 tmpB;\n\
+ vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);\n\
+ vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);\n\
+\n\
+ int8 inputA_desc, inputB_desc, output_desc;\n\
+ _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\
+ int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\
+ _viv_asm(MOV, coord_a.w, baseAddr_a);\n\
+ _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\
+ int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\
+ _viv_asm(MOV, coord_b.w, baseAddr_b);\n\
+\n\
+ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)\n\
+ {\n\
+ vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\
+ VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+ coord_a.x += 4; coord_b.y += 4;\n\
+ _viv_asm(COPY, tmpA0, srcA0, 16);\n\
+ _viv_asm(COPY, tmpA1, srcA1, 16);\n\
+ _viv_asm(COPY, tmpA2, srcA2, 16);\n\
+ _viv_asm(COPY, tmpA3, srcA3, 16);\n\
+ _viv_asm(COPY, tmpB.hi, srcB.hi, 16);\n\
+ _viv_asm(COPY, tmpB.lo, srcB.lo, 16);\n\
+ VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniGemmU8F16toF32Lo_4x4b);\n\
+ VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniGemmU8F16toF32Lo_4x4b);\n\
+ VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, tmpA2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniGemmU8F16toF32Lo_4x4b);\n\
+ VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, tmpA3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniGemmU8F16toF32Lo_4x4b);\n\
+ sum0 += (tempA0);\n\
+ sum1 += (tempA1);\n\
+ sum2 += (tempA2);\n\
+ sum3 += (tempA3);\n\
+ }\n\
+ coord_b.y = gidy;\n\
+ coord_b.z = gidz * axis_size2 + i * inner_size2 + j * outer_size2;\n\
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+ int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0;\n\
+ _viv_asm(MOV, coord_b.w, baseAddr);\n\
+ _viv_asm(CONV, valC, sum0);\n\
+ _viv_asm(COPY, outC, valC, 16);\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+ coord_b.y++;\n\
+ _viv_asm(CONV, valC, sum1);\n\
+ _viv_asm(COPY, outC, valC, 16);\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+ coord_b.y++;\n\
+ _viv_asm(CONV, valC, sum2);\n\
+ _viv_asm(COPY, outC, valC, 16);\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+ coord_b.y++;\n\
+ _viv_asm(CONV, valC, sum3);\n\
+ _viv_asm(COPY, outC, valC, 16);\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+ }\n\
+ }\n\
+}\n\
+"; /* end of matrixmul_cross_vx*/
+
+static const char matrixmul_cross_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int input0_ZP;\n\
+_viv_uniform int input1_ZP;\n\
+_viv_uniform float output_ZP;\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32B_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+_viv_uniform int ac2zero;\n\
+_viv_uniform int bc2zero;\n\
+\n\
+_viv_uniform int outer;\n\
+\n\
+#define GEMM_QINT_TO_QINT_MERGE(src0_type_name, read_type) \\\n\
+__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_merge( \\\n\
+ image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\
+ int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\
+{ \\\n\
+ uint gidy = get_global_id(1); \\\n\
+ short in0_zp, in1_zp; \\\n\
+ _viv_asm(COPY, in0_zp, input0_ZP, 4); \\\n\
+ _viv_asm(COPY, in1_zp, input1_ZP, 4); \\\n\
+ for(int i = 0; i < outer; i++) \\\n\
+ { \\\n\
+ read_type srcA, srcB, outC; \\\n\
+ int4 coord_a = (int4)(0, gidy, (ac2zero ? i : get_global_id(2)), 0); \\\n\
+ int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); \\\n\
+ vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\
+ vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\
+ \\\n\
+ int8 inputA_desc, inputB_desc, output_desc; \\\n\
+ _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\
+ int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\
+ _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\
+ _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\
+ int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\
+ _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\
+ \\\n\
+ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\
+ { \\\n\
+ vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\
+ vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\
+ VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniConvertUint8SubZpToFp32_4x4); \\\n\
+ VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniConvertUint8SubZpToFp32B_4x4); \\\n\
+ VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniConvertUint8SubZpToFp32_4x4); \\\n\
+ VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniConvertUint8SubZpToFp32B_4x4); \\\n\
+ VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniConvertUint8SubZpToFp32_4x4); \\\n\
+ VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniConvertUint8SubZpToFp32B_4x4); \\\n\
+ VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_a.x += 4; \\\n\
+ coord_b.y += 4; \\\n\
+ VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniConvertUint8SubZpToFp32_4x4); \\\n\
+ VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniConvertUint8SubZpToFp32B_4x4); \\\n\
+ sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \\\n\
+ sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \\\n\
+ sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \\\n\
+ sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \\\n\
+ } \\\n\
+ vxc_int4 tmpOut0, tmpOut1; \\\n\
+ coord_b.y = gidy; \\\n\
+ coord_b.z = get_global_id(2) + i * get_global_size(2); \\\n\
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+ int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \\\n\
+ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
+ tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \\\n\
+ tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\
+ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ coord_b.y++; \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ coord_b.y++; \\\n\
+ tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \\\n\
+ tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\
+ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ coord_b.y++; \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+}\n\
+GEMM_QINT_TO_QINT_MERGE(I16, vxc_short8)\n\
+\n\
+#define GEMM_QINT_TO_QINT_CROSS(src0_type_name, read_type) \\\n\
+__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_cross( \\\n\
+ image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\
+ int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N, \\\n\
+ int axis_size, int inner_size, int outer_size, int axis_size0, \\\n\
+ int inner_size0, int outer_size0, int axis_size1, int inner_size1, \\\n\
+ int outer_size1, int axis_size2, int inner_size2, int outer_size2) \\\n\
+{ \\\n\
+ uint gidy = get_global_id(1); \\\n\
+ uint gidz = get_global_id(2); \\\n\
+ short in0_zp, in1_zp; \\\n\
+ _viv_asm(COPY, in0_zp, input0_ZP, 4); \\\n\
+ _viv_asm(COPY, in1_zp, input1_ZP, 4); \\\n\
+ for(int j = 0; j < outer_size; j++) \\\n\
+ { \\\n\
+ for(int i = 0; i < inner_size; i++) \\\n\
+ { \\\n\
+ read_type srcA, srcB, outC; \\\n\
+ int4 coord_a = (int4)(0, gidy, gidz * axis_size0 + i * inner_size0 + j * outer_size0, 0); \\\n\
+ int4 coord_b = (int4)(get_global_id(0), 0, gidz * axis_size1 + i * inner_size1 + j * outer_size1, 0); \\\n\
+ vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\
+ vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\
+ \\\n\
+ int8 inputA_desc, inputB_desc, output_desc; \\\n\
+ _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\
+ int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\
+ _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\
+ _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\
+ int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\
+ _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\
+ \\\n\
+ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\
+ { \\\n\
+ vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\
+ vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\
+ VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniConvertUint8SubZpToFp32_4x4); \\\n\
+ VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniConvertUint8SubZpToFp32B_4x4); \\\n\
+ VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniConvertUint8SubZpToFp32_4x4); \\\n\
+ VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniConvertUint8SubZpToFp32B_4x4); \\\n\
+ VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniConvertUint8SubZpToFp32_4x4); \\\n\
+ VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniConvertUint8SubZpToFp32B_4x4); \\\n\
+ VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_a.x += 4; \\\n\
+ coord_b.y += 4; \\\n\
+ VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniConvertUint8SubZpToFp32_4x4); \\\n\
+ VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniConvertUint8SubZpToFp32B_4x4); \\\n\
+ sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \\\n\
+ sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \\\n\
+ sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \\\n\
+ sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \\\n\
+ } \\\n\
+ vxc_int4 tmpOut0, tmpOut1; \\\n\
+ coord_b.y = gidy; \\\n\
+ coord_b.z = gidz * axis_size2 + i * inner_size2 + j * outer_size2; \\\n\
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+ int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \\\n\
+ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
+ tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \\\n\
+ tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\
+ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ coord_b.y++; \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ coord_b.y++; \\\n\
+ tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \\\n\
+ tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\
+ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ coord_b.y++; \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ } \\\n\
+}\n\
+GEMM_QINT_TO_QINT_CROSS(I16, vxc_short8)\n\
+"; /* end of matrixmul_cross_i16_vx*/
+
static const char matrixmul_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
\n\
_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
@@ -23001,6 +24772,302 @@ __kernel void gemm_transb_I16I16toI16(image2d_array_t inputA,\n\
}\n\
"; /* end of matrixmul_i16_vx*/
+static const char matrixmul_merge_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float output_ZP;\n\
+_viv_uniform float mulKIn0In1Zp;\n\
+_viv_uniform float inOutScale;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+_viv_uniform int ac2zero;\n\
+_viv_uniform int bc2zero;\n\
+_viv_uniform int outer;\n\
+\n\
+_viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4;\n\
+_viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Lo_4x4;\n\
+_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Hi_4x4;\n\
+_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Lo_4x4;\n\
+_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Hi_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;\n\
+\n\
+#define GEMM_QINT_TO_QINT_MERGE(src0_type_name, read_type) \\\n\
+__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_merge( \\\n\
+ image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\
+ int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\
+{ \\\n\
+ read_type srcA0, srcA1, srcA2, srcA3, srcB, outC; \\\n\
+ vxc_float4 sum = (vxc_float4)(mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp); \\\n\
+ for(int i = 0; i < outer; i++) \\\n\
+ { \\\n\
+ vxc_float4 sum0 = sum, sum1 = sum, sum2 = sum, sum3 = sum; \\\n\
+ int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? i : get_global_id(2)), 0); \\\n\
+ int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); \\\n\
+ int8 inputA_desc, inputB_desc, output_desc; \\\n\
+ _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\
+ int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\
+ _viv_asm(MOV, coord_a.w, baseAddr_a); \\\n\
+ _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\
+ int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\
+ _viv_asm(MOV, coord_b.w, baseAddr_b); \\\n\
+ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\
+ { \\\n\
+ vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\
+ vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\
+ VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+ VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+ VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_a.x += 4; coord_b.y += 4; \\\n\
+ VXC_DP4x4(tempA0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGemmU8U8toFp32Block4_4x4); \\\n\
+ VXC_DP4x4(tempA1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGemmU8U8toFp32Block4_4x4); \\\n\
+ VXC_DP4x4(tempA2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGemmU8U8toFp32Block4_4x4); \\\n\
+ VXC_DP4x4(tempA3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGemmU8U8toFp32Block4_4x4); \\\n\
+ VXC_DP8x4(tempB0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGemmU8U8MulZptoFp32_8x4); \\\n\
+ VXC_DP8x4(tempB1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGemmU8U8MulZptoFp32_8x4); \\\n\
+ VXC_DP8x4(tempB2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGemmU8U8MulZptoFp32_8x4); \\\n\
+ VXC_DP8x4(tempB3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGemmU8U8MulZptoFp32_8x4); \\\n\
+ sum0 += tempA0 + tempB0; \\\n\
+ sum1 += tempA1 + tempB1; \\\n\
+ sum2 += tempA2 + tempB2; \\\n\
+ sum3 += tempA3 + tempB3; \\\n\
+ } \\\n\
+ vxc_int4 tmpOut0, tmpOut1; \\\n\
+ coord_b.y = get_global_id(1); \\\n\
+ coord_b.z = get_global_id(2) + i * get_global_size(2); \\\n\
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+ int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \\\n\
+ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
+ tmpOut0 = convert_int4_rte(sum0 * inOutScale + output_ZP); \\\n\
+ tmpOut1 = convert_int4_rte(sum1 * inOutScale + output_ZP); \\\n\
+ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ coord_b.y++; \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ coord_b.y++; \\\n\
+ tmpOut0 = convert_int4_rte(sum2 * inOutScale + output_ZP); \\\n\
+ tmpOut1 = convert_int4_rte(sum3 * inOutScale + output_ZP); \\\n\
+ VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ coord_b.y++; \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+}\n\
+GEMM_QINT_TO_QINT_MERGE(U8, vxc_uchar16)\n\
+GEMM_QINT_TO_QINT_MERGE(I8, vxc_char16)\n\
+\n\
+#if (VX_VERSION==2)\n\
+__kernel void gemm_F16F16toF16_merge(image2d_array_t inputA,\n\
+ image2d_array_t inputB, image2d_array_t output,\n\
+ int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N)\n\
+{\n\
+ uint gidy = get_global_id(1);\n\
+ for(int i = 0; i < outer; i++)\n\
+ {\n\
+ int4 coord_a = (int4)(0, gidy, (ac2zero ? i : get_global_id(2)), 0);\n\
+ int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0);\n\
+\n\
+ half4 valC;\n\
+ vxc_short8 srcA0, srcA1, srcA2, srcA3, outC;\n\
+ vxc_half8 tmpA0, tmpA1, tmpA2, tmpA3;\n\
+ vxc_short16 srcB;\n\
+ vxc_half16 tmpB;\n\
+ vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);\n\
+ vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);\n\
+\n\
+ int8 inputA_desc, inputB_desc, output_desc;\n\
+ _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\
+ int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\
+ _viv_asm(MOV, coord_a.w, baseAddr_a);\n\
+ _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\
+ int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\
+ _viv_asm(MOV, coord_b.w, baseAddr_b);\n\
+\n\
+ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)\n\
+ {\n\
+ vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\
+ VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+ coord_a.x += 4; coord_b.y += 4;\n\
+ _viv_asm(COPY, tmpA0, srcA0, 16);\n\
+ _viv_asm(COPY, tmpA1, srcA1, 16);\n\
+ _viv_asm(COPY, tmpA2, srcA2, 16);\n\
+ _viv_asm(COPY, tmpA3, srcA3, 16);\n\
+ _viv_asm(COPY, tmpB.hi, srcB.hi, 16);\n\
+ _viv_asm(COPY, tmpB.lo, srcB.lo, 16);\n\
+ VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniGemmU8F16toF32Lo_4x4b);\n\
+ VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniGemmU8F16toF32Lo_4x4b);\n\
+ VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, tmpA2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniGemmU8F16toF32Lo_4x4b);\n\
+ VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, tmpA3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniGemmU8F16toF32Lo_4x4b);\n\
+ sum0 += (tempA0);\n\
+ sum1 += (tempA1);\n\
+ sum2 += (tempA2);\n\
+ sum3 += (tempA3);\n\
+ }\n\
+ coord_b.y = gidy;\n\
+ coord_b.z = get_global_id(2) + i * get_global_size(2);\n\
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+ int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0;\n\
+ _viv_asm(MOV, coord_b.w, baseAddr);\n\
+ _viv_asm(CONV, valC, sum0);\n\
+ _viv_asm(COPY, outC, valC, 16);\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+ coord_b.y++;\n\
+ _viv_asm(CONV, valC, sum1);\n\
+ _viv_asm(COPY, outC, valC, 16);\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+ coord_b.y++;\n\
+ _viv_asm(CONV, valC, sum2);\n\
+ _viv_asm(COPY, outC, valC, 16);\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+ coord_b.y++;\n\
+ _viv_asm(CONV, valC, sum3);\n\
+ _viv_asm(COPY, outC, valC, 16);\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+ }\n\
+}\n\
+#else\n\
+__kernel void gemm_F16F16toF16_merge(image2d_array_t inputA,\n\
+ image2d_array_t inputB, image2d_array_t output,\n\
+ int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N)\n\
+{\n\
+ uint gidy = get_global_id(1);\n\
+ for(int i = 0; i < outer; i++)\n\
+ {\n\
+ int4 coord_a = (int4)(0, gidy, (ac2zero ? i : get_global_id(2)), 0);\n\
+ int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0);\n\
+\n\
+ half4 valC;\n\
+ vxc_short8 srcA0, srcB0, srcA1, srcB1, outC;\n\
+ vxc_half8 tmpA0, tmpB0, tmpA1, tmpB1;\n\
+ vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);\n\
+ vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);\n\
+\n\
+ int8 inputA_desc, inputB_desc, output_desc;\n\
+ _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\
+ int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\
+ _viv_asm(MOV, coord_a.w, baseAddr_a);\n\
+ _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\
+ int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\
+ _viv_asm(MOV, coord_b.w, baseAddr_b);\n\
+\n\
+ for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)\n\
+ {\n\
+ vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\
+ vxc_float4 tempB0, tempB1, tempB2, tempB3;\n\
+ VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+ VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+ coord_a.x += 4; coord_b.y += 4;\n\
+ _viv_asm(COPY, tmpA0, srcA0, 16);\n\
+ _viv_asm(COPY, tmpB0, srcB0, 16);\n\
+ _viv_asm(COPY, tmpA1, srcA1, 16);\n\
+ _viv_asm(COPY, tmpB1, srcB1, 16);\n\
+\n\
+ VXC_DP4x4(tempA0, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniGemmFp16toFp32Row0Lo_4x4);\n\
+ VXC_DP4x4(tempB0, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniGemmFp16toFp32Row0Hi_4x4);\n\
+ VXC_DP4x4(tempA1, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniGemmFp16toFp32Row1Lo_4x4);\n\
+ VXC_DP4x4(tempB1, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniGemmFp16toFp32Row1Hi_4x4);\n\
+ VXC_DP4x4(tempA2, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniGemmFp16toFp32Row0Lo_4x4);\n\
+ VXC_DP4x4(tempB2, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniGemmFp16toFp32Row0Hi_4x4);\n\
+ VXC_DP4x4(tempA3, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniGemmFp16toFp32Row1Lo_4x4);\n\
+ VXC_DP4x4(tempB3, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniGemmFp16toFp32Row1Hi_4x4);\n\
+ sum0 += (tempA0 + tempB0);\n\
+ sum1 += (tempA1 + tempB1);\n\
+ sum2 += (tempA2 + tempB2);\n\
+ sum3 += (tempA3 + tempB3);\n\
+ }\n\
+ coord_b.y = gidy;\n\
+ coord_b.z = get_global_id(2) + i * get_global_size(2);\n\
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+ int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0;\n\
+ _viv_asm(MOV, coord_b.w, baseAddr);\n\
+ _viv_asm(CONV, valC, sum0);\n\
+ _viv_asm(COPY, outC, valC, 16);\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+ coord_b.y++;\n\
+ _viv_asm(CONV, valC, sum1);\n\
+ _viv_asm(COPY, outC, valC, 16);\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+ coord_b.y++;\n\
+ _viv_asm(CONV, valC, sum2);\n\
+ _viv_asm(COPY, outC, valC, 16);\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+ coord_b.y++;\n\
+ _viv_asm(CONV, valC, sum3);\n\
+ _viv_asm(COPY, outC, valC, 16);\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+ }\n\
+}\n\
+#endif\n\
+"; /* end of matrixmul_merge_vx*/
+
static const char matrixmul_transA_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
\n\
_viv_uniform int input0_ZP;\n\
@@ -27977,6 +30044,791 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_BF1
}\n\
}"; /* end of moments_u8_axis012_vx*/
+static const char nearest_grid_sample_BF16_to_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+\n\
+_viv_uniform VXC_512Bits uniBF16toFp32_part0_2x8;\n\
+_viv_uniform VXC_512Bits uniBF16toFp32_part1_2x8;\n\
+\n\
+#define GRID_SAMPLE_BF16_PROCESS() \\\n\
+ fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+ fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+ float4 in_x = (float4)(fxy0.xz, fxy1.xz); \\\n\
+ int4 x_idx = convert_int4(in_x); \\\n\
+ float4 in_y = (float4)(fxy0.yw, fxy1.yw); \\\n\
+ int4 y_idx = convert_int4(in_y); \\\n\
+ int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+ int8 input_desc; \\\n\
+ _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+ int baseAddr = input_desc.s0; \\\n\
+ _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+ vxc_short8 src; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.y; \\\n\
+ coord_in.y = y_idx.y; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.z; \\\n\
+ coord_in.y = y_idx.z; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.w; \\\n\
+ coord_in.y = y_idx.w; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ int8 output_desc; \\\n\
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+ _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+ int loop = depth - 1; \\\n\
+ while (coord_in.z < loop) \\\n\
+ { \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+ coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+ coord_in.x = x_idx.x; \\\n\
+ coord_in.y = y_idx.x; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.y; \\\n\
+ coord_in.y = y_idx.y; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.z; \\\n\
+ coord_in.y = y_idx.z; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.w; \\\n\
+ coord_in.y = y_idx.w; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_BF16_BF16toBF16(\n\
+ __read_only image2d_array_t input0,\n\
+ __read_only image2d_t input1,\n\
+ __write_only image2d_array_t output,\n\
+ int align_corners)\n\
+{\n\
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+ int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+ coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+ vxc_short8 read_val;\n\
+ vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+ VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+ float4 fxy0;\n\
+ float4 fxy1;\n\
+\n\
+ vxc_short8 read_src;\n\
+ VXC_DP2x8(read_src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part0_2x8);\n\
+ _viv_asm(COPY, fxy0, read_src, 16);\n\
+ VXC_DP2x8(read_src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part1_2x8);\n\
+ _viv_asm(COPY, fxy1, read_src, 16);\n\
+\n\
+\n\
+\n\
+ GRID_SAMPLE_BF16_PROCESS();\n\
+\n\
+}\n\
+"; /* end of nearest_grid_sample_BF16_to_BF16_vx*/
+
+static const char nearest_grid_sample_F16_to_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;\n\
+_viv_uniform VXC_512Bits uniExtactHalf8_2x8;\n\
+\n\
+#define GRID_SAMPLE_F16_PROCESS() \\\n\
+ fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+ fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+ float4 in_x = (float4)(fxy0.xz, fxy1.xz); \\\n\
+ int4 x_idx = convert_int4(in_x); \\\n\
+ float4 in_y = (float4)(fxy0.yw, fxy1.yw); \\\n\
+ int4 y_idx = convert_int4(in_y); \\\n\
+ int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+ int8 input_desc; \\\n\
+ _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+ int baseAddr = input_desc.s0; \\\n\
+ _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+ vxc_short8 src; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.y; \\\n\
+ coord_in.y = y_idx.y; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.z; \\\n\
+ coord_in.y = y_idx.z; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.w; \\\n\
+ coord_in.y = y_idx.w; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ int8 output_desc; \\\n\
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+ _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+ int loop = depth - 1; \\\n\
+ while (coord_in.z < loop) \\\n\
+ { \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+ coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+ coord_in.x = x_idx.x; \\\n\
+ coord_in.y = y_idx.x; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.y; \\\n\
+ coord_in.y = y_idx.y; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.z; \\\n\
+ coord_in.y = y_idx.z; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.w; \\\n\
+ coord_in.y = y_idx.w; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_F16_F32toF16(\n\
+ __read_only image2d_array_t input0,\n\
+ __read_only image2d_t input1,\n\
+ __write_only image2d_array_t output,\n\
+ int align_corners)\n\
+{\n\
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+ int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+ coord_in1.xz = coord_in1.xz * 2;\n\
+ coord_in1.z = coord_in1.z + 4;\n\
+\n\
+ float4 fxy0 = read_imagef(input1, coord_in1.xy);\n\
+ float4 fxy1 = read_imagef(input1, coord_in1.zw);\n\
+\n\
+ GRID_SAMPLE_F16_PROCESS();\n\
+\n\
+}\n\
+\n\
+_viv_uniform int input1_ZP;\n\
+_viv_uniform float input1Scale;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\
+\n\
+__kernel void nearest_grid_sample_F16_U8toF16(\n\
+ __read_only image2d_array_t input0,\n\
+ __read_only image2d_t input1,\n\
+ __write_only image2d_array_t output,\n\
+ int align_corners)\n\
+{\n\
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+ int4 coord_in1 = coord_out.xyxy;\n\
+ coord_in1.xz = coord_in1.xz * 2;\n\
+ vxc_uchar16 read_coord;\n\
+ VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+ float4 fxy0;\n\
+ float4 fxy1;\n\
+ unsigned char input1ZP;\n\
+ _viv_asm(COPY, input1ZP, input1_ZP, 4);\n\
+ VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);\n\
+ VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\
+ fxy0 = fxy0 * input1Scale;\n\
+ fxy1 = fxy1 * input1Scale;\n\
+\n\
+ GRID_SAMPLE_F16_PROCESS();\n\
+\n\
+}\n\
+\n\
+\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\
+\n\
+__kernel void nearest_grid_sample_F16_F16toF16(\n\
+ __read_only image2d_array_t input0,\n\
+ __read_only image2d_t input1,\n\
+ __write_only image2d_array_t output,\n\
+ int align_corners)\n\
+{\n\
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+ int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+ coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+ vxc_short8 read_val;\n\
+ vxc_half8 read_coord;\n\
+\n\
+ VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ _viv_asm(COPY, read_coord, read_val, 16);\n\
+\n\
+ float4 fxy0;\n\
+ float4 fxy1;\n\
+\n\
+ VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);\n\
+ VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\
+\n\
+ GRID_SAMPLE_F16_PROCESS();\n\
+\n\
+}\n\
+"; /* end of nearest_grid_sample_F16_to_F16_vx*/
+
+static const char nearest_grid_sample_F16_to_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+\n\
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\
+_viv_uniform float uint8Scale;\n\
+_viv_uniform float output_ZP;\n\
+\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\
+\n\
+#define GRID_SAMPLE_F16_to_U8_PROCESS() \\\n\
+ fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+ fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+ float4 in_x = (float4)(fxy0.xz, fxy1.xz); \\\n\
+ int4 x_idx = convert_int4(in_x); \\\n\
+ float4 in_y = (float4)(fxy0.yw, fxy1.yw); \\\n\
+ int4 y_idx = convert_int4(in_y); \\\n\
+ int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+ int8 input_desc; \\\n\
+ _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+ int baseAddr = input_desc.s0; \\\n\
+ _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+ vxc_short8 s0; \\\n\
+ vxc_uchar16 result; \\\n\
+ vxc_half8 src; \\\n\
+ VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.y; \\\n\
+ coord_in.y = y_idx.y; \\\n\
+ VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.z; \\\n\
+ coord_in.y = y_idx.z; \\\n\
+ VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.w; \\\n\
+ coord_in.y = y_idx.w; \\\n\
+ VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ _viv_asm(COPY, src, s0, 16); \\\n\
+ int8 output_desc; \\\n\
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+ _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+ int loop = depth - 1; \\\n\
+ float4 dst4; \\\n\
+ int4 dst; \\\n\
+ while (coord_in.z < loop) \\\n\
+ { \\\n\
+ VXC_DP4x4(dst4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4); \\\n\
+ dst4 = dst4 * uint8Scale + output_ZP; \\\n\
+ dst = convert_int4_rte(dst4); \\\n\
+ VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\
+ result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+ coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+ coord_in.x = x_idx.x; \\\n\
+ coord_in.y = y_idx.x; \\\n\
+ VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.y; \\\n\
+ coord_in.y = y_idx.y; \\\n\
+ VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.z; \\\n\
+ coord_in.y = y_idx.z; \\\n\
+ VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.w; \\\n\
+ coord_in.y = y_idx.w; \\\n\
+ VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ _viv_asm(COPY, src, s0, 16); \\\n\
+ } \\\n\
+ VXC_DP4x4(dst4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4); \\\n\
+ dst4 = dst4 * uint8Scale + output_ZP; \\\n\
+ dst = convert_int4_rte(dst4); \\\n\
+ VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_F16_F32toU8(\n\
+ __read_only image2d_array_t input0,\n\
+ __read_only image2d_t input1,\n\
+ __write_only image2d_array_t output,\n\
+ int align_corners)\n\
+{\n\
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+ int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+ coord_in1.xz = coord_in1.xz * 2;\n\
+ coord_in1.z = coord_in1.z + 4;\n\
+\n\
+ float4 fxy0 = read_imagef(input1, coord_in1.xy);\n\
+ float4 fxy1 = read_imagef(input1, coord_in1.zw);\n\
+ GRID_SAMPLE_F16_to_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+_viv_uniform int input1_ZP;\n\
+_viv_uniform float input1Scale;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_F16_U8toU8(\n\
+ __read_only image2d_array_t input0,\n\
+ __read_only image2d_array_t input1,\n\
+ __write_only image2d_array_t output,\n\
+ int align_corners)\n\
+{\n\
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+ int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+ coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+ vxc_uchar16 read_coord;\n\
+\n\
+ VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ float4 fxy0;\n\
+ float4 fxy1;\n\
+\n\
+ unsigned char input1ZP;\n\
+ _viv_asm(COPY, input1ZP, input1_ZP, 4);\n\
+\n\
+ VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);\n\
+ VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\
+\n\
+ fxy0 = fxy0 * input1Scale;\n\
+ fxy1 = fxy1 * input1Scale;\n\
+\n\
+ GRID_SAMPLE_F16_to_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_F16_F16toU8(\n\
+ __read_only image2d_array_t input0,\n\
+ __read_only image2d_t input1,\n\
+ __write_only image2d_array_t output,\n\
+ int align_corners)\n\
+{\n\
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+ int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+ coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+ vxc_short8 read_val;\n\
+ vxc_half8 read_coord;\n\
+\n\
+ VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ _viv_asm(COPY, read_coord, read_val, 16);\n\
+\n\
+ float4 fxy0;\n\
+ float4 fxy1;\n\
+\n\
+ VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);\n\
+ VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\
+\n\
+ GRID_SAMPLE_F16_to_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+"; /* end of nearest_grid_sample_F16_to_U8_vx*/
+
+static const char nearest_grid_sample_I16_to_I16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;\n\
+_viv_uniform float input1_scale;\n\
+_viv_uniform VXC_512Bits uniConvertI8toI8_2x8;\n\
+\n\
+\n\
+#define GRID_SAMPLE_I16_PROCESS() \\\n\
+ fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+ fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+ float4 in_x = (float4)(fxy0.xz, fxy1.xz); \\\n\
+ int4 x_idx = convert_int4(in_x); \\\n\
+ float4 in_y = (float4)(fxy0.yw, fxy1.yw); \\\n\
+ int4 y_idx = convert_int4(in_y); \\\n\
+ int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+ int8 input_desc; \\\n\
+ _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+ int baseAddr = input_desc.s0; \\\n\
+ _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+ vxc_short8 src, dst; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.y; \\\n\
+ coord_in.y = y_idx.y; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.z; \\\n\
+ coord_in.y = y_idx.z; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.w; \\\n\
+ coord_in.y = y_idx.w; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ int8 output_desc; \\\n\
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+ _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+ int loop = depth - 1; \\\n\
+ while (coord_in.z < loop) \\\n\
+ { \\\n\
+ VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+ coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+ coord_in.x = x_idx.x; \\\n\
+ coord_in.y = y_idx.x; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.y; \\\n\
+ coord_in.y = y_idx.y; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.z; \\\n\
+ coord_in.y = y_idx.z; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.w; \\\n\
+ coord_in.y = y_idx.w; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_I16_I16toI16(\n\
+ __read_only image2d_array_t input0,\n\
+ __read_only image2d_t input1,\n\
+ __write_only image2d_array_t output,\n\
+ int align_corners)\n\
+{\n\
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+ int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+ coord_in1.xz = coord_in1.xz * 2;\n\
+ vxc_short8 read_coord;\n\
+ VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ float4 fxy0;\n\
+ float4 fxy1;\n\
+\n\
+ VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);\n\
+ VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);\n\
+\n\
+ fxy0 = fxy0 * input1_scale;\n\
+ fxy1 = fxy1 * input1_scale;\n\
+\n\
+ GRID_SAMPLE_I16_PROCESS();\n\
+\n\
+}\n\
+"; /* end of nearest_grid_sample_I16_to_I16_vx*/
+
+static const char nearest_grid_sample_I8_to_I8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+\n\
+\n\
+_viv_uniform float input1_scale;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertI8toI8_2x8;\n\
+\n\
+#define GRID_SAMPLE_I8_PROCESS() \\\n\
+ fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+ fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+ float4 in_x = (float4)(fxy0.xz, fxy1.xz); \\\n\
+ int4 x_idx = convert_int4(in_x); \\\n\
+ float4 in_y = (float4)(fxy0.yw, fxy1.yw); \\\n\
+ int4 y_idx = convert_int4(in_y); \\\n\
+ int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+ int8 input_desc; \\\n\
+ _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+ int baseAddr = input_desc.s0; \\\n\
+ _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+ vxc_char16 src, dst; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.y; \\\n\
+ coord_in.y = y_idx.y; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.z; \\\n\
+ coord_in.y = y_idx.z; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.w; \\\n\
+ coord_in.y = y_idx.w; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ int8 output_desc; \\\n\
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+ _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+ int loop = depth - 1; \\\n\
+ while (coord_in.z < loop) \\\n\
+ { \\\n\
+ VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+ coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+ coord_in.x = x_idx.x; \\\n\
+ coord_in.y = y_idx.x; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.y; \\\n\
+ coord_in.y = y_idx.y; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.z; \\\n\
+ coord_in.y = y_idx.z; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.w; \\\n\
+ coord_in.y = y_idx.w; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_I8_I8toI8(\n\
+ __read_only image2d_array_t input0,\n\
+ __read_only image2d_t input1,\n\
+ __write_only image2d_array_t output,\n\
+ int align_corners)\n\
+{\n\
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+ int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+ coord_in1.xz = coord_in1.xz * 2;\n\
+ vxc_char16 read_coord;\n\
+ VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ float4 fxy0;\n\
+ float4 fxy1;\n\
+\n\
+ VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);\n\
+ VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);\n\
+\n\
+ fxy0 = fxy0 * input1_scale;\n\
+ fxy1 = fxy1 * input1_scale;\n\
+\n\
+ GRID_SAMPLE_I8_PROCESS();\n\
+\n\
+}\n\
+"; /* end of nearest_grid_sample_I8_to_I8_vx*/
+
+static const char nearest_grid_sample_U8_to_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+\n\
+_viv_uniform int input1_ZP;\n\
+_viv_uniform float input1Scale;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniMultiplyAndPostShift_2x8;\n\
+_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp\n\
+\n\
+#define GRID_SAMPLE_U8_PROCESS() \\\n\
+ fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+ fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+ float4 in_x = (float4)(fxy0.xz, fxy1.xz); \\\n\
+ int4 x_idx = convert_int4(in_x); \\\n\
+ float4 in_y = (float4)(fxy0.yw, fxy1.yw); \\\n\
+ int4 y_idx = convert_int4(in_y); \\\n\
+ int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+ int8 input_desc; \\\n\
+ _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+ int baseAddr = input_desc.s0; \\\n\
+ _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+ vxc_uchar16 src, dst; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.y; \\\n\
+ coord_in.y = y_idx.y; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.z; \\\n\
+ coord_in.y = y_idx.z; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.w; \\\n\
+ coord_in.y = y_idx.w; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ int8 output_desc; \\\n\
+ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+ baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+ _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+ int loop = depth - 1; \\\n\
+ vxc_ushort8 multiplier; \\\n\
+ _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\
+ while (coord_in.z < loop) \\\n\
+ { \\\n\
+ VXC_DP2x8(dst, src, multiplier, \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+ coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+ coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+ coord_in.x = x_idx.x; \\\n\
+ coord_in.y = y_idx.x; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.y; \\\n\
+ coord_in.y = y_idx.y; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.z; \\\n\
+ coord_in.y = y_idx.z; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = x_idx.w; \\\n\
+ coord_in.y = y_idx.w; \\\n\
+ VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+ VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ } \\\n\
+ VXC_DP2x8(dst, src, multiplier, \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); \\\n\
+ VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_U8_F32toU8(\n\
+ __read_only image2d_array_t input0,\n\
+ __read_only image2d_t input1,\n\
+ __write_only image2d_array_t output,\n\
+ int align_corners)\n\
+{\n\
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+ int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+ coord_in1.xz = coord_in1.xz * 2;\n\
+ coord_in1.z = coord_in1.z + 4;\n\
+\n\
+ float4 fxy0 = read_imagef(input1, coord_in1.xy);\n\
+ float4 fxy1 = read_imagef(input1, coord_in1.zw);\n\
+ GRID_SAMPLE_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_U8_U8toU8(\n\
+ __read_only image2d_array_t input0,\n\
+ __read_only image2d_array_t input1,\n\
+ __write_only image2d_array_t output,\n\
+ int align_corners)\n\
+{\n\
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+ int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+ coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+ vxc_uchar16 read_coord;\n\
+\n\
+ VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ float4 fxy0;\n\
+ float4 fxy1;\n\
+\n\
+ unsigned char input1ZP;\n\
+ _viv_asm(COPY, input1ZP, input1_ZP, 4);\n\
+\n\
+ VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);\n\
+ VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\
+\n\
+ fxy0 = fxy0 * input1Scale;\n\
+ fxy1 = fxy1 * input1Scale;\n\
+\n\
+ GRID_SAMPLE_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\
+\n\
+__kernel void nearest_grid_sample_U8_F16toU8(\n\
+ __read_only image2d_array_t input0,\n\
+ __read_only image2d_t input1,\n\
+ __write_only image2d_array_t output,\n\
+ int align_corners)\n\
+{\n\
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+ int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+ coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+ vxc_short8 read_val;\n\
+ vxc_half8 read_coord;\n\
+\n\
+ VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ _viv_asm(COPY, read_coord, read_val, 16);\n\
+\n\
+ float4 fxy0;\n\
+ float4 fxy1;\n\
+\n\
+ VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);\n\
+ VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\
+\n\
+ GRID_SAMPLE_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+"; /* end of nearest_grid_sample_U8_to_U8_vx*/
+
static const char one_hot_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
\n\
_viv_uniform VXC_512Bits uniDataConvert_0_4x4;\n\
@@ -29077,8 +31929,8 @@ __kernel void pow_##name \\\n\
\\\n\
src0_type src0; \\\n\
copy0_type data0; \\\n\
- src0_type src1; \\\n\
- copy0_type data1; \\\n\
+ src1_type src1; \\\n\
+ copy1_type data1; \\\n\
VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
_viv_asm(COPY, data0, src0, 16); \\\n\
VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
@@ -29149,8 +32001,8 @@ __kernel void pow_##name##_2D \\\n\
\\\n\
src0_type src0; \\\n\
copy0_type data0; \\\n\
- src0_type src1; \\\n\
- copy0_type data1; \\\n\
+ src1_type src1; \\\n\
+ copy1_type data1; \\\n\
VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
_viv_asm(COPY, data0, src0, 16); \\\n\
VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
@@ -29331,9 +32183,21 @@ _viv_uniform int zp;\n\
_viv_uniform float outputScale;\n\
\n\
__kernel void pre_process_bgra_scale_U8toU8(\n\
- __read_only image2d_array_t input, __write_only image2d_array_t output,\n\
- global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
- float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ global int * xRatio,\n\
+ global int * yRatio,\n\
+ global int * xOffset,\n\
+ global int * yOffset,\n\
+ float rMean,\n\
+ float gMean,\n\
+ float bMean,\n\
+ float r_scale,\n\
+ int reverse_channel,\n\
+ int trans,\n\
+ float g_scale,\n\
+ float b_scale\n\
+ )\n\
{\n\
int4 gidx = get_global_id(0);\n\
int gidy = get_global_id(1);\n\
@@ -29389,6 +32253,7 @@ __kernel void pre_process_bgra_scale_U8toU8(\n\
int4 tmp1, tmp2, result1, result2;\n\
float4 tmpDst, tmp0;\n\
float4 mean = (float4)(bMean, gMean, rMean, 0);\n\
+ float4 var = (float4)(b_scale, g_scale, r_scale, 0);\n\
//tmpFx = (int4)(fx.x, fx.x, fx.x, fx.x);\n\
int tmpV = 1 << 19;\n\
vxc_short8 tmpFx;\n\
@@ -29451,9 +32316,21 @@ __kernel void pre_process_bgra_scale_U8toU8(\n\
}\n\
\n\
__kernel void pre_process_bgra_copy_U8toU8(\n\
- __read_only image2d_array_t input, __write_only image2d_array_t output,\n\
- global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
- float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ global int * xRatio,\n\
+ global int * yRatio,\n\
+ global int * xOffset,\n\
+ global int * yOffset,\n\
+ float rMean,\n\
+ float gMean,\n\
+ float bMean,\n\
+ float r_scale,\n\
+ int reverse_channel,\n\
+ int trans,\n\
+ float g_scale,\n\
+ float b_scale\n\
+)\n\
{\n\
int2 pos = (int2)((get_global_id(0) + (*xOffset)) << 2, get_global_id(1) + (*yOffset));\n\
\n\
@@ -29468,10 +32345,10 @@ __kernel void pre_process_bgra_copy_U8toU8(\n\
VXC_DP4x4(tmpG, lineBGRA0, lineBGRA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGfromBgra_4x4);\n\
VXC_DP4x4(tmpR, lineBGRA0, lineBGRA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRfromBgra_4x4);\n\
\n\
- tmpDst = (tmpB - bMean) * var;\n\
+ tmpDst = (tmpB - bMean) * b_scale;\n\
result1 = convert_int4_rte(tmpDst * outputScale + zp);\n\
\n\
- tmpDst = (tmpG - gMean) * var;\n\
+ tmpDst = (tmpG - gMean) * g_scale;\n\
result2 = convert_int4_rte(tmpDst * outputScale + zp);\n\
VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
\n\
@@ -29481,7 +32358,7 @@ __kernel void pre_process_bgra_copy_U8toU8(\n\
dstPos.z = 1;\n\
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
\n\
- tmpDst = (tmpR - rMean) * var;\n\
+ tmpDst = (tmpR - rMean) * r_scale;\n\
result1 = convert_int4_rte(tmpDst * outputScale + zp);\n\
VXC_DP2x8(dst, result1, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
\n\
@@ -30016,7 +32893,10 @@ static const char pre_process_nv12_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
_viv_uniform int bOrder;\n\
_viv_uniform int rOrder;\n\
\n\
-_viv_uniform float outputScaleVar;\n\
+_viv_uniform float outputScaleVar_b;\n\
+_viv_uniform float outputScaleVar_g;\n\
+_viv_uniform float outputScaleVar_r;\n\
+\n\
_viv_uniform float bMeanScaleVarZp;\n\
_viv_uniform float gMeanScaleVarZp;\n\
_viv_uniform float rMeanScaleVarZp;\n\
@@ -30041,10 +32921,12 @@ __kernel void pre_process_nv12_copy_##name \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float var, \\\n\
+ float r_scale, \\\n\
int reverse_channel, \\\n\
int trans, \\\n\
- int nv_type \\\n\
+ int nv_type, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
int gidx = get_global_id(0); \\\n\
@@ -30078,21 +32960,21 @@ __kernel void pre_process_nv12_copy_##name \\\n\
dst_type dst0; \\\n\
save_type dst; \\\n\
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
- tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\
+ tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\
_viv_asm(CONV_RTE, result, tmpDstB); \\\n\
dstPos.z = bOrder; \\\n\
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
_viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\
+ tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\
_viv_asm(CONV_RTE, result, tmpDstG); \\\n\
dstPos.z = 1; \\\n\
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
_viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\
+ tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\
_viv_asm(CONV_RTE, result, tmpDstR); \\\n\
dstPos.z = rOrder; \\\n\
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -30110,7 +32992,10 @@ static const char pre_process_nv12_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
_viv_uniform int bOrder;\n\
_viv_uniform int rOrder;\n\
\n\
-_viv_uniform float outputScaleVar;\n\
+_viv_uniform float outputScaleVar_b;\n\
+_viv_uniform float outputScaleVar_g;\n\
+_viv_uniform float outputScaleVar_r;\n\
+\n\
_viv_uniform float bMeanScaleVarZp;\n\
_viv_uniform float gMeanScaleVarZp;\n\
_viv_uniform float rMeanScaleVarZp;\n\
@@ -30143,10 +33028,12 @@ __kernel void pre_process_nv12_scale_##name##_gq \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float var, \\\n\
+ float r_scale, \\\n\
int reverse_channel, \\\n\
int trans, \\\n\
- int nv_type \\\n\
+ int nv_type, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
uint4 gidx = get_global_id(0); \\\n\
@@ -30200,21 +33087,21 @@ __kernel void pre_process_nv12_scale_##name##_gq \\\n\
dst_type dst0; \\\n\
save_type dst; \\\n\
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
- tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\
+ tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\
_viv_asm(CONV_RTE, result, tmpDstB); \\\n\
dstPos.z = bOrder; \\\n\
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
_viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\
+ tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\
_viv_asm(CONV_RTE, result, tmpDstG); \\\n\
dstPos.z = 1; \\\n\
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
_viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\
+ tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\
_viv_asm(CONV_RTE, result, tmpDstR); \\\n\
dstPos.z = rOrder; \\\n\
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
@@ -30239,10 +33126,12 @@ __kernel void pre_process_nv12_scale_##name \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float var, \\\n\
+ float r_scale, \\\n\
int reverse_channel, \\\n\
int trans, \\\n\
- int nv_type \\\n\
+ int nv_type, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
uint4 gidx = get_global_id(0); \\\n\
@@ -30268,102 +33157,445 @@ __kernel void pre_process_nv12_scale_##name \\\n\
coord.x = sx.w; \\\n\
VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
- coord_uv.x = uvX.y; \\\n\
- VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
- coord_uv.x = uvX.z; \\\n\
- VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
- coord_uv.x = uvX.w; \\\n\
- VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_uv.x = uvX.y; \\\n\
+ VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_uv.x = uvX.z; \\\n\
+ VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_uv.x = uvX.w; \\\n\
+ VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ if (nv_type == 1) \\\n\
+ { \\\n\
+ UV.s01234567 = UV.s10325476; \\\n\
+ } \\\n\
+ \\\n\
+ vxc_char16 tmpUV; \\\n\
+ short tmpVal = 128; \\\n\
+ VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \\\n\
+ \\\n\
+ float4 tmpDstB, tmpDstG, tmpDstR; \\\n\
+ VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \\\n\
+ VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \\\n\
+ VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \\\n\
+ \\\n\
+ conv_type result; \\\n\
+ dst_type dst0; \\\n\
+ save_type dst; \\\n\
+ int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
+ tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\
+ _viv_asm(CONV_RTE, result, tmpDstB); \\\n\
+ dstPos.z = bOrder; \\\n\
+ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+ _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\
+ _viv_asm(CONV_RTE, result, tmpDstG); \\\n\
+ dstPos.z = 1; \\\n\
+ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+ _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\
+ _viv_asm(CONV_RTE, result, tmpDstR); \\\n\
+ dstPos.z = rOrder; \\\n\
+ VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+ _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+ VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+NV12_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8)\n\
+NV12_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8)\n\
+NV12_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16)\n\
+NV12_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16)\n\
+"; /* end of pre_process_nv12_scale_vx*/
+
+static const char pre_process_rgb_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniVecShift10;\n\
+_viv_uniform VXC_512Bits uniAddRShift;\n\
+_viv_uniform VXC_512Bits uniGetTempVal;\n\
+_viv_uniform VXC_512Bits uniExtractBytes;\n\
+_viv_uniform VXC_512Bits uniUnpackToR;\n\
+_viv_uniform VXC_512Bits uniUnpackToG;\n\
+_viv_uniform VXC_512Bits uniUnpackToB;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform float outputZP;\n\
+_viv_uniform int r_order;\n\
+_viv_uniform int b_order;\n\
+\n\
+#define DESCALE(x) (((x) + (1<<19)) >> 20)\n\
+\n\
+#define IMAGE_PRE_PROCESS(dst_name, conv_type, dst_type, copy_type) \\\n\
+__kernel void pre_process_rgb_scale_U8to##dst_name \\\n\
+ ( \\\n\
+__read_only image2d_array_t input, \\\n\
+__write_only image2d_array_t output, \\\n\
+ global int *xRatio, \\\n\
+ global int *yRatio, \\\n\
+ global int *xOffset, \\\n\
+ global int *yOffset, \\\n\
+ float rMean, \\\n\
+ float gMean, \\\n\
+ float bMean, \\\n\
+ float r_scale, \\\n\
+ int reverse_channel, \\\n\
+ int trans, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
+ ) \\\n\
+{ \\\n\
+ int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
+ int4 xPos = get_global_id(0); \\\n\
+ int yPos = get_global_id(1); \\\n\
+ int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\
+ xPos += (int4)(0, 1, 2, 3); \\\n\
+ \\\n\
+ /*x*/ \\\n\
+ int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\
+ int4 sx = fx0 & 0xffff8000; \\\n\
+ fx0 -= sx; \\\n\
+ sx = sx >> 15; \\\n\
+ \\\n\
+ vxc_short4 fx; \\\n\
+ VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \\\n\
+ /*y*/ \\\n\
+ int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\
+ int sy = fy & 0xffff8000; \\\n\
+ \\\n\
+ fy -= sy; \\\n\
+ sy = sy >> 15; \\\n\
+ \\\n\
+ fy = (fy + (1<< 4)) >> 5; \\\n\
+ \\\n\
+ vxc_uchar16 line0RGB1, line0RGB2; \\\n\
+ vxc_uchar16 line1RGB3, line1RGB4; \\\n\
+ int4 coord; \\\n\
+ sx = (sx + (*xOffset)) * 3; \\\n\
+ coord.xyz = sx.xyz; \\\n\
+ coord.w = sy + *yOffset; \\\n\
+ int2 coord1 = (int2)(sx.w, coord.w); \\\n\
+ VXC_ReadImage(line0RGB1, input, coord.xw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line0RGB1, input, coord.yw, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line0RGB2, input, coord.zw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line0RGB2, input, coord1, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \\\n\
+ \\\n\
+ bgrMean *= (float4)(b_scale, g_scale, r_scale, 0); \\\n\
+ \\\n\
+ int4 test01, temp1; \\\n\
+ int4 test02, temp2; \\\n\
+ int4 tt; \\\n\
+ vxc_uchar4 val; \\\n\
+ int4 coord_out = (int4)(xPos.x, yPos, r_order, 0); \\\n\
+ \\\n\
+ vxc_uchar8 line1, line2; \\\n\
+ \\\n\
+ /*R*/ \\\n\
+ VXC_DP2x8(line1, line0RGB1, line0RGB2, \\\n\
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\
+ VXC_DP2x8(line2, line1RGB3, line1RGB4, \\\n\
+ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\
+ \\\n\
+ VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
+ VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ temp1 = temp1 + test01; \\\n\
+ \\\n\
+ VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
+ VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ temp2 = temp2 + test02; \\\n\
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+ \\\n\
+ vxc_float4 tmp_dst; \\\n\
+ vxc_uchar4 u8_dst; \\\n\
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+ /*convert U8 to dst*/ \\\n\
+ dst_type dst; \\\n\
+ tmp_dst = tmp_dst * r_scale - bgrMean.zzzz; \\\n\
+ tmp_dst = tmp_dst * outputScale + outputZP; \\\n\
+ conv_type dst0; \\\n\
+ _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\
+ VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+ copy_type result; \\\n\
+ _viv_asm(COPY, result, dst, 16); \\\n\
+ VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ /*G*/ \\\n\
+ VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\
+ VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\
+ \\\n\
+ coord_out.z = 1; \\\n\
+ VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
+ VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ temp1 = temp1 + test01; \\\n\
+ \\\n\
+ VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
+ VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ temp2 = temp2 + test02; \\\n\
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+ \\\n\
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+ tmp_dst = tmp_dst * g_scale - bgrMean.y; \\\n\
+ tmp_dst = tmp_dst * outputScale + outputZP; \\\n\
+ _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\
+ VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+ _viv_asm(COPY, result, dst, 16); \\\n\
+ VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ /*B*/ \\\n\
+ VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\
+ VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\
+ \\\n\
+ coord_out.z = b_order; \\\n\
+ VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
+ VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ temp1 = temp1 + test01; \\\n\
+ \\\n\
+ VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
+ VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ temp2 = temp2 + test02; \\\n\
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+ \\\n\
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+ tmp_dst = tmp_dst * b_scale - bgrMean.x; \\\n\
+ tmp_dst = tmp_dst * outputScale + outputZP; \\\n\
+ _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\
+ VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+ _viv_asm(COPY, result, dst, 16); \\\n\
+ VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+IMAGE_PRE_PROCESS(U8, uint4, vxc_uchar16, vxc_uchar16)\n\
+IMAGE_PRE_PROCESS(I8, int4, vxc_char16, vxc_char16)\n\
+IMAGE_PRE_PROCESS(I16, int4, vxc_short8, vxc_short8)\n\
+IMAGE_PRE_PROCESS(F16, half4, vxc_half8, vxc_short8)\n\
+"; /* end of pre_process_rgb_vx*/
+
+static const char pre_process_rgb888_planar_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniVecShift10;\n\
+_viv_uniform VXC_512Bits uniAddRShift;\n\
+_viv_uniform VXC_512Bits uniGetTempVal;\n\
+_viv_uniform VXC_512Bits uniExtractBytes;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform int4 rgb_order;\n\
+\n\
+#define RESIZE_BILINEAR_4X1(scale, mean, output, _coord) \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = coord.y; \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = coord.z; \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = coord.w; \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.z ++; \\\n\
+ coord_in.x = coord.x; \\\n\
+ \\\n\
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ temp1 = temp1 + test01; \\\n\
+ \\\n\
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ temp2 = temp2 + test02; \\\n\
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniExtractBytes); \\\n\
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+ tmp_dst = tmp_dst * scale * output_scale - mean * scale * output_scale + output_zp; \\\n\
+ _viv_asm(CONV, dst0, tmp_dst); \\\n\
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniExtract8Data_2x8); \\\n\
+ _viv_asm(COPY, dst, dst1, 8); \\\n\
+ VXC_WriteImage(output, _coord, dst, \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+#define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\
+__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
+ ( \\\n\
+ __read_only image2d_array_t input, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ global int *xRatio, \\\n\
+ global int *yRatio, \\\n\
+ global int *xOffset, \\\n\
+ global int *yOffset, \\\n\
+ float rMean, \\\n\
+ float gMean, \\\n\
+ float bMean, \\\n\
+ float r_scale, \\\n\
+ int reverse, \\\n\
+ int height, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
+ ) \\\n\
+{ \\\n\
+ int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
+ \\\n\
+ int4 xPos = get_global_id(0); \\\n\
+ int yPos = get_global_id(1); \\\n\
+ \\\n\
+ int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\
+ xPos += (int4)(0, 1, 2, 3); \\\n\
+ \\\n\
+ int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\
+ int4 sx = fx0 & 0xffff8000; \\\n\
+ fx0 -= sx; \\\n\
+ sx = sx >> 15; \\\n\
+ \\\n\
+ vxc_short4 fx; \\\n\
+ VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniAddRShift); \\\n\
+ \\\n\
+ int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\
+ int sy = fy & 0xffff8000; \\\n\
+ \\\n\
+ fy -= sy; \\\n\
+ sy = sy >> 15; \\\n\
+ \\\n\
+ fy = (fy + (1<< 4)) >> 5; \\\n\
+ \\\n\
+ vxc_uchar16 line0Y; \\\n\
+ vxc_uchar16 line1Y; \\\n\
+ int4 coord; \\\n\
+ int4 coord_in = (int4)(0, 0, 0, 0); \\\n\
+ sx = sx + *xOffset; \\\n\
+ coord = sx.xyzw; \\\n\
+ coord_in.y = sy + *yOffset; \\\n\
+ coord_in.x = coord.x; \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = coord.y; \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = coord.z; \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = coord.w; \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.z ++; \\\n\
+ coord_in.x = coord.x; \\\n\
+ \\\n\
+ int4 test01, temp1; \\\n\
+ int4 test02, temp2; \\\n\
+ int4 tt; \\\n\
+ vxc_uchar4 val; \\\n\
+ int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \\\n\
+ coord_out.yzw += rgb_order.xyz; \\\n\
\\\n\
- if (nv_type == 1) \\\n\
- { \\\n\
- UV.s01234567 = UV.s10325476; \\\n\
- } \\\n\
+ vxc_uchar8 line1, line2; \\\n\
\\\n\
- vxc_char16 tmpUV; \\\n\
- short tmpVal = 128; \\\n\
- VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \\\n\
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ temp1 = temp1 + test01; \\\n\
\\\n\
- float4 tmpDstB, tmpDstG, tmpDstR; \\\n\
- VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \\\n\
- VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \\\n\
- VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \\\n\
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ temp2 = temp2 + test02; \\\n\
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
\\\n\
- conv_type result; \\\n\
- dst_type dst0; \\\n\
- save_type dst; \\\n\
- int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
- tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\
- _viv_asm(CONV_RTE, result, tmpDstB); \\\n\
- dstPos.z = bOrder; \\\n\
- VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
- _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
- VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ vxc_float4 tmp_dst; \\\n\
+ vxc_uchar4 u8_dst; \\\n\
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniExtractBytes); \\\n\
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertIntergetoF32_4x4); \\\n\
\\\n\
- tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\
- _viv_asm(CONV_RTE, result, tmpDstG); \\\n\
- dstPos.z = 1; \\\n\
- VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
- _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
- VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ conv_type dst0; \\\n\
+ dst_type dst1; \\\n\
+ copy_type dst; \\\n\
+ tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\
+ _viv_asm(CONV, dst0, tmp_dst); \\\n\
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniExtract8Data_2x8); \\\n\
+ _viv_asm(COPY, dst, dst1, 8); \\\n\
+ VXC_WriteImage(output, coord_out.xy, dst, \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\
- _viv_asm(CONV_RTE, result, tmpDstR); \\\n\
- dstPos.z = rOrder; \\\n\
- VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
- _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
- VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ RESIZE_BILINEAR_4X1(g_scale, gMean, output, coord_out.xz) \\\n\
+ RESIZE_BILINEAR_4X1(b_scale, bMean, output, coord_out.xw) \\\n\
}\n\
-NV12_SH_IMPL(U8toU8, vxc_uchar8, int4, vxc_uchar8, 8)\n\
-NV12_SH_IMPL(U8toI8, vxc_char8, int4, vxc_char8, 8)\n\
-NV12_SH_IMPL(U8toI16, vxc_short8, int4, vxc_short8, 16)\n\
-NV12_SH_IMPL(U8toF16, vxc_half8, half4, vxc_short8, 16)\n\
-"; /* end of pre_process_nv12_scale_vx*/
-
-static const char pre_process_rgb_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniVecShift10;\n\
-_viv_uniform VXC_512Bits uniAddRShift;\n\
-_viv_uniform VXC_512Bits uniGetTempVal;\n\
-_viv_uniform VXC_512Bits uniExtractBytes;\n\
-_viv_uniform VXC_512Bits uniUnpackToR;\n\
-_viv_uniform VXC_512Bits uniUnpackToG;\n\
-_viv_uniform VXC_512Bits uniUnpackToB;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\
-_viv_uniform float outputScale;\n\
-_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
-_viv_uniform float outputZP;\n\
-_viv_uniform int r_order;\n\
-_viv_uniform int b_order;\n\
-\n\
-#define DESCALE(x) (((x) + (1<<19)) >> 20)\n\
+PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8, half4, vxc_short8)\n\
+PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4, vxc_short8)\n\
\n\
-#define IMAGE_PRE_PROCESS(dst_name, conv_type, dst_type, copy_type) \\\n\
-__kernel void pre_process_rgb_scale_U8to##dst_name \\\n\
+#define PRE_PROCESS_RGB888_PLANAR_8BITS(dst_name, write_type) \\\n\
+__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
( \\\n\
-__read_only image2d_array_t input, \\\n\
-__write_only image2d_array_t output, \\\n\
- global int *xRatio, \\\n\
- global int *yRatio, \\\n\
- global int *xOffset, \\\n\
- global int *yOffset, \\\n\
- float rMean, \\\n\
- float gMean, \\\n\
- float bMean, \\\n\
- float f32Var, \\\n\
- int reverse_channel, \\\n\
- int trans \\\n\
+ __read_only image2d_array_t input, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ global int *xRatio, \\\n\
+ global int *yRatio, \\\n\
+ global int *xOffset, \\\n\
+ global int *yOffset, \\\n\
+ float rMean, \\\n\
+ float gMean, \\\n\
+ float bMean, \\\n\
+ float r_scale, \\\n\
+ int reverse, \\\n\
+ int height, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
- int4 xPos = get_global_id(0); \\\n\
- int yPos = get_global_id(1); \\\n\
+ int4 xPos = get_global_id(0); \\\n\
+ int yPos = get_global_id(1); \\\n\
+ \\\n\
int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\
xPos += (int4)(0, 1, 2, 3); \\\n\
\\\n\
- /*x*/ \\\n\
int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\
int4 sx = fx0 & 0xffff8000; \\\n\
fx0 -= sx; \\\n\
@@ -30371,137 +33603,485 @@ __write_only image2d_array_t output, \\\n\
\\\n\
vxc_short4 fx; \\\n\
VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \\\n\
- /*y*/ \\\n\
+ \\\n\
int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\
int sy = fy & 0xffff8000; \\\n\
\\\n\
fy -= sy; \\\n\
sy = sy >> 15; \\\n\
- \\\n\
fy = (fy + (1<< 4)) >> 5; \\\n\
\\\n\
- vxc_uchar16 line0RGB1, line0RGB2; \\\n\
- vxc_uchar16 line1RGB3, line1RGB4; \\\n\
+ vxc_uchar16 line0Y; \\\n\
+ vxc_uchar16 line1Y; \\\n\
int4 coord; \\\n\
- sx = (sx + (*xOffset)) * 3; \\\n\
- coord.xyz = sx.xyz; \\\n\
- coord.w = sy + *yOffset; \\\n\
+ sx = sx + *xOffset; \\\n\
+ coord.xyz = sx.xyz; \\\n\
+ coord.w = sy + *yOffset; \\\n\
int2 coord1 = (int2)(sx.w, coord.w); \\\n\
- VXC_ReadImage(line0RGB1, input, coord.xw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
- VXC_ReadImage(line0RGB1, input, coord.yw, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
- VXC_ReadImage(line0RGB2, input, coord.zw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
- VXC_ReadImage(line0RGB2, input, coord1, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
- VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
- VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
- VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
- VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
- VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
- VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
- VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
- VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
- float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \\\n\
- \\\n\
- bgrMean *= f32Var; \\\n\
+ int4 coord_in = (int4)(coord.xw, 0, 0); \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = coord.y; \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = coord.z; \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = coord1.x; \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
int4 test01, temp1; \\\n\
int4 test02, temp2; \\\n\
- int4 tt; \\\n\
- vxc_uchar4 val; \\\n\
- int4 coord_out = (int4)(xPos.x, yPos, r_order, 0); \\\n\
+ int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \\\n\
+ coord_out.yzw += rgb_order.xyz; \\\n\
\\\n\
- vxc_uchar8 line1, line2; \\\n\
- \\\n\
- /*R*/ \\\n\
- VXC_DP2x8(line1, line0RGB1, line0RGB2, \\\n\
- VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\
- VXC_DP2x8(line2, line1RGB3, line1RGB4, \\\n\
- VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\
- \\\n\
- VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
- VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGetTempVal); \\\n\
temp1 = temp1 + test01; \\\n\
\\\n\
- VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
- VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGetTempVal); \\\n\
temp2 = temp2 + test02; \\\n\
temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
\\\n\
vxc_float4 tmp_dst; \\\n\
vxc_uchar4 u8_dst; \\\n\
- VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniExtractBytes); \\\n\
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
uniConvertIntergetoF32_4x4); \\\n\
\\\n\
- /*convert U8 to dst*/ \\\n\
- dst_type dst; \\\n\
- tmp_dst = tmp_dst * f32Var - bgrMean.zzzz; \\\n\
- tmp_dst = tmp_dst * outputScale + outputZP; \\\n\
- conv_type dst0; \\\n\
- _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\
- VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
- copy_type result; \\\n\
- _viv_asm(COPY, result, dst, 16); \\\n\
- VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ int4 dst0; \\\n\
+ write_type dst; \\\n\
+ tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\
+ dst0 = convert_int4_rte(tmp_dst); \\\n\
+ VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniExtract8Data_2x8); \\\n\
\\\n\
- /*G*/ \\\n\
- VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\
- VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\
+ VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- coord_out.z = 1; \\\n\
- VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
- VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ coord_in.x = coord.x; \\\n\
+ coord_in.z = 1; \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = coord.y; \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = coord.z; \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = coord1.x; \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGetTempVal); \\\n\
temp1 = temp1 + test01; \\\n\
\\\n\
- VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
- VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGetTempVal); \\\n\
temp2 = temp2 + test02; \\\n\
temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
- \\\n\
- VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniExtractBytes); \\\n\
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
- uniConvertIntergetoF32_4x4); \\\n\
+ uniConvertIntergetoF32_4x4); \\\n\
+ tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \\\n\
+ dst0 = convert_int4_rte(tmp_dst); \\\n\
+ VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniExtract8Data_2x8); \\\n\
\\\n\
- tmp_dst = tmp_dst * f32Var - bgrMean.y; \\\n\
- tmp_dst = tmp_dst * outputScale + outputZP; \\\n\
- _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\
- VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
- _viv_asm(COPY, result, dst, 16); \\\n\
- VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_WriteImage(output, coord_out.xz, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- /*B*/ \\\n\
- VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\
- VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\
+ coord_in.x = coord.x; \\\n\
+ coord_in.z = 2; \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = coord.y; \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = coord.z; \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.x = coord1.x; \\\n\
+ VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- coord_out.z = b_order; \\\n\
- VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
- VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGetTempVal); \\\n\
temp1 = temp1 + test01; \\\n\
\\\n\
- VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
- VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGetTempVal); \\\n\
temp2 = temp2 + test02; \\\n\
temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
- \\\n\
- VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniExtractBytes); \\\n\
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
uniConvertIntergetoF32_4x4); \\\n\
+ tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \\\n\
+ dst0 = convert_int4_rte(tmp_dst); \\\n\
+ VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniExtract8Data_2x8); \\\n\
\\\n\
- tmp_dst = tmp_dst * f32Var - bgrMean.x; \\\n\
- tmp_dst = tmp_dst * outputScale + outputZP; \\\n\
- _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\
- VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
- _viv_asm(COPY, result, dst, 16); \\\n\
- VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_WriteImage(output, coord_out.xw, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)\n\
+PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_0_vx*/
+
+static const char pre_process_rgb888_planar_1_vx[] = "\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\
+_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\
+\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform int4 rgb_order;\n\
+\n\
+#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\
+__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
+ ( \\\n\
+ __read_only image2d_array_t input, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ global int *xRatio, \\\n\
+ global int *yRatio, \\\n\
+ global int *xOffset, \\\n\
+ global int *yOffset, \\\n\
+ float rMean, \\\n\
+ float gMean, \\\n\
+ float bMean, \\\n\
+ float r_scale, \\\n\
+ int reverse, \\\n\
+ int height, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
+ ) \\\n\
+{ \\\n\
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+ coord.xy += (int2)(*xOffset, *yOffset); \\\n\
+ vxc_uchar16 src0, src1, src2; \\\n\
+ dst_type dst0, dst1; \\\n\
+ \\\n\
+ int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\
+ VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.z ++; \\\n\
+ VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.z ++; \\\n\
+ VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ coord.x = coord.z + 8; \\\n\
+ float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp,\\\n\
+ rMean * r_scale * output_scale - output_zp, \\\n\
+ rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\
+ \\\n\
+ half4 paramData_f16; \\\n\
+ copy_type tmp_dst; \\\n\
+ _viv_asm(CONV, paramData_f16, paramData0); \\\n\
+ VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniDataMeanStddevLo_2x8); \\\n\
+ VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniDataMeanStddevHi_2x8); \\\n\
+ _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
+ int4 coord_out = coord; \\\n\
+ coord_out.yw = coord_out.ww + rgb_order.xy; \\\n\
+ VXC_WriteImage(output, coord_out.zy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
+ VXC_WriteImage(output, coord_out.xy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \\\n\
+ gMean * g_scale * output_scale - output_zp, \\\n\
+ gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\
+ _viv_asm(CONV, paramData_f16, paramData1); \\\n\
+ VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniDataMeanStddevLo_2x8); \\\n\
+ VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniDataMeanStddevHi_2x8); \\\n\
+ _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
+ VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
+ VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \\\n\
+ bMean * b_scale * output_scale - output_zp, \\\n\
+ bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
+ _viv_asm(CONV, paramData_f16, paramData2); \\\n\
+ VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniDataMeanStddevLo_2x8); \\\n\
+ VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniDataMeanStddevHi_2x8); \\\n\
+ _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
+ coord_out.w = coord.w + rgb_order.z; \\\n\
+ VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
+ VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8, vxc_short8)\n\
+PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\
+\n\
+#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \\\n\
+__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
+ ( \\\n\
+ __read_only image2d_array_t input, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ global int *xRatio, \\\n\
+ global int *yRatio, \\\n\
+ global int *xOffset, \\\n\
+ global int *yOffset, \\\n\
+ float rMean, \\\n\
+ float gMean, \\\n\
+ float bMean, \\\n\
+ float r_scale, \\\n\
+ int reverse, \\\n\
+ int height, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
+ ) \\\n\
+{ \\\n\
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+ coord.xy += (int2) (*xOffset, *yOffset); \\\n\
+ vxc_uchar16 src0, src1, src2; \\\n\
+ write_type dst; \\\n\
+ \\\n\
+ int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\
+ VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.z ++; \\\n\
+ VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.z ++; \\\n\
+ VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ int4 coord_out = coord; \\\n\
+ coord_out.xyw = coord.www + rgb_order.xyz; \\\n\
+ float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \\\n\
+ rMean * r_scale * output_scale - output_zp, \\\n\
+ rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\
+ \\\n\
+ half4 paramData_f16; \\\n\
+ _viv_asm(CONV, paramData_f16, paramData0); \\\n\
+ \\\n\
+ VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniDataMeanStddevLo_2x8); \\\n\
+ VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniDataMeanStddevHi_2x8); \\\n\
+ VXC_WriteImage(output, coord_out.zx, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \\\n\
+ gMean * g_scale * output_scale - output_zp, \\\n\
+ gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\
+ _viv_asm(CONV, paramData_f16, paramData1); \\\n\
+ \\\n\
+ VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniDataMeanStddevLo_2x8); \\\n\
+ VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniDataMeanStddevHi_2x8); \\\n\
+ VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \\\n\
+ bMean * b_scale * output_scale - output_zp, \\\n\
+ bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
+ _viv_asm(CONV, paramData_f16, paramData2); \\\n\
+ \\\n\
+ VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniDataMeanStddevLo_2x8); \\\n\
+ VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniDataMeanStddevHi_2x8); \\\n\
+ VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\
+PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\
+"; /* end of pre_process_rgb888_planar_1_vx*/
+
+static const char pre_process_rgb888_planar_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;\n\
+_viv_uniform int4 rgb_order;\n\
+\n\
+__kernel void pre_process_rgb888_planar_4over3_U8toU8\n\
+ (\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ global int *xRatio,\n\
+ global int *yRatio,\n\
+ global int *xOffset,\n\
+ global int *yOffset,\n\
+ float rMean,\n\
+ float gMean,\n\
+ float bMean,\n\
+ float r_scale,\n\
+ int reverse,\n\
+ int height,\n\
+ float g_scale,\n\
+ float b_scale\n\
+ )\n\
+{\n\
+ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+ int4 coord_out;\n\
+\n\
+ vxc_uchar16 src0, src1, src2, src3;\n\
+ vxc_uchar16 dst0, dst1, dst2;\n\
+\n\
+ VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ coord_in.z ++;\n\
+ coord_out.xy = (coord_in.xy >> 2) * 3;\n\
+ coord_out.zw = coord_in.yy + (int2)(1, 2);\n\
+\n\
+ VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\
+ VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\
+ VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\
+ VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\
+ VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\
+ VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\
+ VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\
+ VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\
+\n\
+ int4 coord_r = coord_out;\n\
+ coord_r.yzw += rgb_order.xxx;\n\
+ VXC_WriteImage(output, coord_r.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord_r.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord_r.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ coord_in.z ++;\n\
+\n\
+ VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\
+ VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\
+ VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\
+ VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\
+ VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\
+ VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\
+ VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\
+ VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\
+\n\
+ int4 coord_g = coord_out;\n\
+ coord_g.yzw += rgb_order.yyy;\n\
+ VXC_WriteImage(output, coord_g.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord_g.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord_g.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\
+ VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\
+ VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\
+ VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\
+ VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\
+ VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\
+ VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\
+ VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\
+\n\
+ int4 coord_b = coord_out;\n\
+ coord_b.yzw += rgb_order.zzz;\n\
+ VXC_WriteImage(output, coord_b.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord_b.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord_b.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
}\n\
-IMAGE_PRE_PROCESS(U8, uint4, vxc_uchar16, vxc_uchar16)\n\
-IMAGE_PRE_PROCESS(I8, int4, vxc_char16, vxc_char16)\n\
-IMAGE_PRE_PROCESS(I16, int4, vxc_short8, vxc_short8)\n\
-IMAGE_PRE_PROCESS(F16, half4, vxc_half8, vxc_short8)\n\
-"; /* end of pre_process_rgb_vx*/
+\n\
+__kernel void pre_process_rgb888_planar_half_U8toU8\n\
+ (\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ global int *xRatio,\n\
+ global int *yRatio,\n\
+ global int *xOffset,\n\
+ global int *yOffset,\n\
+ float rMean,\n\
+ float gMean,\n\
+ float bMean,\n\
+ float r_scale,\n\
+ int reverse,\n\
+ int height,\n\
+ float g_scale,\n\
+ float b_scale\n\
+ )\n\
+{\n\
+ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+ vxc_uchar16 src0, src1, src2;\n\
+\n\
+ VXC_ReadImage2DArray(src0, input, coord_in, 0,\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ coord_in.z ++;\n\
+ VXC_ReadImage2DArray(src1, input, coord_in, 0,\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ coord_in.z ++;\n\
+ VXC_ReadImage2DArray(src2, input, coord_in, 0,\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ int2 coord = coord_in.xy >> 1;\n\
+\n\
+ int4 coord_rgb = coord.xyyy;\n\
+ coord_rgb.yzw += rgb_order.xyz;\n\
+ VXC_WriteImage(output, coord_rgb.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord_rgb.xz, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord_rgb.xw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of pre_process_rgb888_planar_2_vx*/
-static const char pre_process_rgb888_planar_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char pre_process_rgb888_planar_nhwc_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
\n\
_viv_uniform VXC_512Bits uniVecShift10;\n\
_viv_uniform VXC_512Bits uniAddRShift;\n\
@@ -30510,11 +34090,15 @@ _viv_uniform VXC_512Bits uniExtractBytes;\n\
\n\
_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\
_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;\n\
\n\
_viv_uniform float output_scale;\n\
_viv_uniform float output_zp;\n\
\n\
-#define RESIZE_BILINEAR_4X1(mean, output) \\\n\
+#define RESIZE_BILINEAR_4X1(scale, mean) \\\n\
VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
@@ -30552,21 +34136,13 @@ _viv_uniform float output_zp;\n\
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
uniConvertIntergetoF32_4x4); \\\n\
\\\n\
- tmp_dst = tmp_dst * output_scale - mean * output_scale + output_zp; \\\n\
- _viv_asm(CONV, dst0, tmp_dst); \\\n\
- VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
- uniExtract8Data_2x8); \\\n\
- _viv_asm(COPY, dst, dst1, 8); \\\n\
- VXC_WriteImage(output, coord_out, dst, \\\n\
- VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
+ tmp_dst = tmp_dst * scale * output_scale - mean * scale * output_scale + output_zp; \\\n\
+ _viv_asm(CONV, dst0, tmp_dst);\n\
#define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\
-__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
+__kernel void pre_process_rgb888_planar_scale_U8to##dst_name##_nhwc \\\n\
( \\\n\
__read_only image2d_array_t input, \\\n\
- __write_only image2d_array_t output0, \\\n\
- __write_only image2d_array_t output1, \\\n\
- __write_only image2d_array_t output2, \\\n\
+ __write_only image2d_array_t output, \\\n\
global int *xRatio, \\\n\
global int *yRatio, \\\n\
global int *xOffset, \\\n\
@@ -30574,7 +34150,10 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float f32Var \\\n\
+ float r_scale, \\\n\
+ int reverse, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
@@ -30636,7 +34215,9 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
int4 test02, temp2; \\\n\
int4 tt; \\\n\
vxc_uchar4 val; \\\n\
- int2 coord_out = (int2)(xPos.x, yPos); \\\n\
+ int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \\\n\
+ coord_out.x = coord_out.x * 3; \\\n\
+ coord_out.z = coord_out.x + 8; \\\n\
\\\n\
vxc_uchar8 line1, line2; \\\n\
\\\n\
@@ -30659,29 +34240,36 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
uniConvertIntergetoF32_4x4); \\\n\
\\\n\
conv_type dst0; \\\n\
- dst_type dst1; \\\n\
- copy_type dst; \\\n\
- tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \\\n\
+ dst_type dst1, dst2; \\\n\
+ copy_type data0, data1, dst; \\\n\
+ tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\
_viv_asm(CONV, dst0, tmp_dst); \\\n\
VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
uniExtract8Data_2x8); \\\n\
- _viv_asm(COPY, dst, dst1, 8); \\\n\
- VXC_WriteImage(output0, coord_out, dst, \\\n\
- VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- RESIZE_BILINEAR_4X1(gMean, output1) \\\n\
- RESIZE_BILINEAR_4X1(bMean, output2) \\\n\
+ RESIZE_BILINEAR_4X1(g_scale, gMean) \\\n\
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniExtract8Data_2x8); \\\n\
+ RESIZE_BILINEAR_4X1(b_scale, bMean) \\\n\
+ VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniExtract8Data_2x8); \\\n\
+ _viv_asm(COPY, data0, dst1, 16); \\\n\
+ _viv_asm(COPY, data1, dst2, 16); \\\n\
+ VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uni16BitsDataInterleave_0_2x8); \\\n\
+ VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uni16BitsDataInterleave_1_2x8); \\\n\
+ VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
}\n\
PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8, half4, vxc_short8)\n\
PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4, vxc_short8)\n\
\n\
#define PRE_PROCESS_RGB888_PLANAR_8BITS(dst_name, write_type) \\\n\
-__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
+__kernel void pre_process_rgb888_planar_scale_U8to##dst_name##_nhwc \\\n\
( \\\n\
__read_only image2d_array_t input, \\\n\
- __write_only image2d_array_t output0, \\\n\
- __write_only image2d_array_t output1, \\\n\
- __write_only image2d_array_t output2, \\\n\
+ __write_only image2d_array_t output, \\\n\
global int *xRatio, \\\n\
global int *yRatio, \\\n\
global int *xOffset, \\\n\
@@ -30689,7 +34277,10 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float f32Var \\\n\
+ float r_scale, \\\n\
+ int reverse, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
@@ -30745,6 +34336,7 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
int4 test01, temp1; \\\n\
int4 test02, temp2; \\\n\
int2 coord_out = (int2)(xPos.x, yPos); \\\n\
+ coord_out.x = coord_out.x * 3; \\\n\
\\\n\
VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
uniVecShift10); \\\n\
@@ -30767,13 +34359,11 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
uniConvertIntergetoF32_4x4); \\\n\
\\\n\
int4 dst0; \\\n\
- write_type dst; \\\n\
- tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \\\n\
+ write_type dst1, dst; \\\n\
+ tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\
dst0 = convert_int4_rte(tmp_dst); \\\n\
- VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
uniExtract8Data_2x8); \\\n\
- \\\n\
- VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
coord_in.x = coord.x; \\\n\
coord_in.z = 1; \\\n\
@@ -30813,12 +34403,10 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
uniExtractBytes); \\\n\
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
uniConvertIntergetoF32_4x4); \\\n\
- tmp_dst = tmp_dst * output_scale - gMean * output_scale + output_zp; \\\n\
+ tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \\\n\
dst0 = convert_int4_rte(tmp_dst); \\\n\
- VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
uniExtract8Data_2x8); \\\n\
- \\\n\
- VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
coord_in.x = coord.x; \\\n\
coord_in.z = 2; \\\n\
@@ -30858,32 +34446,591 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
uniExtractBytes); \\\n\
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
uniConvertIntergetoF32_4x4); \\\n\
- tmp_dst = tmp_dst * output_scale - bMean * output_scale + output_zp; \\\n\
+ tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \\\n\
+ dst0 = convert_int4_rte(tmp_dst); \\\n\
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniExtract8Data_2x8); \\\n\
+ VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uni8BitsDataInterleave_0_2x8); \\\n\
+ VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uni16BitsDataInterleave_1_2x8); \\\n\
+ VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)\n\
+PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_nhwc_0_vx*/
+
+static const char pre_process_rgb888_planar_nhwc_1_vx[] = "\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\
+_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\
+\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;\n\
+\n\
+#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\
+__kernel void pre_process_rgb888_planar_copy_U8to##dst_name##_nhwc \\\n\
+ ( \\\n\
+ __read_only image2d_array_t input, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ global int *xRatio, \\\n\
+ global int *yRatio, \\\n\
+ global int *xOffset, \\\n\
+ global int *yOffset, \\\n\
+ float rMean, \\\n\
+ float gMean, \\\n\
+ float bMean, \\\n\
+ float r_scale, \\\n\
+ int reverse, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
+ ) \\\n\
+{ \\\n\
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+ coord.xy += (int2)(*xOffset, *yOffset); \\\n\
+ vxc_uchar16 src0, src1, src2; \\\n\
+ dst_type dst0, dst1; \\\n\
+ \\\n\
+ int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\
+ VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.z ++; \\\n\
+ VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.z ++; \\\n\
+ VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ int4 coord_out = coord; \\\n\
+ coord_out.z = coord_out.z * 3; \\\n\
+ coord_out.x = coord_out.z + 8; \\\n\
+ float4 paramData0 = (float4)(rMean * output_scale * r_scale - output_zp, \\\n\
+ rMean * r_scale * output_scale - output_zp, \\\n\
+ rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\
+ \\\n\
+ half4 paramData_f16; \\\n\
+ copy_type data0, data1, data2, dst; \\\n\
+ _viv_asm(CONV, paramData_f16, paramData0); \\\n\
+ VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniDataMeanStddevLo_2x8); \\\n\
+ float4 paramData1 = (float4)(gMean * output_scale * g_scale - output_zp,\\\n\
+ gMean * g_scale * output_scale - output_zp, \\\n\
+ gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\
+ _viv_asm(CONV, paramData_f16, paramData1); \\\n\
+ VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniDataMeanStddevLo_2x8); \\\n\
+ _viv_asm(COPY, data0, dst0, 16); \\\n\
+ \\\n\
+ float4 paramData2 = (float4)(bMean * output_scale * b_scale - output_zp, \\\n\
+ bMean * b_scale * output_scale - output_zp, \\\n\
+ bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
+ _viv_asm(CONV, paramData_f16, paramData2); \\\n\
+ VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniDataMeanStddevLo_2x8); \\\n\
+ _viv_asm(COPY, data1, dst1, 16); \\\n\
+ VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uni16BitsDataInterleave_0_2x8); \\\n\
+ VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uni16BitsDataInterleave_1_2x8); \\\n\
+ VXC_WriteImage(output, coord_out.xw, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8, vxc_short8)\n\
+PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\
+\n\
+#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \\\n\
+__kernel void pre_process_rgb888_planar_copy_U8to##dst_name##_nhwc \\\n\
+ ( \\\n\
+ __read_only image2d_array_t input, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ global int *xRatio, \\\n\
+ global int *yRatio, \\\n\
+ global int *xOffset, \\\n\
+ global int *yOffset, \\\n\
+ float rMean, \\\n\
+ float gMean, \\\n\
+ float bMean, \\\n\
+ float r_scale, \\\n\
+ int reverse, \\\n\
+ int height, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
+ ) \\\n\
+{ \\\n\
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+ coord.xy += (int2) (*xOffset, *yOffset); \\\n\
+ vxc_uchar16 src0, src1, src2; \\\n\
+ write_type dst0, dst1, dst2, dst3; \\\n\
+ \\\n\
+ int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\
+ VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.z ++; \\\n\
+ VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_in.z ++; \\\n\
+ VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ int4 coord_out = coord; \\\n\
+ coord_out.z = coord_out.z * 3; \\\n\
+ coord_out.x = coord_out.z + 16; \\\n\
+ float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \\\n\
+ rMean * r_scale * output_scale - output_zp, \\\n\
+ rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\
+ \\\n\
+ half4 paramData_f16; \\\n\
+ _viv_asm(CONV, paramData_f16, paramData0); \\\n\
+ \\\n\
+ VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniDataMeanStddevLo_2x8); \\\n\
+ \\\n\
+ float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \\\n\
+ gMean * g_scale * output_scale - output_zp, \\\n\
+ gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\
+ _viv_asm(CONV, paramData_f16, paramData1); \\\n\
+ \\\n\
+ VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniDataMeanStddevLo_2x8); \\\n\
+ \\\n\
+ float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \\\n\
+ bMean * b_scale * output_scale - output_zp, \\\n\
+ bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
+ _viv_asm(CONV, paramData_f16, paramData2); \\\n\
+ \\\n\
+ VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniDataMeanStddevLo_2x8); \\\n\
+ VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uni8BitsDataInterleave_0_2x8); \\\n\
+ VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uni8BitsDataInterleave_1_2x8); \\\n\
+ VXC_DP2x8(dst3, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uni8BitsDataInterleave_2_2x8); \\\n\
+ VXC_WriteImage(output, coord_out.zw, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_WriteImage(output, coord_out.xw, dst3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\
+PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\
+"; /* end of pre_process_rgb888_planar_nhwc_1_vx*/
+
+static const char pre_process_rgb888_planar_nhwc_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;\n\
+\n\
+__kernel void pre_process_rgb888_planar_half_U8toU8_nhwc\n\
+ (\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ global int *xRatio,\n\
+ global int *yRatio,\n\
+ global int *xOffset,\n\
+ global int *yOffset,\n\
+ float rMean,\n\
+ float gMean,\n\
+ float bMean,\n\
+ float r_scale,\n\
+ int reverse,\n\
+ float g_scale,\n\
+ float b_scale\n\
+ )\n\
+{\n\
+ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+ vxc_uchar16 src0, src1, src2;\n\
+\n\
+ VXC_ReadImage2DArray(src0, input, coord_in, 0,\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ coord_in.z ++;\n\
+ VXC_ReadImage2DArray(src1, input, coord_in, 0,\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ coord_in.z ++;\n\
+ VXC_ReadImage2DArray(src2, input, coord_in, 0,\n\
+ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ int4 coord;\n\
+ coord.xy = coord_in.xy >> 1;\n\
+\n\
+ coord.x = coord.x * 3;\n\
+ coord.z = coord.x + 16;\n\
+\n\
+ vxc_uchar16 dst0, dst1;\n\
+ src0.lo = src0.s02468ace;\n\
+ src0.hi = src1.s02468ace;\n\
+ src1.lo = src2.s02468ace;\n\
+\n\
+ VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\
+ uni8BitsDataInterleave_0_2x8);\n\
+ VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\n\
+ uni8BitsDataInterleave_1_2x8);\n\
+ VXC_DP2x8(dst1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\
+ uni8BitsDataInterleave_2_2x8);\n\
+\n\
+ VXC_WriteImage(output, coord.xy, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord.zy, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of pre_process_rgb888_planar_nhwc_2_vx*/
+
+static const char pre_process_rgb888_planar_sep_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniVecShift10;\n\
+_viv_uniform VXC_512Bits uniAddRShift;\n\
+_viv_uniform VXC_512Bits uniGetTempVal;\n\
+_viv_uniform VXC_512Bits uniExtractBytes;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform int4 rgb_order;\n\
+\n\
+#define RESIZE_BILINEAR_4X1(input, scale, mean, output, _coord) \\\n\
+ VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line0Y, input, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ temp1 = temp1 + test01; \\\n\
+ \\\n\
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ temp2 = temp2 + test02; \\\n\
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniExtractBytes); \\\n\
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+ tmp_dst = tmp_dst * scale * output_scale - scale * mean * output_scale + output_zp; \\\n\
+ _viv_asm(CONV, dst0, tmp_dst); \\\n\
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniExtract8Data_2x8); \\\n\
+ _viv_asm(COPY, dst, dst1, 8); \\\n\
+ VXC_WriteImage(output, _coord, dst, \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+#define RGB888_PLANAR_SEP_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\
+__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
+ ( \\\n\
+ __read_only image2d_array_t input0, \\\n\
+ __read_only image2d_array_t input1, \\\n\
+ __read_only image2d_array_t input2, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ global int *xRatio, \\\n\
+ global int *yRatio, \\\n\
+ global int *xOffset, \\\n\
+ global int *yOffset, \\\n\
+ float rMean, \\\n\
+ float gMean, \\\n\
+ float bMean, \\\n\
+ float r_scale, \\\n\
+ int reverse, \\\n\
+ int height, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
+ ) \\\n\
+{ \\\n\
+ int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
+ \\\n\
+ int4 xPos = get_global_id(0); \\\n\
+ int yPos = get_global_id(1); \\\n\
+ \\\n\
+ int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\
+ xPos += (int4)(0, 1, 2, 3); \\\n\
+ \\\n\
+ int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\
+ int4 sx = fx0 & 0xffff8000; \\\n\
+ fx0 -= sx; \\\n\
+ sx = sx >> 15; \\\n\
+ \\\n\
+ vxc_short4 fx; \\\n\
+ VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniAddRShift); \\\n\
+ \\\n\
+ int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\
+ int sy = fy & 0xffff8000; \\\n\
+ \\\n\
+ fy -= sy; \\\n\
+ sy = sy >> 15; \\\n\
+ \\\n\
+ fy = (fy + (1<< 4)) >> 5; \\\n\
+ \\\n\
+ vxc_uchar16 line0Y; \\\n\
+ vxc_uchar16 line1Y; \\\n\
+ int4 coord; \\\n\
+ sx = sx + *xOffset; \\\n\
+ coord.xyz = sx.xyz; \\\n\
+ coord.w = sy + *yOffset; \\\n\
+ int2 coord1 = (int2)(sx.w, coord.w); \\\n\
+ VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ int4 test01, temp1; \\\n\
+ int4 test02, temp2; \\\n\
+ int4 tt; \\\n\
+ vxc_uchar4 val; \\\n\
+ int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \\\n\
+ coord_out.yzw += rgb_order.xyz; \\\n\
+ \\\n\
+ vxc_uchar8 line1, line2; \\\n\
+ \\\n\
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ temp1 = temp1 + test01; \\\n\
+ \\\n\
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+ temp2 = temp2 + test02; \\\n\
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+ \\\n\
+ vxc_float4 tmp_dst; \\\n\
+ vxc_uchar4 u8_dst; \\\n\
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniExtractBytes); \\\n\
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+ conv_type dst0; \\\n\
+ dst_type dst1; \\\n\
+ copy_type dst; \\\n\
+ tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\
+ _viv_asm(CONV, dst0, tmp_dst); \\\n\
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniExtract8Data_2x8); \\\n\
+ _viv_asm(COPY, dst, dst1, 8); \\\n\
+ VXC_WriteImage(output, coord_out.xy, dst, \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ RESIZE_BILINEAR_4X1(input1, g_scale, gMean, output, coord_out.xz) \\\n\
+ RESIZE_BILINEAR_4X1(input2, b_scale, bMean, output, coord_out.xw) \\\n\
+}\n\
+RGB888_PLANAR_SEP_16BITS(F16, vxc_half8, half4, vxc_short8)\n\
+RGB888_PLANAR_SEP_16BITS(I16, vxc_short8, int4, vxc_short8)\n\
+\n\
+#define RGB888_PLANAR_SEP_8BITS(dst_name, write_type) \\\n\
+__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
+ ( \\\n\
+ __read_only image2d_array_t input0, \\\n\
+ __read_only image2d_array_t input1, \\\n\
+ __read_only image2d_array_t input2, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ global int *xRatio, \\\n\
+ global int *yRatio, \\\n\
+ global int *xOffset, \\\n\
+ global int *yOffset, \\\n\
+ float rMean, \\\n\
+ float gMean, \\\n\
+ float bMean, \\\n\
+ float r_scale, \\\n\
+ int reverse, \\\n\
+ int height, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
+ ) \\\n\
+{ \\\n\
+ int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
+ int4 xPos = get_global_id(0); \\\n\
+ int yPos = get_global_id(1); \\\n\
+ \\\n\
+ int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\
+ xPos += (int4)(0, 1, 2, 3); \\\n\
+ \\\n\
+ int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\
+ int4 sx = fx0 & 0xffff8000; \\\n\
+ fx0 -= sx; \\\n\
+ sx = sx >> 15; \\\n\
+ \\\n\
+ vxc_short4 fx; \\\n\
+ VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \\\n\
+ \\\n\
+ int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\
+ int sy = fy & 0xffff8000; \\\n\
+ \\\n\
+ fy -= sy; \\\n\
+ sy = sy >> 15; \\\n\
+ fy = (fy + (1<< 4)) >> 5; \\\n\
+ \\\n\
+ vxc_uchar16 line0Y; \\\n\
+ vxc_uchar16 line1Y; \\\n\
+ int4 coord; \\\n\
+ sx = sx + *xOffset; \\\n\
+ coord.xyz = sx.xyz; \\\n\
+ coord.w = sy + *yOffset; \\\n\
+ int2 coord1 = (int2)(sx.w, coord.w); \\\n\
+ VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ int4 test01, temp1; \\\n\
+ int4 test02, temp2; \\\n\
+ int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \\\n\
+ coord_out.yzw += rgb_order.xyz; \\\n\
+ \\\n\
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGetTempVal); \\\n\
+ temp1 = temp1 + test01; \\\n\
+ \\\n\
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGetTempVal); \\\n\
+ temp2 = temp2 + test02; \\\n\
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+ \\\n\
+ vxc_float4 tmp_dst; \\\n\
+ vxc_uchar4 u8_dst; \\\n\
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniExtractBytes); \\\n\
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+ int4 dst0; \\\n\
+ write_type dst; \\\n\
+ tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\
+ dst0 = convert_int4_rte(tmp_dst); \\\n\
+ VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniExtract8Data_2x8); \\\n\
+ \\\n\
+ VXC_WriteImage(output, coord_out.xy, dst, \\\n\
+ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ VXC_ReadImage(line0Y, input1, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line0Y, input1, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line0Y, input1, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line0Y, input1, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ VXC_ReadImage(line1Y, input1, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line1Y, input1, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line1Y, input1, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line1Y, input1, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGetTempVal); \\\n\
+ temp1 = temp1 + test01; \\\n\
+ \\\n\
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGetTempVal); \\\n\
+ temp2 = temp2 + test02; \\\n\
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniExtractBytes); \\\n\
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertIntergetoF32_4x4); \\\n\
+ tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \\\n\
+ dst0 = convert_int4_rte(tmp_dst); \\\n\
+ VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniExtract8Data_2x8); \\\n\
+ \\\n\
+ VXC_WriteImage(output, coord_out.xz, \\\n\
+ dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ VXC_ReadImage(line0Y, input2, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line0Y, input2, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line0Y, input2, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line0Y, input2, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ VXC_ReadImage(line1Y, input2, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line1Y, input2, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line1Y, input2, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(line1Y, input2, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
+ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGetTempVal); \\\n\
+ temp1 = temp1 + test01; \\\n\
+ \\\n\
+ VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniVecShift10); \\\n\
+ VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+ uniGetTempVal); \\\n\
+ temp2 = temp2 + test02; \\\n\
+ temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+ VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniExtractBytes); \\\n\
+ VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertIntergetoF32_4x4); \\\n\
+ tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \\\n\
dst0 = convert_int4_rte(tmp_dst); \\\n\
VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
uniExtract8Data_2x8); \\\n\
\\\n\
- VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_WriteImage(output, coord_out.xw, \\\n\
+ dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
}\n\
-PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)\n\
-PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_0_vx*/
+RGB888_PLANAR_SEP_8BITS(U8, vxc_uchar16)\n\
+RGB888_PLANAR_SEP_8BITS(I8, vxc_char16)\n\
+"; /* end of pre_process_rgb888_planar_sep_0_vx*/
-static const char pre_process_rgb888_planar_1_vx[] = "\n\
-#include \"cl_viv_vx_ext.h\"\n\
+static const char pre_process_rgb888_planar_sep_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
\n\
_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\
_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\
\n\
_viv_uniform float output_scale;\n\
_viv_uniform float output_zp;\n\
+_viv_uniform int4 rgb_order;\n\
\n\
-#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\
-__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
+#define RGB888_PLANAR_SEP_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\
+__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\
( \\\n\
- __read_only image2d_array_t input, \\\n\
- __write_only image2d_array_t output0, \\\n\
- __write_only image2d_array_t output1, \\\n\
- __write_only image2d_array_t output2, \\\n\
+ __read_only image2d_array_t input0, \\\n\
+ __read_only image2d_array_t input1, \\\n\
+ __read_only image2d_array_t input2, \\\n\
+ __write_only image2d_array_t output, \\\n\
global int *xRatio, \\\n\
global int *yRatio, \\\n\
global int *xOffset, \\\n\
@@ -30891,7 +35038,11 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float f32Var \\\n\
+ float r_scale, \\\n\
+ int reverse, \\\n\
+ int height, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
@@ -30900,16 +35051,14 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
vxc_uchar16 src0, src1, src2; \\\n\
dst_type dst0, dst1; \\\n\
\\\n\
- int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\
- VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
- coord_in.z ++; \\\n\
- VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
- coord_in.z ++; \\\n\
- VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
coord.x = coord.z + 8; \\\n\
- float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \\\n\
- rMean * output_scale - output_zp, output_scale); \\\n\
+ float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \\\n\
+ rMean * r_scale * output_scale - output_zp, \\\n\
+ rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\
\\\n\
half4 paramData_f16; \\\n\
copy_type tmp_dst; \\\n\
@@ -30919,44 +35068,49 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
uniDataMeanStddevHi_2x8); \\\n\
_viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
- VXC_WriteImage(output0, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ int4 coord_out = coord; \\\n\
+ coord_out.yw = coord_out.ww + rgb_order.xy; \\\n\
+ VXC_WriteImage(output, coord_out.zy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
_viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
- VXC_WriteImage(output0, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_WriteImage(output, coord_out.xy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \\\n\
- gMean * output_scale - output_zp, output_scale); \\\n\
+ float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \\\n\
+ gMean * g_scale * output_scale - output_zp, \\\n\
+ gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\
_viv_asm(CONV, paramData_f16, paramData1); \\\n\
VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
uniDataMeanStddevLo_2x8); \\\n\
VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
uniDataMeanStddevHi_2x8); \\\n\
_viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
- VXC_WriteImage(output1, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
_viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
- VXC_WriteImage(output1, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \\\n\
- bMean * output_scale - output_zp, output_scale); \\\n\
+ float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \\\n\
+ bMean * b_scale * output_scale - output_zp, \\\n\
+ bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
_viv_asm(CONV, paramData_f16, paramData2); \\\n\
VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
uniDataMeanStddevLo_2x8); \\\n\
VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
uniDataMeanStddevHi_2x8); \\\n\
_viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
- VXC_WriteImage(output2, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ coord_out.w = coord.w + rgb_order.z; \\\n\
+ VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
_viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
- VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
}\n\
-PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8, vxc_short8)\n\
-PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\
+RGB888_PLANAR_SEP_COPY_16BITS(F16, vxc_half8, vxc_short8)\n\
+RGB888_PLANAR_SEP_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\
\n\
#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \\\n\
-__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
+__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\
( \\\n\
- __read_only image2d_array_t input, \\\n\
- __write_only image2d_array_t output0, \\\n\
- __write_only image2d_array_t output1, \\\n\
- __write_only image2d_array_t output2, \\\n\
+ __read_only image2d_array_t input0, \\\n\
+ __read_only image2d_array_t input1, \\\n\
+ __read_only image2d_array_t input2, \\\n\
+ __write_only image2d_array_t output, \\\n\
global int *xRatio, \\\n\
global int *yRatio, \\\n\
global int *xOffset, \\\n\
@@ -30964,7 +35118,11 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float f32Var \\\n\
+ float r_scale, \\\n\
+ int reverse, \\\n\
+ int height, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
@@ -30973,15 +35131,15 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
vxc_uchar16 src0, src1, src2; \\\n\
write_type dst; \\\n\
\\\n\
- int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\
- VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
- coord_in.z ++; \\\n\
- VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
- coord_in.z ++; \\\n\
- VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \\\n\
- rMean * output_scale - output_zp, output_scale); \\\n\
+ int4 coord_out = coord; \\\n\
+ coord_out.xyw += rgb_order.xyz; \\\n\
+ float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \\\n\
+ rMean * r_scale * output_scale - output_zp, \\\n\
+ rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\
\\\n\
half4 paramData_f16; \\\n\
_viv_asm(CONV, paramData_f16, paramData0); \\\n\
@@ -30990,46 +35148,49 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
uniDataMeanStddevLo_2x8); \\\n\
VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
uniDataMeanStddevHi_2x8); \\\n\
- VXC_WriteImage(output0, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_WriteImage(output, coord_out.zx, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \\\n\
- gMean * output_scale - output_zp, output_scale); \\\n\
+ float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \\\n\
+ gMean * g_scale * output_scale - output_zp, \\\n\
+ gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\
_viv_asm(CONV, paramData_f16, paramData1); \\\n\
\\\n\
VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
uniDataMeanStddevLo_2x8); \\\n\
VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
uniDataMeanStddevHi_2x8); \\\n\
- VXC_WriteImage(output1, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \\\n\
- bMean * output_scale - output_zp, output_scale); \\\n\
+ float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \\\n\
+ bMean * b_scale * output_scale - output_zp, \\\n\
+ bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
_viv_asm(CONV, paramData_f16, paramData2); \\\n\
\\\n\
VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
uniDataMeanStddevLo_2x8); \\\n\
VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
uniDataMeanStddevHi_2x8); \\\n\
- VXC_WriteImage(output2, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
}\n\
PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\
PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\
-"; /* end of pre_process_rgb888_planar_1_vx*/
+"; /* end of pre_process_rgb888_planar_sep_1_vx*/
-static const char pre_process_rgb888_planar_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char pre_process_rgb888_planar_sep_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
\n\
_viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;\n\
_viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;\n\
_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;\n\
_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;\n\
_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;\n\
+_viv_uniform int4 rgb_order;\n\
\n\
-__kernel void pre_process_rgb888_planar_4over3_U8toU8\n\
+__kernel void pre_process_rgb888_planar_sep_4over3_U8toU8\n\
(\n\
- __read_only image2d_array_t input,\n\
- __write_only image2d_array_t output0,\n\
- __write_only image2d_array_t output1,\n\
- __write_only image2d_array_t output2,\n\
+ __read_only image2d_array_t input0,\n\
+ __read_only image2d_array_t input1,\n\
+ __read_only image2d_array_t input2,\n\
+ __write_only image2d_array_t output,\n\
global int *xRatio,\n\
global int *yRatio,\n\
global int *xOffset,\n\
@@ -31037,24 +35198,24 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8\n\
float rMean,\n\
float gMean,\n\
float bMean,\n\
- float f32Var\n\
+ float r_scale,\n\
+ int reverse,\n\
+ int height,\n\
+ float g_scale,\n\
+ float b_scale\n\
)\n\
{\n\
- int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
int4 coord_out;\n\
\n\
vxc_uchar16 src0, src1, src2, src3;\n\
vxc_uchar16 dst0, dst1, dst2;\n\
\n\
- VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\
- VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\
- VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\
- VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\
- VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- coord_in.z ++;\n\
+ VXC_ReadImage(src0, input0, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src1, input0, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src2, input0, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src3, input0, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
coord_out.xy = (coord_in.xy >> 2) * 3;\n\
coord_out.zw = coord_in.yy + (int2)(1, 2);\n\
\n\
@@ -31067,19 +35228,16 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8\n\
VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\
VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\
\n\
- VXC_WriteImage(output0, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ int4 coord_r = coord_out;\n\
+ coord_r.yzw += rgb_order.xxx;\n\
+ VXC_WriteImage(output, coord_r.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord_r.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord_r.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
\n\
- VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\
- VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\
- VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\
- VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\
- VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- coord_in.z ++;\n\
+ VXC_ReadImage(src0, input1, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src1, input1, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src2, input1, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src3, input1, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
\n\
VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\
VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\
@@ -31090,18 +35248,16 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8\n\
VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\
VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\
\n\
- VXC_WriteImage(output1, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ int4 coord_g = coord_out;\n\
+ coord_g.yzw += rgb_order.yyy;\n\
+ VXC_WriteImage(output, coord_g.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord_g.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord_g.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
\n\
- VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\
- VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\
- VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\
- VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\
- VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src0, input2, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src1, input2, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src2, input2, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src3, input2, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
\n\
VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\
VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\
@@ -31112,17 +35268,19 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8\n\
VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\
VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\
\n\
- VXC_WriteImage(output2, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output2, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ int4 coord_b = coord_out;\n\
+ coord_b.yzw += rgb_order.zzz;\n\
+ VXC_WriteImage(output, coord_b.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord_b.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord_b.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
}\n\
\n\
-__kernel void pre_process_rgb888_planar_half_U8toU8\n\
+__kernel void pre_process_rgb888_planar_sep_half_U8toU8\n\
(\n\
- __read_only image2d_array_t input,\n\
- __write_only image2d_array_t output0,\n\
- __write_only image2d_array_t output1,\n\
- __write_only image2d_array_t output2,\n\
+ __read_only image2d_array_t input0,\n\
+ __read_only image2d_array_t input1,\n\
+ __read_only image2d_array_t input2,\n\
+ __write_only image2d_array_t output,\n\
global int *xRatio,\n\
global int *yRatio,\n\
global int *xOffset,\n\
@@ -31130,31 +35288,32 @@ __kernel void pre_process_rgb888_planar_half_U8toU8\n\
float rMean,\n\
float gMean,\n\
float bMean,\n\
- float f32Var\n\
+ float r_scale,\n\
+ int reverse,\n\
+ int height,\n\
+ float g_scale,\n\
+ float b_scale\n\
)\n\
{\n\
- int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+ int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
\n\
vxc_uchar16 src0, src1, src2;\n\
\n\
- VXC_ReadImage2DArray(src0, input, coord_in, 0,\n\
- VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- coord_in.z ++;\n\
- VXC_ReadImage2DArray(src1, input, coord_in, 0,\n\
- VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- coord_in.z ++;\n\
- VXC_ReadImage2DArray(src2, input, coord_in, 0,\n\
- VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src0, input0, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src1, input1, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src2, input2, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
\n\
- int2 coord = coord_in.xy >> 1;\n\
+ coord_in.zw = coord_in.xy >> 1;\n\
\n\
- VXC_WriteImage(output0, coord, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output1, coord, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output2, coord, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ int4 coord_rgb = coord_in.zwww;\n\
+ coord_rgb.yzw += rgb_order.xyz;\n\
+ VXC_WriteImage(output, coord_rgb.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord_rgb.xz, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord_rgb.xw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
}\n\
-"; /* end of pre_process_rgb888_planar_2_vx*/
+"; /* end of pre_process_rgb888_planar_sep_2_vx*/
-static const char pre_process_rgb888_planar_sep_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char pre_process_rgb888_planar_sep_nhwc_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
\n\
_viv_uniform VXC_512Bits uniVecShift10;\n\
_viv_uniform VXC_512Bits uniAddRShift;\n\
@@ -31163,11 +35322,15 @@ _viv_uniform VXC_512Bits uniExtractBytes;\n\
\n\
_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\
_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;\n\
\n\
_viv_uniform float output_scale;\n\
_viv_uniform float output_zp;\n\
\n\
-#define RESIZE_BILINEAR_4X1(input, mean, output) \\\n\
+#define RESIZE_BILINEAR_4X1(input, scale, mean) \\\n\
VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
@@ -31197,23 +35360,16 @@ _viv_uniform float output_zp;\n\
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
uniConvertIntergetoF32_4x4); \\\n\
\\\n\
- tmp_dst = tmp_dst * output_scale - mean * output_scale + output_zp; \\\n\
- _viv_asm(CONV, dst0, tmp_dst); \\\n\
- VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
- uniExtract8Data_2x8); \\\n\
- _viv_asm(COPY, dst, dst1, 8); \\\n\
- VXC_WriteImage(output, coord_out, dst, \\\n\
- VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ tmp_dst = tmp_dst * scale * output_scale - mean * scale * output_scale + output_zp; \\\n\
+ _viv_asm(CONV, dst0, tmp_dst);\n\
\n\
#define RGB888_PLANAR_SEP_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\
-__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
+__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name##_nhwc \\\n\
( \\\n\
__read_only image2d_array_t input0, \\\n\
__read_only image2d_array_t input1, \\\n\
__read_only image2d_array_t input2, \\\n\
- __write_only image2d_array_t output0, \\\n\
- __write_only image2d_array_t output1, \\\n\
- __write_only image2d_array_t output2, \\\n\
+ __write_only image2d_array_t output, \\\n\
global int *xRatio, \\\n\
global int *yRatio, \\\n\
global int *xOffset, \\\n\
@@ -31221,7 +35377,10 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float f32Var \\\n\
+ float r_scale, \\\n\
+ int reverse, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
@@ -31274,7 +35433,9 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
int4 test02, temp2; \\\n\
int4 tt; \\\n\
vxc_uchar4 val; \\\n\
- int2 coord_out = (int2)(xPos.x, yPos); \\\n\
+ int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \\\n\
+ coord_out.x = coord_out.x * 3; \\\n\
+ coord_out.z = coord_out.x + 8; \\\n\
\\\n\
vxc_uchar8 line1, line2; \\\n\
\\\n\
@@ -31297,31 +35458,38 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
uniConvertIntergetoF32_4x4); \\\n\
\\\n\
conv_type dst0; \\\n\
- dst_type dst1; \\\n\
- copy_type dst; \\\n\
- tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \\\n\
+ dst_type dst1, dst2; \\\n\
+ copy_type data0, data1, dst; \\\n\
+ tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\
_viv_asm(CONV, dst0, tmp_dst); \\\n\
VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
uniExtract8Data_2x8); \\\n\
- _viv_asm(COPY, dst, dst1, 8); \\\n\
- VXC_WriteImage(output0, coord_out, dst, \\\n\
- VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ RESIZE_BILINEAR_4X1(input1, g_scale, gMean) \\\n\
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniExtract8Data_2x8); \\\n\
\\\n\
- RESIZE_BILINEAR_4X1(input1, gMean, output1) \\\n\
- RESIZE_BILINEAR_4X1(input2, bMean, output2) \\\n\
+ RESIZE_BILINEAR_4X1(input2, b_scale, bMean) \\\n\
+ VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uniExtract8Data_2x8); \\\n\
+ _viv_asm(COPY, data0, dst1, 16); \\\n\
+ _viv_asm(COPY, data1, dst2, 16); \\\n\
+ VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uni16BitsDataInterleave_0_2x8); \\\n\
+ VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uni16BitsDataInterleave_1_2x8); \\\n\
+ VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
}\n\
RGB888_PLANAR_SEP_16BITS(F16, vxc_half8, half4, vxc_short8)\n\
RGB888_PLANAR_SEP_16BITS(I16, vxc_short8, int4, vxc_short8)\n\
\n\
#define RGB888_PLANAR_SEP_8BITS(dst_name, write_type) \\\n\
-__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
+__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name##_nhwc \\\n\
( \\\n\
__read_only image2d_array_t input0, \\\n\
__read_only image2d_array_t input1, \\\n\
__read_only image2d_array_t input2, \\\n\
- __write_only image2d_array_t output0, \\\n\
- __write_only image2d_array_t output1, \\\n\
- __write_only image2d_array_t output2, \\\n\
+ __write_only image2d_array_t output, \\\n\
global int *xRatio, \\\n\
global int *yRatio, \\\n\
global int *xOffset, \\\n\
@@ -31329,7 +35497,10 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float f32Var \\\n\
+ float r_scale, \\\n\
+ int reverse, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
@@ -31378,6 +35549,7 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
int4 test01, temp1; \\\n\
int4 test02, temp2; \\\n\
int2 coord_out = (int2)(xPos.x, yPos); \\\n\
+ coord_out.x = coord_out.x * 3; \\\n\
\\\n\
VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
uniVecShift10); \\\n\
@@ -31400,13 +35572,11 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
uniConvertIntergetoF32_4x4); \\\n\
\\\n\
int4 dst0; \\\n\
- write_type dst; \\\n\
- tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \\\n\
+ write_type dst1, dst; \\\n\
+ tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\
dst0 = convert_int4_rte(tmp_dst); \\\n\
- VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
uniExtract8Data_2x8); \\\n\
- \\\n\
- VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
VXC_ReadImage(line0Y, input1, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
VXC_ReadImage(line0Y, input1, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
@@ -31438,12 +35608,10 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
uniExtractBytes); \\\n\
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
uniConvertIntergetoF32_4x4); \\\n\
- tmp_dst = tmp_dst * output_scale - gMean * output_scale + output_zp; \\\n\
+ tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \\\n\
dst0 = convert_int4_rte(tmp_dst); \\\n\
- VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
uniExtract8Data_2x8); \\\n\
- \\\n\
- VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
VXC_ReadImage(line0Y, input2, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
VXC_ReadImage(line0Y, input2, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
@@ -31475,33 +35643,39 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
uniExtractBytes); \\\n\
VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
uniConvertIntergetoF32_4x4); \\\n\
- tmp_dst = tmp_dst * output_scale - bMean * output_scale + output_zp; \\\n\
+ tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \\\n\
dst0 = convert_int4_rte(tmp_dst); \\\n\
- VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \\\n\
uniExtract8Data_2x8); \\\n\
- \\\n\
- VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uni8BitsDataInterleave_0_2x8); \\\n\
+ VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uni16BitsDataInterleave_1_2x8); \\\n\
+ VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
}\n\
RGB888_PLANAR_SEP_8BITS(U8, vxc_uchar16)\n\
-RGB888_PLANAR_SEP_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_sep_0_vx*/
+RGB888_PLANAR_SEP_8BITS(I8, vxc_char16)\n\
+"; /* end of pre_process_rgb888_planar_sep_nhwc_0_vx*/
-static const char pre_process_rgb888_planar_sep_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char pre_process_rgb888_planar_sep_nhwc_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
\n\
_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\
-_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\
\n\
_viv_uniform float output_scale;\n\
_viv_uniform float output_zp;\n\
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;\n\
\n\
#define RGB888_PLANAR_SEP_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\
-__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\
+__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name##_nhwc \\\n\
( \\\n\
__read_only image2d_array_t input0, \\\n\
__read_only image2d_array_t input1, \\\n\
__read_only image2d_array_t input2, \\\n\
- __write_only image2d_array_t output0, \\\n\
- __write_only image2d_array_t output1, \\\n\
- __write_only image2d_array_t output2, \\\n\
+ __write_only image2d_array_t output, \\\n\
global int *xRatio, \\\n\
global int *yRatio, \\\n\
global int *xOffset, \\\n\
@@ -31509,7 +35683,10 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float f32Var \\\n\
+ float r_scale, \\\n\
+ int reverse, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
@@ -31522,58 +35699,50 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\
VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- coord.x = coord.z + 8; \\\n\
- float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \\\n\
- rMean * output_scale - output_zp, output_scale); \\\n\
+ int4 coord_out = coord; \\\n\
+ coord_out.z = coord_out.z * 3; \\\n\
+ coord_out.x = coord_out.z + 8; \\\n\
+ float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp,\\\n\
+ rMean * r_scale * output_scale - output_zp, \\\n\
+ rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\
\\\n\
half4 paramData_f16; \\\n\
- copy_type tmp_dst; \\\n\
+ copy_type data0, data1, data2, dst; \\\n\
_viv_asm(CONV, paramData_f16, paramData0); \\\n\
- VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+ VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
uniDataMeanStddevLo_2x8); \\\n\
- VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
- uniDataMeanStddevHi_2x8); \\\n\
- _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
- VXC_WriteImage(output0, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
- _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
- VXC_WriteImage(output0, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
- float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \\\n\
- gMean * output_scale - output_zp, output_scale); \\\n\
+ float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp,\\\n\
+ gMean * g_scale * output_scale - output_zp, \\\n\
+ gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\
_viv_asm(CONV, paramData_f16, paramData1); \\\n\
- VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+ VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), \\\n\
uniDataMeanStddevLo_2x8); \\\n\
- VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
- uniDataMeanStddevHi_2x8); \\\n\
- _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
- VXC_WriteImage(output1, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
- _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
- VXC_WriteImage(output1, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ _viv_asm(COPY, data0, dst0, 16); \\\n\
\\\n\
- float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \\\n\
- bMean * output_scale - output_zp, output_scale); \\\n\
+ float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp,\\\n\
+ bMean * b_scale * output_scale - output_zp, \\\n\
+ bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
_viv_asm(CONV, paramData_f16, paramData2); \\\n\
VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
uniDataMeanStddevLo_2x8); \\\n\
- VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
- uniDataMeanStddevHi_2x8); \\\n\
- _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
- VXC_WriteImage(output2, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
- _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
- VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ _viv_asm(COPY, data1, dst0, 16); \\\n\
+ VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uni16BitsDataInterleave_0_2x8); \\\n\
+ VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uni16BitsDataInterleave_1_2x8); \\\n\
+ VXC_WriteImage(output, coord_out.xw, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
}\n\
RGB888_PLANAR_SEP_COPY_16BITS(F16, vxc_half8, vxc_short8)\n\
RGB888_PLANAR_SEP_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\
\n\
#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \\\n\
-__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\
+__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name##_nhwc \\\n\
( \\\n\
__read_only image2d_array_t input0, \\\n\
__read_only image2d_array_t input1, \\\n\
__read_only image2d_array_t input2, \\\n\
- __write_only image2d_array_t output0, \\\n\
- __write_only image2d_array_t output1, \\\n\
- __write_only image2d_array_t output2, \\\n\
+ __write_only image2d_array_t output, \\\n\
global int *xRatio, \\\n\
global int *yRatio, \\\n\
global int *xOffset, \\\n\
@@ -31581,153 +35750,75 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float f32Var \\\n\
+ float r_scale, \\\n\
+ int reverse, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
\\\n\
coord.xy += (int2) (*xOffset, *yOffset); \\\n\
vxc_uchar16 src0, src1, src2; \\\n\
- write_type dst; \\\n\
+ write_type dst0, dst1, dst2, dst3; \\\n\
\\\n\
VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \\\n\
- rMean * output_scale - output_zp, output_scale); \\\n\
+ int4 coord_out = coord; \\\n\
+ coord_out.z = coord_out.z * 3; \\\n\
+ coord_out.x = coord_out.z + 16; \\\n\
+ float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp,\\\n\
+ rMean * r_scale * output_scale - output_zp, \\\n\
+ rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\
\\\n\
half4 paramData_f16; \\\n\
_viv_asm(CONV, paramData_f16, paramData0); \\\n\
\\\n\
- VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
uniDataMeanStddevLo_2x8); \\\n\
- VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
- uniDataMeanStddevHi_2x8); \\\n\
- VXC_WriteImage(output0, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \\\n\
- gMean * output_scale - output_zp, output_scale); \\\n\
+ float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp,\\\n\
+ gMean * g_scale * output_scale - output_zp, \\\n\
+ gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\
_viv_asm(CONV, paramData_f16, paramData1); \\\n\
\\\n\
- VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
uniDataMeanStddevLo_2x8); \\\n\
- VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
- uniDataMeanStddevHi_2x8); \\\n\
- VXC_WriteImage(output1, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \\\n\
- bMean * output_scale - output_zp, output_scale); \\\n\
+ float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp,\\\n\
+ bMean * b_scale * output_scale - output_zp, \\\n\
+ bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
_viv_asm(CONV, paramData_f16, paramData2); \\\n\
\\\n\
- VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
uniDataMeanStddevLo_2x8); \\\n\
- VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
- uniDataMeanStddevHi_2x8); \\\n\
- VXC_WriteImage(output2, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uni8BitsDataInterleave_0_2x8); \\\n\
+ VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uni8BitsDataInterleave_1_2x8); \\\n\
+ VXC_DP2x8(dst3, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+ uni8BitsDataInterleave_2_2x8); \\\n\
+ VXC_WriteImage(output, coord_out.zw, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ VXC_WriteImage(output, coord_out.xw, dst3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
}\n\
PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\
PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\
-"; /* end of pre_process_rgb888_planar_sep_1_vx*/
+"; /* end of pre_process_rgb888_planar_sep_nhwc_1_vx*/
-static const char pre_process_rgb888_planar_sep_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char pre_process_rgb888_planar_sep_nhwc_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
\n\
-_viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;\n\
-_viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;\n\
-_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;\n\
\n\
-__kernel void pre_process_rgb888_planar_sep_4over3_U8toU8\n\
+__kernel void pre_process_rgb888_planar_sep_half_U8toU8_nhwc\n\
(\n\
__read_only image2d_array_t input0,\n\
__read_only image2d_array_t input1,\n\
__read_only image2d_array_t input2,\n\
- __write_only image2d_array_t output0,\n\
- __write_only image2d_array_t output1,\n\
- __write_only image2d_array_t output2,\n\
- global int *xRatio,\n\
- global int *yRatio,\n\
- global int *xOffset,\n\
- global int *yOffset,\n\
- float rMean,\n\
- float gMean,\n\
- float bMean,\n\
- float f32Var\n\
- )\n\
-{\n\
- int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
- int4 coord_out;\n\
-\n\
- vxc_uchar16 src0, src1, src2, src3;\n\
- vxc_uchar16 dst0, dst1, dst2;\n\
-\n\
- VXC_ReadImage(src0, input0, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(src1, input0, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(src2, input0, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(src3, input0, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
- coord_out.xy = (coord_in.xy >> 2) * 3;\n\
- coord_out.zw = coord_in.yy + (int2)(1, 2);\n\
-\n\
- VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\
- VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\
- VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\
- VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\
- VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\
- VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\
- VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\
- VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\
-\n\
- VXC_WriteImage(output0, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-\n\
- VXC_ReadImage(src0, input1, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(src1, input1, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(src2, input1, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(src3, input1, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
- VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\
- VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\
- VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\
- VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\
- VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\
- VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\
- VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\
- VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\
-\n\
- VXC_WriteImage(output1, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-\n\
- VXC_ReadImage(src0, input2, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(src1, input2, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(src2, input2, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
- VXC_ReadImage(src3, input2, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
- VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\
- VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\
- VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\
- VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\
- VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\
- VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\
- VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\
- VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\
-\n\
- VXC_WriteImage(output2, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output2, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pre_process_rgb888_planar_sep_half_U8toU8\n\
- (\n\
- __read_only image2d_array_t input0,\n\
- __read_only image2d_array_t input1,\n\
- __read_only image2d_array_t input2,\n\
- __write_only image2d_array_t output0,\n\
- __write_only image2d_array_t output1,\n\
- __write_only image2d_array_t output2,\n\
+ __write_only image2d_array_t output,\n\
global int *xRatio,\n\
global int *yRatio,\n\
global int *xOffset,\n\
@@ -31735,7 +35826,10 @@ __kernel void pre_process_rgb888_planar_sep_half_U8toU8\n\
float rMean,\n\
float gMean,\n\
float bMean,\n\
- float f32Var\n\
+ float r_scale,\n\
+ int reverse,\n\
+ float g_scale,\n\
+ float b_scale\n\
)\n\
{\n\
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
@@ -31746,13 +35840,28 @@ __kernel void pre_process_rgb888_planar_sep_half_U8toU8\n\
VXC_ReadImage(src1, input1, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
VXC_ReadImage(src2, input2, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
\n\
- coord_in.zw = coord_in.xy >> 1;\n\
+ int4 coord;\n\
+ coord.xy = coord_in.xy >> 1;\n\
+\n\
+ coord.x = coord.x * 3;\n\
+ coord.z = coord.x + 16;\n\
+\n\
+ vxc_uchar16 dst0, dst1;\n\
+ src0.lo = src0.s02468ace;\n\
+ src0.hi = src1.s02468ace;\n\
+ src1.lo = src2.s02468ace;\n\
\n\
- VXC_WriteImage(output0, coord_in.zw, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output1, coord_in.zw, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
- VXC_WriteImage(output2, coord_in.zw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\
+ uni8BitsDataInterleave_0_2x8);\n\
+ VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\n\
+ uni8BitsDataInterleave_1_2x8);\n\
+ VXC_DP2x8(dst1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\
+ uni8BitsDataInterleave_2_2x8);\n\
+\n\
+ VXC_WriteImage(output, coord.xy, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord.zy, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
}\n\
-"; /* end of pre_process_rgb888_planar_sep_2_vx*/
+"; /* end of pre_process_rgb888_planar_sep_nhwc_2_vx*/
static const char pre_process_rgb_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
\n\
@@ -31773,6 +35882,8 @@ _viv_uniform VXC_512Bits uniExtractBtoF32_part1_4x4;\n\
_viv_uniform VXC_512Bits uniExtractBtoF32_part2_4x4;\n\
_viv_uniform VXC_512Bits uniExtractBtoF32_part3_4x4;\n\
_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform float4 param_data;\n\
+_viv_uniform float4 rgb_scale;\n\
\n\
#define IMAGE_PRE_PROCESS_COPY_16BITS(dst_name, dst_type, copy_type, convert_type) \\\n\
__kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
@@ -31786,9 +35897,11 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float f32Var, \\\n\
+ float r_scale, \\\n\
int reverse_channel, \\\n\
- int trans \\\n\
+ int trans, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); \\\n\
@@ -31802,10 +35915,6 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
- f32Var *= outputScale; \\\n\
- float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \\\n\
- bMean * f32Var - outputZP, f32Var); \\\n\
\\\n\
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), r_order, 0); \\\n\
float4 tmp0, tmp1; \\\n\
@@ -31813,8 +35922,8 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
\\\n\
VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \\\n\
VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \\\n\
- tmp0 = tmp0 * paramData.w - paramData.x; \\\n\
- tmp1 = tmp1 * paramData.w - paramData.x; \\\n\
+ tmp0 = tmp0 * rgb_scale.x - param_data.x; \\\n\
+ tmp1 = tmp1 * rgb_scale.x - param_data.x; \\\n\
_viv_asm(CONV_RTE, result0, tmp0); \\\n\
_viv_asm(CONV_RTE, result1, tmp1); \\\n\
VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -31824,8 +35933,8 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
coord_out.z = 1; \\\n\
VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \\\n\
VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \\\n\
- tmp0 = tmp0 * paramData.w - paramData.y; \\\n\
- tmp1 = tmp1 * paramData.w - paramData.y; \\\n\
+ tmp0 = tmp0 * rgb_scale.y - param_data.y; \\\n\
+ tmp1 = tmp1 * rgb_scale.y - param_data.y; \\\n\
_viv_asm(CONV_RTE, result0, tmp0); \\\n\
_viv_asm(CONV_RTE, result1, tmp1); \\\n\
VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -31835,8 +35944,8 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
coord_out.z = b_order; \\\n\
VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \\\n\
VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \\\n\
- tmp0 = tmp0 * paramData.w - paramData.z; \\\n\
- tmp1 = tmp1 * paramData.w - paramData.z; \\\n\
+ tmp0 = tmp0 * rgb_scale.z - param_data.z; \\\n\
+ tmp1 = tmp1 * rgb_scale.z - param_data.z; \\\n\
_viv_asm(CONV_RTE, result0, tmp0); \\\n\
_viv_asm(CONV_RTE, result1, tmp1); \\\n\
VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -31858,9 +35967,11 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float f32Var, \\\n\
+ float r_scale, \\\n\
int reverse_channel, \\\n\
- int trans \\\n\
+ int trans, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
int2 coord = (int2)(get_global_id(0) * 3, get_global_id(1)); \\\n\
@@ -31875,10 +35986,6 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
coord.x += 16; \\\n\
VXC_ReadImage(src2, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
- f32Var *= outputScale; \\\n\
- float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \\\n\
- bMean * f32Var - outputZP, f32Var); \\\n\
\\\n\
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), r_order, 0); \\\n\
float4 tmp0, tmp1; \\\n\
@@ -31886,15 +35993,15 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
\\\n\
VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \\\n\
VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \\\n\
- tmp0 = tmp0 * paramData.w - paramData.x; \\\n\
- tmp1 = tmp1 * paramData.w - paramData.x; \\\n\
+ tmp0 = tmp0 * rgb_scale.x - param_data.x; \\\n\
+ tmp1 = tmp1 * rgb_scale.x - param_data.x; \\\n\
result0 = convert_int4_rte(tmp0); \\\n\
result1 = convert_int4_rte(tmp1); \\\n\
VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part2_4x4); \\\n\
VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part3_4x4); \\\n\
- tmp0 = tmp0 * paramData.w - paramData.x; \\\n\
- tmp1 = tmp1 * paramData.w - paramData.x; \\\n\
+ tmp0 = tmp0 * rgb_scale.x - param_data.x; \\\n\
+ tmp1 = tmp1 * rgb_scale.x - param_data.x; \\\n\
result0 = convert_int4_rte(tmp0); \\\n\
result1 = convert_int4_rte(tmp1); \\\n\
VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -31903,15 +36010,15 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
coord_out.z = 1; \\\n\
VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \\\n\
VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \\\n\
- tmp0 = tmp0 * paramData.w - paramData.y; \\\n\
- tmp1 = tmp1 * paramData.w - paramData.y; \\\n\
+ tmp0 = tmp0 * rgb_scale.y - param_data.y; \\\n\
+ tmp1 = tmp1 * rgb_scale.y - param_data.y; \\\n\
result0 = convert_int4_rte(tmp0); \\\n\
result1 = convert_int4_rte(tmp1); \\\n\
VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part2_4x4); \\\n\
VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part3_4x4); \\\n\
- tmp0 = tmp0 * paramData.w - paramData.y; \\\n\
- tmp1 = tmp1 * paramData.w - paramData.y; \\\n\
+ tmp0 = tmp0 * rgb_scale.y - param_data.y; \\\n\
+ tmp1 = tmp1 * rgb_scale.y - param_data.y; \\\n\
result0 = convert_int4_rte(tmp0); \\\n\
result1 = convert_int4_rte(tmp1); \\\n\
VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -31920,15 +36027,15 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
coord_out.z = b_order; \\\n\
VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \\\n\
VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \\\n\
- tmp0 = tmp0 * paramData.w - paramData.z; \\\n\
- tmp1 = tmp1 * paramData.w - paramData.z; \\\n\
+ tmp0 = tmp0 * rgb_scale.z - param_data.z; \\\n\
+ tmp1 = tmp1 * rgb_scale.z - param_data.z; \\\n\
result0 = convert_int4_rte(tmp0); \\\n\
result1 = convert_int4_rte(tmp1); \\\n\
VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part2_4x4); \\\n\
VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part3_4x4); \\\n\
- tmp0 = tmp0 * paramData.w - paramData.z; \\\n\
- tmp1 = tmp1 * paramData.w - paramData.z; \\\n\
+ tmp0 = tmp0 * rgb_scale.z - param_data.z; \\\n\
+ tmp1 = tmp1 * rgb_scale.z - param_data.z; \\\n\
result0 = convert_int4_rte(tmp0); \\\n\
result1 = convert_int4_rte(tmp1); \\\n\
VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -31989,9 +36096,11 @@ __kernel void pre_process_yuv420_copy_##name \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float var, \\\n\
+ float r_scale, \\\n\
int reverse_channel, \\\n\
- int trans \\\n\
+ int trans, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \\\n\
@@ -32050,17 +36159,23 @@ __kernel void pre_process_yuv420_copy_##name \\\n\
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
\\\n\
- var *= output_scale; \\\n\
- float4 paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \\\n\
- rMean * var - output_zp, var); \\\n\
+ float4 paramData = (float4)(bMean * b_scale * output_scale - output_zp,\\\n\
+ gMean * g_scale * output_scale - output_zp, \\\n\
+ rMean * r_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
half4 paramData_f16; \\\n\
_viv_asm(CONV, paramData_f16, paramData); \\\n\
\\\n\
VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \\\n\
VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \\\n\
+ \\\n\
+ paramData.w = g_scale * output_scale; \\\n\
+ _viv_asm(CONV, paramData_f16, paramData); \\\n\
\\\n\
VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \\\n\
VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \\\n\
+ \\\n\
+ paramData.w = r_scale * output_scale; \\\n\
+ _viv_asm(CONV, paramData_f16, paramData); \\\n\
\\\n\
VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \\\n\
VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \\\n\
@@ -32090,9 +36205,11 @@ __kernel void pre_process_yuv420_copy_##name \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float var, \\\n\
+ float r_scale, \\\n\
int reverse_channel, \\\n\
- int trans \\\n\
+ int trans, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \\\n\
@@ -32142,18 +36259,22 @@ __kernel void pre_process_yuv420_copy_##name \\\n\
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
\\\n\
- var *= output_scale; \\\n\
- float4 paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \\\n\
- rMean * var - output_zp, var); \\\n\
+ float4 paramData = (float4)(bMean * b_scale * output_scale - output_zp, \\\n\
+ gMean * g_scale * output_scale - output_zp, \\\n\
+ rMean * r_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
half4 paramData_f16; \\\n\
_viv_asm(CONV, paramData_f16, paramData); \\\n\
\\\n\
VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \\\n\
VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \\\n\
\\\n\
+ paramData.w = g_scale * output_scale; \\\n\
+ _viv_asm(CONV, paramData_f16, paramData); \\\n\
VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \\\n\
VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \\\n\
\\\n\
+ paramData.w = r_scale * output_scale; \\\n\
+ _viv_asm(CONV, paramData_f16, paramData); \\\n\
VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \\\n\
VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \\\n\
\\\n\
@@ -32228,9 +36349,11 @@ __kernel void pre_process_yuv420_scale_##name \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float var, \\\n\
+ float r_scale, \\\n\
int reverse_channel, \\\n\
- int trans \\\n\
+ int trans, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
int4 gidx = get_global_id(0); \\\n\
@@ -32379,7 +36502,7 @@ __kernel void pre_process_yuv420_scale_##name \\\n\
float4 tmpDst; \\\n\
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
- tmpDst = (tmpDst - bMean) * var; \\\n\
+ tmpDst = (tmpDst - bMean) * b_scale; \\\n\
dstPos.z = bOrder; \\\n\
result = convert_int4_rte(tmpDst * output_scale + output_zp); \\\n\
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -32393,7 +36516,7 @@ __kernel void pre_process_yuv420_scale_##name \\\n\
temp2 = fx * tmpData0 + tmpData1; \\\n\
result = fy * temp2 + (temp1 << 10); \\\n\
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
- tmpDst = (tmpDst - gMean) * var; \\\n\
+ tmpDst = (tmpDst - gMean) * g_scale; \\\n\
dstPos.z = 1; \\\n\
result = convert_int4_rte(tmpDst * output_scale + output_zp); \\\n\
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -32407,7 +36530,7 @@ __kernel void pre_process_yuv420_scale_##name \\\n\
temp2 = fx * tmpData0 + tmpData1; \\\n\
result = fy * temp2 + (temp1 << 10); \\\n\
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
- tmpDst = (tmpDst - rMean) * var; \\\n\
+ tmpDst = (tmpDst - rMean) * r_scale; \\\n\
dstPos.z = rOrder; \\\n\
result = convert_int4_rte(tmpDst * output_scale + output_zp); \\\n\
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -32467,9 +36590,11 @@ __kernel void pre_process_yuv420_scale_##name \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float var, \\\n\
+ float r_scale, \\\n\
int reverse_channel, \\\n\
- int trans \\\n\
+ int trans, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
int4 gidx = get_global_id(0); \\\n\
@@ -32620,7 +36745,7 @@ __kernel void pre_process_yuv420_scale_##name \\\n\
float4 tmpDst; \\\n\
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
- tmpDst = (tmpDst - bMean) * var; \\\n\
+ tmpDst = (tmpDst - bMean) * b_scale; \\\n\
dstPos.z = bOrder; \\\n\
tmpDst = tmpDst * output_scale + output_zp; \\\n\
_viv_asm(CONV_RTE, tmpVal, tmpDst); \\\n\
@@ -32636,7 +36761,7 @@ __kernel void pre_process_yuv420_scale_##name \\\n\
temp2 = fx * tmpData0 + tmpData1; \\\n\
result = fy * temp2 + (temp1 << 10); \\\n\
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
- tmpDst = (tmpDst - gMean) * var; \\\n\
+ tmpDst = (tmpDst - gMean) * g_scale; \\\n\
dstPos.z = 1; \\\n\
tmpDst = tmpDst * output_scale + output_zp; \\\n\
_viv_asm(CONV_RTE, tmpVal, tmpDst); \\\n\
@@ -32652,7 +36777,7 @@ __kernel void pre_process_yuv420_scale_##name \\\n\
temp2 = fx * tmpData0 + tmpData1; \\\n\
result = fy * temp2 + (temp1 << 10); \\\n\
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
- tmpDst = (tmpDst - rMean) * var; \\\n\
+ tmpDst = (tmpDst - rMean) * r_scale; \\\n\
dstPos.z = rOrder; \\\n\
tmpDst = tmpDst * output_scale + output_zp; \\\n\
_viv_asm(CONV_RTE, tmpVal, tmpDst); \\\n\
@@ -32669,7 +36794,9 @@ static const char pre_process_yuv422_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n
_viv_uniform int bOrder;\n\
_viv_uniform int rOrder;\n\
\n\
-_viv_uniform float outputScaleVar;\n\
+_viv_uniform float outputScaleVar_b;\n\
+_viv_uniform float outputScaleVar_g;\n\
+_viv_uniform float outputScaleVar_r;\n\
_viv_uniform float bMeanScaleVarZp;\n\
_viv_uniform float gMeanScaleVarZp;\n\
_viv_uniform float rMeanScaleVarZp;\n\
@@ -32693,10 +36820,12 @@ __kernel void pre_process_yuv422_copy_##name \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float var, \\\n\
+ float r_scale, \\\n\
int reverse_channel, \\\n\
int trans, \\\n\
- int yuv422_type \\\n\
+ int yuv422_type, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
int gidx = get_global_id(0); \\\n\
@@ -32726,21 +36855,21 @@ __kernel void pre_process_yuv422_copy_##name \\\n\
dst_type dst0; \\\n\
save_type dst; \\\n\
int4 dstPos = (int4)(gidx, gidy, 0, 0); \\\n\
- tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\
+ tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\
_viv_asm(CONV_RTE, result, tmpDstB); \\\n\
dstPos.z = bOrder; \\\n\
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
_viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\
+ tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\
_viv_asm(CONV_RTE, result, tmpDstG); \\\n\
dstPos.z = 1; \\\n\
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
_viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\
+ tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\
_viv_asm(CONV_RTE, result, tmpDstR); \\\n\
dstPos.z = rOrder; \\\n\
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
@@ -32758,7 +36887,10 @@ static const char pre_process_yuv422_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\
_viv_uniform int bOrder;\n\
_viv_uniform int rOrder;\n\
\n\
-_viv_uniform float outputScaleVar;\n\
+_viv_uniform float outputScaleVar_b;\n\
+_viv_uniform float outputScaleVar_g;\n\
+_viv_uniform float outputScaleVar_r;\n\
+\n\
_viv_uniform float bMeanScaleVarZp;\n\
_viv_uniform float gMeanScaleVarZp;\n\
_viv_uniform float rMeanScaleVarZp;\n\
@@ -32788,10 +36920,12 @@ __kernel void pre_process_yuv422_scale_##name \\\n\
float rMean, \\\n\
float gMean, \\\n\
float bMean, \\\n\
- float var, \\\n\
+ float r_scale, \\\n\
int reverse_channel, \\\n\
int trans, \\\n\
- int yuv422_type \\\n\
+ int yuv422_type, \\\n\
+ float g_scale, \\\n\
+ float b_scale \\\n\
) \\\n\
{ \\\n\
int4 gidx = get_global_id(0); \\\n\
@@ -32863,21 +36997,21 @@ __kernel void pre_process_yuv422_scale_##name \\\n\
dst_type dst0; \\\n\
save_type dst; \\\n\
int4 dstPos = (int4)(gidx.x, gidy, 0, 0); \\\n\
- tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\
+ tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\
_viv_asm(CONV_RTE, result, tmpDstB); \\\n\
dstPos.z = bOrder; \\\n\
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
_viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\
+ tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\
_viv_asm(CONV_RTE, result, tmpDstG); \\\n\
dstPos.z = 1; \\\n\
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
_viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
\\\n\
- tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\
+ tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\
_viv_asm(CONV_RTE, result, tmpDstR); \\\n\
dstPos.z = rOrder; \\\n\
VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
@@ -32939,9 +37073,11 @@ __kernel void pre_process_yuv444_copy_U8toU8(\n\
float rMean,\n\
float gMean,\n\
float bMean,\n\
- float var,\n\
+ float r_scale,\n\
int reverse_channel,\n\
- int trans\n\
+ int trans,\n\
+ float g_scale,\n\
+ float b_scale\n\
)\n\
{\n\
int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset));\n\
@@ -33000,17 +37136,22 @@ __kernel void pre_process_yuv444_copy_U8toU8(\n\
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
\n\
- var *= outputScale;\n\
- float4 paramData = (float4)(bMean * var - zp, gMean * var - zp,\\\n\
- rMean * var - zp, var);\n\
+ float4 paramData = (float4)(bMean * b_scale * outputScale - zp, gMean * g_scale * outputScale - zp,\\\n\
+ rMean * r_scale * outputScale - zp, b_scale * outputScale);\n\
half4 paramData_f16;\n\
_viv_asm(CONV, paramData_f16, paramData);\n\
\n\
VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\
VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\
+\n\
+ paramData.w = g_scale * outputScale;\n\
+ _viv_asm(CONV, paramData_f16, paramData);\n\
\n\
VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\
VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\
+\n\
+ paramData.w = r_scale * outputScale;\n\
+ _viv_asm(CONV, paramData_f16, paramData);\n\
\n\
VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\
VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\
@@ -33035,9 +37176,11 @@ __kernel void pre_process_yuv444_copy_U8toF16(\n\
float rMean,\n\
float gMean,\n\
float bMean,\n\
- float var,\n\
+ float r_scale,\n\
int reverse_channel,\n\
- int trans\n\
+ int trans,\n\
+ float g_scale,\n\
+ float b_scale\n\
)\n\
{\n\
int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset));\n\
@@ -33097,16 +37240,22 @@ __kernel void pre_process_yuv444_copy_U8toF16(\n\
VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
\n\
- float4 paramData = (float4)(bMean * var, gMean * var,\\\n\
- rMean * var, var);\n\
+ float4 paramData = (float4)(bMean * b_scale * outputScale, gMean * g_scale * outputScale,\\\n\
+ rMean * r_scale * outputScale, b_scale * outputScale);\n\
half4 paramData_f16;\n\
_viv_asm(CONV, paramData_f16, paramData);\n\
\n\
VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\
VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\
+\n\
+ paramData.w = g_scale * outputScale;\n\
+ _viv_asm(CONV, paramData_f16, paramData);\n\
\n\
VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\
VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\
+\n\
+ paramData.w = r_scale * outputScale;\n\
+ _viv_asm(CONV, paramData_f16, paramData);\n\
\n\
VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\
VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\
@@ -33171,7 +37320,8 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \\\n\
__read_only image2d_t y_img, __read_only image2d_t u_img, \\\n\
__read_only image2d_t v_img, __write_only image2d_array_t output, \\\n\
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, \\\n\
- float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) \\\n\
+ float rMean, float gMean, float bMean, float r_scale, int reverse_channel, int trans, \\\n\
+ float g_scale, float b_scale) \\\n\
{ \\\n\
int4 gidx = get_global_id(0); \\\n\
int gidy = get_global_id(1); \\\n\
@@ -33283,7 +37433,7 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \\\n\
float4 tmpDst; \\\n\
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
- tmpDst = (tmpDst - bMean) * var; \\\n\
+ tmpDst = (tmpDst - bMean) * b_scale; \\\n\
dstPos.z = bOrder; \\\n\
result = convert_int4_rte(tmpDst * outputScale + zp); \\\n\
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \\\n\
@@ -33297,7 +37447,7 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \\\n\
temp2 = fx * tmpData0 + tmpData1; \\\n\
result = fy * temp2 + (temp1 << 10); \\\n\
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
- tmpDst = (tmpDst - gMean) * var; \\\n\
+ tmpDst = (tmpDst - gMean) * g_scale; \\\n\
dstPos.z = 1; \\\n\
result = convert_int4_rte(tmpDst * outputScale + zp); \\\n\
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \\\n\
@@ -33311,7 +37461,7 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \\\n\
temp2 = fx * tmpData0 + tmpData1; \\\n\
result = fy * temp2 + (temp1 << 10); \\\n\
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
- tmpDst = (tmpDst - rMean) * var; \\\n\
+ tmpDst = (tmpDst - rMean) * r_scale; \\\n\
dstPos.z = rOrder; \\\n\
result = convert_int4_rte(tmpDst * outputScale + zp); \\\n\
VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \\\n\
@@ -33360,7 +37510,8 @@ __kernel void pre_process_yuv444_scale_U8toF16(\n\
__read_only image2d_t y_img, __read_only image2d_t u_img,\n\
__read_only image2d_t v_img, __write_only image2d_array_t output,\n\
global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
- float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
+ float rMean, float gMean, float bMean, float r_scale, int reverse_channel, int trans,\n\
+ float g_scale, float b_scale)\n\
{\n\
int4 gidx = get_global_id(0);\n\
int gidy = get_global_id(1);\n\
@@ -33480,7 +37631,7 @@ __kernel void pre_process_yuv444_scale_U8toF16(\n\
float4 tmpDst;\n\
int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
- tmpDst = (tmpDst - bMean) * var;\n\
+ tmpDst = (tmpDst - bMean) * b_scale;\n\
dstPos.z = bOrder;\n\
_viv_asm(CONV, hDst, tmpDst);\n\
VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\
@@ -33495,7 +37646,7 @@ __kernel void pre_process_yuv444_scale_U8toF16(\n\
temp2 = fx * tmpData0 + tmpData1;\n\
result = fy * temp2 + (temp1 << 10);\n\
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
- tmpDst = (tmpDst - gMean) * var;\n\
+ tmpDst = (tmpDst - gMean) * g_scale;\n\
dstPos.z = 1;\n\
_viv_asm(CONV, hDst, tmpDst);\n\
VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\
@@ -33510,7 +37661,7 @@ __kernel void pre_process_yuv444_scale_U8toF16(\n\
temp2 = fx * tmpData0 + tmpData1;\n\
result = fy * temp2 + (temp1 << 10);\n\
VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
- tmpDst = (tmpDst - rMean) * var;\n\
+ tmpDst = (tmpDst - rMean) * r_scale;\n\
dstPos.z = rOrder;\n\
_viv_asm(CONV, hDst, tmpDst);\n\
VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\
@@ -37154,7 +41305,6 @@ static const char resize_1d_bilinear_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
\n\
_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\
_viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\
_viv_uniform VXC_512Bits uniExtactHalf8_2x8;\n\
_viv_uniform float scale_x;\n\
_viv_uniform int out_height;\n\
@@ -37215,8 +41365,10 @@ __kernel void resize_1d_bilinear_F16toF16_DOWN\n\
\n\
_viv_asm(COPY, src_half, src, 16);\n\
\n\
- VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_left_4x4);\n\
- VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_right_4x4);\n\
+ VXC_DP4x4(left4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniConvertFp2FP32_left_4x4);\n\
+ VXC_DP4x4(right4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniConvertFp2FP32_right_4x4);\n\
right4 -= left4;\n\
float4 dst4 = right4 * x_lerp + left4;\n\
\n\
@@ -37281,8 +41433,10 @@ __kernel void resize_1d_bilinear_F16toU8_DOWN\n\
\n\
_viv_asm(COPY, src_half, src, 16);\n\
\n\
- VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_left_4x4);\n\
- VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_right_4x4);\n\
+ VXC_DP4x4(left4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniConvertFp2FP32_left_4x4);\n\
+ VXC_DP4x4(right4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+ uniConvertFp2FP32_right_4x4);\n\
right4 -= left4;\n\
float4 dst4 = right4 * x_lerp + left4;\n\
\n\
@@ -41782,6 +45936,580 @@ __kernel void scatter_nd_update_F16F16toU8_big(\n\
}\n\
"; /* end of scatter_nd_update_big_vx*/
+static const char scatter_nd_update_fp_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int update_width;\n\
+_viv_uniform int output_width;\n\
+_viv_uniform int ref_stride;\n\
+_viv_uniform int output_stride;\n\
+\n\
+_viv_uniform int4 coord_stride;\n\
+_viv_uniform int4 coord_stride1;\n\
+_viv_uniform float inout_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertFp16ToFp32_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
+\n\
+inline void AtomicAdd_float(volatile __global float *source, const float operand)\n\
+{\n\
+ union\n\
+ {\n\
+ unsigned int intVal;\n\
+ float floatVal;\n\
+ } newVal;\n\
+ union\n\
+ {\n\
+ unsigned int intVal;\n\
+ float floatVal;\n\
+ } prevVal;\n\
+ do\n\
+ {\n\
+ prevVal.floatVal = *source;\n\
+ newVal.floatVal = prevVal.floatVal + operand;\n\
+ } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\
+ prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\
+}\n\
+\n\
+__kernel void scatter_nd_update_update_F16(\n\
+ __read_only image2d_t index,\n\
+ __read_only image2d_t update,\n\
+ image2d_t temp_buf_float,\n\
+ image2d_t link_buffer0,\n\
+ int width, int area, int vol, int val4,\n\
+ int val5, int val6, int val7, int coord_dim)\n\
+{\n\
+ int gidx = get_global_id(0);\n\
+ int gidy = get_global_id(1);\n\
+ Image img1 = create_image_from_image2d(index, 4);\n\
+ Image img2 = create_image_from_image2d(update, 2);\n\
+ Image img3 = create_image_from_image2d(temp_buf_float, 4);\n\
+ __global int* index_ptr = (__global int*)img1.ptr;\n\
+ __global short* update_ptr = (__global short*)img2.ptr;\n\
+ __global float* output_ptr = (__global float*)img3.ptr;\n\
+ half src;\n\
+\n\
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim);\n\
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);\n\
+ short tmpData = update_ptr[gidy * update_width + gidx];\n\
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;\n\
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;\n\
+ int loc = idx * output_width + gidx;\n\
+ _viv_asm(COPY, src, tmpData, 4);\n\
+ float data;\n\
+ _viv_asm(CONV, data, src);\n\
+ AtomicAdd_float(output_ptr + loc, data);\n\
+}\n\
+\n\
+__kernel void scatter_nd_update_update_F16_4X(\n\
+ __read_only image2d_t index,\n\
+ __read_only image2d_t update,\n\
+ image2d_t temp_buf_float,\n\
+ image2d_t link_buffer0,\n\
+ int width, int area, int vol, int val4,\n\
+ int val5, int val6, int val7, int coord_dim)\n\
+{\n\
+ int gidx = get_global_id(0);\n\
+ int gidy = get_global_id(1);\n\
+ Image img1 = create_image_from_image2d(index, 4);\n\
+ Image img2 = create_image_from_image2d(update, 2);\n\
+ Image img3 = create_image_from_image2d(temp_buf_float, 4);\n\
+ __global int* index_ptr = (__global int*)img1.ptr;\n\
+ __global vxc_short4* update_ptr = (__global vxc_short4*)img2.ptr;\n\
+ __global float* output_ptr = (__global float*)img3.ptr;\n\
+ vxc_half4 src;\n\
+\n\
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim);\n\
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);\n\
+ vxc_short4 tmpData = update_ptr[gidy * update_width + gidx];\n\
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;\n\
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;\n\
+ int4 loc = idx * output_width + gidx * 4 + (int4)(0, 1, 2, 3);\n\
+\n\
+ _viv_asm(COPY, src, tmpData, 8);\n\
+ float4 data;\n\
+ VXC_DP4x4(data, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1),\n\
+ uniConvertFp16ToFp32_4x4);\n\
+ AtomicAdd_float(output_ptr + loc.x, data.x);\n\
+ AtomicAdd_float(output_ptr + loc.y, data.y);\n\
+ AtomicAdd_float(output_ptr + loc.z, data.z);\n\
+ AtomicAdd_float(output_ptr + loc.w, data.w);\n\
+}\n\
+\n\
+__kernel void scatter_nd_update_update_BF16(\n\
+ __read_only image2d_t index,\n\
+ __read_only image2d_t update,\n\
+ image2d_t temp_buf_float,\n\
+ image2d_t link_buffer0,\n\
+ int width, int area, int vol, int val4,\n\
+ int val5, int val6, int val7, int coord_dim)\n\
+{\n\
+ int gidx = get_global_id(0);\n\
+ int gidy = get_global_id(1);\n\
+ Image img1 = create_image_from_image2d(index, 4);\n\
+ Image img2 = create_image_from_image2d(update, 2);\n\
+ Image img3 = create_image_from_image2d(temp_buf_float, 4);\n\
+ __global int* index_ptr = (__global int*)img1.ptr;\n\
+ __global short* update_ptr = (__global short*)img2.ptr;\n\
+ __global float* output_ptr = (__global float*)img3.ptr;\n\
+ float data;\n\
+\n\
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim);\n\
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);\n\
+ short tmpData = update_ptr[gidy * update_width + gidx];\n\
+ vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+ vxc_short8 src0, src1;\n\
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;\n\
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;\n\
+ int loc = idx * output_width + gidx;\n\
+ _viv_asm(COPY, src0, tmpData, 4);\n\
+ VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+ uniConvBF16toF32_Part0_2x8);\n\
+ _viv_asm(COPY, data, src1, 4);\n\
+ AtomicAdd_float(output_ptr + loc, data);\n\
+}\n\
+\n\
+__kernel void scatter_nd_update_update_BF16_4X(\n\
+ __read_only image2d_t index,\n\
+ __read_only image2d_t update,\n\
+ image2d_t temp_buf_float,\n\
+ image2d_t link_buffer0,\n\
+ int width, int area, int vol, int val4,\n\
+ int val5, int val6, int val7, int coord_dim)\n\
+{\n\
+ int gidx = get_global_id(0);\n\
+ int gidy = get_global_id(1);\n\
+ Image img1 = create_image_from_image2d(index, 4);\n\
+ Image img2 = create_image_from_image2d(update, 2);\n\
+ Image img3 = create_image_from_image2d(temp_buf_float, 4);\n\
+ __global int* index_ptr = (__global int*)img1.ptr;\n\
+ __global vxc_short4* update_ptr = (__global vxc_short4*)img2.ptr;\n\
+ __global float* output_ptr = (__global float*)img3.ptr;\n\
+\n\
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim);\n\
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);\n\
+ vxc_short4 tmpData = update_ptr[gidy * update_width + gidx];\n\
+ vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+ vxc_short8 src0, src1;\n\
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;\n\
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;\n\
+ int4 loc = idx * output_width + gidx * 4 + (int4)(0, 1, 2, 3);\n\
+\n\
+ _viv_asm(COPY, src0, tmpData, 8);\n\
+ float4 data;\n\
+ VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+ uniConvBF16toF32_Part0_2x8);\n\
+ _viv_asm(COPY, data, src1, 16);\n\
+ AtomicAdd_float(output_ptr + loc.x, data.x);\n\
+ AtomicAdd_float(output_ptr + loc.y, data.y);\n\
+ AtomicAdd_float(output_ptr + loc.z, data.z);\n\
+ AtomicAdd_float(output_ptr + loc.w, data.w);\n\
+}\n\
+\n\
+#define SCATTER_ND_UPDATE_REF_FP16(type0, type1, ptr_type) \\\n\
+__kernel void scatter_nd_update_ref_##type0##to##type1( \\\n\
+ __read_only image2d_t index, \\\n\
+ __read_only image2d_t update, \\\n\
+ __read_only image2d_t temp_buf_int, \\\n\
+ image2d_t temp_ref, \\\n\
+ image2d_t link_buffer0, \\\n\
+ image2d_t link_buffer1, \\\n\
+ int width, int area, int vol, int val4, \\\n\
+ int val5, int val6, int val7, int coord_dim) \\\n\
+{ \\\n\
+ int gidx = get_global_id(0); \\\n\
+ int gidy = get_global_id(1); \\\n\
+ Image img1 = create_image_from_image2d(index, 4); \\\n\
+ Image img2 = create_image_from_image2d(temp_buf_int, 4); \\\n\
+ Image img3 = create_image_from_image2d(temp_ref, 2); \\\n\
+ __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+ __global ptr_type* acc_ptr = (__global ptr_type*)img2.ptr; \\\n\
+ __global short* ref_ptr = (__global short*)img3.ptr; \\\n\
+ \\\n\
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\
+ int loc = idx * output_stride + gidx; \\\n\
+ float4 tmpData; \\\n\
+ tmpData.x = convert_float(acc_ptr[loc]) * inout_scale + output_zp; \\\n\
+ half4 data; \\\n\
+ short tmpDst; \\\n\
+ _viv_asm(CONV, data, tmpData); \\\n\
+ _viv_asm(COPY, tmpDst, data, 4); \\\n\
+ ref_ptr[loc] = tmpDst; \\\n\
+}\n\
+SCATTER_ND_UPDATE_REF_FP16(I32, F16, int)\n\
+SCATTER_ND_UPDATE_REF_FP16(F32, F16, float)\n\
+\n\
+#define SCATTER_ND_UPDATE_REF_FP16_4X(type0, type1, ptr_type) \\\n\
+__kernel void scatter_nd_update_ref_##type0##to##type1##_4X( \\\n\
+ __read_only image2d_t index, \\\n\
+ __read_only image2d_t update, \\\n\
+ __read_only image2d_t temp_buf_int, \\\n\
+ image2d_t temp_ref, \\\n\
+ image2d_t link_buffer0, \\\n\
+ image2d_t link_buffer1, \\\n\
+ int width, int area, int vol, int val4, \\\n\
+ int val5, int val6, int val7, int coord_dim) \\\n\
+{ \\\n\
+ int gidx = get_global_id(0); \\\n\
+ int gidy = get_global_id(1); \\\n\
+ Image img1 = create_image_from_image2d(index, 4); \\\n\
+ Image img2 = create_image_from_image2d(temp_buf_int, 4); \\\n\
+ Image img3 = create_image_from_image2d(temp_ref, 2); \\\n\
+ __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+ __global ptr_type* acc_ptr = (__global ptr_type*)img2.ptr; \\\n\
+ __global vxc_short4* ref_ptr = (__global vxc_short4*)img3.ptr; \\\n\
+ \\\n\
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\
+ float4 tmpData = convert_float4(vload4(gidx, acc_ptr + idx * ref_stride)); \\\n\
+ int loc = idx * output_stride + gidx; \\\n\
+ float4 tmpVal = tmpData * inout_scale + output_zp; \\\n\
+ half4 data; \\\n\
+ vxc_short8 tmpDst; \\\n\
+ _viv_asm(CONV, data, tmpVal); \\\n\
+ _viv_asm(COPY, tmpDst, data, 16); \\\n\
+ ref_ptr[loc] = tmpDst.s0246; \\\n\
+}\n\
+SCATTER_ND_UPDATE_REF_FP16_4X(I32, F16, int)\n\
+SCATTER_ND_UPDATE_REF_FP16_4X(F32, F16, float)\n\
+\n\
+__kernel void scatter_nd_update_ref_F32toBF16(\n\
+ __read_only image2d_t index,\n\
+ __read_only image2d_t update,\n\
+ __read_only image2d_t temp_buf_int,\n\
+ image2d_t temp_ref,\n\
+ image2d_t link_buffer0,\n\
+ image2d_t link_buffer1,\n\
+ int width, int area, int vol, int val4,\n\
+ int val5, int val6, int val7, int coord_dim)\n\
+{\n\
+ int gidx = get_global_id(0);\n\
+ int gidy = get_global_id(1);\n\
+ Image img1 = create_image_from_image2d(index, 4);\n\
+ Image img2 = create_image_from_image2d(temp_buf_int, 4);\n\
+ Image img3 = create_image_from_image2d(temp_ref, 2);\n\
+ __global int* index_ptr = (__global int*)img1.ptr;\n\
+ __global float* acc_ptr = (__global float*)img2.ptr;\n\
+ __global short* ref_ptr = (__global short*)img3.ptr;\n\
+\n\
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim);\n\
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);\n\
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;\n\
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;\n\
+ int loc = idx * output_stride + gidx;\n\
+ float tmpData;\n\
+ tmpData = acc_ptr[loc];\n\
+ vxc_ushort8 src0, src2;\n\
+ _viv_asm(COPY, src0, tmpData, 4);\n\
+ VXC_DP2x8(src2, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+\n\
+ ref_ptr[loc] = src2.x;\n\
+}\n\
+\n\
+__kernel void scatter_nd_update_ref_F32toBF16_4X(\n\
+ __read_only image2d_t index,\n\
+ __read_only image2d_t update,\n\
+ __read_only image2d_t temp_buf_int,\n\
+ image2d_t temp_ref,\n\
+ image2d_t link_buffer0,\n\
+ image2d_t link_buffer1,\n\
+ int width, int area, int vol, int val4,\n\
+ int val5, int val6, int val7, int coord_dim)\n\
+{\n\
+ int gidx = get_global_id(0);\n\
+ int gidy = get_global_id(1);\n\
+ Image img1 = create_image_from_image2d(index, 4);\n\
+ Image img2 = create_image_from_image2d(temp_buf_int, 4);\n\
+ Image img3 = create_image_from_image2d(temp_ref, 2);\n\
+ __global int* index_ptr = (__global int*)img1.ptr;\n\
+ __global float* acc_ptr = (__global float*)img2.ptr;\n\
+ __global vxc_short4* ref_ptr = (__global vxc_short4*)img3.ptr;\n\
+\n\
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim);\n\
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);\n\
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;\n\
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;\n\
+ float4 tmpData = vload4(gidx, acc_ptr + idx * ref_stride);\n\
+ int loc = idx * output_stride + gidx;\n\
+ vxc_short8 src0, src2;\n\
+ _viv_asm(COPY, src0, tmpData, 16);\n\
+ VXC_DP2x8(src2, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+ ref_ptr[loc] = src2.s0123;\n\
+}\n\
+"; /* end of scatter_nd_update_fp_vx*/
+
+static const char scatter_nd_update_qint_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\
+_viv_uniform int update_width;\n\
+_viv_uniform int output_width;\n\
+_viv_uniform int ref_stride;\n\
+_viv_uniform int output_stride;\n\
+_viv_uniform int2 multAndoutZP0;\n\
+\n\
+_viv_uniform int4 coord_stride;\n\
+_viv_uniform int4 coord_stride1;\n\
+\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform int input_zp;\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform float inout_scale;\n\
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+#define SCATTER_RESET(name0, name1, ptr0, ptr1, type0, type1, len0, len1, size0, size1, ptr2, ptr3, len3) \\\n\
+__kernel void scatter_nd_update_reset_##name0##to##name1( \\\n\
+ __read_only image2d_t input_ref, \\\n\
+ image2d_t temp_ref, \\\n\
+ image2d_t temp_buf_int, \\\n\
+ int length, int res) \\\n\
+{ \\\n\
+ int gidx = get_global_id(0); \\\n\
+ Image img1 = create_image_from_image2d(input_ref, size0); \\\n\
+ Image img2 = create_image_from_image2d(temp_ref, size1); \\\n\
+ Image img3 = create_image_from_image2d(temp_buf_int, 4); \\\n\
+ __global ptr0* input_ptr = (__global ptr0*)img1.ptr; \\\n\
+ __global ptr1* output_ptr = (__global ptr1*)img2.ptr; \\\n\
+ __global int* tmp_update_ptr = (__global int*)img3.ptr; \\\n\
+ ptr0 tmpData = input_ptr[gidx]; \\\n\
+ int4 zeros = (int4)(0); \\\n\
+ int loc2 = gidx * 8; \\\n\
+ type0 src; \\\n\
+ type1 tmpDst; \\\n\
+ ptr1 dst; \\\n\
+ vxc_ushort8 ms0; \\\n\
+ _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+ _viv_asm(COPY, src, tmpData, len0); \\\n\
+ VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+ _viv_asm(COPY, dst, tmpDst, len1); \\\n\
+ output_ptr[gidx] = dst; \\\n\
+ vstore4(zeros, 0, tmp_update_ptr + loc2); \\\n\
+ vstore4(zeros, 1, tmp_update_ptr + loc2); \\\n\
+ if(gidx < res) \\\n\
+ { \\\n\
+ __global ptr2* input_ptr1 = (__global ptr2*)img1.ptr; \\\n\
+ __global ptr3* output_ptr1 = (__global ptr3*)img2.ptr; \\\n\
+ ptr2 tmpData1 = input_ptr1[length + gidx]; \\\n\
+ ptr3 dst1; \\\n\
+ dst1 ^= dst1; \\\n\
+ tmp_update_ptr[length + gidx] = 0; \\\n\
+ _viv_asm(COPY, src, tmpData1, 4); \\\n\
+ VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+ _viv_asm(COPY, dst1, tmpDst, len3); \\\n\
+ output_ptr1[length + gidx] = dst1; \\\n\
+ } \\\n\
+}\n\
+SCATTER_RESET(U8, U8, vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, 8, 8, 1, 1, uchar, uchar, 1)\n\
+SCATTER_RESET(I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, 8, 8, 1, 1, char, char, 1)\n\
+SCATTER_RESET(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, 16, 16, 2, 2, short, short, 2)\n\
+SCATTER_RESET(F16, F16, vxc_short8, vxc_short8, vxc_half8, vxc_half8, 16, 16, 2, 2, short, short, 2)\n\
+SCATTER_RESET(U8, F16, vxc_uchar8, vxc_short8, vxc_uchar8, vxc_half8, 8, 16, 1, 2, uchar, short, 2)\n\
+SCATTER_RESET(I8, F16, vxc_char8, vxc_short8, vxc_char8, vxc_half8, 8, 16, 1, 2, char, short, 2)\n\
+SCATTER_RESET(I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, 16, 8, 2, 1, short, short, 2)\n\
+SCATTER_RESET(F16, U8, vxc_short8, vxc_uchar8, vxc_half8, vxc_uchar8, 16, 8, 2, 1, short, uchar, 1)\n\
+\n\
+__kernel void scatter_nd_update_reset_BF16toBF16(\n\
+ __read_only image2d_t input_ref,\n\
+ image2d_t temp_ref,\n\
+ image2d_t temp_buf_int)\n\
+{\n\
+ int gidx = get_global_id(0);\n\
+ Image img1 = create_image_from_image2d(input_ref, 2);\n\
+ Image img2 = create_image_from_image2d(temp_ref, 2);\n\
+ Image img3 = create_image_from_image2d(temp_buf_int, 4);\n\
+ __global vxc_short8* input_ptr = (__global vxc_short8*)img1.ptr;\n\
+ __global vxc_short8* output_ptr = (__global vxc_short8*)img2.ptr;\n\
+ __global float* tmp_update_ptr = (__global float*)img3.ptr;\n\
+ vxc_short8 src = input_ptr[gidx];\n\
+ float4 zeros = (float4)(0, 0, 0, 0);\n\
+ int loc2 = gidx * 8;\n\
+ output_ptr[gidx] = src;\n\
+ vstore4(zeros, 0, tmp_update_ptr + loc2);\n\
+ vstore4(zeros, 1, tmp_update_ptr + loc2);\n\
+}\n\
+\n\
+#define SCATTER_ND_UPDATE_QINT(src0_type, data_type, ptr_type, element_size) \\\n\
+__kernel void scatter_nd_update_update_##src0_type( \\\n\
+ __read_only image2d_t index, \\\n\
+ __read_only image2d_t update, \\\n\
+ image2d_t temp_buf_int, \\\n\
+ image2d_t link_buffer0, \\\n\
+ int width, int area, int vol, int val4, \\\n\
+ int val5, int val6, int val7, int coord_dim) \\\n\
+{ \\\n\
+ int gidx = get_global_id(0); \\\n\
+ int gidy = get_global_id(1); \\\n\
+ Image img1 = create_image_from_image2d(index, 4); \\\n\
+ Image img2 = create_image_from_image2d(update, element_size); \\\n\
+ Image img3 = create_image_from_image2d(temp_buf_int, 4); \\\n\
+ __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+ __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \\\n\
+ __global int* output_ptr = (__global int*)img3.ptr; \\\n\
+ data_type src; \\\n\
+ \\\n\
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\
+ ptr_type tmpData = update_ptr[gidy * update_width + gidx]; \\\n\
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\
+ int loc = idx * output_width + gidx; \\\n\
+ _viv_asm(COPY, src, tmpData, 4); \\\n\
+ vxc_int4 data; \\\n\
+ short zp = input_zp; \\\n\
+ VXC_DP4x4(data, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvert1stUint8SubZpToFp32_4x4); \\\n\
+ atomic_add(output_ptr + loc, data.x); \\\n\
+}\n\
+SCATTER_ND_UPDATE_QINT(U8, vxc_uchar8, uchar, 1)\n\
+SCATTER_ND_UPDATE_QINT(I8, vxc_char8, char, 1)\n\
+SCATTER_ND_UPDATE_QINT(I16, vxc_short8, short, 2)\n\
+\n\
+#define SCATTER_ND_UPDATE_QINT_4X(src0_type, data_type, ptr_type, element_size) \\\n\
+__kernel void scatter_nd_update_update_##src0_type##_4X( \\\n\
+ __read_only image2d_t index, \\\n\
+ __read_only image2d_t update, \\\n\
+ image2d_t temp_buf_int, \\\n\
+ image2d_t link_buffer0, \\\n\
+ int width, int area, int vol, int val4, \\\n\
+ int val5, int val6, int val7, int coord_dim) \\\n\
+{ \\\n\
+ int gidx = get_global_id(0); \\\n\
+ int gidy = get_global_id(1); \\\n\
+ Image img1 = create_image_from_image2d(index, 4); \\\n\
+ Image img2 = create_image_from_image2d(update, element_size); \\\n\
+ Image img3 = create_image_from_image2d(temp_buf_int, 4); \\\n\
+ __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+ __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \\\n\
+ __global int* output_ptr = (__global int*)img3.ptr; \\\n\
+ \\\n\
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\
+ ptr_type src = update_ptr[gidy * update_width + gidx]; \\\n\
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\
+ int4 loc = idx * output_width + gidx * 4 + (int4)(0, 1, 2, 3); \\\n\
+ vxc_int4 data; \\\n\
+ short zp = input_zp; \\\n\
+ VXC_DP4x4(data, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvert1stUint8SubZpToFp32_4x4); \\\n\
+ atomic_add(output_ptr + loc.x, data.x); \\\n\
+ atomic_add(output_ptr + loc.y, data.y); \\\n\
+ atomic_add(output_ptr + loc.z, data.z); \\\n\
+ atomic_add(output_ptr + loc.w, data.w); \\\n\
+}\n\
+SCATTER_ND_UPDATE_QINT_4X(U8, vxc_uchar8, vxc_uchar4, 1)\n\
+SCATTER_ND_UPDATE_QINT_4X(I8, vxc_char8, vxc_char4, 1)\n\
+SCATTER_ND_UPDATE_QINT_4X(I16, vxc_short8, vxc_short4, 2)\n\
+\n\
+#define SCATTER_ND_UPDATE_REF(src0_type, dst_type, data_type, ptr_type, element_size) \\\n\
+__kernel void scatter_nd_update_ref_##src0_type##to##dst_type( \\\n\
+ __read_only image2d_t index, \\\n\
+ __read_only image2d_t update, \\\n\
+ __read_only image2d_t temp_buf_int, \\\n\
+ image2d_t temp_ref, \\\n\
+ image2d_t link_buffer0, \\\n\
+ image2d_t link_buffer1, \\\n\
+ int width, int area, int vol, int val4, \\\n\
+ int val5, int val6, int val7, int coord_dim) \\\n\
+{ \\\n\
+ int gidx = get_global_id(0); \\\n\
+ int gidy = get_global_id(1); \\\n\
+ Image img1 = create_image_from_image2d(index, 4); \\\n\
+ Image img2 = create_image_from_image2d(temp_buf_int, 4); \\\n\
+ Image img3 = create_image_from_image2d(temp_ref, element_size); \\\n\
+ __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+ __global int* acc_ptr = (__global int*)img2.ptr; \\\n\
+ __global ptr_type* ref_ptr = (__global ptr_type*)img3.ptr; \\\n\
+ data_type dst; \\\n\
+ \\\n\
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\
+ int loc = idx * output_stride + gidx; \\\n\
+ int tmpData = acc_ptr[loc]; \\\n\
+ int4 data; \\\n\
+ data.x = convert_int_rte(tmpData * inout_scale + output_zp); \\\n\
+ VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ ref_ptr[loc] = dst.x; \\\n\
+}\n\
+SCATTER_ND_UPDATE_REF(I32, U8, vxc_uchar8, uchar, 1)\n\
+SCATTER_ND_UPDATE_REF(I32, I8, vxc_char8, char, 1)\n\
+SCATTER_ND_UPDATE_REF(I32, I16, vxc_short8, short, 2)\n\
+\n\
+#define SCATTER_ND_UPDATE_REF_4X(src0_type, dst_type, data_type, ptr_type, element_size) \\\n\
+__kernel void scatter_nd_update_ref_##src0_type##to##dst_type##_4X( \\\n\
+ __read_only image2d_t index, \\\n\
+ __read_only image2d_t update, \\\n\
+ __read_only image2d_t temp_buf_int, \\\n\
+ image2d_t temp_ref, \\\n\
+ image2d_t link_buffer0, \\\n\
+ image2d_t link_buffer1, \\\n\
+ int width, int area, int vol, int val4, \\\n\
+ int val5, int val6, int val7, int coord_dim) \\\n\
+{ \\\n\
+ int gidx = get_global_id(0); \\\n\
+ int gidy = get_global_id(1); \\\n\
+ Image img1 = create_image_from_image2d(index, 4); \\\n\
+ Image img2 = create_image_from_image2d(temp_buf_int, 4); \\\n\
+ Image img3 = create_image_from_image2d(temp_ref, element_size); \\\n\
+ __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+ __global int* acc_ptr = (__global int*)img2.ptr; \\\n\
+ __global ptr_type* ref_ptr = (__global ptr_type*)img3.ptr; \\\n\
+ data_type dst; \\\n\
+ \\\n\
+ int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\
+ int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\
+ int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\
+ int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\
+ float4 tmpData = convert_float4(vload4(gidx, acc_ptr + idx * ref_stride)); \\\n\
+ int loc = idx * output_stride + gidx; \\\n\
+ int4 data = convert_int4_rte(tmpData * inout_scale + output_zp); \\\n\
+ VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+ uniConvertInt32toUint8_2x8); \\\n\
+ ref_ptr[loc] = dst.xyzw; \\\n\
+}\n\
+SCATTER_ND_UPDATE_REF_4X(I32, U8, vxc_uchar8, vxc_uchar4, 1)\n\
+SCATTER_ND_UPDATE_REF_4X(I32, I8, vxc_char8, vxc_char4, 1)\n\
+SCATTER_ND_UPDATE_REF_4X(I32, I16, vxc_short8, vxc_short4, 2)\n\
+\n\
+#define SCATTER_ND_UPDATE_COPY(src0_type, ptr_type, element_size, ptr_type1) \\\n\
+__kernel void scatter_nd_update_copy_##src0_type( \\\n\
+ __read_only image2d_t temp_ref, \\\n\
+ __read_only image2d_t link_buffer1, \\\n\
+ image2d_t output, \\\n\
+ int length, int res) \\\n\
+{ \\\n\
+ int gidx = get_global_id(0); \\\n\
+ Image img1 = create_image_from_image2d(temp_ref, element_size); \\\n\
+ Image img2 = create_image_from_image2d(output, element_size); \\\n\
+ __global ptr_type* input_ptr = (__global ptr_type*)img1.ptr; \\\n\
+ __global ptr_type* output_ptr = (__global ptr_type*)img2.ptr; \\\n\
+ output_ptr[gidx] = input_ptr[gidx]; \\\n\
+ if(gidx < res) \\\n\
+ { \\\n\
+ __global ptr_type1* input_ptr1 = (__global ptr_type1*)img1.ptr; \\\n\
+ __global ptr_type1* output_ptr1 = (__global ptr_type1*)img2.ptr; \\\n\
+ output_ptr1[length + gidx] = input_ptr1[length + gidx]; \\\n\
+ } \\\n\
+}\n\
+SCATTER_ND_UPDATE_COPY(U8, vxc_uchar8, 1, uchar)\n\
+SCATTER_ND_UPDATE_COPY(I8, vxc_char8, 1, char)\n\
+SCATTER_ND_UPDATE_COPY(I16, vxc_short8, 2, short)\n\
+SCATTER_ND_UPDATE_COPY(F16, vxc_short8, 2, short)\n\
+SCATTER_ND_UPDATE_COPY(BF16, vxc_short8, 2, short)\n\
+"; /* end of scatter_nd_update_qint_vx*/
+
static const char scatter_nd_update_special_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
\n\
_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\
@@ -42120,7 +46848,7 @@ __kernel void sequence_mask_##src0_type_name##to##src1_type_name##_2D( \\\n\
short zp = inputZP; \\\n\
VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
uniConvert1stUint8SubZpToFp32_4x4); \\\n\
- int index = convert_int_rte(tmpData.s0 * input_scale); \\\n\
+ int index = convert_int_rtz(tmpData.s0 * input_scale); \\\n\
int4 data; \\\n\
data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \\\n\
write_type dst; \\\n\
@@ -42146,7 +46874,7 @@ __kernel void sequence_mask_##src0_type_name##to##src1_type_name( \\\n\
short zp = inputZP; \\\n\
VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
uniConvert1stUint8SubZpToFp32_4x4); \\\n\
- int index = convert_int_rte(tmpData.s0 * input_scale); \\\n\
+ int index = convert_int_rtz(tmpData.s0 * input_scale); \\\n\
int4 data; \\\n\
data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \\\n\
write_type dst; \\\n\
@@ -42172,7 +46900,7 @@ __kernel void sequence_mask_F16toF16_2D(\n\
float4 tmpData;\n\
VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
UniFP16toFP32Lo4_dp4x4);\n\
- int index = convert_int_rte(tmpData.x);\n\
+ int index = convert_int_rtz(tmpData.x);\n\
float4 data;\n\
data = outIdx < index? outputVal1 : convert_float(output_ZP);\n\
vxc_short8 dst;\n\
@@ -42195,7 +46923,7 @@ __kernel void sequence_mask_F16toF16(\n\
float4 tmpData;\n\
VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
UniFP16toFP32Lo4_dp4x4);\n\
- int index = convert_int_rte(tmpData.x);\n\
+ int index = convert_int_rtz(tmpData.x);\n\
float4 data;\n\
data = outIdx < index? outputVal1 : convert_float(output_ZP);\n\
vxc_short8 dst;\n\
@@ -42218,7 +46946,7 @@ __kernel void sequence_mask_F16toU8_2D(\n\
float4 tmpData;\n\
VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
UniFP16toFP32Lo4_dp4x4);\n\
- int index = convert_int_rte(tmpData.x);\n\
+ int index = convert_int_rtz(tmpData.x);\n\
int4 data;\n\
data = outIdx < index? convert_int_rte(outputVal1) : output_ZP;\n\
vxc_uchar16 dst;\n\
@@ -42239,7 +46967,7 @@ __kernel void sequence_mask_F16toU8(\n\
float4 tmpData;\n\
VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
UniFP16toFP32Lo4_dp4x4);\n\
- int index = convert_int_rte(tmpData.x);\n\
+ int index = convert_int_rtz(tmpData.x);\n\
int4 data;\n\
data = outIdx < index? convert_int_rte(outputVal1) : output_ZP;\n\
vxc_uchar16 dst;\n\
@@ -43211,6 +47939,167 @@ TILE_2D_MIX(U8, F16, 7, 6, vxc_uchar8, vxc_short8)\n\
TILE_2D_MIX(U8, F16, 0, 7, vxc_uchar8, vxc_short8)\n\
"; /* end of tile_mix_vx*/
+static const char tiny_yolov4_postprocess_box_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+#define logE (1.44269502f)\n\
+\n\
+float4 sigmoid4(float4 x)\n\
+{\n\
+ x *= -logE;\n\
+ x = 1 + exp2(x);\n\
+ return 1 / x;\n\
+}\n\
+\n\
+float4 exp4(float4 x)\n\
+{\n\
+ x *= logE;\n\
+ return exp2(x);\n\
+}\n\
+\n\
+#define CONST0 (1.0499999523162842f)\n\
+#define CONST1 (0.0250000003725290f)\n\
+\n\
+_viv_uniform VXC_512Bits uniDatatoFloat32_0_4x4;\n\
+_viv_uniform VXC_512Bits uniDatatoFloat32_1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uniDataTranspose_0_2x8;\n\
+_viv_uniform VXC_512Bits uniDataTranspose_1_2x8;\n\
+_viv_uniform float input0_scale;\n\
+_viv_uniform float input0_tail;\n\
+_viv_uniform float input1_scale;\n\
+_viv_uniform float input1_tail;\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform float CONST2;\n\
+__kernel void tiny_yolov4_postprocess_box_U8_U8toU8\n\
+ (\n\
+ __read_only image2d_array_t input0,\n\
+ __read_only image2d_array_t input1,\n\
+ __write_only image2d_array_t output,\n\
+ float bias_0,\n\
+ float bias_1\n\
+ )\n\
+{\n\
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(0));\n\
+\n\
+ vxc_uchar16 src0, src1, src2, src3;\n\
+ VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src1, input0, coord.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src1, input0, coord.xy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ VXC_ReadImage(src2, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src3, input1, coord.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ coord.zw += (int2)(2, 3);\n\
+\n\
+ float4 data0, data1, data2, data3, data;\n\
+ VXC_DP4x4(data0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);\n\
+ data0 = data0 * input0_scale + input0_tail;\n\
+ data0 = sigmoid4(data0);\n\
+ data0 = data0 * CONST0 - CONST1;\n\
+\n\
+ VXC_DP4x4(data, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);\n\
+ data = data * input1_scale + input1_tail;\n\
+ data0 = data0 * CONST2 + data * CONST2;\n\
+\n\
+ VXC_DP4x4(data1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_1_4x4);\n\
+ data1 = data1 * input0_scale + input0_tail;\n\
+ data1 = sigmoid4(data1);\n\
+ data1 = data1 * CONST0 - CONST1;\n\
+\n\
+ VXC_DP4x4(data, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);\n\
+ data = data * input1_scale + input1_tail;\n\
+ data1 = data1 * CONST2 + data * CONST2;\n\
+\n\
+ VXC_DP4x4(data2, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);\n\
+ data2 = data2 * input0_scale + input0_tail;\n\
+ data2 = exp4(data2) * bias_0;\n\
+\n\
+ VXC_DP4x4(data3, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_1_4x4);\n\
+ data3 = data3 * input0_scale + input0_tail;\n\
+ data3 = exp4(data3) * bias_1;\n\
+\n\
+ data0 = data0 * output_scale + output_zp;\n\
+ data1 = data1 * output_scale + output_zp;\n\
+\n\
+ int4 dst0 = convert_int4_rte(data0);\n\
+ int4 dst1 = convert_int4_rte(data1);\n\
+ VXC_DP2x8(src1, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\
+ data2 = data2 * output_scale + output_zp;\n\
+ data3 = data3 * output_scale + output_zp;\n\
+ dst0 = convert_int4_rte(data2);\n\
+ dst1 = convert_int4_rte(data3);\n\
+ VXC_DP2x8(src1, dst0, dst1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\
+\n\
+ VXC_DP2x8(src0, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniDataTranspose_0_2x8);\n\
+ VXC_DP2x8(src0, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniDataTranspose_1_2x8);\n\
+\n\
+ VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ coord.x ++;\n\
+ VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord.yz, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord.yw, src0, VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of tiny_yolov4_postprocess_box_vx*/
+
+static const char tiny_yolov4_postprocess_confidence_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniU8TimesU8_0_4x4;\n\
+_viv_uniform VXC_512Bits uniU8PlusU8_trans_0_2x8;\n\
+_viv_uniform VXC_512Bits uniU8PlusU8_trans_1_2x8;\n\
+_viv_uniform VXC_512Bits uniU16TimesMultiplier_PostShift_2x8;\n\
+_viv_uniform int output_zp;\n\
+\n\
+__kernel void tiny_yolov4_postprocess_conf_U8toU8\n\
+(\n\
+ __read_only image2d_t input,\n\
+ __write_only image2d_t output\n\
+)\n\
+{\n\
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, get_global_id(0));\n\
+\n\
+ vxc_uchar16 src0, src1, src2, src3, src4;\n\
+\n\
+ VXC_ReadImage(src0, input, coord.wz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ vxc_ushort8 data0, data1;\n\
+\n\
+ VXC_ReadImage(src1, input, coord.wy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src2, input, coord.wy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src3, input, coord.wy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_ReadImage(src4, input, coord.wy, VXC_5BITOFFSET_XY(0, 4), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+ coord.zw = coord.xx + (int2)(2, 3);\n\
+\n\
+ VXC_DP4x4(data0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);\n\
+ VXC_DP4x4(data0, src0, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);\n\
+ VXC_DP4x4(data1, src0, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);\n\
+ VXC_DP4x4(data1, src0, src4, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);\n\
+\n\
+ VXC_DP2x8(src1, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\
+ uniU16TimesMultiplier_PostShift_2x8);\n\
+ VXC_DP2x8(src1, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\n\
+ uniU16TimesMultiplier_PostShift_2x8);\n\
+\n\
+ uchar zp;\n\
+ _viv_asm(COPY, zp, output_zp, 2);\n\
+\n\
+ VXC_DP2x8(src0, src1, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\
+ uniU8PlusU8_trans_0_2x8);\n\
+ VXC_DP2x8(src0, src1, zp, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\n\
+ uniU8PlusU8_trans_1_2x8);\n\
+\n\
+ VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+ coord.x ++;\n\
+ VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord.yz, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0));\n\
+ VXC_WriteImage(output, coord.yw, src0, VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of tiny_yolov4_postprocess_confidence_vx*/
+
static const char upsample_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
\n\
_viv_uniform VXC_512Bits uniF16MulMultipiler_PostShft_2x8;\n\
@@ -49204,6 +54093,8 @@ static const char gather_cl[] = "__kernel void gather_U8toU8(\n\
\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
int4 indice = read_imagei(input1, coord_in.xy);\n\
+\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.w = gidz * axis_num + indice.x;\n\
\n\
uint4 data = read_imageui(input0, coord_in.zw);\n\
@@ -49229,6 +54120,8 @@ __kernel void gather_F16toF16(\n\
\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
int4 indice = read_imagei(input1, coord_in.xy);\n\
+\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.w = gidz * axis_num + indice.x;\n\
\n\
float4 data = read_imagef(input0, coord_in.zw);\n\
@@ -49254,6 +54147,8 @@ __kernel void gather_I32toI32(\n\
\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
int4 indice = read_imagei(input1, coord_in.xy);\n\
+\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.w = gidz * axis_num + indice.x;\n\
\n\
int4 data = read_imagei(input0, coord_in.zw);\n\
@@ -49279,6 +54174,8 @@ __kernel void gather_F32toF32(\n\
\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
int4 indice = read_imagei(input1, coord_in.xy);\n\
+\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.w = gidz * axis_num + indice.x;\n\
\n\
float4 data = read_imagef(input0, coord_in.zw);\n\
@@ -49305,6 +54202,7 @@ static const char gather_array_cl[] = "__kernel void gather_array_U8toU8(\n\
\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
int4 indice = read_imagei(input1, coord_in.xy);\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.w = gidz * axis_num + indice.x;\n\
\n\
Image img1 = create_image_from_image2d(input0, 1);\n\
@@ -49333,6 +54231,7 @@ __kernel void gather_array_F16toF16(\n\
\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
int4 indice = read_imagei(input1, coord_in.xy);\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.w = gidz * axis_num + indice.x;\n\
\n\
Image img1 = create_image_from_image2d(input0, 2);\n\
@@ -49361,6 +54260,7 @@ __kernel void gather_array_I32toI32(\n\
\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
int4 indice = read_imagei(input1, coord_in.xy);\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.w = gidz * axis_num + indice.x;\n\
\n\
Image img1 = create_image_from_image2d(input0, 4);\n\
@@ -49389,6 +54289,7 @@ __kernel void gather_array_F32toF32(\n\
\n\
int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
int4 indice = read_imagei(input1, coord_in.xy);\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.w = gidz * axis_num + indice.x;\n\
\n\
Image img1 = create_image_from_image2d(input0, 4);\n\
@@ -49423,6 +54324,7 @@ static const char gather_batch_cl[] = "__kernel void gather_batch_U8toU8(\n\
{\n\
int4 indice = read_imagei(input1, coord_idx);\n\
coord_idx.y++;\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.y = gidz * axis_num + indice.x;\n\
\n\
uint4 data = read_imageui(input0, coord_in);\n\
@@ -49454,6 +54356,7 @@ __kernel void gather_batch_F16toF16(\n\
{\n\
int4 indice = read_imagei(input1, coord_idx);\n\
coord_idx.y++;\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.y = gidz * axis_num + indice.x;\n\
\n\
float4 data = read_imagef(input0, coord_in);\n\
@@ -49485,6 +54388,7 @@ __kernel void gather_batch_I32toI32(\n\
{\n\
int4 indice = read_imagei(input1, coord_idx);\n\
coord_idx.y++;\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.y = gidz * axis_num + indice.x;\n\
\n\
int4 data = read_imagei(input0, coord_in);\n\
@@ -49516,6 +54420,7 @@ __kernel void gather_batch_F32toF32(\n\
{\n\
int4 indice = read_imagei(input1, coord_idx);\n\
coord_idx.y++;\n\
+ indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
coord_in.y = gidz * axis_num + indice.x;\n\
\n\
float4 data = read_imagef(input0, coord_in);\n\
@@ -49526,7 +54431,15 @@ __kernel void gather_batch_F32toF32(\n\
}\n\
"; /* end of gather_batch_cl*/
-static const char gather_elements_cl[] = "\n\
+static const char gather_elements_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+\n\
+_viv_uniform uint width0;\n\
+_viv_uniform uint height0;\n\
+_viv_uniform uint width1;\n\
+_viv_uniform uint height1;\n\
+_viv_uniform uint width_out;\n\
+_viv_uniform uint height_out;\n\
+\n\
#define GATHER_ELEMENTS_AXIS0_2D(name, data_type, read_func, write_func, conv_func) \\\n\
__kernel void gather_elements_axis0_##name##_I32to##name##_2D \\\n\
( \\\n\
@@ -49661,6 +54574,162 @@ __kernel void gather_elements_axis2_##name##_I32to##name \\\n\
GATHER_ELEMENTS_AXIS2(F32, float4, read_imagef, write_imagef, convert_float4)\n\
GATHER_ELEMENTS_AXIS2(I32, int4, read_imagei, write_imagei, convert_int4_rte)\n\
GATHER_ELEMENTS_AXIS2(U32, uint4, read_imageui, write_imageui, convert_uint4_rte)\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name \\\n\
+ ( \\\n\
+ __read_only image2d_array_t input0, \\\n\
+ __read_only image2d_array_t input1, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ float input_scale, \\\n\
+ float input_tail, \\\n\
+ int axis_size \\\n\
+ ) \\\n\
+{ \\\n\
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \\\n\
+ int* index_ptr = (int*)index_tensor.ptr; \\\n\
+ int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \\\n\
+ \\\n\
+ Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \\\n\
+ data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \\\n\
+ data_type data = input_ptr[index + coord.y * width0 + coord.z * width0 * height0]; \\\n\
+ \\\n\
+ Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \\\n\
+ data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \\\n\
+ output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(F32, float, float*, 4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I32, int, int*, 4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I8, char, char*, 1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(U8, uchar, uchar*, 1)\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name \\\n\
+ ( \\\n\
+ __read_only image2d_array_t input0, \\\n\
+ __read_only image2d_array_t input1, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ float input_scale, \\\n\
+ float input_tail, \\\n\
+ int axis_size \\\n\
+ ) \\\n\
+{ \\\n\
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \\\n\
+ int* index_ptr = (int*)index_tensor.ptr; \\\n\
+ int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \\\n\
+ \\\n\
+ Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \\\n\
+ data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \\\n\
+ data_type data = input_ptr[coord.x + index * width0 + coord.z * width0 * height0]; \\\n\
+ \\\n\
+ Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \\\n\
+ data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \\\n\
+ output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(F32, float, float*, 4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I32, int, int*, 4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I8, char, char*, 1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(U8, uchar, uchar*, 1)\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis2_##name##_I32to##name \\\n\
+ ( \\\n\
+ __read_only image2d_array_t input0, \\\n\
+ __read_only image2d_array_t input1, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ float input_scale, \\\n\
+ float input_tail, \\\n\
+ int axis_size \\\n\
+ ) \\\n\
+{ \\\n\
+ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+ Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \\\n\
+ int* index_ptr = (int*)index_tensor.ptr; \\\n\
+ int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \\\n\
+ \\\n\
+ Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \\\n\
+ data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \\\n\
+ data_type data = input_ptr[coord.x + coord.y * width0 + index * width0 * height0]; \\\n\
+ \\\n\
+ Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \\\n\
+ data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \\\n\
+ output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(F32, float, float*, 4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I32, int, int*, 4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I8, char, char*, 1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(U8, uchar, uchar*, 1)\n\
+\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name##_2D \\\n\
+ ( \\\n\
+ __read_only image2d_t input0, \\\n\
+ __read_only image2d_t input1, \\\n\
+ __write_only image2d_t output, \\\n\
+ float input_scale, \\\n\
+ float input_tail, \\\n\
+ int axis_size \\\n\
+ ) \\\n\
+{ \\\n\
+ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ Image index_img = create_image_from_image2d(input1, 4); \\\n\
+ int* index_ptr = (int*)index_img.ptr; \\\n\
+ int index = index_ptr[coord.x + coord.y * width1]; \\\n\
+ \\\n\
+ Image input_img = create_image_from_image2d(input0, stride); \\\n\
+ data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \\\n\
+ data_type data = input_ptr[index + coord.y * width0]; \\\n\
+ \\\n\
+ Image output_img = create_image_from_image2d(output, stride); \\\n\
+ data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \\\n\
+ output_ptr[coord.x + coord.y * width_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(F32, float, float*, 4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I32, int, int*, 4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I8, char, char*, 1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(U8, uchar, uchar*, 1)\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name##_2D \\\n\
+ ( \\\n\
+ __read_only image2d_t input0, \\\n\
+ __read_only image2d_t input1, \\\n\
+ __write_only image2d_t output, \\\n\
+ float input_scale, \\\n\
+ float input_tail, \\\n\
+ int axis_size \\\n\
+ ) \\\n\
+{ \\\n\
+ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ Image index_img = create_image_from_image2d(input1, 4); \\\n\
+ int* index_ptr = (int*)index_img.ptr; \\\n\
+ int index = index_ptr[coord.x + coord.y * width1]; \\\n\
+ \\\n\
+ Image input_img = create_image_from_image2d(input0, stride); \\\n\
+ data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \\\n\
+ data_type data = input_ptr[coord.x + index * width0]; \\\n\
+ \\\n\
+ Image output_img = create_image_from_image2d(output, stride); \\\n\
+ data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \\\n\
+ output_ptr[coord.x + coord.y * width_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(F32, float, float*, 4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I32, int, int*, 4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I8, char, char*, 1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(U8, uchar, uchar*, 1)\n\
"; /* end of gather_elements_cl*/
static const char gather_nd_cl[] = "__kernel void gather_nd_U8toU8_1D(\n\
@@ -49919,127 +54988,136 @@ __kernel void gather_nd_F32toF32_3D(\n\
static const char gather_nd_batch_cl[] = "__kernel void gather_nd_batch_U8toU8_1D(\n\
__read_only image2d_t input0,\n\
- __read_only image2d_t input1,\n\
- __write_only image2d_t output,\n\
+ __read_only image2d_array_t input1,\n\
+ __write_only image2d_array_t output,\n\
int block_size,\n\
int coord_dim\n\
)\n\
{\n\
int gidx = get_global_id(0); // block_size\n\
- int gidy = get_global_id(1); // batch_num\n\
+ int gidy = get_global_id(1); // index_num\n\
+ int gidz = get_global_id(2); // batch_num\n\
\n\
- int4 coord = (int4)(gidx, gidy, 0, 0);\n\
- int4 indice = read_imagei(input1, coord.wy);\n\
- coord.z = indice.x * block_size + gidx;\n\
+ int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+ int4 indice = read_imagei(input1, coord.wyzw);\n\
+ int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\
\n\
- uint4 data = read_imageui(input0, coord.zy);\n\
- write_imageui(output, coord.xy, data);\n\
+ uint4 data = read_imageui(input0, coord0);\n\
+ write_imageui(output, coord, data);\n\
}\n\
\n\
__kernel void gather_nd_batch_F16toF16_1D(\n\
__read_only image2d_t input0,\n\
- __read_only image2d_t input1,\n\
- __write_only image2d_t output,\n\
+ __read_only image2d_array_t input1,\n\
+ __write_only image2d_array_t output,\n\
int block_size,\n\
int coord_dim\n\
)\n\
{\n\
int gidx = get_global_id(0); // block_size\n\
- int gidy = get_global_id(1); // batch_num\n\
+ int gidy = get_global_id(1); // index_num\n\
+ int gidz = get_global_id(2); // batch_num\n\
\n\
- int4 coord = (int4)(gidx, gidy, 0, 0);\n\
- int4 indice = read_imagei(input1, coord.wy);\n\
- coord.z = indice.x * block_size + gidx;\n\
+ int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+ int4 indice = read_imagei(input1, coord.wyzw);\n\
+ int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\
\n\
- float4 data = read_imagef(input0, coord.zy);\n\
- write_imagef(output, coord.xy, data);\n\
+ float4 data = read_imagef(input0, coord0);\n\
+ write_imagef(output, coord, data);\n\
}\n\
\n\
__kernel void gather_nd_batch_I8toI8_1D(\n\
__read_only image2d_t input0,\n\
- __read_only image2d_t input1,\n\
- __write_only image2d_t output,\n\
+ __read_only image2d_array_t input1,\n\
+ __write_only image2d_array_t output,\n\
int block_size,\n\
int coord_dim\n\
)\n\
{\n\
int gidx = get_global_id(0); // block_size\n\
- int gidy = get_global_id(1); // batch_num\n\
+ int gidy = get_global_id(1); // index_num\n\
+ int gidz = get_global_id(2); // batch_num\n\
\n\
- int4 coord = (int4)(gidx, gidy, 0, 0);\n\
- int4 indice = read_imagei(input1, coord.wy);\n\
- coord.z = indice.x * block_size + gidx;\n\
+ int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+ int4 indice = read_imagei(input1, coord.wyzw);\n\
+ int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\
\n\
- int4 data = read_imagei(input0, coord.zy);\n\
- write_imagei(output, coord.xy, data);\n\
+ int4 data = read_imagei(input0, coord0);\n\
+ write_imagei(output, coord, data);\n\
}\n\
\n\
//2D\n\
__kernel void gather_nd_batch_U8toU8_2D(\n\
__read_only image2d_array_t input0,\n\
- __read_only image2d_t input1,\n\
- __write_only image2d_t output,\n\
+ __read_only image2d_array_t input1,\n\
+ __write_only image2d_array_t output,\n\
int block_size,\n\
int coord_dim\n\
)\n\
{\n\
int gidx = get_global_id(0); // block_size\n\
- int gidy = get_global_id(1); // batch_num\n\
+ int gidy = get_global_id(1); // index_num\n\
+ int gidz = get_global_id(2); // batch_num\n\
\n\
- int4 coord = (int4)(0, gidy, gidx, 1);\n\
- int4 indice = read_imagei(input1, coord.xy);\n\
- int4 indice1 = read_imagei(input1, coord.wy);\n\
+ int4 coord = (int4)(1, gidy, gidz, 0);\n\
+ int4 indice = read_imagei(input1, coord.wyzw);\n\
+ int4 indice1 = read_imagei(input1, coord.xyzw);\n\
indice.x = indice.x * block_size + gidx;\n\
indice.y = indice1.x;\n\
- indice.zw = coord.yx;\n\
+ indice.zw = coord.zw;\n\
\n\
uint4 data = read_imageui(input0, indice);\n\
- write_imageui(output, coord.zy, data);\n\
+ coord.x = gidx;\n\
+ write_imageui(output, coord, data);\n\
}\n\
\n\
__kernel void gather_nd_batch_F16toF16_2D(\n\
__read_only image2d_array_t input0,\n\
- __read_only image2d_t input1,\n\
- __write_only image2d_t output,\n\
+ __read_only image2d_array_t input1,\n\
+ __write_only image2d_array_t output,\n\
int block_size,\n\
int coord_dim\n\
)\n\
{\n\
int gidx = get_global_id(0); // block_size\n\
- int gidy = get_global_id(1); // batch_num\n\
+ int gidy = get_global_id(1); // index_num\n\
+ int gidz = get_global_id(2); // batch_num\n\
\n\
- int4 coord = (int4)(0, gidy, gidx, 1);\n\
- int4 indice = read_imagei(input1, coord.xy);\n\
- int4 indice1 = read_imagei(input1, coord.wy);\n\
+ int4 coord = (int4)(1, gidy, gidz, 0);\n\
+ int4 indice = read_imagei(input1, coord.wyzw);\n\
+ int4 indice1 = read_imagei(input1, coord.xyzw);\n\
indice.x = indice.x * block_size + gidx;\n\
indice.y = indice1.x;\n\
- indice.zw = coord.yx;\n\
+ indice.zw = coord.zw;\n\
\n\
float4 data = read_imagef(input0, indice);\n\
- write_imagef(output, coord.zy, data);\n\
+ coord.x = gidx;\n\
+ write_imagef(output, coord, data);\n\
}\n\
\n\
__kernel void gather_nd_batch_I8toI8_2D(\n\
__read_only image2d_array_t input0,\n\
- __read_only image2d_t input1,\n\
- __write_only image2d_t output,\n\
+ __read_only image2d_array_t input1,\n\
+ __write_only image2d_array_t output,\n\
int block_size,\n\
int coord_dim\n\
)\n\
{\n\
int gidx = get_global_id(0); // block_size\n\
- int gidy = get_global_id(1); // batch_num\n\
+ int gidy = get_global_id(1); // index_num\n\
+ int gidz = get_global_id(2); // batch_num\n\
\n\
- int4 coord = (int4)(0, gidy, gidx, 1);\n\
- int4 indice = read_imagei(input1, coord.xy);\n\
- int4 indice1 = read_imagei(input1, coord.wy);\n\
+ int4 coord = (int4)(1, gidy, gidz, 0);\n\
+ int4 indice = read_imagei(input1, coord.wyzw);\n\
+ int4 indice1 = read_imagei(input1, coord.xyzw);\n\
indice.x = indice.x * block_size + gidx;\n\
indice.y = indice1.x;\n\
indice.y = indice1.x;\n\
- indice.zw = coord.yx;\n\
+ indice.zw = coord.zw;\n\
\n\
int4 data = read_imagei(input0, indice);\n\
- write_imagei(output, coord.zy, data);\n\
+ coord.x = gidx;\n\
+ write_imagei(output, coord, data);\n\
}\n\
"; /* end of gather_nd_batch_cl*/
@@ -57045,6 +62123,103 @@ GEMM_TRANSB_3D(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);\n\
\n\
"; /* end of matrixmul_cl*/
+static const char matrixmul_cross_cl[] = "__kernel void gemm_F32F32toF32_merge(\n\
+ __read_only image2d_array_t inputA,\n\
+ __read_only image2d_array_t inputB,\n\
+ __write_only image2d_array_t output,\n\
+ int M,\n\
+ int K,\n\
+ int N,\n\
+ int ac2zero,\n\
+ int bc2zero,\n\
+ float scale_a,\n\
+ float zp_a,\n\
+ float scale_b,\n\
+ float zp_b,\n\
+ float scale_out,\n\
+ float zp_out,\n\
+ int outer)\n\
+{\n\
+ for(int i = 0; i < outer; i++)\n\
+ {\n\
+ int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? i : get_global_id(2)), 0);\n\
+ int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0);\n\
+\n\
+ float4 sum = (float4)(0);\n\
+\n\
+ for(; coord_a.x < K;)\n\
+ {\n\
+ float4 tempA0;\n\
+ float4 tempB0;\n\
+\n\
+ tempA0 = read_imagef(inputA, coord_a);\n\
+ tempB0 = read_imagef(inputB, coord_b);\n\
+ coord_a.x++;\n\
+ coord_b.y++;\n\
+\n\
+ sum = sum + tempA0 * tempB0;\n\
+ }\n\
+\n\
+ coord_b.y = get_global_id(1);\n\
+ coord_b.z = get_global_id(2) + i * get_global_size(2);\n\
+ write_imagef(output, coord_b, sum);\n\
+ }\n\
+}\n\
+\n\
+#define GEMM_MERGE(name, dst_type, read_image_type, convert_type, write_image_type) \\\n\
+__kernel void gemm_##name##_merge( \\\n\
+ __read_only image2d_array_t inputA, \\\n\
+ __read_only image2d_array_t inputB, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ int M, \\\n\
+ int K, \\\n\
+ int N, \\\n\
+ int ac2zero, \\\n\
+ int bc2zero, \\\n\
+ float scale_a, \\\n\
+ float zp_a, \\\n\
+ float scale_b, \\\n\
+ float zp_b, \\\n\
+ float scale_out, \\\n\
+ float zp_out, \\\n\
+ int outer) \\\n\
+{ \\\n\
+ for(int i = 0; i < outer; i++) \\\n\
+ { \\\n\
+ int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? i : get_global_id(2)), 0); \\\n\
+ int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); \\\n\
+ float4 sum = (float4)(0); \\\n\
+ dst_type dst; \\\n\
+ \\\n\
+ for(; coord_a.x < K;) \\\n\
+ { \\\n\
+ float4 tempA0; \\\n\
+ float4 tempB0; \\\n\
+ \\\n\
+ tempA0 = convert_float4(read_image_type(inputA, coord_a)); \\\n\
+ tempB0 = convert_float4(read_image_type(inputB, coord_b)); \\\n\
+ tempA0.x = (tempA0.x - zp_a) * scale_a; \\\n\
+ tempB0.x = (tempB0.x - zp_b) * scale_b; \\\n\
+ \\\n\
+ coord_a.x++; \\\n\
+ coord_b.y++; \\\n\
+ \\\n\
+ sum = sum + tempA0 * tempB0; \\\n\
+ } \\\n\
+ sum.x = sum.x * scale_out + zp_out; \\\n\
+ dst = convert_type(sum); \\\n\
+ \\\n\
+ coord_b.y = get_global_id(1); \\\n\
+ coord_b.z = get_global_id(2) + i * get_global_size(2); \\\n\
+ write_image_type(output, coord_b, dst); \\\n\
+ } \\\n\
+}\n\
+GEMM_MERGE(I8I8toI8,int4,read_imagei,convert_int4,write_imagei);\n\
+GEMM_MERGE(U8U8toU8,uint4,read_imageui,convert_uint4,write_imageui);\n\
+GEMM_MERGE(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);\n\
+\n\
+"; /* end of matrixmul_cross_cl*/
+
static const char matrixmul_transA_cl[] = "__kernel void gemm_transa_F32F32toF32_2D(\n\
__read_only image2d_t inputA,\n\
__read_only image2d_t inputB,\n\
@@ -59324,6 +64499,85 @@ __kernel void moments_axis2_BF16toF32(\n\
}\n\
"; /* end of moments_axis2_cl*/
+static const char nearest_grid_sample_cl[] = "__kernel void nearest_grid_sample_F32_F32toF32(\n\
+ __read_only image2d_array_t input0,\n\
+ __read_only image2d_t input1,\n\
+ __write_only image2d_array_t output,\n\
+ float half_input0_w,\n\
+ float half_input0_h,\n\
+ float add_float_value_w,\n\
+ float add_float_value_h,\n\
+ int depth\n\
+ )\n\
+{\n\
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+ int2 coord_in1 = (int2)(get_global_id(0) * 2, get_global_id(1));\n\
+\n\
+ float fx = read_imagef(input1, coord_in1).x;\n\
+ coord_in1.x = coord_in1.x + 1;\n\
+ float fy = read_imagef(input1, coord_in1).x;\n\
+\n\
+ fx = fx * half_input0_w + add_float_value_w;\n\
+ fy = fy * half_input0_h + add_float_value_h;\n\
+ int x_index = convert_int(fx);\n\
+ int y_index = convert_int(fy);\n\
+ int4 coord_in = (int4)(x_index, y_index, 0, 0);\n\
+\n\
+ float4 dst;\n\
+\n\
+ while (coord_in.z < depth){\n\
+ dst = read_imagef(input0, coord_in);\n\
+ write_imagef(output, coord_out, dst);\n\
+ coord_in.z++;\n\
+ coord_out.z++;\n\
+ }\n\
+}\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_U8_U8toU8(\n\
+ __read_only image2d_array_t input0,\n\
+ __read_only image2d_t input1,\n\
+ __write_only image2d_array_t output,\n\
+ float half_input0_w,\n\
+ float half_input0_h,\n\
+ float add_float_value_w,\n\
+ float add_float_value_h,\n\
+ int depth,\n\
+ float in0_scale,\n\
+ float in0_tail,\n\
+ float in1_scale,\n\
+ float in1_tail,\n\
+ float out_scale,\n\
+ float out_tail\n\
+ )\n\
+{\n\
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+ int2 coord_in1 = (int2)(get_global_id(0) * 2, get_global_id(1));\n\
+\n\
+ float fx = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;\n\
+ coord_in1.x = coord_in1.x + 1;\n\
+ float fy = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;\n\
+\n\
+ fx = fx * half_input0_w + add_float_value_w;\n\
+ fy = fy * half_input0_h + add_float_value_h;\n\
+ int x_index = convert_int(fx);\n\
+ int y_index = convert_int(fy);\n\
+ int4 coord_in = (int4)(x_index, y_index, 0, 0);\n\
+\n\
+ float4 val;\n\
+ uint4 dst;\n\
+\n\
+ while (coord_in.z < depth){\n\
+ val = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;\n\
+ dst = convert_uint4_rte(val * out_scale + out_tail);\n\
+ write_imageui(output, coord_out, dst);\n\
+ coord_in.z++;\n\
+ coord_out.z++;\n\
+ }\n\
+\n\
+}\n\
+"; /* end of nearest_grid_sample_cl*/
+
static const char one_hot_cl[] = "__kernel void one_hot_F32toF32\n\
(\n\
__read_only image2d_t input,\n\
@@ -62168,6 +67422,290 @@ __kernel void resize_1d_nearest_U8toU8(\n\
}\n\
"; /* end of resize_1d_nearest_cl*/
+static const char resize_3d_bilinear_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\
+\n\
+#define RESIZE_3D(in_name, out_name, read_image_type, dst_type, convert_type, write_image_type) \\\n\
+__kernel void resize_3d_bilinear_##in_name##to##out_name( \\\n\
+ __read_only image2d_array_t input, \\\n\
+ __write_only image2d_array_t output, \\\n\
+ float scale_x, \\\n\
+ float scale_y, \\\n\
+ float scale_z, \\\n\
+ float half_pixel_value, \\\n\
+ uint in_width, \\\n\
+ uint in_height, \\\n\
+ uint in_depth, \\\n\
+ float in_scale, \\\n\
+ float in_tail, \\\n\
+ float out_scale, \\\n\
+ float out_tail \\\n\
+ ) \\\n\
+{ \\\n\
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value; \\\n\
+ float left_x_f = fmax(floor(in_x), 0); \\\n\
+ float x_lerp = in_x - left_x_f; \\\n\
+ int left_x_idx = convert_int(left_x_f); \\\n\
+ float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value; \\\n\
+ float top_y_f = fmax(floor(in_y), 0); \\\n\
+ float y_lerp = in_y - top_y_f; \\\n\
+ int top_y_idx = convert_int(top_y_f); \\\n\
+ float in_z = (convert_float(coord_out.z) + half_pixel_value) * scale_z - half_pixel_value; \\\n\
+ float front_z_f = fmax(floor(in_z), 0); \\\n\
+ float z_lerp = in_z - front_z_f; \\\n\
+ int front_z_idx = convert_int(front_z_f); \\\n\
+ int4 coord_in = (int4)(left_x_idx, top_y_idx, front_z_idx, 0); \\\n\
+ float4 data_000, data_100, data_010, data_110, data_001, data_011, data_101, data_111; \\\n\
+ dst_type dst; \\\n\
+ \\\n\
+ int dx, dy, dz; \\\n\
+ dx = in_x < 0 ? 0 : (left_x_f < in_width - 1 ? 1 : 0); \\\n\
+ dy = in_y < 0 ? 0 : (top_y_f < in_height - 1 ? 1 : 0); \\\n\
+ dz = in_z < 0 ? 0 : (front_z_idx < in_depth - 1 ? 1 : 0); \\\n\
+ \\\n\
+ data_000 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\
+ coord_in.y = coord_in.y + dy; \\\n\
+ data_010 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\
+ coord_in.x = coord_in.x + dx; \\\n\
+ data_110 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\
+ coord_in.y = coord_in.y - dy; \\\n\
+ data_100 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\
+ coord_in.z = coord_in.z + dz; \\\n\
+ data_101 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\
+ coord_in.y = coord_in.y + dy; \\\n\
+ data_111 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\
+ coord_in.x = coord_in.x - dx; \\\n\
+ data_011 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\
+ coord_in.y = coord_in.y - dy; \\\n\
+ data_001 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\
+ \\\n\
+ data_000 = data_000 + (data_100 - data_000) * x_lerp; \\\n\
+ data_010 = data_010 + (data_110 - data_010) * x_lerp; \\\n\
+ data_000 = data_000 + (data_010 - data_000) * y_lerp; \\\n\
+ \\\n\
+ data_001 = data_001 + (data_101 - data_001) * x_lerp; \\\n\
+ data_011 = data_011 + (data_111 - data_011) * x_lerp; \\\n\
+ data_001 = data_001 + (data_011 - data_001) * y_lerp; \\\n\
+ data_000 = data_000 + (data_001 - data_000) * z_lerp; \\\n\
+ \\\n\
+ dst = convert_type(data_000 * out_scale + out_tail); \\\n\
+ \\\n\
+ write_image_type(output, coord_out, dst); \\\n\
+}\n\
+RESIZE_3D(F32, F32, read_imagef, float4, convert_float4, write_imagef)\n\
+RESIZE_3D(F32, U8, read_imagef, uint4, convert_uint4, write_imageui)\n\
+RESIZE_3D(U8, F32, read_imageui, float4, convert_float4, write_imagef)\n\
+RESIZE_3D(U8, U8, read_imageui, uint4, convert_uint4, write_imageui)\n\
+RESIZE_3D(I8, I8, read_imagei, int4, convert_int4, write_imagei)\n\
+\n\
+__kernel void resize_3d_bilinear_BF16toBF16(\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ float scale_x,\n\
+ float scale_y,\n\
+ float scale_z,\n\
+ float half_pixel_value,\n\
+ uint in_width,\n\
+ uint in_height,\n\
+ uint in_depth,\n\
+ float in_scale,\n\
+ float in_tail,\n\
+ float out_scale,\n\
+ float out_tail\n\
+ )\n\
+{\n\
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+ float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value;\n\
+ float left_x_f = fmax(floor(in_x), 0);\n\
+ float x_lerp = in_x - left_x_f;\n\
+ int left_x_idx = convert_int(left_x_f);\n\
+ float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value;\n\
+ float top_y_f = fmax(floor(in_y), 0);\n\
+ float y_lerp = in_y - top_y_f;\n\
+ int top_y_idx = convert_int(top_y_f);\n\
+ float in_z = (convert_float(coord_out.z) + half_pixel_value) * scale_z - half_pixel_value;\n\
+ float front_z_f = fmax(floor(in_z), 0);\n\
+ float z_lerp = in_z - front_z_f;\n\
+ int front_z_idx = convert_int(front_z_f);\n\
+ int4 coord_in = (int4)(left_x_idx, top_y_idx, front_z_idx, 0);\n\
+ uint4 data_000, data_100, data_010, data_110, data_001, data_011, data_101, data_111;\n\
+ float4 data_000_f, data_100_f, data_010_f, data_110_f, data_001_f, data_011_f, data_101_f, data_111_f;\n\
+ uint4 dst;\n\
+\n\
+ int dx, dy, dz;\n\
+ dx = in_x < 0 ? 0 : (left_x_f < in_width - 1 ? 1 : 0);\n\
+ dy = in_y < 0 ? 0 : (top_y_f < in_height - 1 ? 1 : 0);\n\
+ dz = in_z < 0 ? 0 : (front_z_idx < in_depth - 1 ? 1 : 0);\n\
+\n\
+ data_000 = read_imageui(input, coord_in);\n\
+ data_000 = data_000 << 16;\n\
+ coord_in.y = coord_in.y + dy;\n\
+ data_010 = read_imageui(input, coord_in);\n\
+ data_010 = data_010 << 16;\n\
+ coord_in.x = coord_in.x + dx;\n\
+ data_110 = read_imageui(input, coord_in);\n\
+ data_110 = data_110 << 16;\n\
+ coord_in.y = coord_in.y - dy;\n\
+ data_100 = read_imageui(input, coord_in);\n\
+ data_100 = data_100 << 16;\n\
+ coord_in.z = coord_in.z + dz;\n\
+ data_101 = read_imageui(input, coord_in);\n\
+ data_101 = data_101 << 16;\n\
+ coord_in.y = coord_in.y + dy;\n\
+ data_111 = read_imageui(input, coord_in);\n\
+ data_111 = data_111 << 16;\n\
+ coord_in.x = coord_in.x - dx;\n\
+ data_011 = read_imageui(input, coord_in);\n\
+ data_011 = data_011 << 16;\n\
+ coord_in.y = coord_in.y - dy;\n\
+ data_001 = read_imageui(input, coord_in);\n\
+ data_001 = data_001 << 16;\n\
+\n\
+ _viv_asm(COPY, data_000_f, data_000, 16);\n\
+ _viv_asm(COPY, data_010_f, data_010, 16);\n\
+ _viv_asm(COPY, data_110_f, data_110, 16);\n\
+ _viv_asm(COPY, data_100_f, data_100, 16);\n\
+ _viv_asm(COPY, data_101_f, data_101, 16);\n\
+ _viv_asm(COPY, data_111_f, data_111, 16);\n\
+ _viv_asm(COPY, data_011_f, data_011, 16);\n\
+ _viv_asm(COPY, data_001_f, data_001, 16);\n\
+\n\
+ data_000_f = data_000_f + (data_100_f - data_000_f) * x_lerp;\n\
+ data_010_f = data_010_f + (data_110_f - data_010_f) * x_lerp;\n\
+ data_000_f = data_000_f + (data_010_f - data_000_f) * y_lerp;\n\
+\n\
+ data_001_f = data_001_f + (data_101_f - data_001_f) * x_lerp;\n\
+ data_011_f = data_011_f + (data_111_f - data_011_f) * x_lerp;\n\
+ data_001_f = data_001_f + (data_011_f - data_001_f) * y_lerp;\n\
+ data_000_f = data_000_f + (data_001_f - data_000_f) * z_lerp;\n\
+\n\
+ _viv_asm(COPY, dst, data_000_f, 16);\n\
+ dst = dst >> 16;\n\
+ write_imageui(output, coord_out, dst);\n\
+}\n\
+"; /* end of resize_3d_bilinear_cl*/
+
+static const char resize_3d_nearest_cl[] = "\n\
+#define NEAREST_INDEX_PROCESS() \\\n\
+ int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ float in_x = (convert_float(coord_out.x) + half_pixel_value) * scale_x + round_value; \\\n\
+ int in_x_idx = convert_int(in_x); \\\n\
+ float in_y = (convert_float(coord_out.y) + half_pixel_value) * scale_y + round_value; \\\n\
+ int in_y_idx = convert_int(in_y); \\\n\
+ float in_z = (convert_float(coord_out.z) + half_pixel_value) * scale_z + round_value; \\\n\
+ int in_z_idx = convert_int(in_z); \\\n\
+\n\
+__kernel void resize_3d_nearest_F32toF32(\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ float scale_x,\n\
+ float scale_y,\n\
+ float scale_z,\n\
+ float half_pixel_value,\n\
+ float round_value,\n\
+ float output_scale,\n\
+ float output_tail)\n\
+{\n\
+ NEAREST_INDEX_PROCESS()\n\
+ int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);\n\
+ float4 dst;\n\
+ dst = read_imagef(input, coord_in);\n\
+ write_imagef(output, coord_out, dst);\n\
+}\n\
+\n\
+\n\
+__kernel void resize_3d_nearest_U8toU8(\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ float scale_x,\n\
+ float scale_y,\n\
+ float scale_z,\n\
+ float half_pixel_value,\n\
+ float round_value,\n\
+ float output_scale,\n\
+ float output_tail)\n\
+{\n\
+ NEAREST_INDEX_PROCESS()\n\
+ int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);\n\
+ uint4 dst;\n\
+ dst = convert_uint4(convert_float4(read_imageui(input, coord_in)) * output_scale + output_tail);\n\
+ write_imageui(output, coord_out, dst);\n\
+}\n\
+\n\
+__kernel void resize_3d_nearest_U8toF32(\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ float scale_x,\n\
+ float scale_y,\n\
+ float scale_z,\n\
+ float half_pixel_value,\n\
+ float round_value,\n\
+ float output_scale,\n\
+ float output_tail)\n\
+{\n\
+ NEAREST_INDEX_PROCESS()\n\
+ int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);\n\
+ float4 dst;\n\
+ dst = convert_float4(read_imageui(input, coord_in)) * output_scale + output_tail;\n\
+ write_imagef(output, coord_out, dst);\n\
+}\n\
+\n\
+__kernel void resize_3d_nearest_F32toU8(\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ float scale_x,\n\
+ float scale_y,\n\
+ float scale_z,\n\
+ float half_pixel_value,\n\
+ float round_value,\n\
+ float output_scale,\n\
+ float output_tail)\n\
+{\n\
+ NEAREST_INDEX_PROCESS()\n\
+ int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);\n\
+ uint4 dst;\n\
+ dst = convert_uint4(read_imagef(input, coord_in) * output_scale + output_tail);\n\
+ write_imageui(output, coord_out, dst);\n\
+}\n\
+\n\
+__kernel void resize_3d_nearest_I8toI8(\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ float scale_x,\n\
+ float scale_y,\n\
+ float scale_z,\n\
+ float half_pixel_value,\n\
+ float round_value,\n\
+ float output_scale,\n\
+ float output_tail)\n\
+{\n\
+ NEAREST_INDEX_PROCESS()\n\
+ int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);\n\
+ int4 dst;\n\
+ dst = convert_int4(convert_float4(read_imagei(input, coord_in)) * output_scale);\n\
+ write_imagei(output, coord_out, dst);\n\
+}\n\
+\n\
+__kernel void resize_3d_nearest_BF16toBF16(\n\
+ __read_only image2d_array_t input,\n\
+ __write_only image2d_array_t output,\n\
+ float scale_x,\n\
+ float scale_y,\n\
+ float scale_z,\n\
+ float half_pixel_value,\n\
+ float round_value,\n\
+ float output_scale,\n\
+ float output_tail)\n\
+{\n\
+ NEAREST_INDEX_PROCESS()\n\
+ int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);\n\
+ uint4 dst;\n\
+ dst = read_imageui(input, coord_in);\n\
+ write_imageui(output, coord_out, dst);\n\
+}\n\
+\n\
+"; /* end of resize_3d_nearest_cl*/
+
static const char resize_bilinear_cl[] = "__kernel void resize_bilinear_F32toF32(\n\
__read_only image2d_array_t input,\n\
__write_only image2d_array_t output,\n\
@@ -64556,7 +70094,7 @@ __kernel void swish_F32toU8_2D(\n\
}"; /* end of swish_cl*/
static const char tile_cl[] = "\n\
-#define TILE_3D(name0, name1, data_type, read_image_func, write_image_func) \\\n\
+#define TILE_3D(name0, name1, src_type, dst_type, conv_type, read_image_func, write_image_func) \\\n\
__kernel void tile_##name0##to##name1 \\\n\
( \\\n\
__read_only image2d_array_t input, \\\n\
@@ -64567,7 +70105,9 @@ __kernel void tile_##name0##to##name1 \\\n\
int multiples_0, \\\n\
int multiples_1, \\\n\
int multiples_2, \\\n\
- int multiples_3 \\\n\
+ int multiples_3, \\\n\
+ float inoutscale, \\\n\
+ float inouttail \\\n\
) \\\n\
{ \\\n\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
@@ -64575,7 +70115,9 @@ __kernel void tile_##name0##to##name1 \\\n\
int width = get_image_width(input); \\\n\
int height = get_image_height(input); \\\n\
\\\n\
- data_type src; \\\n\
+ src_type src; \\\n\
+ dst_type dst; \\\n\
+ \\\n\
read_image_func(src, input, coord); \\\n\
\\\n\
int batch_id = (short)coord.z / (short)depthIn; \\\n\
@@ -64597,17 +70139,19 @@ __kernel void tile_##name0##to##name1 \\\n\
for (int x = 0; x < multiples_0; x++) \\\n\
{ \\\n\
coord_out.x = coord.x + x * width; \\\n\
- write_image_func(output, coord_out.xyzw, src); \\\n\
+ dst = conv_type(convert_float4(src) * inoutscale + inouttail); \\\n\
+ write_image_func(output, coord_out.xyzw, dst); \\\n\
} \\\n\
} \\\n\
} \\\n\
} \\\n\
}\n\
-TILE_3D(I32, I32, int4, READ_IMAGEI_2DARRAY, write_imagei)\n\
-TILE_3D(U32, U32, uint4, READ_IMAGEUI_2DARRAY, write_imageui)\n\
-TILE_3D(F32, F32, float4, READ_IMAGEF_2DARRAY, write_imagef)\n\
+TILE_3D(I32, I32, int4, int4, convert_int4_rte, READ_IMAGEI_2DARRAY, write_imagei)\n\
+TILE_3D(U32, U32, uint4, uint4, convert_uint4_rte, READ_IMAGEUI_2DARRAY, write_imageui)\n\
+TILE_3D(F32, F32, float4, float4,convert_float4_rte,READ_IMAGEF_2DARRAY, write_imagef)\n\
+TILE_3D(F32, U32, float4, uint4, convert_uint4_rte, READ_IMAGEF_2DARRAY, write_imageui)\n\
\n\
-#define TILE_2D(name0, name1, data_type, read_image_func, write_image_func) \\\n\
+#define TILE_2D(name0, name1, src_type, dst_type, conv_type, read_image_func, write_image_func) \\\n\
__kernel void tile_##name0##to##name1##_2D \\\n\
( \\\n\
__read_only image2d_t input, \\\n\
@@ -64618,7 +70162,9 @@ __kernel void tile_##name0##to##name1##_2D \\\n\
int multiples_0, \\\n\
int multiples_1, \\\n\
int multiples_2, \\\n\
- int multiples_3 \\\n\
+ int multiples_3, \\\n\
+ float inoutscale, \\\n\
+ float inouttail \\\n\
) \\\n\
{ \\\n\
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
@@ -64627,22 +70173,25 @@ __kernel void tile_##name0##to##name1##_2D \\\n\
int output_width = get_image_width(output); \\\n\
int output_height = get_image_height(output); \\\n\
\\\n\
- data_type src = read_image_func(input, coord); \\\n\
+ src_type src = read_image_func(input, coord); \\\n\
+ dst_type dst; \\\n\
\\\n\
do \\\n\
{ \\\n\
do \\\n\
{ \\\n\
- write_image_func(output, coord, src); \\\n\
+ dst = conv_type(convert_float4(src) * inoutscale + inouttail); \\\n\
+ write_image_func(output, coord, dst); \\\n\
coord.x += width; \\\n\
} while (coord.x < output_width); \\\n\
coord.x = get_global_id(0); \\\n\
coord.y += height; \\\n\
} while (coord.y < output_height); \\\n\
}\n\
-TILE_2D(I32, I32, int4, read_imagei, write_imagei)\n\
-TILE_2D(U32, U32, uint4, read_imageui, write_imageui)\n\
-TILE_2D(F32, F32, float4, read_imagef, write_imagef)\n\
+TILE_2D(I32, I32, int4, int4, convert_int4_rte, read_imagei, write_imagei)\n\
+TILE_2D(U32, U32, uint4, uint4, convert_uint4_rte, read_imageui, write_imageui)\n\
+TILE_2D(F32, F32, float4, float4,convert_float4_rte,read_imagef, write_imagef)\n\
+TILE_2D(F32, U32, float4, uint4, convert_uint4_rte, read_imagef, write_imageui)\n\
\n\
\n\
\n\
@@ -65903,9 +71452,13 @@ static const source_map_t evis_resource[] =
{"cumsum_vx", cumsum_vx},
{"cumsum_2d_vx", cumsum_2d_vx},
{"cumsum_bf16_vx", cumsum_bf16_vx},
+ {"cumsum_ex_rev_axis0_vx", cumsum_ex_rev_axis0_vx},
+ {"cumsum_ex_rev_axis1_vx", cumsum_ex_rev_axis1_vx},
+ {"cumsum_ex_rev_axis2_vx", cumsum_ex_rev_axis2_vx},
{"cumsum_f16_u8_vx", cumsum_f16_u8_vx},
{"custom_softmax_vx", custom_softmax_vx},
{"custom_warp_affine_vx", custom_warp_affine_vx},
+ {"custom_warp_affine_rgb_vx", custom_warp_affine_rgb_vx},
{"custom_warp_perspective_vx", custom_warp_perspective_vx},
{"depth2space_crd_vx", depth2space_crd_vx},
{"depthwise_conv1d_src0_vx", depthwise_conv1d_src0_vx},
@@ -65988,12 +71541,15 @@ static const source_map_t evis_resource[] =
{"lstmunit_activation_S_F16_vx", lstmunit_activation_S_F16_vx},
{"lstmunit_activation_S_U8_vx", lstmunit_activation_S_U8_vx},
{"matrixmul_bf16_vx", matrixmul_bf16_vx},
+ {"matrixmul_cross_vx", matrixmul_cross_vx},
+ {"matrixmul_cross_i16_vx", matrixmul_cross_i16_vx},
{"matrixmul_f16_vx", matrixmul_f16_vx},
{"matrixmul_f16f16_u8_vx", matrixmul_f16f16_u8_vx},
{"matrixmul_f16i16_i16_vx", matrixmul_f16i16_i16_vx},
{"matrixmul_f16u8_f16_vx", matrixmul_f16u8_f16_vx},
{"matrixmul_f16u8_u8_vx", matrixmul_f16u8_u8_vx},
{"matrixmul_i16_vx", matrixmul_i16_vx},
+ {"matrixmul_merge_vx", matrixmul_merge_vx},
{"matrixmul_transA_vx", matrixmul_transA_vx},
{"matrixmul_transB_f16_vx", matrixmul_transB_f16_vx},
{"matrixmul_transB_f16_mix_vx", matrixmul_transB_f16_mix_vx},
@@ -66015,6 +71571,12 @@ static const source_map_t evis_resource[] =
{"moments_axis2_vx", moments_axis2_vx},
{"moments_u8_vx", moments_u8_vx},
{"moments_u8_axis012_vx", moments_u8_axis012_vx},
+ {"nearest_grid_sample_BF16_to_BF16_vx", nearest_grid_sample_BF16_to_BF16_vx},
+ {"nearest_grid_sample_F16_to_F16_vx", nearest_grid_sample_F16_to_F16_vx},
+ {"nearest_grid_sample_F16_to_U8_vx", nearest_grid_sample_F16_to_U8_vx},
+ {"nearest_grid_sample_I16_to_I16_vx", nearest_grid_sample_I16_to_I16_vx},
+ {"nearest_grid_sample_I8_to_I8_vx", nearest_grid_sample_I8_to_I8_vx},
+ {"nearest_grid_sample_U8_to_U8_vx", nearest_grid_sample_U8_to_U8_vx},
{"one_hot_vx", one_hot_vx},
{"poolwithargmax_F16_vx", poolwithargmax_F16_vx},
{"poolwithargmax_I16_vx", poolwithargmax_I16_vx},
@@ -66031,9 +71593,15 @@ static const source_map_t evis_resource[] =
{"pre_process_rgb888_planar_0_vx", pre_process_rgb888_planar_0_vx},
{"pre_process_rgb888_planar_1_vx", pre_process_rgb888_planar_1_vx},
{"pre_process_rgb888_planar_2_vx", pre_process_rgb888_planar_2_vx},
+ {"pre_process_rgb888_planar_nhwc_0_vx", pre_process_rgb888_planar_nhwc_0_vx},
+ {"pre_process_rgb888_planar_nhwc_1_vx", pre_process_rgb888_planar_nhwc_1_vx},
+ {"pre_process_rgb888_planar_nhwc_2_vx", pre_process_rgb888_planar_nhwc_2_vx},
{"pre_process_rgb888_planar_sep_0_vx", pre_process_rgb888_planar_sep_0_vx},
{"pre_process_rgb888_planar_sep_1_vx", pre_process_rgb888_planar_sep_1_vx},
{"pre_process_rgb888_planar_sep_2_vx", pre_process_rgb888_planar_sep_2_vx},
+ {"pre_process_rgb888_planar_sep_nhwc_0_vx", pre_process_rgb888_planar_sep_nhwc_0_vx},
+ {"pre_process_rgb888_planar_sep_nhwc_1_vx", pre_process_rgb888_planar_sep_nhwc_1_vx},
+ {"pre_process_rgb888_planar_sep_nhwc_2_vx", pre_process_rgb888_planar_sep_nhwc_2_vx},
{"pre_process_rgb_copy_vx", pre_process_rgb_copy_vx},
{"pre_process_yuv420_copy_vx", pre_process_yuv420_copy_vx},
{"pre_process_yuv420_scale_0_vx", pre_process_yuv420_scale_0_vx},
@@ -66092,6 +71660,8 @@ static const source_map_t evis_resource[] =
{"scatter_nd_update_vx", scatter_nd_update_vx},
{"scatter_nd_update_atom_vx", scatter_nd_update_atom_vx},
{"scatter_nd_update_big_vx", scatter_nd_update_big_vx},
+ {"scatter_nd_update_fp_vx", scatter_nd_update_fp_vx},
+ {"scatter_nd_update_qint_vx", scatter_nd_update_qint_vx},
{"scatter_nd_update_special_vx", scatter_nd_update_special_vx},
{"select_vx", select_vx},
{"sequence_mask_vx", sequence_mask_vx},
@@ -66102,6 +71672,8 @@ static const source_map_t evis_resource[] =
{"tensorstackconcat_vx", tensorstackconcat_vx},
{"tile_vx", tile_vx},
{"tile_mix_vx", tile_mix_vx},
+ {"tiny_yolov4_postprocess_box_vx", tiny_yolov4_postprocess_box_vx},
+ {"tiny_yolov4_postprocess_confidence_vx", tiny_yolov4_postprocess_confidence_vx},
{"upsample_F16_vx", upsample_F16_vx},
{"upsample_I16_vx", upsample_I16_vx},
{"upsample_I8_vx", upsample_I8_vx},
@@ -66192,6 +71764,7 @@ static const source_map_t cl_resource[] =
{"lstmunit_activation_S_F32_cl", lstmunit_activation_S_F32_cl},
{"lstmunit_activation_S_U8_cl", lstmunit_activation_S_U8_cl},
{"matrixmul_cl", matrixmul_cl},
+ {"matrixmul_cross_cl", matrixmul_cross_cl},
{"matrixmul_transA_cl", matrixmul_transA_cl},
{"maximum_cl", maximum_cl},
{"maxpoolwithargmax_cl", maxpoolwithargmax_cl},
@@ -66204,6 +71777,7 @@ static const source_map_t cl_resource[] =
{"moments_axis012_cl", moments_axis012_cl},
{"moments_axis1_cl", moments_axis1_cl},
{"moments_axis2_cl", moments_axis2_cl},
+ {"nearest_grid_sample_cl", nearest_grid_sample_cl},
{"one_hot_cl", one_hot_cl},
{"poolwithargmax_cl", poolwithargmax_cl},
{"pow_cl", pow_cl},
@@ -66229,6 +71803,8 @@ static const source_map_t cl_resource[] =
{"repeat_cl", repeat_cl},
{"resize_1d_bilinear_cl", resize_1d_bilinear_cl},
{"resize_1d_nearest_cl", resize_1d_nearest_cl},
+ {"resize_3d_bilinear_cl", resize_3d_bilinear_cl},
+ {"resize_3d_nearest_cl", resize_3d_nearest_cl},
{"resize_bilinear_cl", resize_bilinear_cl},
{"resize_nearest_cl", resize_nearest_cl},
{"reversesequence_cl", reversesequence_cl},
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
index 8462aad82..2c63c1e5e 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
@@ -33,6 +33,7 @@
#include "libnnext/vsi_nn_vxkernel.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vsi_nn_libnnext_resource.h"
+#include "vsi_nn_error.h"
static char s_vx_resource_path[VSI_NN_MAX_PATH] = "VX";
@@ -63,6 +64,11 @@ uint8_t * vsi_nn_LoadBinarySource
fseek( fp, 0, SEEK_SET );
buf = (uint8_t *)malloc( len + 1 );
+ if (buf == NULL)
+ {
+ fclose( fp );
+ return NULL;
+ }
n = (int32_t)fread( buf, 1, len, fp );
fclose( fp );
@@ -208,7 +214,10 @@ static vsi_status vsi_nn_RegisterVXKernel
evis = context->config.evis.ver;
program_src = (const char**)malloc(kernel_info->resource_num * sizeof(char *));
+ CHECK_PTR_FAIL_GOTO( program_src, "Create buffer fail.", final );
program_len = (vx_size*)malloc(kernel_info->resource_num * sizeof(vx_size));
+ CHECK_PTR_FAIL_GOTO( program_len, "Create buffer fail.", final );
+
for (i = 0; i < kernel_info->resource_num; i++)
{
program_src[i] = vsi_nn_resource_load_source_code(
@@ -228,7 +237,7 @@ static vsi_status vsi_nn_RegisterVXKernel
{
VSILOGE("[%s : %d] vxCreateProgramWithSource() Error!\n", __FILE__, __LINE__);
status = VSI_FAILURE;
- goto OnError;
+ goto final;
}
if(evis == VSI_NN_HW_EVIS_NONE)
@@ -267,16 +276,17 @@ static vsi_status vsi_nn_RegisterVXKernel
{
VSILOGE( "Add kernel %s fail.", kernel->name );
}
-OnError:
+final:
for (i = 0; i < kernel_info->resource_num; i++)
{
- if (program_src[i] && load_from_file)
+ if (load_from_file && program_src[i])
{
free((char *)program_src[i]);
}
}
if(program_src) free((char**)program_src);
if(program_len) free(program_len);
+
return status;
}
@@ -286,7 +296,7 @@ static vsi_status vsi_nn_RegisterBinKernel
vsi_nn_kernel_info_t * kernel_info
)
{
- vsi_status status;
+ vsi_status status = VSI_FAILURE;
vx_kernel obj;
vx_program program = NULL;
vx_size program_len = 0;
@@ -308,6 +318,11 @@ static vsi_status vsi_nn_RegisterBinKernel
program_ptr = vsi_nn_VxBinResourceGetResource(
kernel_info->resource_name[kernel_info->resource_num - 1], &program_len);
+ if (program_ptr == NULL)
+ {
+ VSILOGE("[%s : %d] vsi_nn_VxBinResourceGetResource() Error!\n", __FILE__, __LINE__);
+ return status;
+ }
program = vxCreateProgramWithBinary(ctx, (const vx_uint8 *)program_ptr, program_len);
status = vxGetStatus((vx_reference)program);
@@ -396,10 +411,19 @@ vx_node vsi_nn_RegisterClientKernelAndNewNode
)
{
vsi_status status;
- vx_context ctx;
- vx_kernel obj;
- vx_node node;
- vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index];
+ vx_context ctx = NULL;
+ vx_kernel obj = NULL;
+ vx_node node = NULL;
+ vx_kernel_description_t * kernel = NULL;
+
+ if (kernel_info->kernel)
+ {
+ kernel = kernel_info->kernel[kernel_info->kernel_index];
+ }
+ else
+ {
+ goto final;
+ }
ctx = vxGetContext( (vx_reference)graph->g );
@@ -444,6 +468,8 @@ vx_node vsi_nn_RegisterClientKernelAndNewNode
kernel->name, status );
return NULL;
}
+
+final:
return node;
} /* vsi_nn_RegisterClientKernelAndNewNode() */
@@ -501,6 +527,10 @@ vsi_status VX_CALLBACK vsi_nn_KernelValidator
vx_meta_format metas[]
)
{
+ VSI_UNREFERENCED(node);
+ VSI_UNREFERENCED(parameters);
+ VSI_UNREFERENCED(num);
+ VSI_UNREFERENCED(metas);
return VSI_SUCCESS;
} /* vsi_nn_KernelValidator() */
@@ -511,6 +541,9 @@ vsi_status VX_CALLBACK vsi_nn_KernelInitializer
uint32_t paraNum
)
{
+ VSI_UNREFERENCED(nodObj);
+ VSI_UNREFERENCED(paramObj);
+ VSI_UNREFERENCED(paraNum);
return VSI_SUCCESS;
} /* vsi_nn_KernelInitializer() */
@@ -521,6 +554,9 @@ vsi_status VX_CALLBACK vsi_nn_KernelDeinitializer
uint32_t paraNum
)
{
+ VSI_UNREFERENCED(nodObj);
+ VSI_UNREFERENCED(paraObj);
+ VSI_UNREFERENCED(paraNum);
return VSI_SUCCESS;
} /* vsi_nn_KernelDeinitializer() */
@@ -543,6 +579,8 @@ const uint8_t * vsi_nn_VxBinResourceGetResource
vx_size *len
)
{
+ VSI_UNREFERENCED(name);
+ VSI_UNREFERENCED(len);
return NULL;
} /* vsi_nn_VxResourceGetBinResource() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c b/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c
index 1f371d471..97da8bd51 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c
@@ -39,6 +39,7 @@
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
static vsi_status op_compute
(
@@ -78,6 +79,7 @@ static vsi_bool op_check
attr.vtl = TRUE;
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
a_times_b[0] = vsi_nn_CreateTensor(self->graph, &attr);
+ CHECK_PTR_FAIL_GOTO(a_times_b[0], "Create tensor failed", final);
ret = vsi_nn_OpCheck(VSI_NN_OP_MULTIPLY, self, inputs, a_times_b);
if (!ret)
{
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c
index 078d708a7..b248d9054 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c
@@ -34,6 +34,7 @@
#include "vsi_nn_internal_node.h"
#include "utils/vsi_nn_constraint_check.h"
#include "vsi_nn_kernel_prv.h"
+#include "vsi_nn_error.h"
static int32_t _get_input_num
(
@@ -91,6 +92,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -101,6 +104,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
} /* op_check() */
@@ -112,6 +118,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
@@ -122,7 +130,7 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
- vsi_bool ret = TRUE;
+ vsi_bool ret = FALSE;
uint32_t i;
vsi_nn_tensor_attr_t attr;
vsi_nn_internal_node_t* curr = NULL;
@@ -134,6 +142,12 @@ static vsi_bool op_setup
input_num = _get_input_num(self, inputs);
+ if (input_num < 2)
+ {
+ VSILOGE( "Wrong input tensor number = %u.", input_num );
+ return FALSE;
+ }
+
is_sp_supported = vsi_nn_is_sp_supported_broadcast(self->graph, inputs, input_num, outputs[0]);
for(i = 0; i < input_num -1; i++)
@@ -142,6 +156,7 @@ static vsi_bool op_setup
/* setup input for each add */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
if(i == 0)
{
curr->inputs[0] = inputs[i];
@@ -174,6 +189,7 @@ static vsi_bool op_setup
}
temp_output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(temp_output_tensor, curr, "Create internal tensor failed", final);
curr->outputs[0] = temp_output_tensor->t;
}
@@ -182,8 +198,10 @@ static vsi_bool op_setup
curr->outputs[0] = outputs[0];
}
- vsi_nn_internal_setup_node( self, curr );
+ ret = vsi_nn_internal_setup_node( self, curr );
}
+
+final:
return ret;
} /* op_setup() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c b/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c
index 23248759e..6252e4d52 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c
@@ -70,6 +70,9 @@ static vsi_bool op_check
)
{
/*TODO: Check tensor shapes. */
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
} /* op_check() */
@@ -80,6 +83,7 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.size[0] = inputs[1]->attr.size[0];
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c
index 56889cbed..0e6fa13e5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c
@@ -248,6 +248,7 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
/* TODO: Add code to comput outputs' shape. */
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c
index a969fa6b5..7afa231b4 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c
@@ -78,6 +78,7 @@ static vsi_bool setup_op_shapes
attr.is_const = TRUE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
inputs[BI_LSTM_FW_INPUT_H_STATE] = output_tensor->t;
}
@@ -91,6 +92,7 @@ static vsi_bool setup_op_shapes
attr.is_const = TRUE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
inputs[BI_LSTM_BW_INPUT_H_STATE] = output_tensor->t;
}
@@ -119,6 +121,8 @@ static vsi_bool setup_op_shapes
}
return TRUE;
+final:
+ return FALSE;
}
static vsi_status op_compute
@@ -128,6 +132,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -139,6 +145,9 @@ static vsi_bool op_check
)
{
/*TODO: Check tensor shapes. */
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
} /* op_check() */
@@ -150,6 +159,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
@@ -183,6 +194,9 @@ static vsi_bool op_setup
vsi_size_t batch_size = 0;
uint32_t time_step = 0;
vsi_size_t i = 0;
+ vsi_bool ret = FALSE;
+ vsi_status status = VSI_FAILURE;
+ vsi_nn_tensor_t** merge_tensors = NULL;
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
vsi_nn_internal_init_node_wksp( self );
@@ -207,6 +221,7 @@ static vsi_bool op_setup
/* transpose to time_major */
output_tensor = vsi_nn_rnn_transpose_time_major(self,
inputs[BI_LSTM_INPUT_INPUT], NULL, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
input_tensor = output_tensor->t;
}
@@ -219,6 +234,7 @@ static vsi_bool op_setup
/* transpose to time_major */
output_tensor = vsi_nn_rnn_transpose_time_major(self,
inputs[BI_LSTM_AUX_INPUT], NULL, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
aux_input_tensor = output_tensor->t;
}
}
@@ -231,10 +247,12 @@ static vsi_bool op_setup
CHECK_PTR_FAIL_GOTO( reshape_output_tensors, "Create buffer fail.", final );
memset( reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
- vsi_nn_rnn_split_input_tensor(self, input_tensor,
+ status = vsi_nn_rnn_split_input_tensor(self, input_tensor,
split_output_tensors, time_step, use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO(status, final);
- vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor);
+ status = vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO(status, final);
/* split aux input tensor */
if(has_aux_input)
@@ -246,10 +264,12 @@ static vsi_bool op_setup
CHECK_PTR_FAIL_GOTO( aux_reshape_output_tensors, "Create buffer fail.", final );
memset( aux_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
- vsi_nn_rnn_split_input_tensor(self, aux_input_tensor,
+ status = vsi_nn_rnn_split_input_tensor(self, aux_input_tensor,
aux_split_output_tensors, time_step, use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO(status, final);
- vsi_nn_rnn_data_check_aligned(self, aux_split_output_tensors, time_step, use_virtual_tensor);
+ status = vsi_nn_rnn_data_check_aligned(self, aux_split_output_tensors, time_step, use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO(status, final);
}
/* prepare output tensor */
@@ -267,6 +287,7 @@ static vsi_bool op_setup
/* reshape for split output */
output_tensor = vsi_nn_rnn_reshape_split_output(self,
split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
reshape_output_tensors[i] = output_tensor->t;
if (has_aux_input)
@@ -274,6 +295,7 @@ static vsi_bool op_setup
/* reshape for aux split output */
output_tensor = vsi_nn_rnn_reshape_split_output(self,
aux_split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
aux_reshape_output_tensors[i] = output_tensor->t;
}
}
@@ -291,21 +313,25 @@ static vsi_bool op_setup
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
lstmcell_out0 = output_tensor->t;
/* lstmcell output h_state */
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
lstmcell_out1 = output_tensor->t;
/* lstmcell output c_state */
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
lstmcell_out2 = output_tensor->t;
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_OVXLIB, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.lstmunit_ovxlib.activation = curr_param->activation;
curr->node->nn_param.lstmunit_ovxlib.cell_clip = curr_param->cell_clip;
curr->node->nn_param.lstmunit_ovxlib.forget_bias = curr_param->forget_bias;
@@ -373,6 +399,7 @@ static vsi_bool op_setup
/* reshape output to 3-dims */
output_tensor = vsi_nn_rnn_reshape_cell_output(self,
lstmcell_out0, (uint32_t)batch_size, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
lstmcell_reshape_output_tensors_fw[i] = output_tensor->t;
}
@@ -391,21 +418,25 @@ static vsi_bool op_setup
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
lstmcell_out0 = output_tensor->t;
/* lstmcell output h_state */
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
lstmcell_out1 = output_tensor->t;
/* lstmcell output c_state */
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
lstmcell_out2 = output_tensor->t;
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_OVXLIB, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.lstmunit_ovxlib.activation = curr_param->activation;
curr->node->nn_param.lstmunit_ovxlib.cell_clip = curr_param->cell_clip;
curr->node->nn_param.lstmunit_ovxlib.forget_bias = curr_param->forget_bias;
@@ -473,12 +504,12 @@ static vsi_bool op_setup
/* reshape output to 3-dims */
output_tensor = vsi_nn_rnn_reshape_cell_output(self,
lstmcell_out0, (uint32_t)batch_size, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
lstmcell_reshape_output_tensors_bw[i] = output_tensor->t;
}
if(curr_param->merge_outputs)
{
- vsi_nn_tensor_t** merge_tensors = NULL;
merge_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
CHECK_PTR_FAIL_GOTO( merge_tensors, "Create buffer fail.", final );
memset( merge_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
@@ -489,6 +520,7 @@ static vsi_bool op_setup
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
tensor = output_tensor->t;
}
@@ -499,8 +531,10 @@ static vsi_bool op_setup
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 2, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.concat.axis = 0;
curr->inputs[0] = lstmcell_reshape_output_tensors_fw[i];
curr->inputs[1] = lstmcell_reshape_output_tensors_bw[i];
@@ -512,6 +546,7 @@ static vsi_bool op_setup
/* concat lstmcell output, the lstm's output is 3-dims */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.concat.axis = 2;
for( i = 0; i < time_step; i++ )
{
@@ -526,7 +561,6 @@ static vsi_bool op_setup
vsi_nn_rnn_transpose_time_major(self,
tensor, outputs[BI_LSTM_FW_OUTPUT_OUTPUT], use_virtual_tensor);
}
- vsi_nn_safe_free( merge_tensors );
}
else
{
@@ -537,12 +571,14 @@ static vsi_bool op_setup
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
tensor = output_tensor->t;
}
/* concat lstmcell output, the lstm's output is 3-dims */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.concat.axis = 2;
for( i = 0; i < time_step; i++ )
{
@@ -565,12 +601,14 @@ static vsi_bool op_setup
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[BI_LSTM_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
tensor = output_tensor->t;
}
/* concat lstmcell output, the lstm's output is 3-dims */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.concat.axis = 2;
for( i = 0; i < time_step; i++ )
{
@@ -587,7 +625,10 @@ static vsi_bool op_setup
}
}
+ ret = TRUE;
+
final:
+ vsi_nn_safe_free( merge_tensors );
vsi_nn_safe_free( split_output_tensors );
vsi_nn_safe_free( aux_split_output_tensors )
vsi_nn_safe_free( reshape_output_tensors );
@@ -595,7 +636,7 @@ static vsi_bool op_setup
vsi_nn_safe_free( lstmcell_reshape_output_tensors_fw );
vsi_nn_safe_free( lstmcell_reshape_output_tensors_bw );
- return TRUE;
+ return ret;
} /* op_setup() */
static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c
index c122de7f5..8b3844de0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c
@@ -79,6 +79,7 @@ static vsi_bool setup_op_shapes
attr.is_const = TRUE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
inputs[BI_RNN_FW_INPUT_H_STATE] = output_tensor->t;
}
@@ -92,6 +93,7 @@ static vsi_bool setup_op_shapes
attr.is_const = TRUE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
inputs[BI_RNN_BW_INPUT_H_STATE] = output_tensor->t;
}
@@ -103,6 +105,7 @@ static vsi_bool setup_op_shapes
attr.vtl = use_virtual_tensor;
attr.is_const = FALSE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
outputs[BI_RNN_FW_OUTPUT_H_STATE] = output_tensor->t;
}
@@ -114,6 +117,7 @@ static vsi_bool setup_op_shapes
attr.vtl = use_virtual_tensor;
attr.is_const = FALSE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
outputs[BI_RNN_BW_OUTPUT_H_STATE] = output_tensor->t;
}
@@ -162,6 +166,8 @@ static vsi_bool setup_op_shapes
}
}
return TRUE;
+final:
+ return FALSE;
}
static vsi_status op_compute
@@ -171,6 +177,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -181,6 +189,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -193,6 +204,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
@@ -225,6 +238,9 @@ static vsi_bool op_setup
vsi_size_t batch_size = 0;
vsi_size_t time_step = 0;
vsi_size_t i = 0;
+ vsi_bool ret = FALSE;
+ vsi_nn_tensor_t** merge_tensors = NULL;
+ vsi_status status = VSI_FAILURE;
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
vsi_nn_internal_init_node_wksp( self );
@@ -249,6 +265,7 @@ static vsi_bool op_setup
/* transpose to time_major */
output_tensor = vsi_nn_rnn_transpose_time_major(self,
inputs[BI_RNN_INPUT_INPUT], NULL, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
input_tensor = output_tensor->t;
}
@@ -261,6 +278,7 @@ static vsi_bool op_setup
/* transpose to time_major */
output_tensor = vsi_nn_rnn_transpose_time_major(self,
inputs[BI_RNN_AUX_INPUT], NULL, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
aux_input_tensor = output_tensor->t;
}
}
@@ -273,10 +291,12 @@ static vsi_bool op_setup
CHECK_PTR_FAIL_GOTO( reshape_output_tensors, "Create buffer fail.", final );
memset( reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
- vsi_nn_rnn_split_input_tensor(self, input_tensor,
+ status = vsi_nn_rnn_split_input_tensor(self, input_tensor,
split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO(status, final);
- vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+ status = vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO(status, final);
/* split aux input tensor */
if(has_aux_input)
@@ -288,10 +308,13 @@ static vsi_bool op_setup
CHECK_PTR_FAIL_GOTO( aux_reshape_output_tensors, "Create buffer fail.", final );
memset( aux_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
- vsi_nn_rnn_split_input_tensor(self, aux_input_tensor,
+ status = vsi_nn_rnn_split_input_tensor(self, aux_input_tensor,
aux_split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO(status, final);
- vsi_nn_rnn_data_check_aligned(self, aux_split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+ status = vsi_nn_rnn_data_check_aligned(self, aux_split_output_tensors,
+ (uint32_t)time_step, use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO(status, final);
}
/* prepare output tensor */
@@ -309,6 +332,7 @@ static vsi_bool op_setup
/* reshape for split output */
output_tensor = vsi_nn_rnn_reshape_split_output(self,
split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
reshape_output_tensors[i] = output_tensor->t;
if (has_aux_input)
@@ -316,6 +340,7 @@ static vsi_bool op_setup
/* reshape for aux split output */
output_tensor = vsi_nn_rnn_reshape_split_output(self,
aux_split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
aux_reshape_output_tensors[i] = output_tensor->t;
}
}
@@ -331,12 +356,14 @@ static vsi_bool op_setup
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
rnncell_out0 = output_tensor->t;
/* rnncell output h_state */
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
rnncell_out1 = output_tensor->t;
if (reshape_output_tensors[i]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 &&
@@ -366,6 +393,7 @@ static vsi_bool op_setup
}
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RNNCELL_OVXLIB, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.rnncell_ovxlib.activation = curr_param->activation;
memcpy( curr->node->nn_param.rnncell_ovxlib.internal_dtype,
curr_param->internal_dtype,
@@ -399,6 +427,7 @@ static vsi_bool op_setup
/* reshape output to 3-dims */
output_tensor = vsi_nn_rnn_reshape_cell_output(self,
rnncell_out0, (uint32_t)batch_size, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
rnncell_reshape_output_tensors_fw[i] = output_tensor->t;
}
@@ -421,12 +450,14 @@ static vsi_bool op_setup
&outputs[BI_RNN_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
}
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
rnncell_out0 = output_tensor->t;
/* rnncell output h_state */
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[BI_RNN_BW_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
rnncell_out1 = output_tensor->t;
if (reshape_output_tensors[time_step - 1 - i]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 &&
@@ -456,6 +487,7 @@ static vsi_bool op_setup
}
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RNNCELL_OVXLIB, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.rnncell_ovxlib.activation = curr_param->activation;
memcpy( curr->node->nn_param.rnncell_ovxlib.internal_dtype,
curr_param->internal_dtype,
@@ -489,12 +521,12 @@ static vsi_bool op_setup
/* reshape output to 3-dims */
output_tensor = vsi_nn_rnn_reshape_cell_output(self,
rnncell_out0, (uint32_t)batch_size, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
rnncell_reshape_output_tensors_bw[time_step - 1 - i] = output_tensor->t;
}
if(curr_param->merge_outputs)
{
- vsi_nn_tensor_t** merge_tensors = NULL;
merge_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
CHECK_PTR_FAIL_GOTO( merge_tensors, "Create buffer fail.", final );
memset( merge_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
@@ -505,6 +537,7 @@ static vsi_bool op_setup
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
tensor = output_tensor->t;
}
@@ -515,8 +548,10 @@ static vsi_bool op_setup
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 2, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.concat.axis = 0;
curr->inputs[0] = rnncell_reshape_output_tensors_fw[i];
curr->inputs[1] = rnncell_reshape_output_tensors_bw[i];
@@ -528,6 +563,7 @@ static vsi_bool op_setup
/* concat rnncell output, the rnn's output is 3-dims */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.concat.axis = 2;
for( i = 0; i < time_step; i++ )
{
@@ -542,7 +578,6 @@ static vsi_bool op_setup
vsi_nn_rnn_transpose_time_major(self,
tensor, outputs[BI_RNN_FW_OUTPUT_OUTPUT], use_virtual_tensor);
}
- vsi_nn_safe_free( merge_tensors );
}
else
{
@@ -553,6 +588,7 @@ static vsi_bool op_setup
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
tensor = output_tensor->t;
}
@@ -561,6 +597,7 @@ static vsi_bool op_setup
if (outputs[BI_RNN_FW_OUTPUT_H_STATE] != NULL)
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = last_step_h_state_fw;
curr->outputs[0] = outputs[BI_RNN_FW_OUTPUT_H_STATE];
vsi_nn_internal_setup_node(self, curr);
@@ -568,6 +605,7 @@ static vsi_bool op_setup
/* concat rnncell output, the rnn's output is 3-dims */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.concat.axis = 2;
for( i = 0; i < time_step; i++ )
{
@@ -590,6 +628,7 @@ static vsi_bool op_setup
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[BI_RNN_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
tensor = output_tensor->t;
}
@@ -598,6 +637,7 @@ static vsi_bool op_setup
if (outputs[BI_RNN_BW_OUTPUT_H_STATE] != NULL)
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = last_step_h_state_bw;
curr->outputs[0] = outputs[BI_RNN_BW_OUTPUT_H_STATE];
vsi_nn_internal_setup_node(self, curr);
@@ -605,6 +645,7 @@ static vsi_bool op_setup
/* concat rnncell output, the rnn's output is 3-dims */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.concat.axis = 2;
for( i = 0; i < time_step; i++ )
{
@@ -621,6 +662,7 @@ static vsi_bool op_setup
}
}
+ ret = TRUE;
final:
vsi_nn_safe_free( split_output_tensors );
vsi_nn_safe_free( aux_split_output_tensors )
@@ -628,8 +670,9 @@ static vsi_bool op_setup
vsi_nn_safe_free( aux_reshape_output_tensors );
vsi_nn_safe_free( rnncell_reshape_output_tensors_fw );
vsi_nn_safe_free( rnncell_reshape_output_tensors_bw );
+ vsi_nn_safe_free( merge_tensors );
- return TRUE;
+ return ret;
} /* op_setup() */
static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c b/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c
index 878c60692..9f7e6ace9 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c
@@ -81,6 +81,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -92,6 +95,8 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = 1;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c
index cac99d089..f53aeb548 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c
@@ -149,6 +149,8 @@ static vsi_bool op_setup
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
vsi_bool ret = TRUE;
+ VSI_UNREFERENCED(self);
+
out_rank = inputs[0]->attr.dim_num;
for (i = 0; i < out_rank; i++)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c
index 1eaa7839a..e3de22fff 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c
@@ -37,6 +37,7 @@
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
/*
Declare number of input and output.
@@ -290,7 +291,7 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
- vsi_bool ret = TRUE;
+ vsi_bool ret = FALSE;
if ( NULL == self )
{
@@ -298,7 +299,7 @@ static vsi_bool op_setup
}
ret = vsi_nn_op_common_setup(self, inputs, outputs);
- if ( _is_dataconvert_op(self, inputs, outputs) )
+ if ( _is_dataconvert_op(self, inputs, outputs) )
{
vsi_nn_internal_node_t* curr = NULL;
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
@@ -309,7 +310,7 @@ static vsi_bool op_setup
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node(self, curr);
+ ret &= vsi_nn_internal_setup_node(self, curr);
}
return ret;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
index 3e1db0e6d..bade3f959 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
@@ -39,6 +39,7 @@
#include "vsi_nn_internal_node.h"
#include "utils/vsi_nn_constraint_check.h"
#include "utils/vsi_nn_dtype_util.h"
+#include "vsi_nn_error.h"
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
@@ -194,7 +195,7 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
- vsi_bool ret = TRUE;
+ vsi_bool ret = FALSE;
vsi_nn_internal_node_t* curr = NULL;
float min = self->nn_param.clip.min;
float max = self->nn_param.clip.max;
@@ -224,11 +225,12 @@ static vsi_bool op_setup
{
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0);
}
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
self->nn_param.clip.local2->is_internal_node = TRUE;
}
@@ -236,6 +238,8 @@ static vsi_bool op_setup
{
ret = vsi_nn_op_common_setup(self, inputs, outputs);
}
+
+final:
return ret;
} /* op_init() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_common.c b/src/tim/vx/internal/src/ops/vsi_nn_op_common.c
index 354b6ce61..f4e70c55f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_common.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_common.c
@@ -38,6 +38,9 @@ vsi_status vsi_nn_op_common_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
//TODO: assert_always()
return VSI_FAILURE;
} /* op_common_init() */
@@ -64,6 +67,7 @@ vsi_bool vsi_nn_op_common_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(node);
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
@@ -81,5 +85,8 @@ vsi_status vsi_nn_op_common_optimize
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(node);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return VSI_SUCCESS;
} /* op_common_optimize() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
index bb1be6e1a..47b5889df 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
@@ -159,6 +159,8 @@ static vsi_status copy_tensor_to_view
vsi_status ret;
vsi_nn_concat_lcl_data * data;
+ VSI_UNREFERENCED(axis);
+
ret = VSI_SUCCESS;
/* Malloc ptr */
data = (vsi_nn_concat_lcl_data *)malloc( sizeof(vsi_nn_concat_lcl_data) );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c
index f07a690eb..f802f44e9 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c
@@ -32,6 +32,7 @@
#include "utils/vsi_nn_dtype_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
typedef struct _conv1d_local_data_t {
vsi_bool use_ext_pad;
@@ -324,12 +325,16 @@ static vsi_bool op_setup
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, TRUE);
tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( tensor, "Create tensor fail.", final );
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_PAD, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
front_data = (uint32_t*)\
vsi_nn_internal_new_node_param(curr, sizeof(uint32_t) * inputs[0]->attr.dim_num);
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(front_data, curr, "Create internal buffer failed", final);
back_data = (uint32_t*)\
vsi_nn_internal_new_node_param(curr, sizeof(uint32_t) * inputs[0]->attr.dim_num);
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(back_data, curr, "Create internal buffer failed", final);
front_data[0] = p->pad[0];
front_data[1] = 0;
@@ -353,6 +358,8 @@ static vsi_bool op_setup
}
return TRUE;
+final:
+ return FALSE;
} /* op_setup() */
static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c
index 03118aaa2..2e1ae75f5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c
@@ -38,6 +38,7 @@
#include "vsi_nn_error.h"
#include "vsi_nn_internal_node.h"
#include "vsi_nn_rnn_helper.h"
+#include "vsi_nn_error.h"
static vsi_nn_internal_tensor_t * reshape_cell_out
(
@@ -54,11 +55,14 @@ static vsi_nn_internal_tensor_t * reshape_cell_out
vsi_nn_internal_init_tensor_attr(&attr, &cell_out->attr.dtype, TRUE);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
/* reshape cell_out [w,h,c,n] to [w,h,c,1,n] */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
reshape_cell_size = vsi_nn_internal_new_node_param(curr,
VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( reshape_cell_size, curr, "Create internal buffer fail.", final );
reshape_cell_size[0] = cell_out->attr.size[0];
reshape_cell_size[1] = cell_out->attr.size[1];
reshape_cell_size[2] = cell_out->attr.size[2];
@@ -71,6 +75,8 @@ static vsi_nn_internal_tensor_t * reshape_cell_out
curr->outputs[0] = output_tensor->t;
vsi_nn_internal_setup_node( self, curr );
+
+final:
return output_tensor;
} /* reshape_cell_out() */
@@ -88,11 +94,14 @@ static vsi_nn_internal_tensor_t * reshape_split_out
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
vsi_nn_internal_init_tensor_attr(&attr, &split_out->attr.dtype, TRUE);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
/* reshape [w,h,c,t,n] to [w,h,c,n] */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
reshape_split_size = vsi_nn_internal_new_node_param(curr,
VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( reshape_split_size, curr, "Create internal buffer fail.", final );
reshape_split_size[0] = split_out->attr.size[0];
reshape_split_size[1] = split_out->attr.size[1];
reshape_split_size[2] = split_out->attr.size[2];
@@ -104,10 +113,11 @@ static vsi_nn_internal_tensor_t * reshape_split_out
curr->outputs[0] = output_tensor->t;
vsi_nn_internal_setup_node( self, curr );
+final:
return output_tensor;
} /* reshape_split_out() */
-static void split_input_tensor
+static vsi_status split_input_tensor
(
vsi_nn_node_t * self,
vsi_nn_tensor_t * input,
@@ -115,6 +125,7 @@ static void split_input_tensor
uint32_t time_step
)
{
+ vsi_status status = VSI_FAILURE;
uint32_t i;
vsi_nn_tensor_attr_t attr;
vsi_nn_internal_node_t* curr = NULL;
@@ -124,7 +135,9 @@ static void split_input_tensor
i = 0;
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, time_step );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
slices = (uint32_t *)vsi_nn_internal_new_node_param(curr, time_step * sizeof(uint32_t));
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(slices, curr, "Create internal buffer failed", final);
curr->node->nn_param.split.axis = 3; /* input_shape [w,h,c,t,n] */
curr->node->nn_param.split.slices_num = time_step;
curr->inputs[0] = input;
@@ -135,10 +148,15 @@ static void split_input_tensor
slices[i] = 1;
vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, TRUE);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( output_tensor, curr, "Create internal tensor fail.", final );
curr->outputs[i] = output_tensor->t;
output[i] = output_tensor->t;
}
vsi_nn_internal_setup_node( self, curr );
+
+ status = VSI_SUCCESS;
+final:
+ return status;
} /* split_input_tensor() */
static void trans_output_tensor
@@ -182,13 +200,14 @@ static void trans_output_tensor
}
} /* trans_output_tensor() */
-static void trans_input_tensor
+static vsi_status trans_input_tensor
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** trans_inputs
)
{
+ vsi_status status = VSI_FAILURE;
vsi_size_t perm[VSI_NN_MAX_DIM_NUM];
vsi_nn_internal_tensor_t * tmp_tensor = NULL;
vsi_nn_conv2d_lstm_param * p = &self->nn_param.conv2d_lstm;
@@ -203,6 +222,7 @@ static void trans_input_tensor
perm[3] = 3;
perm[4] = 4;
tmp_tensor = vsi_nn_rnn_create_permute(self, inputs[CONV2D_LSTM_IN_INPUT], NULL, perm, 5, TRUE);
+ CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
trans_inputs[CONV2D_LSTM_IN_INPUT] = tmp_tensor->t;
// [c,w,h,n] --> [w,h,c,n]
@@ -211,9 +231,11 @@ static void trans_input_tensor
perm[2] = 0;
perm[3] = 3;
tmp_tensor = vsi_nn_rnn_create_permute(self, inputs[CONV2D_LSTM_IN_H_STATE], NULL, perm, 4, TRUE);
+ CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
trans_inputs[CONV2D_LSTM_IN_H_STATE] = tmp_tensor->t;
tmp_tensor = vsi_nn_rnn_create_permute(self, inputs[CONV2D_LSTM_IN_C_STATE], NULL, perm, 4, TRUE);
+ CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
trans_inputs[CONV2D_LSTM_IN_C_STATE] = tmp_tensor->t;
}
else
@@ -222,9 +244,13 @@ static void trans_input_tensor
trans_inputs[CONV2D_LSTM_IN_H_STATE] = inputs[CONV2D_LSTM_IN_H_STATE];
trans_inputs[CONV2D_LSTM_IN_C_STATE] = inputs[CONV2D_LSTM_IN_C_STATE];
}
+
+ status = VSI_SUCCESS;
+final:
+ return status;
} /* trans_input_tensor() */
-static void create_state_tensor
+static vsi_status create_state_tensor
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
@@ -234,6 +260,7 @@ static void create_state_tensor
vsi_size_t out_channel
)
{
+ vsi_status status = VSI_FAILURE;
vsi_size_t samples, state_shape[4];
vsi_nn_tensor_attr_t attr;
vsi_nn_internal_tensor_t * tensor = NULL;
@@ -267,6 +294,7 @@ static void create_state_tensor
attr.is_const = TRUE;
tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
inputs[CONV2D_LSTM_IN_H_STATE] = tensor->t;
}
@@ -280,6 +308,7 @@ static void create_state_tensor
attr.is_const = TRUE;
tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
inputs[CONV2D_LSTM_IN_C_STATE] = tensor->t;
}
@@ -291,6 +320,7 @@ static void create_state_tensor
attr.vtl = TRUE;
attr.is_const = FALSE;
tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
outputs[CONV2D_LSTM_OUT_H_STATE] = tensor->t;
}
@@ -303,8 +333,12 @@ static void create_state_tensor
attr.vtl = TRUE;
attr.is_const = FALSE;
tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
outputs[CONV2D_LSTM_OUT_C_STATE] = tensor->t;
}
+ status = VSI_SUCCESS;
+final:
+ return status;
} /* create_state_tensor() */
static vsi_bool setup_op_shapes
@@ -314,6 +348,7 @@ static vsi_bool setup_op_shapes
vsi_nn_tensor_t ** outputs
)
{
+ vsi_status status = VSI_FAILURE;
vsi_nn_tensor_attr_t attr;
vsi_size_t w_out, h_out, samples, timestep, out_channel;
vsi_size_t conv_in_shape[4];
@@ -411,7 +446,8 @@ static vsi_bool setup_op_shapes
}
/* create hstate and cstate input/output if app doesn't provide them */
- create_state_tensor(self, inputs, outputs, w_out, h_out, out_channel);
+ status = create_state_tensor(self, inputs, outputs, w_out, h_out, out_channel);
+ CHECK_STATUS_FAIL_GOTO(status, final);
/* hidden state output */
if(VSI_NN_DIM_AUTO == outputs[CONV2D_LSTM_OUT_H_STATE]->attr.dim_num)
@@ -452,6 +488,8 @@ static vsi_bool setup_op_shapes
}
return TRUE;
+final:
+ return FALSE;
} /* setup_op_shapes() */
static vsi_status op_compute
@@ -461,6 +499,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -471,6 +511,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -483,6 +526,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
@@ -493,6 +538,7 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ vsi_status status = VSI_FAILURE;
vsi_size_t i, timestep, perm[VSI_NN_MAX_DIM_NUM];
vsi_nn_tensor_t * trans_inputs[3] = { NULL };
vsi_nn_tensor_t * conv2dlstm_outputs[3] = { NULL };
@@ -503,6 +549,7 @@ static vsi_bool op_setup
vsi_nn_tensor_t * cell_out0 = NULL, * cell_out1 = NULL, * cell_out2 = NULL;
vsi_nn_conv2d_lstm_param * p = &self->nn_param.conv2d_lstm;
vsi_nn_internal_node_t* curr = NULL;
+ vsi_bool ret = FALSE;
memset(&attr, 0, sizeof(attr));
memset(perm, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM);
@@ -512,7 +559,8 @@ static vsi_bool op_setup
setup_op_shapes(self, inputs, outputs);
- trans_input_tensor(self, inputs, trans_inputs);
+ status = trans_input_tensor(self, inputs, trans_inputs);
+ CHECK_STATUS_FAIL_GOTO(status, final);
split_outputs = (vsi_nn_tensor_t **)malloc(sizeof(vsi_nn_tensor_t *) * timestep);
CHECK_PTR_FAIL_GOTO( split_outputs, "Create buffer fail.", final );
@@ -522,7 +570,8 @@ static vsi_bool op_setup
memset(conv2dlstm_step_outputs, 0, sizeof(vsi_nn_tensor_t *) * timestep);
/* split input tensor by time-step */
- split_input_tensor(self, trans_inputs[CONV2D_LSTM_IN_INPUT], split_outputs, (uint32_t)timestep);
+ status = split_input_tensor(self, trans_inputs[CONV2D_LSTM_IN_INPUT], split_outputs, (uint32_t)timestep);
+ CHECK_STATUS_FAIL_GOTO(status, final);
cell_out0 = cell_out1 = cell_out2 = NULL;
step_h_state = trans_inputs[CONV2D_LSTM_IN_H_STATE];
@@ -533,6 +582,7 @@ static vsi_bool op_setup
/* reshape for split output */
tmp_tensor = reshape_split_out(self, split_outputs[i]);
+ CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
reshape_output = tmp_tensor->t;
if((i == timestep - 1) && p->return_sequences == FALSE && p->data_format == CONV2D_LSTM_CHANNELS_FIRST)
@@ -543,6 +593,7 @@ static vsi_bool op_setup
{
vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.dtype, TRUE);
tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
cell_out0 = tmp_tensor->t;
}
@@ -556,16 +607,19 @@ static vsi_bool op_setup
/* conv2d_lstm hstate output */
vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_OUT_H_STATE]->attr.dtype, TRUE);
tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
cell_out1 = tmp_tensor->t;
/* conv2d_lstm cstate output */
vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_OUT_C_STATE]->attr.dtype, TRUE);
tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
cell_out2 = tmp_tensor->t;
}
/* create a conv2d_lstm_cell */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONV2D_LSTM_CELL, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.conv2d_lstm_cell.filters = p->filters;
curr->node->nn_param.conv2d_lstm_cell.activation = p->activation;
curr->node->nn_param.conv2d_lstm_cell.recurrent_activation = p->recurrent_activation;
@@ -600,6 +654,7 @@ static vsi_bool op_setup
{
/* store step's outputs */
tmp_tensor = reshape_cell_out(self, cell_out0);
+ CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
conv2dlstm_step_outputs[i] = tmp_tensor->t;
}
}
@@ -610,6 +665,7 @@ static vsi_bool op_setup
{
vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.dtype, TRUE);
tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
conv2dlstm_outputs[CONV2D_LSTM_OUT_OUTPUT] = tmp_tensor->t;
}
else
@@ -618,6 +674,7 @@ static vsi_bool op_setup
}
/* concat all step's output0 data on dimension t --- cell out0 shape: [w,h,c,t,n] */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)timestep, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.concat.axis = 3;
for(i = 0; i < timestep; i++)
{
@@ -638,10 +695,11 @@ static vsi_bool op_setup
trans_output_tensor(self, conv2dlstm_outputs, outputs);
}
+ ret = TRUE;
final:
vsi_nn_safe_free(split_outputs);
vsi_nn_safe_free(conv2dlstm_step_outputs)
- return TRUE;
+ return ret;
} /* op_setup() */
static vsi_status op_deinit
@@ -660,6 +718,7 @@ static vsi_status op_init
)
{
vsi_status status = VSI_SUCCESS;
+ VSI_UNREFERENCED(self);
return status;
} /* op_init() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
index 388de95c3..3a31d44db 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
@@ -35,7 +35,7 @@
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
#include "vsi_nn_internal_node.h"
#include "vsi_nn_rnn_helper.h"
@@ -99,8 +99,10 @@ static vsi_nn_internal_tensor_t * create_input_conv
attr.vtl = TRUE;
attr.is_const = FALSE;
input_conv_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+ CHECK_PTR_FAIL_GOTO(input_conv_out, "Create internal tensor failed", final);
input_conv = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(input_conv, "Create internal node failed", final);
input_conv->node->nn_param.conv2d.group = 1;
input_conv->node->nn_param.conv2d.ksize[0] = p->conv2d.ksize[0];
input_conv->node->nn_param.conv2d.ksize[1] = p->conv2d.ksize[1];
@@ -129,6 +131,7 @@ static vsi_nn_internal_tensor_t * create_input_conv
// reshape whcn --> xn
reshape_out = reshape_tensor_to_act(self, input_conv_out->t);
+final:
return reshape_out;
} /* create_input_conv() */
@@ -176,8 +179,10 @@ static vsi_nn_internal_tensor_t * create_recurrent_conv
attr.vtl = TRUE;
attr.is_const = FALSE;
recurrent_conv_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+ CHECK_PTR_FAIL_GOTO(recurrent_conv_out, "Create internal tensor failed", final);
recurrent_conv = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(recurrent_conv, "Create internal node failed", final);
recurrent_conv->node->nn_param.conv2d.pad_type = VSI_NN_PAD_SAME;
recurrent_conv->node->nn_param.conv2d.group = 1;
recurrent_conv->node->nn_param.conv2d.ksize[0] = p->conv2d.ksize[0];
@@ -203,6 +208,8 @@ static vsi_nn_internal_tensor_t * create_recurrent_conv
// reshape whcn --> xn
reshape_out = reshape_tensor_to_act(self, recurrent_conv_out->t);
+
+final:
return reshape_out;
} /* create_recurrent_conv() */
@@ -303,6 +310,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -313,6 +322,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -325,6 +337,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
@@ -344,6 +358,7 @@ static vsi_bool op_setup
vsi_nn_internal_tensor_t * reshape_h_out = NULL;
vsi_nn_internal_tensor_t * reshape_c_out = NULL;
vsi_nn_conv2d_lstm_cell_param * p = &self->nn_param.conv2d_lstm_cell;
+ vsi_bool ret = FALSE;
vsi_nn_internal_init_node_wksp( self );
@@ -359,6 +374,7 @@ static vsi_bool op_setup
inputs[CONV2D_LSTM_CELL_IN_KERNEL_I2I + i],
inputs[CONV2D_LSTM_CELL_IN_BIAS_I + i]
);
+ CHECK_PTR_FAIL_GOTO(input_conv_outputs[i], "Create internal tensor failed", final);
}
/* create recurrent convolution */
@@ -369,10 +385,12 @@ static vsi_bool op_setup
inputs[CONV2D_LSTM_CELL_IN_H_STATE],
inputs[CONV2D_LSTM_CELL_IN_KERNEL_R2I + i]
);
+ CHECK_PTR_FAIL_GOTO(recurrent_conv_outputs[i], "Create internal tensor failed", final);
}
/* activations */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_ACTIVATION, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.lstmunit_activation.cell_clip = 0;
curr->node->nn_param.lstmunit_activation.proj_clip = 0;
curr->node->nn_param.lstmunit_activation.forget_bias = 0;
@@ -384,6 +402,7 @@ static vsi_bool op_setup
curr->node->nn_param.lstmunit_activation.recurrent_activation = p->recurrent_activation;
reshape_cell_in = reshape_tensor_to_act(self, inputs[CONV2D_LSTM_CELL_IN_C_STATE]);
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_cell_in, curr, "Create internal tensor failed", final);
curr->inputs[LSTMUNIT_ACT_CSTATE_IN] = reshape_cell_in->t;
for(i = 0; i < CONV2D_LSTM_CELL_GATE_NUM; i++)
{
@@ -392,15 +411,20 @@ static vsi_bool op_setup
curr->inputs[LSTMUNIT_ACT_HSTATE_FC_I + i] = recurrent_conv_outputs[i]->t;
}
reshape_out = reshape_tensor_to_act(self, outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]);
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_out, curr, "Create internal tensor failed", final);
reshape_h_out = reshape_tensor_to_act(self, outputs[CONV2D_LSTM_CELL_OUT_H_STATE]);
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_h_out, curr, "Create internal tensor failed", final);
reshape_c_out = reshape_tensor_to_act(self, outputs[CONV2D_LSTM_CELL_OUT_C_STATE]);
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_c_out, curr, "Create internal tensor failed", final);
curr->outputs[LSTMUNIT_ACT_OUTPUT] = reshape_out->t;
curr->outputs[LSTMUNIT_ACT_CSTATE_OUT] = reshape_c_out->t;
curr->outputs[LSTMUNIT_ACT_HSTATE_OUT] = reshape_h_out->t;
vsi_nn_internal_setup_node(self, curr);
- return TRUE;
+ ret = TRUE;
+final:
+ return ret;
} /* op_setup() */
static vsi_status op_deinit
@@ -419,7 +443,7 @@ static vsi_status op_init
)
{
vsi_status status = VSI_SUCCESS;
-
+ VSI_UNREFERENCED(self);
return status;
} /* op_init() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c
index 1825e3b98..98217903a 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c
@@ -353,7 +353,7 @@ static vsi_status op_init
//self->nn_param.conv3d.local = \
// (conv3d_local_data_t*)malloc(sizeof(conv3d_local_data_t));
*/
-
+ VSI_UNREFERENCED(self);
return VSI_SUCCESS;
} /* op_init() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c
index 6aaa61d5c..ed26a68f0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c
@@ -36,6 +36,7 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
@@ -47,6 +48,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -72,6 +75,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
}
@@ -86,6 +91,7 @@ static vsi_bool op_setup
int32_t i = 0;
uint32_t j = 0;
vsi_nn_internal_node_t* curr = NULL;
+ vsi_bool ret = FALSE;
vsi_nn_internal_init_node_wksp( self );
p = (vsi_nn_crop_param *)&(self->nn_param.crop);
@@ -96,46 +102,43 @@ static vsi_bool op_setup
return FALSE;
}
- if ( VSI_NN_DIM_AUTO != outputs[0]->attr.dim_num )
- {
- goto final;
- }
-
- if (p->dims + p->axis == inputs[0]->attr.dim_num)
+ if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
- for (i = 0; i < p->axis; i++)
- {
- outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
- }
- for (i = p->axis; i < (int32_t)inputs[0]->attr.dim_num; i++)
+ if (p->dims + p->axis == inputs[0]->attr.dim_num)
{
- outputs[0]->attr.size[i] = inputs[1]->attr.size[i];
- }
- outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
- }
- else
- {
- if (p->dims == 1)
- {
- for (i = 0; i <= p->axis; i++)
+ for (i = 0; i < p->axis; i++)
{
- outputs[0]->attr.size[i] = inputs[1]->attr.size[i];
- p->offset[i] = p->offset[0];
+ outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
}
- for (i = p->axis + 1; i < (int32_t)inputs[0]->attr.dim_num; i++)
+ for (i = p->axis; i < (int32_t)inputs[0]->attr.dim_num; i++)
{
- outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+ outputs[0]->attr.size[i] = inputs[1]->attr.size[i];
}
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
}
else
{
- VSILOGE("Invalid parameter: offset dims!\n");
- return FALSE;
+ if (p->dims == 1)
+ {
+ for (i = 0; i <= p->axis; i++)
+ {
+ outputs[0]->attr.size[i] = inputs[1]->attr.size[i];
+ p->offset[i] = p->offset[0];
+ }
+ for (i = p->axis + 1; i < (int32_t)inputs[0]->attr.dim_num; i++)
+ {
+ outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+ }
+ outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+ }
+ else
+ {
+ VSILOGE("Invalid parameter: offset dims!\n");
+ return FALSE;
+ }
}
}
-final:
for (j = 0; j < self->nn_param.crop.dims; j++)
{
p->lcl_data->begin_dims[j] = (int32_t)self->nn_param.crop.offset[j];
@@ -151,6 +154,7 @@ static vsi_bool op_setup
}
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.strided_slice.begin_dims = p->lcl_data->begin_dims;
curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num;
curr->node->nn_param.strided_slice.end_dims = p->lcl_data->end_dims;
@@ -163,9 +167,10 @@ static vsi_bool op_setup
curr->node->nn_param.strided_slice.new_axis_mask = 0;
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node( self, curr );
+ ret = vsi_nn_internal_setup_node( self, curr );
- return TRUE;
+final:
+ return ret;
} /* op_setup() */
static vsi_status op_init
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c
index d976b13b8..43f8a8f43 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c
@@ -136,6 +136,8 @@ static vsi_bool op_setup
/* TODO: Add code to comput outputs' shape. */
uint32_t i = 0;
+ VSI_UNREFERENCED(self);
+
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
index d1a778528..6d109f00b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
@@ -70,6 +70,8 @@ static vsi_bool _is_same_quant
{
vsi_nn_dtype_t *dtype,*_dtype;
+ VSI_UNREFERENCED(self);
+
dtype = &inputs[0]->attr.dtype;
_dtype = &outputs[0]->attr.dtype;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c
index 7048f5173..ba3a3c511 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c
@@ -37,6 +37,7 @@
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
typedef struct _deconv3d_local_data_t {
int32_t placeholder;
@@ -135,7 +136,9 @@ void _rotate_weight_data(
int32_t item_size = vsi_nn_TypeGetBytes(weights->attr.dtype.vx_type);
weight_data = vsi_nn_ConvertTensorToData(graph, weights);
+ CHECK_PTR_FAIL_GOTO( weight_data, "Create weight_data fail.", final );
buffer = (uint8_t*)malloc(item_size * depth_size * weight_ic * weight_oc);
+ CHECK_PTR_FAIL_GOTO( buffer, "Create buffer fail.", final );
memset(buffer, 0x00, item_size * depth_size * weight_ic * weight_oc);
//memcpy(buffer, weight_data, item_size * slice_size * weight_ic * weight_oc);
for(oc = 0; oc < weight_oc; oc++)
@@ -164,6 +167,8 @@ void _rotate_weight_data(
}
vsi_nn_CopyDataToTensor( graph, weights, buffer );
+
+final:
vsi_nn_Free( buffer );
vsi_nn_safe_free( weight_data );
}
@@ -263,7 +268,7 @@ static vsi_status op_init
//self->nn_param.deconv3d.local = \
// (deconv3d_local_data_t*)malloc(sizeof(deconv3d_local_data_t));
*/
-
+ VSI_UNREFERENCED(self);
return VSI_SUCCESS;
} /* op_init() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
index 09c59d81d..be301ea20 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
@@ -36,6 +36,183 @@
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_constraint_check.h"
+#define LOCAL() (local)
+
+typedef struct _vsi_nn_grouped_deconv2d_param_local_data {
+ vsi_nn_tensor_t ** input_tensor_group;
+ vsi_nn_tensor_t ** weight_tensor_group;
+ vsi_nn_tensor_t ** bias_tensor_group;
+ vsi_nn_tensor_t ** output_tensor_group;
+} vsi_nn_grouped_deconv2d_param_local_data;
+
+static vsi_status op_grouped_compute
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t * inputs[3],
+ vsi_nn_tensor_t ** outputs,
+ vx_nn_deconvolution_params_ext2_t param
+ )
+{
+ vsi_bool res;
+ uint32_t i;
+ vsi_status status = VSI_FAILURE;
+ vsi_nn_deconv_param *nn_param = &self->nn_param.deconv;
+ uint32_t group = nn_param->group;
+ vsi_nn_grouped_deconv2d_param_local_data *local =
+ (vsi_nn_grouped_deconv2d_param_local_data*)malloc(sizeof(vsi_nn_grouped_deconv2d_param_local_data));
+ if (NULL == local)
+ {
+ VSILOGE("Malloc fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+ goto final;
+ }
+ memset(local, 0, sizeof(vsi_nn_grouped_deconv2d_param_local_data));
+ /* TODO */
+ LOCAL()->input_tensor_group = (vsi_nn_tensor_t **)malloc(
+ group * sizeof(vsi_nn_tensor_t *));
+ if (NULL == LOCAL()->input_tensor_group)
+ {
+ VSILOGE("Malloc fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+ goto final;
+ }
+ memset(LOCAL()->input_tensor_group, 0, group * sizeof(vsi_nn_tensor_t *));
+ res = vsi_nn_CreateTensorGroup(self->graph, inputs[0], 2,
+ LOCAL()->input_tensor_group, group);
+ if (res == FALSE)
+ {
+ VSILOGE("CreateTensorGroup fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+ goto final;
+ }
+
+ LOCAL()->weight_tensor_group = (vsi_nn_tensor_t **)malloc(
+ group * sizeof(vsi_nn_tensor_t *));
+ if (NULL == LOCAL()->weight_tensor_group)
+ {
+ VSILOGE("Malloc fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+ return VSI_FAILURE;
+ }
+ memset(LOCAL()->weight_tensor_group, 0, group * sizeof(vsi_nn_tensor_t *));
+ res = vsi_nn_CreateTensorGroup(self->graph, inputs[1], 2,
+ LOCAL()->weight_tensor_group, group);
+ if (res == FALSE)
+ {
+ VSILOGE("CreateTensorGroup fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+ goto final;
+ }
+
+ LOCAL()->bias_tensor_group = (vsi_nn_tensor_t **)malloc(
+ group * sizeof(vsi_nn_tensor_t *));
+ if (NULL == LOCAL()->bias_tensor_group)
+ {
+ VSILOGE("Malloc fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+ goto final;
+ }
+ memset(LOCAL()->bias_tensor_group, 0, group * sizeof(vsi_nn_tensor_t *));
+ if (inputs[2] != NULL)
+ {
+ res = vsi_nn_CreateTensorGroup(self->graph, inputs[2], 0,
+ LOCAL()->bias_tensor_group, group);
+ if (res == FALSE)
+ {
+ VSILOGE("CreateTensorGroup fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+ goto final;
+ }
+ }
+
+ LOCAL()->output_tensor_group = (vsi_nn_tensor_t **)malloc(
+ group * sizeof(vsi_nn_tensor_t *));
+ if (NULL == LOCAL()->output_tensor_group)
+ {
+ VSILOGE("Malloc fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+ goto final;
+ }
+ memset(LOCAL()->output_tensor_group, 0, group * sizeof(vsi_nn_tensor_t *));
+ res = vsi_nn_CreateTensorGroup(self->graph, outputs[0], 2,
+ LOCAL()->output_tensor_group, group);
+ if (res == FALSE)
+ {
+ VSILOGE("CreateTensorGroup fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+ goto final;
+ }
+
+ param.ext.channel_group = 1;
+ for (i = 0; i < group; i++)
+ {
+ vx_tensor bias;
+
+ if ( inputs[2] == NULL )
+ {
+ bias = NULL;
+ }
+ else
+ {
+ bias = LOCAL()->bias_tensor_group[i]->t;
+ }
+
+ self->n = vxDeconvolutionLayer(
+ self->graph->g,
+ LOCAL()->input_tensor_group[i]->t,
+ LOCAL()->weight_tensor_group[i]->t,
+ bias,
+ (vx_nn_deconvolution_params_t *)¶m,
+ sizeof( vx_nn_deconvolution_params_ext2_t ),
+ LOCAL()->output_tensor_group[i]->t
+ );
+ if ( NULL == self->n )
+ {
+ VSILOGE("Add vxConvolutionLayer fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+ status = VSI_FAILURE;
+ goto final;
+ }
+ else
+ {
+ // no need to maintain self->n
+ vxReleaseNode( &self->n );
+ status = VSI_SUCCESS;
+ self->n = NULL;
+ }
+ }
+
+final:
+ if (LOCAL())
+ {
+ if (LOCAL()->input_tensor_group)
+ {
+ for (i = 0; i < group; i++)
+ {
+ vsi_safe_release_tensor((LOCAL()->input_tensor_group[i]));
+ }
+ vsi_nn_safe_free(LOCAL()->input_tensor_group);
+ }
+ if (LOCAL()->weight_tensor_group)
+ {
+ for (i = 0; i < group; i++)
+ {
+ vsi_safe_release_tensor((LOCAL()->weight_tensor_group[i]));
+ }
+ vsi_nn_safe_free(LOCAL()->weight_tensor_group);
+ }
+ if (LOCAL()->bias_tensor_group != NULL)
+ {
+ for (i = 0; i < group; i++)
+ {
+ vsi_safe_release_tensor((LOCAL()->bias_tensor_group[i]));
+ }
+ vsi_nn_safe_free(LOCAL()->bias_tensor_group);
+ }
+ if (LOCAL()->output_tensor_group != NULL)
+ {
+ for (i = 0; i < group; i++)
+ {
+ vsi_safe_release_tensor((LOCAL()->output_tensor_group[i]));
+ }
+ vsi_nn_safe_free(LOCAL()->output_tensor_group);
+ }
+
+ vsi_nn_safe_free(LOCAL());
+ }
+ return status;
+} /* op_compute() */
+
#define COMPUTE_DECONV_SZ( in, ksize, pad_1, pad_2, stride, output_padding )\
(( in - 1 ) * stride + ksize - pad_1 - pad_2 + output_padding)
static vsi_status op_compute
@@ -161,18 +338,31 @@ static vsi_status op_compute
//param.border_mode;
//param.border_const;
- self->n = vxDeconvolutionLayer(
- self->graph->g,
- inputs[0]->t,
- weight_tensor->t,
- (NULL == inputs[2]) ? NULL : inputs[2]->t,
- (vx_nn_deconvolution_params_t *)¶m,
- sizeof( vx_nn_deconvolution_params_ext2_t ),
- outputs[0]->t
- );
- if( NULL != self->n )
+ if (self->nn_param.deconv.group > 1 &&
+ self->nn_param.deconv.group < inputs[0]->attr.size[2])
{
- status = VSI_SUCCESS;
+ vsi_nn_tensor_t *inputs_tensors[3] = {NULL};
+
+ inputs_tensors[0] = inputs[0];
+ inputs_tensors[1] = weight_tensor;
+ inputs_tensors[2] = inputs[2];
+ status = op_grouped_compute(self, inputs_tensors, outputs, param );
+ }
+ else
+ {
+ self->n = vxDeconvolutionLayer(
+ self->graph->g,
+ inputs[0]->t,
+ weight_tensor->t,
+ (NULL == inputs[2]) ? NULL : inputs[2]->t,
+ (vx_nn_deconvolution_params_t *)¶m,
+ sizeof( vx_nn_deconvolution_params_ext2_t ),
+ outputs[0]->t
+ );
+ if ( NULL != self->n )
+ {
+ status = VSI_SUCCESS;
+ }
}
final:
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c
index 4128480bf..1180dbee9 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c
@@ -64,7 +64,9 @@ static vsi_status op_compute
weight_attr.size[2] = weight_attr.size[1];
weight_attr.size[1] = 1;
weight_attr.dim_num = 4;
- if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
+ if (inputs[1]->attr.dtype.qnt_type !=
+ VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC &&
+ inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8)
{
weight_tensor = vsi_nn_reshape_tensor( self->graph, inputs[1], weight_attr.size, 4 );
CHECK_PTR_FAIL_GOTO( weight_tensor, "create tensor fail.", final );
@@ -118,6 +120,7 @@ static vsi_status op_compute
attr.size[2] = weight_tensor->attr.size[3];
attr.size[3] = weight_tensor->attr.size[2];
permute_tensor = vsi_nn_CreateTensor(self->graph, &attr);
+ CHECK_PTR_FAIL_GOTO( permute_tensor, "Create tensor fail.", final );
self->n = vxTensorPermuteNode( self->graph->g, weight_tensor->t,
permute_tensor->t, perm_array, 4);
if ( NULL == self->n )
@@ -135,6 +138,7 @@ static vsi_status op_compute
memset(&attr_reverse, 0, sizeof(vsi_nn_tensor_attr_t));
memcpy(&attr_reverse, &tmp_in_tensor->attr, sizeof(vsi_nn_tensor_attr_t) );
reverse_tensor = vsi_nn_CreateTensor(self->graph, &attr_reverse);
+ CHECK_PTR_FAIL_GOTO( reverse_tensor, "Create tensor fail.", final );
para.axis = axis_reverse;
para.numberOfAxis = 2;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c
index 551aa59ea..cee8b8c7c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c
@@ -36,6 +36,7 @@
#include "utils/vsi_nn_util.h"
#include "vsi_nn_internal_node.h"
#include "utils/vsi_nn_math.h"
+#include "vsi_nn_error.h"
static vsi_status vsi_nn_depth2space_compute
(
@@ -46,29 +47,38 @@ static vsi_status vsi_nn_depth2space_compute
{
vsi_status status;
vsi_nn_tensor_t *block_size_tensor = NULL;
- vx_nn_reorg_params_t param;
+#if (VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
+ vx_nn_reorg_params_ext3_t paramExt;
+ vx_nn_reorg_params_t *param = (vx_nn_reorg_params_t *)¶mExt.base.base;
+ size_t size = sizeof(vx_nn_reorg_params_ext3_t);
+ paramExt.mode = self->nn_param.depth2space.mode;
+#else
+ vx_nn_reorg_params_t base;
+ vx_nn_reorg_params_t *param = &base;
+ size_t size = sizeof(vx_nn_reorg_params_t);
+ memset(param, 0, sizeof(vx_nn_reorg_params_t));
+#endif
status = VSI_FAILURE;
- memset(¶m, 0, sizeof(vx_nn_reorg_params_t));
block_size_tensor = vsi_nn_VariableToTensor(self,
(uint8_t *)&self->nn_param.depth2space.block_size,
VSI_NN_TYPE_INT32);
- if( NULL == block_size_tensor )
+ if ( NULL == block_size_tensor )
{
VSILOGE("Create block_size_tensor fail.(depth2space)");
return VSI_FAILURE;
}
self->nn_param.depth2space.local.block_size_tensor = block_size_tensor;
- param.block_size = REQUIRED_IO(block_size_tensor);
- param.type = VX_REORG_DEPTH_TO_SPACE;
+ param->block_size = REQUIRED_IO(block_size_tensor);
+ param->type = VX_REORG_DEPTH_TO_SPACE;
self->n = vxReorgLayer2( self->graph->g,
inputs[0]->t,
- ¶m,
- sizeof(vx_nn_reorg_params_t),
+ param,
+ size,
outputs[0]->t);
- if( NULL != self->n )
+ if ( NULL != self->n )
{
status = VSI_SUCCESS;
}
@@ -84,6 +94,13 @@ static vsi_status op_compute
{
vsi_status status = VSI_FAILURE;
+#if (VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
+ if (self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_DCR ||
+ self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_CRD)
+ {
+ status = vsi_nn_depth2space_compute(self, inputs, outputs);
+ }
+#else
if (self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_DCR)
{
status = vsi_nn_depth2space_compute(self, inputs, outputs);
@@ -92,6 +109,7 @@ static vsi_status op_compute
{
status = vsi_nn_internal_compute_node( self );
}
+#endif
else
{
VSILOGE("Unknown depth2space mode.(depth2space)");
@@ -101,24 +119,6 @@ static vsi_status op_compute
return status;
} /* op_compute() */
-static vsi_status op_optimize
- (
- vsi_nn_node_t * self,
- vsi_nn_tensor_t ** inputs,
- vsi_nn_tensor_t ** outputs,
- vsi_nn_opt_direction_e direction
- )
-{
- if (self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_CRD)
- {
- return vsi_nn_internal_optimize_node(self, direction );
- }
- else
- {
- return VSI_SUCCESS;
- }
-} /* op_optimize() */
-
static vsi_bool op_check
(
vsi_nn_node_t * self,
@@ -139,6 +139,7 @@ static vsi_bool op_check
return ret;
} /* op_check() */
+#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
static void op_set_depth2space_param_value(vsi_nn_nn_param_t *nn_param,
vsi_nn_op_t type_name,
vsi_nn_depth2space_mode_e mode,
@@ -160,20 +161,23 @@ static vsi_bool op_set_depth2space_internal
vsi_nn_op_t type_name
)
{
- vsi_bool retn = TRUE;
+ vsi_bool retn = FALSE;
vsi_nn_internal_node_t* curr = NULL;
vsi_nn_internal_init_node_wksp( self );
curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
op_set_depth2space_param_value(&(curr->node->nn_param), type_name,
self->nn_param.depth2space.mode, self->nn_param.depth2space.block_size);
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
retn = vsi_nn_internal_setup_node(self, curr);
+final:
return retn;
}
+#endif
static vsi_status op_init
(
@@ -199,7 +203,7 @@ static vsi_bool op_setup
{
vsi_bool ret = TRUE;
uint32_t size = node->nn_param.depth2space.block_size;
- if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+ if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
outputs[0]->attr.size[0] = inputs[0]->attr.size[0] * size;
@@ -208,10 +212,12 @@ static vsi_bool op_setup
outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
}
+#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
if (node->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_CRD)
{
ret = op_set_depth2space_internal(node, inputs, outputs, VSI_NN_OP_DEPTH2SPACE_INTERNAL);
}
+#endif
return ret;
} /* op_setup() */
@@ -225,11 +231,13 @@ static vsi_status op_deinit
vsi_nn_ReleaseTensor(&(self->nn_param.depth2space.local.block_size_tensor));
}
+#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
if (self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_CRD)
{
vsi_nn_internal_deinit_node_wksp(self);
}
else
+#endif
{
vsi_nn_op_common_deinit(self);
}
@@ -249,7 +257,7 @@ DEF_OP_REG
/* deinit */ op_deinit,
/* check */ op_check,
/* setup */ op_setup,
- /* optimize */ op_optimize,
+ /* optimize */ NULL,
/* input_num */ 1,
/* output_num */ 1
);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c
index fa5336755..1b417b168 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c
@@ -34,7 +34,7 @@
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
#include "vsi_nn_internal_node.h"
#include "utils/vsi_nn_constraint_check.h"
@@ -48,6 +48,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -82,19 +84,21 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
- vsi_bool ret = TRUE;
+ vsi_bool ret = FALSE;
vsi_nn_internal_node_t* curr = NULL;
vsi_nn_internal_init_node_wksp(self);
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_LINEAR, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.linear.a = self->nn_param.dropout.ratio;
curr->node->nn_param.linear.b = 0;
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
+final:
return ret;
} /* op_init() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
index 68c6993a0..280e5eee2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
@@ -120,6 +120,8 @@ static vsi_bool op_setup
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
vsi_bool ret = TRUE;
+ VSI_UNREFERENCED(self);
+
out_rank = inputs[0]->attr.dim_num;
for (i = 0; i < out_rank; i++)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c
index bcdf270f5..c1f2fc56e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c
@@ -122,6 +122,8 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(node);
+
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = inputs[1]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c
index 68c9fc257..d586d3141 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c
@@ -35,6 +35,7 @@
#include "utils/vsi_nn_dtype_util_prv.h"
#include "utils/vsi_nn_math.h"
#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
static vsi_status op_compute
(
@@ -43,6 +44,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
}
@@ -85,6 +88,7 @@ static vsi_bool op_check
IO_TYPE(D_BF16, D_F32)
IO_TYPE(D_I32|Q_DFP, D_I32|Q_DFP)
IO_TYPE(D_I32|Q_ASYM, D_I32|Q_ASYM)
+ IO_TYPE(D_BOOL8, D_BOOL8)
END_IO_TYPE_DECL(EXPAND_BROADCAST)
if (!VALIDATE_OP_IO_TYPES(EXPAND_BROADCAST, self, inputs, self->input.num, outputs, self->output.num))
{
@@ -109,9 +113,11 @@ static vsi_bool op_setup
vsi_nn_tensor_attr_t attr;
vsi_nn_internal_tensor_t* input_0 = NULL;
vsi_nn_internal_tensor_t *input_1 = NULL;
+ vsi_nn_internal_tensor_t* input_2 = NULL;
vsi_nn_internal_node_t* mul_node = NULL;
vsi_nn_tensor_t* mul_input = NULL;
int32_t use_virtual_tensor = 1;
+ vsi_bool is_same_shape = TRUE;
vsi_nn_expand_broadcast_param *p = &self->nn_param.expand_broadcast;
vsi_nn_internal_init_node_wksp(self);
@@ -120,33 +126,55 @@ static vsi_bool op_setup
attr.dim_num = p->dim_num;
if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE &&
(inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ||
- inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16)) {
+ inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16))
+ {
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
}
- else {
+ else if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BOOL8)
+ {
+ attr.dtype.vx_type = VSI_NN_TYPE_BOOL8;
+ }
+ else
+ {
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
}
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
attr.is_const = TRUE;
- for(i = 0; i < p->dim_num; i++)
+ for (i = 0; i < p->dim_num; i++)
{
+ vsi_size_t sz = i < inputs[0]->attr.dim_num ?
+ inputs[0]->attr.size[i] : 1;
+
attr.size[i] = p->shape[i];
+ if (( p->shape[i] != sz && p->shape[i] != 1)
+ && is_same_shape)
+ {
+ is_same_shape = FALSE;
+ }
}
input_1 = vsi_nn_internal_new_tensor( self, &attr, 1.0f );
+ CHECK_PTR_FAIL_GOTO(input_1, "Create tensor failed", final);
- if (p->dimensions_num > 0) {
+ if (p->dimensions_num > 0)
+ {
vsi_nn_internal_node_t* reshape_node = NULL;
vsi_size_t* reshape_input_size = NULL;
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor);
input_0 = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(input_0, "Create internal tensor failed", final);
reshape_node = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(reshape_node, "Create internal node failed", final);
reshape_input_size = (vsi_size_t*)vsi_nn_internal_new_node_param(reshape_node,
VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
- for(i = 0; i < p->dim_num; i++) {
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_input_size, reshape_node,
+ "Create internal buffer failed", final);
+ for (i = 0; i < p->dim_num; i++)
+ {
reshape_input_size[i] = 1;
}
- for (i = 0; i < p->dimensions_num; i++) {
+ for (i = 0; i < p->dimensions_num; i++)
+ {
reshape_input_size[p->dimensions[i]] = p->shape[p->dimensions[i]];
}
@@ -156,20 +184,74 @@ static vsi_bool op_setup
reshape_node->outputs[0] = input_0->t;
vsi_nn_internal_setup_node( self, reshape_node );
mul_input = input_0->t;
- } else {
+ }
+ else
+ {
mul_input = inputs[0];
}
- mul_node = vsi_nn_internal_new_node(self, VSI_NN_OP_MULTIPLY, 0, 0 );
- mul_node->inputs[0] = mul_input;
- mul_node->inputs[1] = input_1->t;
- mul_node->node->nn_param.multiply.scale = 1.0f;
- mul_node->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
- mul_node->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN;
- mul_node->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node(self, mul_node);
+ if (is_same_shape)
+ {
+ vsi_nn_internal_node_t* curr = NULL;
+ vsi_nn_tensor_t* temp_tensor = NULL;
+
+ if (input_1->t->attr.dim_num != mul_input->attr.dim_num)
+ {
+ vsi_size_t* shape_sizes = NULL;
+ uint32_t rank0 = input_1->t->attr.dim_num;
+ uint32_t rank1 = mul_input->attr.dim_num;
+ uint32_t rank = vsi_nn_max( rank0, rank1 );
+
+ memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
+ vsi_nn_internal_init_tensor_attr(&attr, &mul_input->attr.dtype, use_virtual_tensor);
+ input_2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(input_2, "Create internal tensor failed", final);
+
+ curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+ shape_sizes = (vsi_size_t*)vsi_nn_internal_new_node_param(curr,
+ VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(shape_sizes, curr,
+ "Create internal buffer failed", final);
+ for (i = 0; i < rank; i++)
+ {
+ shape_sizes[i] = i < rank1 ? mul_input->attr.size[i] : 1;
+ }
+ curr->node->nn_param.reshape2.size = shape_sizes;
+ curr->node->nn_param.reshape2.dim_num = rank;
+ curr->inputs[0] = mul_input;
+ curr->outputs[0] = input_2->t;
+ vsi_nn_internal_setup_node( self, curr );
+
+ temp_tensor = input_2->t;
+ }
+ else
+ {
+ temp_tensor = mul_input;
+ }
+
+ curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+ curr->inputs[0] = temp_tensor;
+ curr->outputs[0] = outputs[0];
+ vsi_nn_internal_setup_node(self, curr);
+ }
+ else
+ {
+ mul_node = vsi_nn_internal_new_node(self, VSI_NN_OP_MULTIPLY, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(mul_node, "Create internal node failed", final);
+ mul_node->inputs[0] = mul_input;
+ mul_node->inputs[1] = input_1->t;
+ mul_node->node->nn_param.multiply.scale = 1.0f;
+ mul_node->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
+ mul_node->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN;
+ mul_node->outputs[0] = outputs[0];
+ vsi_nn_internal_setup_node(self, mul_node);
+ }
return TRUE;
+final:
+ return FALSE;
}
static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c
index 23be09a06..958b06b10 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c
@@ -123,6 +123,9 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/* TODO: Add code to comput outputs' shape. */
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c
index 92b13378c..4a803ad6e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c
@@ -171,6 +171,8 @@ static vsi_bool op_setup
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
vsi_bool ret = TRUE;
+ VSI_UNREFERENCED(self);
+
in1_rank = inputs[0]->attr.dim_num;
in2_rank = inputs[1]->attr.dim_num;
out_rank = vsi_nn_max( in1_rank, in2_rank );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
index 1f3f281c2..489d3cb96 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
@@ -34,6 +34,7 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_tensor_util_prv.h"
#define _ARG_NUM (1)
#define _INPUT_NUM (2)
@@ -80,7 +81,31 @@ static vsi_status op_compute
vsi_nn_kernel_param_add_int32( param, "axis", (int32_t)axis );
vsi_nn_kernel_param_add_int32( param, "indices_num", (int32_t)indices_num );
vsi_nn_kernel_param_add_int32( param, "batch_dims", (int32_t)batch_dims );
- n = vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, outputs, 1, param );
+
+ if (vsi_nn_is_same_data_type(inputs[0], outputs[0]) == FALSE ||
+ vsi_nn_is_same_quant_type(inputs[0], outputs[0]))
+ {
+ n = vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, outputs, 1, param );
+ }
+ else
+ {
+ vsi_nn_tensor_attr_t attr;
+ vsi_nn_tensor_t* temp_tensors = NULL;
+
+ VSILOGW("gather is no_range_change operation! \
+ Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!");
+
+ memcpy( &attr, &outputs[0]->attr, sizeof(attr));
+ memcpy( &attr.dtype, &inputs[0]->attr.dtype, sizeof(attr.dtype));
+ attr.is_const = FALSE;
+ attr.vtl = TRUE;
+ temp_tensors = vsi_nn_CreateTensor( self->graph, &attr );
+
+ vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, &temp_tensors, 1, param );
+ n = vxTensorCopyNode( self->graph->g, temp_tensors->t, outputs[0]->t);
+
+ vsi_safe_release_tensor(temp_tensors);
+ }
if ( n != NULL )
{
self->n = (vx_node)n;
@@ -187,7 +212,7 @@ static vsi_bool op_setup
outputs[0]->attr.size[j] = inputs[0]->attr.size[i];
j++;
}
- for (i = 0; i < inputs[1]->attr.dim_num; i++)
+ for (i = 0; i < q_rank; i++)
{
outputs[0]->attr.size[j] = inputs[1]->attr.size[i];
j++;
@@ -198,8 +223,8 @@ static vsi_bool op_setup
j++;
}
}
-
}
+
return TRUE;
} /* op_setup() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c
index baf55b1dc..b77a39db3 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c
@@ -58,6 +58,7 @@ static vsi_status op_compute
{
vsi_status status = VSI_FAILURE;
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+ vsi_nn_tensor_t* temp_tensors = NULL;
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
uint32_t rank_in = 0;
int32_t axis = 0;
@@ -66,6 +67,8 @@ static vsi_status op_compute
vsi_bool ret = FALSE;
vsi_nn_kernel_param_t * param = NULL;
vsi_nn_gather_elements_param * p = NULL;
+ vsi_size_t depth0 = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
+ vsi_size_t depth1 = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1;
if ( NULL == self )
{
@@ -86,7 +89,31 @@ static vsi_status op_compute
// Add params
param = vsi_nn_kernel_param_create();
- if ( ret && new_axis0 == new_axis1 )
+ if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE)
+ {
+ vsi_nn_tensor_attr_t attr;
+
+ VSILOGW("gather_element is no_range_change operation! \
+ Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!");
+
+ memcpy( &attr, &outputs[0]->attr, sizeof(attr));
+ memcpy( &attr.dtype, &inputs[0]->attr.dtype, sizeof(attr.dtype));
+ attr.is_const = FALSE;
+ attr.vtl = TRUE;
+ temp_tensors = vsi_nn_CreateTensor( self->graph, &attr );
+ }
+ else
+ {
+ temp_tensors = outputs[0];
+ }
+
+ if ( ret && new_axis0 == new_axis1 &&
+ inputs[0]->attr.size[0] < GPU_TENSOR_MAX_WIDTH &&
+ inputs[0]->attr.size[1] < GPU_TENSOR_MAX_WIDTH &&
+ inputs[1]->attr.size[0] < GPU_TENSOR_MAX_WIDTH &&
+ inputs[1]->attr.size[1] < GPU_TENSOR_MAX_WIDTH &&
+ depth0 < GPU_TENSOR_MAX_WIDTH &&
+ depth1 < GPU_TENSOR_MAX_WIDTH)
{
vsi_nn_kernel_param_add_int32( param, "axis", new_axis0 );
@@ -95,7 +122,7 @@ static vsi_status op_compute
reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
inputs[1], shapes[1], rank_in );
reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph,
- outputs[0], shapes[1], rank_in );
+ temp_tensors, shapes[1], rank_in );
self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
"gather_elements",
@@ -112,7 +139,13 @@ static vsi_status op_compute
self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
"gather_elements",
inputs, 2,
- outputs, 1, param );
+ &temp_tensors, 1, param );
+ }
+
+ if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE)
+ {
+ self->n = vxTensorCopyNode( self->graph->g, temp_tensors->t, outputs[0]->t);
+ vsi_safe_release_tensor(temp_tensors);
}
vsi_nn_kernel_param_release( ¶m );
@@ -164,6 +197,8 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
uint32_t i = 0;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c
index 4246ee6aa..26d47dd7e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c
@@ -30,10 +30,11 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
-#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_tensor_util_prv.h"
#define _ARG_NUM (2)
#define _INPUT_NUM (2)
@@ -50,19 +51,20 @@ static vsi_status op_compute
vsi_nn_kernel_param_t * param = NULL;
vsi_nn_kernel_node_t n = NULL;
vsi_size_t i = 0;
- int32_t batch_dims = self->nn_param.gather_nd.batch_dims == 0 ? 0 : 1;
+ int32_t batch_dims = self->nn_param.gather_nd.batch_dims;
vsi_size_t block_size = 1, coord_dim = 1;
vsi_size_t *input_size = inputs[0]->attr.size;
vsi_size_t dims_num = inputs[0]->attr.dim_num;
+ batch_dims = batch_dims < 0 ? 0 : batch_dims;
+
if (inputs[1]->attr.dim_num > 1)
{
coord_dim = inputs[1]->attr.size[0];
}
if (coord_dim > 4 || (coord_dim > 3 && input_size[dims_num - 1] != 1)
- || (batch_dims && coord_dim >= 3))
+ || (batch_dims && coord_dim >= 3) || (batch_dims >= (int32_t)vsi_nn_min(dims_num, inputs[1]->attr.dim_num)))
{
- CHECK_STATUS(status);
return status;
}
@@ -76,7 +78,32 @@ static vsi_status op_compute
vsi_nn_kernel_param_add_int32( param, "block_size", (int32_t)block_size );
vsi_nn_kernel_param_add_int32( param, "coord_dim", (int32_t)coord_dim );
vsi_nn_kernel_param_add_int32( param, "batch_dims", (int32_t)batch_dims );
- n = vsi_nn_kernel_selector( self->graph, "gather_nd", inputs, 2, outputs, 1, param );
+
+ if (vsi_nn_is_same_data_type(inputs[0], outputs[0]) == FALSE ||
+ vsi_nn_is_same_quant_type(inputs[0], outputs[0]))
+ {
+ n = vsi_nn_kernel_selector( self->graph, "gather_nd", inputs, 2, outputs, 1, param );
+ }
+ else
+ {
+ vsi_nn_tensor_attr_t attr;
+ vsi_nn_tensor_t* temp_tensors = NULL;
+
+ VSILOGW("gather_nd is no_range_change operation! \
+ Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!");
+
+ memcpy( &attr, &outputs[0]->attr, sizeof(attr));
+ memcpy( &attr.dtype, &inputs[0]->attr.dtype, sizeof(attr.dtype));
+ attr.is_const = FALSE;
+ attr.vtl = TRUE;
+ temp_tensors = vsi_nn_CreateTensor( self->graph, &attr );
+
+ vsi_nn_kernel_selector( self->graph, "gather_nd", inputs, 2, &temp_tensors, 1, param );
+ n = vxTensorCopyNode( self->graph->g, temp_tensors->t, outputs[0]->t);
+
+ vsi_safe_release_tensor(temp_tensors);
+ }
+
if ( n != NULL )
{
self->n = (vx_node)n;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c b/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c
index 77feaafe3..09e96a1f0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c
@@ -78,6 +78,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
} /* op_check() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_globallppool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_globallppool.c
index de9059ecf..cc6463f63 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_globallppool.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_globallppool.c
@@ -155,6 +155,8 @@ static vsi_bool op_setup
{
vsi_size_t i = 0;
+ VSI_UNREFERENCED(self);
+
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
index d8c99aa89..86f15f81d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
@@ -51,7 +51,38 @@ static vsi_status op_compute
{
vsi_status status = VSI_FAILURE;
- status = vsi_nn_internal_compute_node(self);
+ vsi_nn_kernel_param_t* param = NULL;
+ int32_t align_corners = self->nn_param.gridsample.align_corners;
+ vsi_nn_kernel_node_t n;
+ char kernel_name[128];
+
+ param = vsi_nn_kernel_param_create();
+ vsi_nn_kernel_param_add_int32(param, "align_corners", align_corners);
+
+ switch (self->nn_param.gridsample.mode) {
+ case VSI_NN_INTERPOLATION_BILINEAR:
+ snprintf(kernel_name, sizeof(kernel_name), "bilinear_grid_sample");
+ break;
+ case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR:
+ snprintf(kernel_name, sizeof(kernel_name), "nearest_grid_sample");
+ break;
+ default:
+ break;
+ }
+
+ n = (vx_node)vsi_nn_kernel_selector(
+ self->graph, kernel_name, inputs, 2, outputs, 1, param);
+
+ if (n == NULL) {
+ vsi_nn_kernel_param_release(¶m);
+ status = VSI_FAILURE;
+ return status;
+ }
+ self->n = (vx_node)n;
+ vsi_nn_kernel_param_release(¶m);
+ if (self->n) {
+ status = VSI_SUCCESS;
+ }
return status;
} /* op_compute() */
@@ -63,8 +94,12 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
- if (VSI_NN_INTERPOLATION_BILINEAR != self->nn_param.gridsample.mode) {
- VSILOGE("Only support bilinear_grid_sample now!");
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
+ if ((VSI_NN_INTERPOLATION_BILINEAR != self->nn_param.gridsample.mode) &&
+ (VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR !=
+ self->nn_param.gridsample.mode)) {
+ VSILOGE("Only support bilinear or nearest grid sample mode now!");
return FALSE;
}
@@ -85,8 +120,6 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
- vsi_nn_internal_node_t* curr = NULL;
-
if (NULL == self) {
return FALSE;
}
@@ -101,22 +134,6 @@ static vsi_bool op_setup
}
}
- if (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.gridsample.mode) {
- vsi_nn_internal_init_node_wksp(self);
- curr = vsi_nn_internal_new_node(
- self, VSI_NN_OP_BILINEAR_GRID_SAMPLE, 2, 1);
- curr->node->nn_param.bilinear_grid_sample.align_corners =
- self->nn_param.gridsample.align_corners;
- curr->node->nn_param.bilinear_grid_sample.padding_mode =
- self->nn_param.gridsample.padding_mode;
- curr->node->nn_param.bilinear_grid_sample.const_val =
- self->nn_param.gridsample.const_val;
- curr->inputs[0] = inputs[0];
- curr->inputs[1] = inputs[1];
- curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node(self, curr);
- }
-
return TRUE;
} /* op_setup() */
@@ -129,7 +146,7 @@ static vsi_status op_init
//self->nn_param.grid_sample.local = \
// (grid_sample_local_data_t*)malloc(sizeof(grid_sample_local_data_t));
*/
-
+ VSI_UNREFERENCED(self);
return VSI_SUCCESS;
} /* op_init() */
@@ -140,7 +157,7 @@ static vsi_status op_deinit
{
vsi_status status = VSI_SUCCESS;
- status = vsi_nn_internal_deinit_node_wksp(self);
+ status = vsi_nn_op_common_deinit(self);
return status;
} /* op_deinit() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c
index 5cfeddf58..a40497949 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c
@@ -77,6 +77,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -103,6 +105,7 @@ static vsi_bool op_setup
{
vsi_nn_internal_node_t* curr = NULL;
vsi_nn_grouped_conv1d_param* p = &self->nn_param.grouped_conv1d;
+ vsi_bool ret = FALSE;
vsi_nn_internal_init_node_wksp(self);
@@ -125,7 +128,9 @@ static vsi_bool op_setup
p->local->input = _expand_tensor_dim( self->graph, inputs[0],
inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 );
- if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
+ if (inputs[1]->attr.dtype.qnt_type !=
+ VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC &&
+ inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8)
{
p->local->weight = _expand_tensor_dim( self->graph, inputs[1],
inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 );
@@ -159,6 +164,7 @@ static vsi_bool op_setup
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_GROUPED_CONV2D, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = p->local->input;
curr->inputs[1] = p->local->weight;
curr->inputs[2] = inputs[2];
@@ -179,10 +185,10 @@ static vsi_bool op_setup
curr->node->nn_param.grouped_conv2d.pad_type = p->pad_type;
curr->node->nn_param.grouped_conv2d.pad_mode = p->pad_mode;
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
final:
- return TRUE;
+ return ret;
} /* op_setup() */
static vsi_status op_init
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
index 00545d3c9..629486c69 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
@@ -77,6 +77,7 @@ static vsi_bool _is_3d_group_norm
vsi_nn_tensor_t ** inputs
)
{
+ VSI_UNREFERENCED(self);
if ( 3 == inputs[0]->attr.dim_num )
{
return TRUE;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c
index ad4c2a741..24acf6f94 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c
@@ -39,13 +39,14 @@
#include "utils/vsi_nn_tensor_op.h"
#include "utils/vsi_nn_util.h"
#include "ops/vsi_nn_op_gru.h"
+#include "vsi_nn_error.h"
typedef struct _vsi_nn_gru_local
{
void * placeholder;
} vsi_nn_gru_local;
-static void create_state_tensor
+static vsi_status create_state_tensor
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
@@ -54,6 +55,7 @@ static void create_state_tensor
vsi_size_t hidden_size
)
{
+ vsi_status status = VSI_FAILURE;
vsi_nn_tensor_attr_t attr;
vsi_nn_internal_tensor_t * tensor = NULL;
@@ -67,6 +69,7 @@ static void create_state_tensor
attr.vtl = TRUE;
attr.is_const = FALSE;
tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(tensor, "Create internal tensor failed", final);
outputs[GRU_OUT_H_STATE] = tensor->t;
}
@@ -80,9 +83,13 @@ static void create_state_tensor
attr.is_const = TRUE;
tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(tensor, "Create internal tensor failed", final);
inputs[GRU_IN_H_STATE] = tensor->t;
}
+ status = VSI_SUCCESS;
+final:
+ return status;
} /* create_state_tensor() */
static vsi_bool setup_op_shapes
@@ -92,8 +99,10 @@ static vsi_bool setup_op_shapes
vsi_nn_tensor_t ** outputs
)
{
+ vsi_status status = VSI_FAILURE;
vsi_nn_gru_param * p = &self->nn_param.gru;
vsi_size_t batch_size = 0, hidden_size = 0, timesetp = 0;
+ vsi_bool ret = FALSE;
hidden_size = p->num_units;
if(p->time_major)
@@ -137,7 +146,8 @@ static vsi_bool setup_op_shapes
}
/* create hstate input/output if app doesn't provide them */
- create_state_tensor(self, inputs, outputs, batch_size, hidden_size);
+ status = create_state_tensor(self, inputs, outputs, batch_size, hidden_size);
+ CHECK_STATUS_FAIL_GOTO(status, final);
/* hstate output */
if(VSI_NN_DIM_AUTO == outputs[GRU_OUT_H_STATE]->attr.dim_num)
@@ -147,7 +157,9 @@ static vsi_bool setup_op_shapes
outputs[GRU_OUT_H_STATE]->attr.size[1] = batch_size;
}
- return TRUE;
+ ret = TRUE;
+final:
+ return ret;
} /* setup_op_shapes() */
static vsi_status op_compute
@@ -157,6 +169,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
}
@@ -167,6 +181,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
}
@@ -187,6 +204,8 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** gru_step_outputs = NULL;
vsi_nn_internal_tensor_t * tmp_tensor = NULL;
vsi_nn_tensor_attr_t attr;
+ vsi_bool ret = FALSE;
+ vsi_status status = VSI_FAILURE;
memset(&attr, 0, sizeof(attr));
vsi_nn_internal_init_node_wksp( self );
@@ -211,15 +230,19 @@ static vsi_bool op_setup
/* transpose to time_major */
tmp_tensor = vsi_nn_rnn_transpose_time_major(self,
inputs[GRU_INPUT_INPUT], NULL, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
input_tensor = tmp_tensor->t;
}
split_outputs = (vsi_nn_tensor_t **)malloc(timestep * sizeof(vsi_nn_tensor_t *));
+ CHECK_PTR_FAIL_GOTO( split_outputs, "Create buffer fail.", final );
memset(split_outputs, 0, timestep * sizeof(vsi_nn_tensor_t *));
gru_step_outputs = (vsi_nn_tensor_t **)malloc(timestep * sizeof(vsi_nn_tensor_t *));
+ CHECK_PTR_FAIL_GOTO( gru_step_outputs, "Create buffer fail.", final );
memset(gru_step_outputs, 0, timestep * sizeof(vsi_nn_tensor_t *));
- vsi_nn_rnn_split_input_tensor(self, input_tensor, split_outputs, (uint32_t)timestep, use_virtual_tensor);
+ status = vsi_nn_rnn_split_input_tensor(self, input_tensor, split_outputs, (uint32_t)timestep, use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO(status, final);
//vsi_nn_rnn_data_check_aligned(self, split_outputs, timestep, use_virtual_tensor); ??
@@ -233,6 +256,7 @@ static vsi_bool op_setup
/* reshape split_outputs to cell_input */
tmp_tensor = vsi_nn_rnn_reshape_split_output(
self, split_outputs[i], (uint32_t)batch_size, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
reshape_output = tmp_tensor->t;
/* grucell output */
@@ -245,6 +269,7 @@ static vsi_bool op_setup
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[GRU_OUT_OUTPUT]->attr.dtype, use_virtual_tensor);
tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
cell_out0 = tmp_tensor->t;
}
@@ -254,6 +279,7 @@ static vsi_bool op_setup
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[GRU_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
cell_out1 = tmp_tensor->t;
}
else
@@ -263,6 +289,7 @@ static vsi_bool op_setup
/* create a grucell */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.grucell.num_units = p->num_units;
curr->node->nn_param.grucell.activation = p->activation;
curr->node->nn_param.grucell.recurrent_activation = p->recurrent_activation;
@@ -292,6 +319,7 @@ static vsi_bool op_setup
/* reshape every step output to 3-dims for GRU_OUTPUT */
tmp_tensor = vsi_nn_rnn_reshape_cell_output(self,
cell_out0, (uint32_t)batch_size, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
gru_step_outputs[i] = tmp_tensor->t;
}
} /* for(i = 0; i < timestep; i++) end */
@@ -305,11 +333,13 @@ static vsi_bool op_setup
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
output_tensor = tmp_tensor->t;
}
/* concat all grucell output0, the reshaped grucell output shape: [hidden_size, batch, 1] */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, timestep, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.concat.axis = 2; /* concat the cell_outs in timestep */
for( i = 0; i < timestep; i++ )
{
@@ -326,10 +356,12 @@ static vsi_bool op_setup
}
}
+ ret = TRUE;
+final:
vsi_nn_safe_free( split_outputs );
vsi_nn_safe_free( gru_step_outputs );
- return TRUE;
+ return ret;
}
static vsi_status op_deinit
@@ -350,6 +382,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
}
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c
index 5ac947b9f..9d7e34897 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c
@@ -93,6 +93,7 @@ static vsi_bool setup_op_shapes
attr.is_const = TRUE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
inputs[GRU_INPUT_H_STATE] = output_tensor->t;
}
@@ -103,6 +104,7 @@ static vsi_bool setup_op_shapes
memcpy( &attr.dtype, &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) );
attr.vtl = TRUE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
outputs[GRU_OUTPUT_H_STATE] = output_tensor->t;
}
@@ -132,6 +134,8 @@ static vsi_bool setup_op_shapes
}
return TRUE;
+final:
+ return FALSE;
}
static vsi_status op_compute
@@ -141,6 +145,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -151,6 +157,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -163,6 +172,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
@@ -187,6 +198,7 @@ static vsi_bool op_setup_default
vsi_size_t time_step = 0;
vsi_size_t i = 0;
vsi_bool ret = FALSE;
+ vsi_status status = VSI_FAILURE;
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
vsi_nn_internal_init_node_wksp( self );
@@ -211,6 +223,7 @@ static vsi_bool op_setup_default
/* transpose to time_major */
output_tensor = vsi_nn_rnn_transpose_time_major(self,
inputs[GRU_INPUT_INPUT], NULL, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
input_tensor = output_tensor->t;
}
@@ -222,9 +235,12 @@ static vsi_bool op_setup_default
CHECK_PTR_FAIL_GOTO( grucell_reshape_output_tensors, "Create buffer fail.", final );
memset( grucell_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
- vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+ status = vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors,
+ (uint32_t)time_step, use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO(status, final);
- vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+ status = vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO(status, final);
last_step_h_state = inputs[GRU_INPUT_H_STATE];
for( i = 0; i < time_step; i++ )
@@ -236,6 +252,7 @@ static vsi_bool op_setup_default
/* reshape for split output */
output_tensor = vsi_nn_rnn_reshape_split_output(self,
split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
reshape_output = output_tensor->t;
/* grucell output */
@@ -248,6 +265,7 @@ static vsi_bool op_setup_default
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
grucell_out0 = output_tensor->t;
}
@@ -257,6 +275,7 @@ static vsi_bool op_setup_default
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[GRU_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
grucell_out1 = output_tensor->t;
}
else
@@ -265,13 +284,14 @@ static vsi_bool op_setup_default
}
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_OVXLIB, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.grucell_ovxlib.num_units = curr_param->num_units;
curr->node->nn_param.grucell_ovxlib.activation = curr_param->activation;
curr->node->nn_param.grucell_ovxlib.recurrent_activation = curr_param->recurrent_activation;
curr->node->nn_param.grucell_ovxlib.linear_before_reset = curr_param->linear_before_reset;
if ( reshape_output->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 )
{
- int32_t k = 0;
+ size_t k = 0;
for (k = 0; k < _cnt_of_array( curr_param->internal_dtype ); k++)
{
if (curr_param->internal_dtype[k].vx_type == VSI_NN_TYPE_NONE)
@@ -316,6 +336,7 @@ static vsi_bool op_setup_default
/* reshape output to 3-dims */
output_tensor = vsi_nn_rnn_reshape_cell_output(self,
grucell_out0, (uint32_t)batch_size, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
grucell_reshape_output_tensors[i] = output_tensor->t;
}
}
@@ -328,12 +349,14 @@ static vsi_bool op_setup_default
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
tensor = output_tensor->t;
}
/* concat grucell output, the gru's output is 3-dims */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.concat.axis = 2;
for( i = 0; i < time_step; i++ )
{
@@ -383,6 +406,8 @@ static vsi_bool op_setup_optimized
vsi_nn_internal_tensor_t* input_weight_for_nn = NULL;
vsi_size_t permute_in_perm[VSI_NN_MAX_DIM_NUM] = { 0 };
vsi_size_t reshape_size[VSI_NN_MAX_DIM_NUM] = { 0 };
+ vsi_bool ret = FALSE;
+ vsi_status status = VSI_FAILURE;
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
vsi_nn_internal_init_node_wksp( self );
@@ -409,57 +434,69 @@ static vsi_bool op_setup_optimized
/* transpose to time_major */
output_tensor = vsi_nn_rnn_transpose_time_major(self,
inputs[GRU_INPUT_INPUT], NULL, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
input_tensor = output_tensor->t;
}
/* input FC */
p->local->weights_input = vsi_nn_ConcatTensor(self->graph, 1, inputs[GRU_INPUT_WEIGHT_I2R],
inputs[GRU_INPUT_WEIGHT_I2Z], inputs[GRU_INPUT_WEIGHT_I2C]);
+ CHECK_PTR_FAIL_GOTO(p->local->weights_input, "Create tensor failed", final);
p->local->weights_input->attr.is_const = TRUE;
vsi_nn_SetTensorAttr(p->local->weights_input, VSI_NN_TENSOR_ATTR_CONST);
p->local->weights_recurrent = vsi_nn_ConcatTensor(self->graph, 1, inputs[GRU_INPUT_WEIGHT_H2R],
inputs[GRU_INPUT_WEIGHT_H2Z], inputs[GRU_INPUT_WEIGHT_H2C]);
+ CHECK_PTR_FAIL_GOTO(p->local->weights_recurrent, "Create tensor failed", final);
p->local->weights_recurrent->attr.is_const = TRUE;
vsi_nn_SetTensorAttr(p->local->weights_recurrent, VSI_NN_TENSOR_ATTR_CONST);
p->local->bias_r = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2R]->attr,
inputs[GRUCELL_INPUT_BIAS_I2R], inputs[GRUCELL_INPUT_BIAS_H2R]);
+ CHECK_PTR_FAIL_GOTO(p->local->bias_r, "Create tensor failed", final);
p->local->bias_r->attr.is_const = TRUE;
vsi_nn_SetTensorAttr(p->local->bias_r, VSI_NN_TENSOR_ATTR_CONST);
p->local->bias_z = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2Z]->attr,
inputs[GRUCELL_INPUT_BIAS_I2Z], inputs[GRUCELL_INPUT_BIAS_H2Z]);
+ CHECK_PTR_FAIL_GOTO(p->local->bias_z, "Create tensor failed", final);
p->local->bias_z->attr.is_const = TRUE;
vsi_nn_SetTensorAttr(p->local->bias_z, VSI_NN_TENSOR_ATTR_CONST);
p->local->bias_c = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2C]->attr,
inputs[GRUCELL_INPUT_BIAS_I2C], inputs[GRUCELL_INPUT_BIAS_H2C]);
+ CHECK_PTR_FAIL_GOTO(p->local->bias_c, "Create tensor failed", final);
p->local->bias_c->attr.is_const = TRUE;
vsi_nn_SetTensorAttr(p->local->bias_c, VSI_NN_TENSOR_ATTR_CONST);
/* prepare weight and bias for recurrent fc */
recurrent_weight_for_nn = vsi_nn_rnn_prepare_weight_for_nn_fc(self, p->local->weights_recurrent, 1, 1);
+ CHECK_PTR_FAIL_GOTO(recurrent_weight_for_nn, "Create internal tensor failed", final);
/* transpose input from [T,B,D] to [D,T,B] */
permute_in_perm[0] = 1;
permute_in_perm[1] = 2;
permute_in_perm[2] = 0;
tmp_tensor = vsi_nn_rnn_create_permute(self, input_tensor, NULL, permute_in_perm, 3, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
reshape_size[0] = tmp_tensor->t->attr.size[0];
reshape_size[1] = tmp_tensor->t->attr.size[1];
reshape_size[2] = tmp_tensor->t->attr.size[2];
reshape_size[3] = 1; /* new batch dim */
tmp_tensor = vsi_nn_rnn_create_reshape(self, tmp_tensor->t, NULL, reshape_size, 4, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
input_weight_for_nn = vsi_nn_rnn_prepare_weight_for_nn_fc(self, p->local->weights_input, 1, 1);
+ CHECK_PTR_FAIL_GOTO(input_weight_for_nn, "Create internal tensor failed", final);
vsi_nn_internal_init_tensor_attr(&attr, &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_INPUT],
use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.conv2d.ksize[0] = 1;
curr->node->nn_param.conv2d.ksize[1] = 1;
curr->node->nn_param.conv2d.stride[0] = 1;
@@ -483,11 +520,13 @@ static vsi_bool op_setup_optimized
reshape_size[1] = output_tensor->t->attr.size[1];
reshape_size[2] = output_tensor->t->attr.size[2];
output_tensor = vsi_nn_rnn_create_reshape(self, output_tensor->t, NULL, reshape_size, 3, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
permute_in_perm[0] = 0;
permute_in_perm[1] = 2;
permute_in_perm[2] = 1;
tmp_tensor = vsi_nn_rnn_create_permute(self, output_tensor->t, NULL, permute_in_perm, 3, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
/* split input tensor */
split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
@@ -497,19 +536,24 @@ static vsi_bool op_setup_optimized
CHECK_PTR_FAIL_GOTO( grucell_reshape_output_tensors, "Create buffer fail.", final );
memset( grucell_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
- vsi_nn_rnn_split_input_tensor(self, tmp_tensor->t, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+ status = vsi_nn_rnn_split_input_tensor(self, tmp_tensor->t, split_output_tensors,
+ (uint32_t)time_step, use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO(status, final);
- vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+ status = vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO(status, final);
memcpy(&attr, &p->local->bias_r->attr, sizeof(vsi_nn_tensor_attr_t));
attr.size[1] = 1;
attr.dim_num = 2;
p->local->cond_zeros = vsi_nn_CreateTensorWithDefault(self->graph, &attr, 0.0);
+ CHECK_PTR_FAIL_GOTO(p->local->cond_zeros, "Create tensor failed", final);
last_step_h_state = inputs[GRU_INPUT_H_STATE];
permute_in_perm[0] = 1;
permute_in_perm[1] = 0;
tmp_tensor = vsi_nn_rnn_create_permute(self, last_step_h_state, NULL, permute_in_perm, 2, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
last_step_h_state = tmp_tensor->t;
for( i = 0; i < time_step; i++ )
@@ -525,6 +569,7 @@ static vsi_bool op_setup_optimized
/* reshape for split output */
output_tensor = vsi_nn_rnn_reshape_split_output(self,
split_output_tensors[i], (uint32_t)(unit_nums * 3), use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
input_fc_output = output_tensor->t;
/* last_step_h_state is not batch first, no need to permute */
@@ -533,13 +578,16 @@ static vsi_bool op_setup_optimized
reshape_size[1] = 1/*kernel_h*/;
reshape_size[0] = last_step_h_state->attr.size[0];
tmp = vsi_nn_rnn_create_reshape(self, last_step_h_state, NULL, reshape_size, 4, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
vsi_nn_internal_init_tensor_attr(&attr,
&p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_HIDDEN],
use_virtual_tensor);
tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.conv2d.ksize[0] = 1;
curr->node->nn_param.conv2d.ksize[1] = 1;
curr->node->nn_param.conv2d.stride[0] = 1;
@@ -562,37 +610,35 @@ static vsi_bool op_setup_optimized
reshape_size[1] = recurrent_weight_for_nn->t->attr.size[3];
reshape_size[0] = batch_size;
tmp_tensor = vsi_nn_rnn_create_reshape(self, tmp_tensor->t, NULL, reshape_size, 2, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
recurrent_fc_output = tmp_tensor->t;
/* grucell output */
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
grucell_out0 = output_tensor->t;
/* grucell output h_state */
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[GRU_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
grucell_out1 = output_tensor->t;
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[GRUCELL_ACTIVATION_INPUT_H_STATE] = last_step_h_state;
- if(0)
- {
- curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R] = input_fc_output;
- curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z] = NULL;
- curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_C] = NULL;
- curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_R] = recurrent_fc_output;
- curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_Z] = NULL;
- curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_C] = NULL;
- }
- else
{
splited_input_fc_output_tensors = vsi_nn_create_split(self,
input_fc_output, 1, 3, NULL, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(splited_input_fc_output_tensors, curr,
+ "Create internal tensor failed", final);
splited_recurrent_fc_output_tensors = vsi_nn_create_split(self,
recurrent_fc_output, 1, 3, NULL, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(splited_recurrent_fc_output_tensors, curr,
+ "Create internal tensor failed", final);
curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R] = splited_input_fc_output_tensors[0]->t;
curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z] = splited_input_fc_output_tensors[1]->t;
curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_C] = splited_input_fc_output_tensors[2]->t;
@@ -623,8 +669,10 @@ static vsi_bool op_setup_optimized
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.concat.axis = 1;
for( i = 0; i < time_step; i++ )
{
@@ -634,9 +682,10 @@ static vsi_bool op_setup_optimized
vsi_nn_internal_setup_node(self, curr);
reshape_size[0] = batch_size;
- reshape_size[1] = -1;
+ reshape_size[1] = (vsi_size_t)-1;
reshape_size[2] = time_step;
tmp_tensor = vsi_nn_rnn_create_reshape(self, tmp_tensor->t, NULL, reshape_size, 3, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
if(p->time_major)
{
@@ -657,11 +706,12 @@ static vsi_bool op_setup_optimized
vsi_nn_rnn_create_permute(self, last_step_h_state, outputs[GRU_OUTPUT_H_STATE],
permute_in_perm, 2, use_virtual_tensor);
+ ret = TRUE;
final:
vsi_nn_safe_free( split_output_tensors );
vsi_nn_safe_free( grucell_reshape_output_tensors );
- return TRUE;
+ return ret;
} /* op_setup_optimized() */
static vsi_bool op_setup
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c
index 18ae5545a..2fc49d033 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c
@@ -39,6 +39,7 @@
#include "utils/vsi_nn_tensor_op.h"
#include "utils/vsi_nn_util.h"
#include "ops/vsi_nn_op_grucell.h"
+#include "vsi_nn_error.h"
typedef struct _vsi_nn_grucell_local
{
@@ -64,6 +65,7 @@ static vsi_nn_internal_tensor_t * _create_fc
{
/* create zero bias for NN/TP */
tmp_tensor = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE);
+ CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create tensor fail.", final );
bias_tensor = tmp_tensor->t;
}
else
@@ -85,8 +87,10 @@ static vsi_nn_internal_tensor_t * _create_fc
attr.vtl = TRUE;
attr.is_const = FALSE;
fc_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+ CHECK_PTR_FAIL_GOTO(fc_out, "Create internal tensor failed", final);
fc_node = vsi_nn_internal_new_node(self, VSI_NN_OP_FCL, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(fc_node, "Create internal node failed", final);
fc_node->node->nn_param.fcl.axis = 0;
fc_node->node->nn_param.fcl.weights = (uint32_t)weight->attr.size[1];
fc_node->inputs[0] = input;
@@ -95,6 +99,7 @@ static vsi_nn_internal_tensor_t * _create_fc
fc_node->outputs[0] = fc_out->t;
vsi_nn_internal_setup_node(self, fc_node);
+final:
return fc_out;
} /* () */
@@ -136,6 +141,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -146,6 +153,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
}
@@ -167,6 +177,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
}
@@ -200,6 +212,7 @@ static vsi_bool op_setup_default
inputs[GRUCELL_IN_KERNEL_I2Z + i],
inputs[GRUCELL_IN_BIAS_I2Z + i]
);
+ CHECK_PTR_FAIL_GOTO(input_fc_outputs[i], "Create internal tensor failed", final);
}
/* create hstate fc */
@@ -211,6 +224,7 @@ static vsi_bool op_setup_default
inputs[GRUCELL_IN_KERNEL_R2Z + i],
inputs[GRUCELL_IN_BIAS_R2Z + i]
);
+ CHECK_PTR_FAIL_GOTO(hstate_fc_outputs[i], "Create internal tensor failed", final);
}
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
@@ -228,8 +242,10 @@ static vsi_bool op_setup_default
attr.vtl = TRUE;
attr.is_const = FALSE;
h_times_r = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+ CHECK_PTR_FAIL_GOTO(h_times_r, "Create internal tensor failed", final);
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_H_TIMES_ACTIVATION_R, 3, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.grucell_h_times_activation_r.recurrent_activation = p->recurrent_activation;
curr->inputs[0] = inputs[GRUCELL_IN_H_STATE];
curr->inputs[1] = input_fc_outputs[GRUCELL_GATES_R]->t;
@@ -243,8 +259,10 @@ static vsi_bool op_setup_default
inputs[GRUCELL_IN_KERNEL_R2H],
inputs[GRUCELL_IN_BIAS_R2H]
);
+ CHECK_PTR_FAIL_GOTO(hstate_fc_outputs[GRUCELL_GATES_H], "Create internal tensor failed", final);
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_Z_H, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.grucell_activation_z_h.activation = p->activation;
curr->node->nn_param.grucell_activation_z_h.recurrent_activation = p->recurrent_activation;
curr->inputs[GRUCELL_ACT_Z_H_HSTATE] = inputs[GRUCELL_IN_H_STATE];
@@ -257,6 +275,8 @@ static vsi_bool op_setup_default
vsi_nn_internal_setup_node(self, curr);
return TRUE;
+final:
+ return FALSE;
}
#endif
@@ -287,6 +307,7 @@ static vsi_bool op_setup_reset_after
inputs[GRUCELL_IN_KERNEL_I2Z + i],
inputs[GRUCELL_IN_BIAS_I2Z + i]
);
+ CHECK_PTR_FAIL_GOTO(input_fc_outputs[i], "Create internal tensor failed", final);
}
/* create hstate fc */
@@ -298,9 +319,11 @@ static vsi_bool op_setup_reset_after
inputs[GRUCELL_IN_KERNEL_R2Z + i],
inputs[GRUCELL_IN_BIAS_R2Z + i]
);
+ CHECK_PTR_FAIL_GOTO(hstate_fc_outputs[i], "Create internal tensor failed", final);
}
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.grucell_activation.activation = p->activation;
curr->node->nn_param.grucell_activation.recurrent_activation = p->recurrent_activation;
curr->inputs[GRUCELL_ACT_H_STATE] = inputs[GRUCELL_IN_H_STATE];
@@ -315,6 +338,8 @@ static vsi_bool op_setup_reset_after
vsi_nn_internal_setup_node(self, curr);
return TRUE;
+final:
+ return FALSE;
}
static vsi_bool op_setup
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c
index 4fcd61200..1478eac41 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c
@@ -75,6 +75,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -86,6 +89,8 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+
if (VSI_NN_DIM_AUTO == outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dim_num)
{
outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dim_num = \
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c
index 42fc9fbc3..a77d05dd6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c
@@ -73,6 +73,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
} /* op_check() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c
index ba9b540cf..cf35692d0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c
@@ -70,6 +70,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -81,6 +84,8 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+
if(VSI_NN_DIM_AUTO == outputs[GRUCELL_ACTIVATION_SMA_OUTPUT_OUTPUT]->attr.dim_num)
{
outputs[GRUCELL_ACTIVATION_SMA_OUTPUT_OUTPUT]->attr.dim_num = \
@@ -108,6 +113,8 @@ static vsi_status op_init
{
vsi_status status = VSI_SUCCESS;
+ VSI_UNREFERENCED(self);
+
return status;
} /* op_init() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_z_h.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_z_h.c
index 46eff0d9d..7980d4281 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_z_h.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_z_h.c
@@ -76,6 +76,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -87,6 +90,8 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+
if (VSI_NN_DIM_AUTO == outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]->attr.dim_num)
{
outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]->attr.dim_num = \
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_h_times_activation_r.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_h_times_activation_r.c
index e1e448077..58dc548e6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_h_times_activation_r.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_h_times_activation_r.c
@@ -81,6 +81,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -92,6 +95,8 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+
if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
{
outputs[0]->attr.dim_num = \
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c
index 020ab32e6..432ce2032 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c
@@ -35,12 +35,12 @@
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
#include "ops/vsi_nn_op_grucell_ovxlib.h"
#include "vsi_nn_internal_node.h"
#include "vsi_nn_rnn_helper.h"
#include "utils/vsi_nn_tensor_op.h"
#include "utils/vsi_nn_util.h"
+#include "vsi_nn_error.h"
#define USE_GRUCELL_ACTIVATION
@@ -78,8 +78,10 @@ static vsi_nn_internal_tensor_t* create_multiply
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor);
tensor1 = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final);
tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_MULTIPLY, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
tmp_inode->inputs[0] = input1;
tmp_inode->inputs[1] = input2;
@@ -89,6 +91,7 @@ static vsi_nn_internal_tensor_t* create_multiply
tmp_inode->outputs[0] = tensor1->t;
vsi_nn_internal_setup_node(self, tmp_inode);
+final:
return tensor1;
}
@@ -125,6 +128,7 @@ static vsi_bool setup_op_shapes
attr.is_const = FALSE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
inputs[GRUCELL_INPUT_H_STATE] = output_tensor->t;
}
@@ -133,6 +137,7 @@ static vsi_bool setup_op_shapes
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[GRUCELL_OUTPUT_OUTPUT]->attr.dtype, TRUE);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
outputs[GRUCELL_OUTPUT_H_STATE] = output_tensor->t;
}
@@ -156,6 +161,8 @@ static vsi_bool setup_op_shapes
}
return TRUE;
+final:
+ return FALSE;
}
static vsi_status op_compute
@@ -165,6 +172,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -175,6 +184,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -187,6 +199,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
@@ -205,24 +219,31 @@ static vsi_bool op_setup_float
vsi_nn_internal_tensor_t* tensor_rt = NULL;
vsi_nn_internal_tensor_t* input_hstate = NULL;
vsi_nn_internal_tensor_t** splited_tensors = NULL;
+ vsi_bool ret = FALSE;
p->local->weights_update = vsi_nn_ConcatTensor(self->graph, 0,
inputs[GRUCELL_INPUT_WEIGHT_I2Z], inputs[GRUCELL_INPUT_WEIGHT_H2Z]);
+ CHECK_PTR_FAIL_GOTO(p->local->weights_update, "Create tensor failed", final);
p->local->weights_reset = vsi_nn_ConcatTensor(self->graph, 0,
inputs[GRUCELL_INPUT_WEIGHT_I2R], inputs[GRUCELL_INPUT_WEIGHT_H2R]);
+ CHECK_PTR_FAIL_GOTO(p->local->weights_reset, "Create tensor failed", final);
p->local->bias_z = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2Z]->attr,
inputs[GRUCELL_INPUT_BIAS_I2Z], inputs[GRUCELL_INPUT_BIAS_H2Z]);
+ CHECK_PTR_FAIL_GOTO(p->local->bias_z, "Create tensor failed", final);
p->local->bias_r = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2R]->attr,
inputs[GRUCELL_INPUT_BIAS_I2R], inputs[GRUCELL_INPUT_BIAS_H2R]);
+ CHECK_PTR_FAIL_GOTO(p->local->bias_r, "Create tensor failed", final);
p->local->bias_z_r = vsi_nn_ConcatTensor(self->graph, 0, p->local->bias_z, p->local->bias_r);
+ CHECK_PTR_FAIL_GOTO(p->local->bias_z_r, "Create tensor failed", final);
p->local->weights_z_r = vsi_nn_ConcatTensor(self->graph, 1, p->local->weights_update, p->local->weights_reset);
+ CHECK_PTR_FAIL_GOTO(p->local->weights_z_r, "Create tensor failed", final);
p->local->weights_c = vsi_nn_ConcatTensor(self->graph, 0,
inputs[GRUCELL_INPUT_WEIGHT_I2C], inputs[GRUCELL_INPUT_WEIGHT_H2C]);
+ CHECK_PTR_FAIL_GOTO(p->local->weights_c, "Create tensor failed", final);
p->local->bias_c = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2C]->attr,
inputs[GRUCELL_INPUT_BIAS_I2C], inputs[GRUCELL_INPUT_BIAS_H2C]);
+ CHECK_PTR_FAIL_GOTO(p->local->bias_c, "Create tensor failed", final);
- vsi_safe_release_tensor(p->local->bias_z);
- vsi_safe_release_tensor(p->local->bias_r);
p->local->bias_z_r->attr.is_const = TRUE;
vsi_nn_SetTensorAttr(p->local->bias_z_r, VSI_NN_TENSOR_ATTR_CONST);
p->local->weights_z_r->attr.is_const = TRUE;
@@ -234,6 +255,7 @@ static vsi_bool op_setup_float
input_hstate = vsi_nn_rnn_create_concat(self, 0,
use_virtual_tensor, inputs[GRUCELL_INPUT_INPUT], inputs[GRUCELL_INPUT_H_STATE]);
+ CHECK_PTR_FAIL_GOTO(input_hstate, "Create internal tensor failed", final);
dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
if ( input_hstate->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
@@ -247,8 +269,10 @@ static vsi_bool op_setup_float
}
tmp_tensor = vsi_nn_rnn_create_tp_fc(self, input_hstate->t,
p->local->weights_z_r, p->local->bias_z_r, &dtype, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
splited_tensors = vsi_nn_create_split(self, tmp_tensor->t, 0, 2, NULL, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(splited_tensors, "Create internal tensor failed", final);
/* reset Gate activations */
tensor_rt = vsi_nn_rnn_create_activation(self,
@@ -256,6 +280,7 @@ static vsi_bool op_setup_float
p->local->gate_activation,
&splited_tensors[1]->t->attr.dtype,
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tensor_rt, "Create internal tensor failed", final);
/* if linear_before_reset=0: ht=g(input*w_ic + (r.hstate)*w_hc + b_ic + b_hc)*/
if ( p->linear_before_reset == 0 )
@@ -263,10 +288,12 @@ static vsi_bool op_setup_float
/* r{t} * h{t-1}*/
tensor_rt = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_MULTIPLY,
tensor_rt->t, inputs[GRUCELL_INPUT_H_STATE], &tensor_rt->t->attr.dtype, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tensor_rt, "Create internal tensor failed", final);
/* [x{t}, r{t}] */
tmp_tensor = vsi_nn_rnn_create_concat(self, 0, use_virtual_tensor,
inputs[GRUCELL_INPUT_INPUT], tensor_rt->t);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
if ( tmp_tensor->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
@@ -281,6 +308,7 @@ static vsi_bool op_setup_float
/* W{c} x [x{t}, r{t}] */
tmp_tensor = vsi_nn_rnn_create_tp_fc(self, tmp_tensor->t, p->local->weights_c, p->local->bias_c,
&dtype, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
}
/* if linear_before_reset!=0: ht=g(input*w_ic + (r.(hstate*w_hc + b_hc)) + b_ic)*/
else
@@ -298,19 +326,24 @@ static vsi_bool op_setup_float
/* r.(hstate*w_hc + b_hc) */
tmp_tensor = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_H_STATE], inputs[GRUCELL_INPUT_WEIGHT_H2C],
inputs[GRUCELL_INPUT_BIAS_H2C], &dtype, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
tensor_rt = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_MULTIPLY,
tensor_rt->t, tmp_tensor->t, &tensor_rt->t->attr.dtype, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tensor_rt, "Create internal tensor failed", final);
/* input*w_ic + b_ic */
tmp_tensor = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_INPUT], inputs[GRUCELL_INPUT_WEIGHT_I2C],
inputs[GRUCELL_INPUT_BIAS_I2C], &dtype, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
tmp_tensor = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_ADD,
tensor_rt->t, tmp_tensor->t, &tensor_rt->t->attr.dtype, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
}
#define USE_GRUCELL_ACTIVATION
#ifdef USE_GRUCELL_ACTIVATION
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = splited_tensors[0]->t;
curr->inputs[1] = tmp_tensor->t;
curr->inputs[2] = inputs[GRUCELL_INPUT_H_STATE];
@@ -342,6 +375,7 @@ static vsi_bool op_setup_float
tmp_tensor = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_MULTIPLY,
tensor_zt->t, tmp_tensor->t, &tensor_ht_->t->attr.dtype, use_virtual_tensor);
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = tmp_tensor->t;
curr->inputs[1] = tensor_ht_->t;
curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT];
@@ -349,12 +383,18 @@ static vsi_bool op_setup_float
}
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT];
curr->outputs[0] = outputs[GRUCELL_OUTPUT_H_STATE];
vsi_nn_internal_setup_node(self, curr);
#endif
- return TRUE;
+
+ ret = TRUE;
+final:
+ vsi_safe_release_tensor(p->local->bias_z);
+ vsi_safe_release_tensor(p->local->bias_r);
+ return ret;
}
static vsi_bool op_setup_float_cudnn
@@ -379,24 +419,29 @@ static vsi_bool op_setup_float_cudnn
p->local->weights_input = vsi_nn_ConcatTensor(self->graph, 1, inputs[GRUCELL_INPUT_WEIGHT_I2R],
inputs[GRUCELL_INPUT_WEIGHT_I2Z], inputs[GRUCELL_INPUT_WEIGHT_I2C]);
+ CHECK_PTR_FAIL_GOTO(p->local->weights_input, "Create tensor failed", final);
p->local->weights_input->attr.is_const = TRUE;
vsi_nn_SetTensorAttr(p->local->weights_input, VSI_NN_TENSOR_ATTR_CONST);
p->local->weights_recurrent = vsi_nn_ConcatTensor(self->graph, 1, inputs[GRUCELL_INPUT_WEIGHT_H2R],
inputs[GRUCELL_INPUT_WEIGHT_H2Z], inputs[GRUCELL_INPUT_WEIGHT_H2C]);
+ CHECK_PTR_FAIL_GOTO(p->local->weights_recurrent, "Create tensor failed", final);
p->local->weights_recurrent->attr.is_const = TRUE;
vsi_nn_SetTensorAttr(p->local->weights_recurrent, VSI_NN_TENSOR_ATTR_CONST);
p->local->bias_r = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2R]->attr,
inputs[GRUCELL_INPUT_BIAS_I2R], inputs[GRUCELL_INPUT_BIAS_H2R]);
+ CHECK_PTR_FAIL_GOTO(p->local->bias_r, "Create tensor failed", final);
p->local->bias_r->attr.is_const = TRUE;
vsi_nn_SetTensorAttr(p->local->bias_r, VSI_NN_TENSOR_ATTR_CONST);
p->local->bias_z = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2Z]->attr,
inputs[GRUCELL_INPUT_BIAS_I2Z], inputs[GRUCELL_INPUT_BIAS_H2Z]);
+ CHECK_PTR_FAIL_GOTO(p->local->bias_z, "Create tensor failed", final);
p->local->bias_z->attr.is_const = TRUE;
vsi_nn_SetTensorAttr(p->local->bias_z, VSI_NN_TENSOR_ATTR_CONST);
p->local->bias_c = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2C]->attr,
inputs[GRUCELL_INPUT_BIAS_I2C], inputs[GRUCELL_INPUT_BIAS_H2C]);
+ CHECK_PTR_FAIL_GOTO(p->local->bias_c, "Create tensor failed", final);
p->local->bias_c->attr.is_const = TRUE;
vsi_nn_SetTensorAttr(p->local->bias_c, VSI_NN_TENSOR_ATTR_CONST);
@@ -412,16 +457,19 @@ static vsi_bool op_setup_float_cudnn
/* reshape and transpose input */
input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_INPUT],
p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
-
+ CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final);
tmp = vsi_nn_rnn_create_nn_fc(self, input_tensor->t, p->local->weights_input,
NULL, kernel_h, kernel_w,
&p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_INPUT],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
+
/* transpose and reshape output */
reshaped_size[0] = inputs[GRUCELL_INPUT_INPUT]->attr.size[1];
reshaped_size[1] = p->local->weights_input->attr.size[1];
input_fc_output = vsi_nn_rnn_create_reshape(self, tmp->t, NULL,
reshaped_size, 2, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(input_fc_output, "Create internal tensor failed", final);
grucell_activation_input_layout = GRUCELL_ACTIVATION_INPUT_LAYOUT_INPUT_NC_FC_CN;
}
@@ -430,6 +478,7 @@ static vsi_bool op_setup_float_cudnn
input_fc_output = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_INPUT],
p->local->weights_input, NULL,
&p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_INPUT], use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(input_fc_output, "Create internal tensor failed", final);
grucell_activation_input_layout = GRUCELL_ACTIVATION_INPUT_LAYOUT_ALL_NC;
}
@@ -444,25 +493,31 @@ static vsi_bool op_setup_float_cudnn
/* reshape and transpose input */
input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_H_STATE],
p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final);
tmp = vsi_nn_rnn_create_nn_fc(self, input_tensor->t, p->local->weights_recurrent,
NULL, kernel_h, kernel_w,
&p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_HIDDEN], use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
+
/* transpose and reshape output */
reshaped_size[0] = inputs[GRUCELL_INPUT_H_STATE]->attr.size[1];
reshaped_size[1] = p->local->weights_recurrent->attr.size[1];
recurrent_fc_output = vsi_nn_rnn_create_reshape(self, tmp->t, NULL,
reshaped_size, 2, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(recurrent_fc_output, "Create internal tensor failed", final);
}
else
{
recurrent_fc_output = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_H_STATE],
p->local->weights_recurrent, NULL,
&p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_HIDDEN], use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(recurrent_fc_output, "Create internal tensor failed", final);
}
#ifdef USE_GRUCELL_ACTIVATION
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[GRUCELL_ACTIVATION_INPUT_H_STATE] = inputs[GRUCELL_INPUT_H_STATE];
if(p->local->multi_batch)
@@ -480,8 +535,12 @@ static vsi_bool op_setup_float_cudnn
{
splited_input_fc_output_tensors = vsi_nn_create_split(self,
input_fc_output->t, 1, 3, NULL, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(splited_input_fc_output_tensors, curr,
+ "Create internal tensor failed", final);
splited_recurrent_fc_output_tensors = vsi_nn_create_split(self,
recurrent_fc_output->t, 1, 3, NULL, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(splited_recurrent_fc_output_tensors, curr,
+ "Create internal tensor failed", final);
curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R] = splited_input_fc_output_tensors[0]->t;
curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z] = splited_input_fc_output_tensors[1]->t;
curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_C] = splited_input_fc_output_tensors[2]->t;
@@ -494,8 +553,12 @@ static vsi_bool op_setup_float_cudnn
{
splited_input_fc_output_tensors = vsi_nn_create_split(self,
input_fc_output->t, 0, 3, NULL, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(splited_input_fc_output_tensors, curr,
+ "Create internal tensor failed", final);
splited_recurrent_fc_output_tensors = vsi_nn_create_split(self,
recurrent_fc_output->t, 0, 3, NULL, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(splited_recurrent_fc_output_tensors, curr,
+ "Create internal tensor failed", final);
curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R] = splited_input_fc_output_tensors[0]->t;
curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z] = splited_input_fc_output_tensors[1]->t;
curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_C] = splited_input_fc_output_tensors[2]->t;
@@ -593,12 +656,14 @@ static vsi_bool op_setup_float_cudnn
tensor_u->t, tmp_tensor->t, &tmp_tensor->t->attr.dtype, use_virtual_tensor);
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = tmp_tensor->t;
curr->inputs[1] = tensor_c->t;
curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT];
vsi_nn_internal_setup_node(self, curr);
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT];
curr->outputs[0] = outputs[GRUCELL_OUTPUT_H_STATE];
vsi_nn_internal_setup_node(self, curr);
@@ -606,6 +671,8 @@ static vsi_bool op_setup_float_cudnn
#endif
return TRUE;
+final:
+ return FALSE;
}
/*
@@ -629,34 +696,38 @@ static vsi_bool op_setup_float_cudnn_v2
vsi_nn_internal_tensor_t* tensor_r = NULL;
vsi_nn_internal_tensor_t* concated_input = NULL;
vsi_nn_tensor_attr_t attr;
+ vsi_bool ret = FALSE;
/* input to r,z */
p->local->weights_update = vsi_nn_ConcatTensor(self->graph, 1/* axis */,
inputs[GRUCELL_INPUT_WEIGHT_I2R], inputs[GRUCELL_INPUT_WEIGHT_I2Z]);
+ CHECK_PTR_FAIL_GOTO(p->local->weights_update, "Create tensor failed", final);
/* recurrent to r,z */
p->local->weights_reset = vsi_nn_ConcatTensor(self->graph, 1/* axis */,
inputs[GRUCELL_INPUT_WEIGHT_H2R], inputs[GRUCELL_INPUT_WEIGHT_H2Z]);
+ CHECK_PTR_FAIL_GOTO(p->local->weights_reset, "Create tensor failed", final);
/* [input, recurrent] to r,z */
p->local->weights_input = vsi_nn_ConcatTensor(self->graph, 0/* axis */,
p->local->weights_update, p->local->weights_reset);
+ CHECK_PTR_FAIL_GOTO(p->local->weights_input, "Create tensor failed", final);
p->local->weights_input->attr.is_const = TRUE;
vsi_nn_SetTensorAttr(p->local->weights_input, VSI_NN_TENSOR_ATTR_CONST);
- vsi_safe_release_tensor(p->local->weights_update);
- vsi_safe_release_tensor(p->local->weights_reset);
p->local->bias_z = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2Z]->attr,
inputs[GRUCELL_INPUT_BIAS_I2Z], inputs[GRUCELL_INPUT_BIAS_H2Z]);
+ CHECK_PTR_FAIL_GOTO(p->local->bias_z, "Create tensor failed", final);
p->local->bias_r = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2R]->attr,
inputs[GRUCELL_INPUT_BIAS_I2R], inputs[GRUCELL_INPUT_BIAS_H2R]);
+ CHECK_PTR_FAIL_GOTO(p->local->bias_r, "Create tensor failed", final);
p->local->bias_z_r = vsi_nn_ConcatTensor(self->graph, 0/* axis */,
p->local->bias_r, p->local->bias_z);
+ CHECK_PTR_FAIL_GOTO(p->local->bias_z_r, "Create tensor failed", final);
p->local->bias_z_r->attr.is_const = TRUE;
vsi_nn_SetTensorAttr(p->local->bias_z_r, VSI_NN_TENSOR_ATTR_CONST);
- vsi_safe_release_tensor(p->local->bias_z);
- vsi_safe_release_tensor(p->local->bias_r);
concated_input = vsi_nn_rnn_create_concat(self, 0/* axis */,
use_virtual_tensor, inputs[GRUCELL_INPUT_INPUT], inputs[GRUCELL_INPUT_H_STATE]);
+ CHECK_PTR_FAIL_GOTO(concated_input, "Create internal tensor failed", final);
dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
if ( concated_input->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
@@ -670,6 +741,16 @@ static vsi_bool op_setup_float_cudnn_v2
}
tmp_tensor = vsi_nn_rnn_create_tp_fc(self, concated_input->t, p->local->weights_input,
p->local->bias_z_r, &dtype, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
+
+ {
+ uint32_t _slices[] = { 0, 0 };
+ _slices[0] = (uint32_t)inputs[GRUCELL_INPUT_INPUT]->attr.size[0];
+ _slices[1] = (uint32_t)inputs[GRUCELL_INPUT_H_STATE]->attr.size[0];
+ splited_input_fc_output_tensors = vsi_nn_create_split(self, concated_input->t,
+ 0, 2, _slices, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( splited_input_fc_output_tensors, "Create internal tensor fail.", final );
+ }
dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
if ( splited_input_fc_output_tensors[0]->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
@@ -681,14 +762,10 @@ static vsi_bool op_setup_float_cudnn_v2
{
dtype.vx_type = VSI_NN_TYPE_FLOAT16;
}
- {
- uint32_t _slices[] = { (uint32_t)inputs[GRUCELL_INPUT_INPUT]->attr.size[0],
- (uint32_t)inputs[GRUCELL_INPUT_H_STATE]->attr.size[0] };
- splited_input_fc_output_tensors = vsi_nn_create_split(self, concated_input->t,
- 0, 2, _slices, use_virtual_tensor);
- }
+
input2cand_output = vsi_nn_rnn_create_tp_fc(self, splited_input_fc_output_tensors[0]->t,
inputs[GRUCELL_INPUT_WEIGHT_I2C], inputs[GRUCELL_INPUT_BIAS_I2C], &dtype, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(input2cand_output, "Create internal tensor failed", final);
dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
if ( inputs[GRUCELL_INPUT_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
@@ -702,14 +779,17 @@ static vsi_bool op_setup_float_cudnn_v2
}
recurrent2cand_output = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_H_STATE],
inputs[GRUCELL_INPUT_WEIGHT_H2C], inputs[GRUCELL_INPUT_BIAS_H2C], &dtype, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(recurrent2cand_output, "Create internal tensor failed", final);
tmp_tensor = vsi_nn_rnn_create_activation(self, tmp_tensor->t, p->local->gate_activation,
&tmp_tensor->t->attr.dtype, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
/* split for combined FC outputs, r_t, z_t */
splited_input_fc_output_tensors = vsi_nn_create_split(self, tmp_tensor->t,
0/* axis */,
2/* dim num */, NULL, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(splited_input_fc_output_tensors, "Create internal tensor failed", final);
memset( &attr, 0x00, sizeof(attr) );
attr.dim_num = VSI_NN_DIM_AUTO;
@@ -726,8 +806,10 @@ static vsi_bool op_setup_float_cudnn_v2
dtype.vx_type = VSI_NN_TYPE_FLOAT16;
}
tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_A_TIMES_B_PLUS_C, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = splited_input_fc_output_tensors[0]->t;
curr->inputs[1] = recurrent2cand_output->t;
curr->inputs[2] = input2cand_output->t;
@@ -736,10 +818,12 @@ static vsi_bool op_setup_float_cudnn_v2
tensor_r = vsi_nn_rnn_create_activation(self, tmp_tensor->t,
p->local->candidate_activation, &tmp_tensor->t->attr.dtype, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(tensor_r, curr, "Create internal tensor failed", final);
#define USE_GRUCELL_ACTIVATION_SMA
#ifdef USE_GRUCELL_ACTIVATION_SMA
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL_SMA, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[GRUCELL_ACTIVATION_SMA_INPUT_H_STATE] = inputs[GRUCELL_INPUT_H_STATE];
curr->inputs[GRUCELL_ACTIVATION_SMA_INPUT_H_T_] = tensor_r->t;
curr->inputs[GRUCELL_ACTIVATION_SMA_INPUT_Z_T] = splited_input_fc_output_tensors[1]->t;
@@ -758,18 +842,25 @@ static vsi_bool op_setup_float_cudnn_v2
tmp_tensor->t, &tmp_tensor->t->attr.dtype, use_virtual_tensor);
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = tmp_tensor->t;
curr->inputs[1] = tensor_r->t;
curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT];
vsi_nn_internal_setup_node(self, curr);
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT];
curr->outputs[0] = outputs[GRUCELL_OUTPUT_H_STATE];
vsi_nn_internal_setup_node(self, curr);
#endif
-
- return TRUE;
+ ret = TRUE;
+final:
+ vsi_safe_release_tensor(p->local->bias_z);
+ vsi_safe_release_tensor(p->local->bias_r);
+ vsi_safe_release_tensor(p->local->weights_update);
+ vsi_safe_release_tensor(p->local->weights_reset);
+ return ret;
}
static vsi_bool op_setup_default
@@ -804,6 +895,8 @@ static vsi_bool op_setup_default
uint32_t kernel_h = 1;
uint32_t kernel_w = 1;
int32_t i = 0;
+ vsi_nn_tensor_t* wei_r2c_tensor = NULL;
+ vsi_nn_tensor_t* bias_r2c_tensor = NULL;
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
memset( &attr, 0x00, sizeof( attr ) );
@@ -853,6 +946,7 @@ static vsi_bool op_setup_default
inputs[GRUCELL_INPUT_BIAS_I2R + i],
&p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2R + i],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(input_gate_fc_outputs[i], "Create internal tensor failed", final);
}
}
else
@@ -862,6 +956,7 @@ static vsi_bool op_setup_default
(uint32_t)inputs[GRUCELL_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w);
input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_INPUT],
p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final);
for( i = 0; i < GRUCELL_RZ_GATE_COUNT; i++)
{
@@ -872,9 +967,11 @@ static vsi_bool op_setup_default
kernel_h, kernel_w,
&p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2R + i],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
/* transpose and reshape output */
input_gate_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self,
tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(input_gate_fc_outputs[i], "Create internal tensor failed", final);
}
}
@@ -889,10 +986,7 @@ static vsi_bool op_setup_default
inputs[GRUCELL_INPUT_BIAS_H2R + i],
&p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2R + i],
use_virtual_tensor);
- if (hstate_gate_fc_outputs[i] == NULL)
- {
- goto error;
- }
+ CHECK_PTR_FAIL_GOTO(hstate_gate_fc_outputs[i], "Create internal tensor failed", final);
}
}
else
@@ -902,6 +996,7 @@ static vsi_bool op_setup_default
(uint32_t)inputs[GRUCELL_INPUT_H_STATE]->attr.size[0], &kernel_h, &kernel_w);
hstate_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self,
inputs[GRUCELL_INPUT_H_STATE], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(hstate_input_tensor, "Create internal tensor failed", final);
for( i = 0; i < GRUCELL_RZ_GATE_COUNT; i++)
{
@@ -912,9 +1007,11 @@ static vsi_bool op_setup_default
kernel_h, kernel_w,
&p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2R + i],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
/* transpose and reshape output */
hstate_gate_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self,
tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(hstate_gate_fc_outputs[i], "Create internal tensor failed", final);
}
}
@@ -926,6 +1023,7 @@ static vsi_bool op_setup_default
hstate_gate_fc_outputs[i]->t,
&p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2R + i],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(gate_fc_outputs[i], "Create internal tensor failed", final);
}
/* Gate activations */
@@ -936,6 +1034,7 @@ static vsi_bool op_setup_default
p->local->gate_activation,
&p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2R + i],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(gate_act_outputs[i], "Create internal tensor failed", final);
}
/* Candidate FC */
@@ -948,6 +1047,7 @@ static vsi_bool op_setup_default
inputs[GRUCELL_INPUT_H_STATE],
&p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2R],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(rh_mul_outputs, "Create internal tensor failed", final);
}
else
{
@@ -957,6 +1057,7 @@ static vsi_bool op_setup_default
inputs[GRUCELL_INPUT_H_STATE]->attr.size,
inputs[GRUCELL_INPUT_H_STATE]->attr.dim_num,
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(rh_mul_outputs, "Create internal tensor failed", final);
}
if( inputs[GRUCELL_INPUT_INPUT]->attr.dtype.qnt_type
@@ -999,6 +1100,7 @@ static vsi_bool op_setup_default
inputs[GRUCELL_INPUT_BIAS_I2C],
&p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(input_cand_fc_output, "Create internal tensor failed", final);
}
else
{
@@ -1008,6 +1110,8 @@ static vsi_bool op_setup_default
(uint32_t)inputs[GRUCELL_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w);
input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_INPUT],
p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final);
+
tmp = vsi_nn_rnn_create_nn_fc(self,
input_tensor->t,
inputs[GRUCELL_INPUT_WEIGHT_I2C],
@@ -1015,9 +1119,11 @@ static vsi_bool op_setup_default
kernel_h, kernel_w,
&p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
/* transpose and reshape output */
input_cand_fc_output = vsi_nn_rnn_process_output_for_nn_fc(self,
tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(input_cand_fc_output, "Create internal tensor failed", final);
}
if ( is_hstate_cand_fc_op_tp )
{
@@ -1025,9 +1131,6 @@ static vsi_bool op_setup_default
if ((rh_mul_outputs->t->attr.dtype.vx_type) != (inputs[GRUCELL_INPUT_WEIGHT_H2C]->attr.dtype.vx_type)
&& (p->local->multi_batch))
{
- vsi_nn_tensor_t* wei_r2c_tensor = NULL;
- vsi_nn_tensor_t* bias_r2c_tensor = NULL;
-
memcpy(&attr, &(inputs[GRUCELL_INPUT_WEIGHT_H2C]->attr), sizeof(attr));
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
if ( rh_mul_outputs->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
@@ -1041,14 +1144,18 @@ static vsi_bool op_setup_default
}
wei_r2c_tensor = vsi_nn_ConvertTensorDtype(self->graph, inputs[GRUCELL_INPUT_WEIGHT_H2C], &(attr.dtype));
+ CHECK_PTR_FAIL_GOTO(wei_r2c_tensor, "Create tensor failed", final);
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
bias_r2c_tensor = vsi_nn_ConvertTensorDtype(self->graph, inputs[GRUCELL_INPUT_BIAS_H2C], &(attr.dtype));
+ CHECK_PTR_FAIL_GOTO(bias_r2c_tensor, "Create tensor failed", final);
+
rh_cand_fc_output = vsi_nn_rnn_create_tp_fc(self,
rh_mul_outputs->t,
wei_r2c_tensor,
bias_r2c_tensor,
&p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2C],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(rh_cand_fc_output, "Create internal tensor failed", final);
}
else
{
@@ -1058,6 +1165,7 @@ static vsi_bool op_setup_default
inputs[GRUCELL_INPUT_BIAS_H2C],
&p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2C],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(rh_cand_fc_output, "Create internal tensor failed", final);
}
}
else
@@ -1068,6 +1176,8 @@ static vsi_bool op_setup_default
(uint32_t)rh_mul_outputs->t->attr.size[0], &kernel_h, &kernel_w);
hstate_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, rh_mul_outputs->t,
p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(hstate_input_tensor, "Create internal tensor failed", final);
+
tmp = vsi_nn_rnn_create_nn_fc(self,
hstate_input_tensor->t,
inputs[GRUCELL_INPUT_WEIGHT_H2C],
@@ -1075,9 +1185,11 @@ static vsi_bool op_setup_default
kernel_h, kernel_w,
&p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2C],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
/* transpose and reshape output */
rh_cand_fc_output = vsi_nn_rnn_process_output_for_nn_fc(self,
tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(rh_cand_fc_output, "Create internal tensor failed", final);
}
if ( p->linear_before_reset == 0 )
@@ -1091,6 +1203,7 @@ static vsi_bool op_setup_default
rh_cand_fc_output->t,
&p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2C],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(r_mul_hcand_fc_output, "Create internal tensor failed", final);
}
/* Candidate input FC add r*h FC */
cand_fc_output = vsi_nn_rnn_create_tensor_add(self,
@@ -1098,6 +1211,7 @@ static vsi_bool op_setup_default
r_mul_hcand_fc_output->t,
&p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(cand_fc_output, "Create internal tensor failed", final);
/* Candidate activation */
cand_act_output = vsi_nn_rnn_create_activation(self,
@@ -1105,6 +1219,7 @@ static vsi_bool op_setup_default
p->local->candidate_activation,
&p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(cand_act_output, "Create internal tensor failed", final);
/* GRU cell output */
memcpy( &attr.dtype, &gate_act_outputs[GRUCELL_GATE_Z]->t->attr.dtype, sizeof( attr.dtype ) );
@@ -1113,6 +1228,7 @@ static vsi_bool op_setup_default
attr.vtl = use_virtual_tensor;
attr.is_const = TRUE;
input_tensor = vsi_nn_internal_new_tensor(self, &attr, 1.0f);
+ CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final);
memset( &attr, 0x00, sizeof(attr) );
//memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
@@ -1131,9 +1247,11 @@ static vsi_bool op_setup_default
}
tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
/* create internal tensor sub node (1-zt)*c */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SUBTRACT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = input_tensor->t;
curr->inputs[1] = gate_act_outputs[GRUCELL_GATE_Z]->t;
curr->outputs[0] = tmp_tensor->t;
@@ -1146,6 +1264,7 @@ static vsi_bool op_setup_default
cand_act_output->t,
&p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
/* create internal multiply node zt*hstate */
tmp_tensor = create_multiply(self,
@@ -1153,9 +1272,11 @@ static vsi_bool op_setup_default
inputs[GRUCELL_INPUT_H_STATE],
&p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2Z],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
/* create internal tensor add node (1-zt)*c + zt*hstate */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = output_tensor->t;
curr->inputs[1] = tmp_tensor->t;
curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT];
@@ -1164,13 +1285,16 @@ static vsi_bool op_setup_default
/* copy output to h_state */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT];
curr->outputs[0] = outputs[GRUCELL_OUTPUT_H_STATE];
vsi_nn_internal_setup_node(self, curr);
return TRUE;
-error:
+final:
+ vsi_safe_release_tensor(wei_r2c_tensor);
+ vsi_safe_release_tensor(bias_r2c_tensor);
return FALSE;
} /* op_setup() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c b/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c
index dbec83887..4a07faab6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c
@@ -94,6 +94,8 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(node);
+
if( outputs[0]->attr.dim_num == VSI_NN_DIM_AUTO )
{
outputs[0]->attr.dim_num = inputs[2]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c b/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c
index 46ee1d284..cc4b44362 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c
@@ -68,6 +68,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -79,6 +82,8 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = 2;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c b/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c
index d9b3b320f..5386af725 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c
@@ -60,6 +60,10 @@ static vsi_status op_compute
{
vsi_status status = VSI_SUCCESS;
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
+
return status;
} /* op_compute() */
@@ -124,6 +128,12 @@ vsi_status vsi_nn_op_imageprocess_single_node
vsi_nn_tensor_t *tensor_out
)
{
+ VSI_UNREFERENCED(graph);
+ VSI_UNREFERENCED(attr);
+ VSI_UNREFERENCED(p);
+ VSI_UNREFERENCED(data);
+ VSI_UNREFERENCED(tensor_out);
+
return VSI_SUCCESS;
}
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c b/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c
index 9a2043e9e..2066865a5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c
@@ -34,6 +34,7 @@
#include "vsi_nn_tensor.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_error.h"
/*
Declare number of input and output.
@@ -50,6 +51,9 @@ static vsi_status op_compute
{
vsi_status status = VSI_SUCCESS;
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
+
status = vsi_nn_internal_compute_node( self );
return status;
@@ -64,6 +68,9 @@ static vsi_bool op_check
{
vsi_nn_interp_param *p = NULL;
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
+
p = &self->nn_param.interp;
if ((p->pad_beg > 0) || (p->pad_end > 0))
@@ -166,8 +173,10 @@ static vsi_bool op_setup
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor);
crop_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(crop_tensor, "Create internal tensor failed", final);
crop_in_tensor = crop_tensor->t;
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 1, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num;
curr->node->nn_param.strided_slice.end_dims_num = inputs[0]->attr.dim_num;
curr->node->nn_param.strided_slice.stride_dims_num = inputs[0]->attr.dim_num;
@@ -177,10 +186,13 @@ static vsi_bool op_setup
curr->node->nn_param.strided_slice.new_axis_mask = 0;
begin_dims = (vsi_ssize_t *)vsi_nn_internal_new_node_param(curr,
VSI_NN_MAX_DIM_NUM * sizeof(vsi_ssize_t));
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(begin_dims, curr, "Create internal buffer failed", final);
end_dims = (vsi_ssize_t *)vsi_nn_internal_new_node_param(curr,
VSI_NN_MAX_DIM_NUM * sizeof(vsi_ssize_t));
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(end_dims, curr, "Create internal buffer failed", final);
stride_dims = (vsi_ssize_t *)vsi_nn_internal_new_node_param(curr,
VSI_NN_MAX_DIM_NUM * sizeof(vsi_ssize_t));
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(stride_dims, curr, "Create internal buffer failed", final);
for (i = 0; i < inputs[0]->attr.dim_num; i++)
{
stride_dims[i] = 1;
@@ -215,6 +227,7 @@ static vsi_bool op_setup
&& (height_in_eff_ == (vsi_ssize_t)outputs[0]->attr.size[1]))
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 1, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = crop_in_tensor;
curr->outputs[0] = outputs[0];
vsi_nn_internal_setup_node(self, curr);
@@ -222,6 +235,7 @@ static vsi_bool op_setup
else
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_INTERNAL, 1, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.resize_internal.align_corners = vx_true_e;
curr->node->nn_param.resize_internal.factor = factor;
curr->node->nn_param.resize_internal.half_pixel_centers = vx_false_e;
@@ -231,6 +245,8 @@ static vsi_bool op_setup
}
return TRUE;
+final:
+ return FALSE;
} /* op_setup() */
static vsi_status op_optimize
@@ -243,6 +259,9 @@ static vsi_status op_optimize
{
vsi_status status;
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
+
status = VSI_SUCCESS;
vsi_nn_internal_optimize_node( self, direction );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c
index cff15071e..242099b11 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c
@@ -42,7 +42,7 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
- vsi_status status = VX_FAILURE;
+ vsi_status status = VSI_FAILURE;
int32_t axis = self->nn_param.l2_normalize.axis;
vsi_nn_kernel_param_t * param = NULL;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
index e872a3dc5..d52eb7d19 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
@@ -54,7 +54,7 @@ static vsi_nn_tensor_t* _expand_scale_tensor
vsi_size_t scale_size_out
)
{
- vsi_status status = VX_SUCCESS;
+ vsi_status status = VSI_FAILURE;
float* f32_in_buffer = NULL;
float* f32_out_buffer = NULL;
vsi_size_t i = 0;
@@ -144,13 +144,7 @@ static vsi_bool _check_value_is_equal_to_one
}
}
- if ( !tensor->attr.is_created_from_handle )
- {
- if ( tensor_data )
- {
- free(tensor_data);
- }
- }
+ vsi_nn_safe_free(tensor_data);
return ret;
}
@@ -324,7 +318,7 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
- vsi_bool ret = TRUE;
+ vsi_bool ret = FALSE;
vsi_nn_internal_node_t* curr = NULL;
if( NULL == self )
@@ -349,10 +343,11 @@ static vsi_bool op_setup
{
self->nn_param.l2normalizescale.local.use_internal_node = TRUE;
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_L2_NORMALIZE, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.l2_normalize.axis = self->nn_param.l2normalizescale.axis;
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node( self, curr );
+ ret = vsi_nn_internal_setup_node( self, curr );
}
else if ( ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 &&
outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) ||
@@ -370,8 +365,10 @@ static vsi_bool op_setup
attr.vtl = TRUE;
attr.is_const = FALSE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_L2_NORMALIZE, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.l2_normalize.axis = self->nn_param.l2normalizescale.axis;
curr->inputs[0] = inputs[0];
curr->outputs[0] = output_tensor->t;
@@ -389,22 +386,26 @@ static vsi_bool op_setup
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
}
reshape_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+ CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create internal tensor failed", final);
+
vsi_nn_ConvertTensor(self->graph, inputs[1], reshape_tensor->t);
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_MULTIPLY, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = output_tensor->t;
curr->inputs[1] = reshape_tensor->t;
curr->node->nn_param.multiply.scale = 1.0f;
curr->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
curr->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN;
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node( self, curr );
+ ret = vsi_nn_internal_setup_node( self, curr );
}
else
{
ret = vsi_nn_op_common_setup(self, inputs, outputs);
}
+final:
return ret;
}
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
index 75354a7c5..a90ae594b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
@@ -37,6 +37,7 @@
#include "vsi_nn_tensor_util_prv.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
#define _INPUT_NUM (3)
#define _OUTPUT_NUM (1)
@@ -116,11 +117,15 @@ static vsi_bool op_setup
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
mean_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(mean_tensor, "Create internal tensor failed", final);
vari_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(vari_tensor, "Create internal tensor failed", final);
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_MOMENTS, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
axis_array = (int32_t*)\
vsi_nn_internal_new_node_param(curr, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(axis_array, curr, "Create internal buffer failed", final);
axis_array[0] = axis;
curr->node->nn_param.moments.axis = axis_array;
@@ -131,6 +136,7 @@ static vsi_bool op_setup
vsi_nn_internal_setup_node( self, curr );
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_BATCHNORM_SINGLE, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[0];
curr->inputs[1] = mean_tensor->t;
curr->inputs[2] = vari_tensor->t;
@@ -138,13 +144,14 @@ static vsi_bool op_setup
curr->inputs[4] = inputs[1];
curr->node->nn_param.batchnorm_single.eps = self->nn_param.layernorm.eps;
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node( self, curr );
+ ret = vsi_nn_internal_setup_node( self, curr );
}
else
{
ret = vsi_nn_op_common_setup(self, inputs, outputs);
}
+final:
return ret;
}
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c
index fd12173cf..34c329c4c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c
@@ -112,6 +112,8 @@ static vsi_bool _log_softmax_op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(kernel_name);
+
/* TODO: Add code to comput outputs' shape. */
if( NULL == self )
{
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c
index e44440ead..6bddcff6e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c
@@ -100,6 +100,8 @@ static vsi_bool op_setup
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
vsi_bool ret = TRUE;
+ VSI_UNREFERENCED(self);
+
out_rank = inputs[0]->attr.dim_num;
for(i = 0; i < out_rank; i++)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c
index 01695c42b..7cb068ed0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c
@@ -106,6 +106,8 @@ static vsi_bool op_setup
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
vsi_bool ret = TRUE;
+ VSI_UNREFERENCED(self);
+
in1_rank = inputs[0]->attr.dim_num;
in2_rank = inputs[1]->attr.dim_num;
out_rank = vsi_nn_max( in1_rank, in2_rank );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lpnorm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lpnorm.c
index 7a3eb91c0..9547d8be8 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lpnorm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lpnorm.c
@@ -164,6 +164,8 @@ static vsi_bool op_setup
{
vsi_size_t i = 0;
+ VSI_UNREFERENCED(self);
+
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c
index 63a85f7ab..8d55f065d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c
@@ -34,6 +34,7 @@
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
static vsi_status op_compute
(
@@ -42,15 +43,17 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
- vsi_status status = VSI_SUCCESS;
+ vsi_status status = VSI_FAILURE;
vsi_nn_tensor_t * type_tensor = NULL;
vx_nn_lshproj_params_t p;
vx_bool valued = TRUE;
vsi_nn_tensor_t * weight_tensor = NULL;
+ float* const_data = NULL;
type_tensor = vsi_nn_VariableToTensor(self,
(uint8_t *)&self->nn_param.lsh_projection.type,
VSI_NN_TYPE_INT32);
+ CHECK_PTR_FAIL_GOTO( type_tensor, "Create tensor fail.", final );
memset(&p, 0, sizeof(p));
p.hash_func = REQUIRED_IO(inputs[0]);
@@ -65,7 +68,9 @@ static vsi_status op_compute
float const_one = 1.0;
vsi_size_t i;
vsi_size_t count = inputs[1]->attr.size[1];
- float* const_data = (float*)malloc(count * sizeof(float));
+
+ const_data = (float*)malloc(count * sizeof(float));
+ CHECK_PTR_FAIL_GOTO( const_data, "Create buffer fail.", final );
for (i = 0; i < count; i++)
{
@@ -78,9 +83,8 @@ static vsi_status op_compute
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
weight_tensor = vsi_nn_CreateTensorFromData(self->graph,
(uint8_t *)const_data, &attr);
+ CHECK_PTR_FAIL_GOTO( weight_tensor, "Create tensor fail.", final );
p.weights = weight_tensor->t;
- free(const_data);
- //valued = FALSE;
}
vxSetTensorAttribute(p.weights, VX_TENSOR_VALUE, &valued, sizeof(vx_bool));
@@ -90,8 +94,12 @@ static vsi_status op_compute
{
status = VSI_FAILURE;
}
- vsi_nn_ReleaseTensor( &type_tensor );
- if (weight_tensor != NULL) vsi_nn_ReleaseTensor(&weight_tensor);
+
+final:
+ vsi_nn_safe_free(const_data);
+ vsi_safe_release_tensor( type_tensor );
+ vsi_safe_release_tensor( weight_tensor );
+
return status;
} /* op_compute() */
@@ -102,6 +110,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm.c
index 900e50b7d..d3cc0c824 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm.c
@@ -202,6 +202,8 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(outputs);
//TODO: Check tensor shapes.
if( inputs[0]->attr.dim_num < 3)
{
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c
index 283f930b5..ebd17a3f2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c
@@ -35,9 +35,9 @@
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
#include "vsi_nn_internal_node.h"
#include "vsi_nn_rnn_helper.h"
+#include "vsi_nn_error.h"
static vsi_bool setup_op_shapes
(
@@ -82,6 +82,7 @@ static vsi_bool setup_op_shapes
attr.is_const = TRUE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
inputs[LSTM_INPUT_H_STATE] = output_tensor->t;
}
@@ -96,6 +97,7 @@ static vsi_bool setup_op_shapes
attr.is_const = TRUE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
inputs[LSTM_INPUT_C_STATE] = output_tensor->t;
}
@@ -107,6 +109,7 @@ static vsi_bool setup_op_shapes
attr.vtl = use_virtual_tensor;
attr.is_const = FALSE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
outputs[LSTM_OUTPUT_H_STATE] = output_tensor->t;
}
@@ -119,6 +122,7 @@ static vsi_bool setup_op_shapes
attr.vtl = use_virtual_tensor;
attr.is_const = FALSE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
outputs[LSTM_OUTPUT_C_STATE] = output_tensor->t;
}
@@ -156,6 +160,8 @@ static vsi_bool setup_op_shapes
}
return TRUE;
+final:
+ return FALSE;
}
static vsi_status op_compute
@@ -165,6 +171,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -175,6 +183,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -187,6 +198,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
@@ -211,6 +224,8 @@ static vsi_bool op_setup
uint32_t batch_size = 0;
uint32_t time_step = 0;
uint32_t i = 0;
+ vsi_bool ret = FALSE;
+ vsi_status status = VSI_FAILURE;
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
vsi_nn_internal_init_node_wksp( self );
@@ -235,21 +250,26 @@ static vsi_bool op_setup
/* transpose to time_major */
output_tensor = vsi_nn_rnn_transpose_time_major(self,
inputs[LSTM_INPUT_INPUT], NULL, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
input_tensor = output_tensor->t;
}
/* split input tensor */
split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * \
sizeof(vsi_nn_tensor_t *));
+ CHECK_PTR_FAIL_GOTO( split_output_tensors, "Create buffer fail.", final );
memset( split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t *));
lstmunit_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * \
sizeof(vsi_nn_tensor_t *));
+ CHECK_PTR_FAIL_GOTO( lstmunit_reshape_output_tensors, "Create buffer fail.", final );
memset( lstmunit_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t *));
- vsi_nn_rnn_split_input_tensor(self, input_tensor,
+ status = vsi_nn_rnn_split_input_tensor(self, input_tensor,
split_output_tensors, time_step, use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO(status, final);
- vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor);
+ status = vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO(status, final);
last_step_h_state = inputs[LSTM_INPUT_H_STATE];
last_step_c_state = inputs[LSTM_INPUT_C_STATE];
@@ -263,6 +283,7 @@ static vsi_bool op_setup
/* reshape for split output */
output_tensor = vsi_nn_rnn_reshape_split_output(self,
split_output_tensors[i], batch_size, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
reshape_output = output_tensor->t;
/* lstmunit output */
@@ -275,6 +296,7 @@ static vsi_bool op_setup
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[LSTM_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
lstmunit_out0 = output_tensor->t;
}
@@ -284,12 +306,14 @@ static vsi_bool op_setup
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[LSTM_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
lstmunit_out1 = output_tensor->t;
/* lstmunit output c_state */
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[LSTM_OUTPUT_C_STATE]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
lstmunit_out2 = output_tensor->t;
}
else
@@ -299,6 +323,7 @@ static vsi_bool op_setup
}
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_OVXLIB, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.lstmunit_ovxlib.activation = curr_param->activation;
curr->node->nn_param.lstmunit_ovxlib.cell_clip = curr_param->cell_clip;
curr->node->nn_param.lstmunit_ovxlib.forget_bias = curr_param->forget_bias;
@@ -350,6 +375,7 @@ static vsi_bool op_setup
/* reshape output to 3-dims */
output_tensor = vsi_nn_rnn_reshape_cell_output(self,
lstmunit_out0, batch_size, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
lstmunit_reshape_output_tensors[i] = output_tensor->t;
}
}
@@ -362,19 +388,21 @@ static vsi_bool op_setup
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[LSTM_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
tensor = output_tensor->t;
}
/* concat */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.concat.axis = 2;
for( i = 0; i < time_step; i++ )
{
curr->inputs[i] = lstmunit_reshape_output_tensors[i];
}
curr->outputs[0] = tensor;
- vsi_nn_internal_setup_node( self, curr );
+ ret = vsi_nn_internal_setup_node( self, curr );
if( !curr_param->time_major )
{
@@ -383,11 +411,17 @@ static vsi_bool op_setup
tensor, outputs[LSTM_OUTPUT_OUTPUT], use_virtual_tensor);
}
}
+ else
+ {
+ /* return_sequences = False, return true to setup lstm node. */
+ ret = TRUE;
+ }
+final:
vsi_nn_safe_free( split_output_tensors );
vsi_nn_safe_free( lstmunit_reshape_output_tensors );
- return TRUE;
+ return ret;
} /* op_setup() */
static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c
index 7730fee89..13fe0fed8 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c
@@ -222,6 +222,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
//TODO: Check tensor shapes.
return TRUE;
} /* op_check() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
index 27b545719..22dfd664d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
@@ -49,7 +49,7 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
- vsi_status status = VX_FAILURE;
+ vsi_status status = VSI_FAILURE;
vsi_nn_kernel_param_t * param = NULL;
int32_t _is_ln= 0;
int32_t _is_cifg= 0;
@@ -107,6 +107,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c
index d792d34b2..f715c99ad 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c
@@ -35,7 +35,7 @@
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
#include "ops/vsi_nn_op_lstmunit_ovxlib.h"
#include "vsi_nn_internal_node.h"
#include "vsi_nn_rnn_helper.h"
@@ -64,8 +64,10 @@ static vsi_nn_internal_tensor_t* create_tp_fc
vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor);
tensor2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(tensor2, "Create internal tensor failed", final);
tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_FCL, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
tmp_inode->node->nn_param.fcl.axis = 0;
tmp_inode->node->nn_param.fcl.weights = (uint32_t)weight->attr.size[1];
@@ -75,6 +77,7 @@ static vsi_nn_internal_tensor_t* create_tp_fc
tmp_inode->outputs[0] = tensor2->t;
vsi_nn_internal_setup_node(self, tmp_inode);
+final:
return tensor2;
}
@@ -105,6 +108,7 @@ static vsi_nn_internal_tensor_t* create_nn_fc
vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor);
tensor2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(tensor2, "Create internal tensor failed", final);
reshaped_weight_shape[3] = weight->attr.size[1];
reshaped_weight_shape[2] = weight->attr.size[0] / ( kernel_h * kernel_w );
@@ -118,10 +122,12 @@ static vsi_nn_internal_tensor_t* create_nn_fc
memcpy( &attr.dtype, &weight->attr.dtype, sizeof(attr.dtype) );
memcpy( &attr.size, &reshaped_weight_shape, sizeof(attr.size));
reshaped_weight_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(reshaped_weight_tensor, "Create internal tensor failed", final);
vsi_nn_ReshapeTensor( self->graph, weight, reshaped_weight_tensor->t, reshaped_weight_shape, 4 );
tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
tmp_inode->node->nn_param.conv2d.ksize[0] = kernel_w;
tmp_inode->node->nn_param.conv2d.ksize[1] = kernel_h;
tmp_inode->node->nn_param.conv2d.stride[0] = 1;
@@ -141,10 +147,11 @@ static vsi_nn_internal_tensor_t* create_nn_fc
tmp_inode->outputs[0] = tensor2->t;
vsi_nn_internal_setup_node(self, tmp_inode);
+final:
return tensor2;
}
-static void create_peephole
+static vsi_status create_peephole
(
vsi_nn_node_t * self,
vsi_nn_tensor_t * input,
@@ -153,6 +160,7 @@ static void create_peephole
vsi_bool use_virtual_tensor
)
{
+ vsi_status status = VSI_FAILURE;
vsi_nn_tensor_attr_t attr;
vsi_nn_internal_tensor_t* input_tensor0 = NULL;
vsi_nn_internal_tensor_t* input_tensor1 = NULL;
@@ -164,8 +172,10 @@ static void create_peephole
attr.is_const = FALSE;
memcpy(&(attr.dtype), &((*input_fc)->t->attr.dtype), sizeof(vsi_nn_dtype_t));
input_tensor0 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+ CHECK_PTR_FAIL_GOTO(input_tensor0, "Create internal tensor failed", final);
/* create internal nodes */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_MULTIPLY, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.multiply.scale = 1.0f;
curr->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
curr->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN;
@@ -174,13 +184,19 @@ static void create_peephole
curr->outputs[0] = input_tensor0->t;
vsi_nn_internal_setup_node(self, curr);
input_tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+ CHECK_PTR_FAIL_GOTO( input_tensor1, "Create internal tensor fail.", final );
/* create internal nodes */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = (*input_fc)->t;
curr->inputs[1] = input_tensor0->t;
curr->outputs[0] = input_tensor1->t;
vsi_nn_internal_setup_node(self, curr);
*input_fc = input_tensor1;
+
+ status = VSI_SUCCESS;
+final:
+ return status;
}
static vsi_bool setup_op_shapes
@@ -236,6 +252,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -246,6 +264,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -258,6 +279,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
@@ -272,7 +295,6 @@ static vsi_bool op_setup
vsi_nn_tensor_attr_t attr;
vsi_bool is_input_fc_on_tp = FALSE;
vsi_bool is_recurrent_fc_on_tp = FALSE;
- vsi_nn_internal_tensor_t* add_tensor = NULL;
vsi_nn_internal_tensor_t* input_tensor = NULL;
vsi_nn_internal_tensor_t* output_tensor = NULL;
vsi_nn_internal_tensor_t* recurrent_input_tensor = NULL;
@@ -364,6 +386,7 @@ static vsi_bool op_setup
bias_tensors[i],
&p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I + i],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( input_fc_outputs[i], "Create tensor fail.", final );
}
if (inputs[LSTMUNIT_INPUT_AUX_INPUT] != NULL)
{
@@ -375,6 +398,7 @@ static vsi_bool op_setup
NULL,
&p->internal_dtype_aux[LSTMUNIT_QUANTIZE_PARAM_AUX_I2I + i],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( aux_input_fc_outputs[i], "Create tensor fail.", final );
}
}
}
@@ -385,6 +409,7 @@ static vsi_bool op_setup
(uint32_t)inputs[LSTMUNIT_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w);
input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[LSTMUNIT_INPUT_INPUT],
p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( input_tensor, "Create tensor fail.", final );
for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++)
{
@@ -395,9 +420,11 @@ static vsi_bool op_setup
kernel_h, kernel_w,
&p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I + i],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( tmp, "Create tensor fail.", final );
/* transpose and reshape output */
input_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self,
tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( input_fc_outputs[i], "Create tensor fail.", final );
}
if (inputs[LSTMUNIT_INPUT_AUX_INPUT] != NULL)
{
@@ -406,6 +433,7 @@ static vsi_bool op_setup
(uint32_t)inputs[LSTMUNIT_INPUT_AUX_INPUT]->attr.size[0], &kernel_h, &kernel_w);
input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[LSTMUNIT_INPUT_AUX_INPUT],
p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( input_tensor, "Create tensor fail.", final );
for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++)
{
@@ -416,9 +444,11 @@ static vsi_bool op_setup
kernel_h, kernel_w,
&p->internal_dtype_aux[LSTMUNIT_QUANTIZE_PARAM_AUX_I2I + i],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( tmp, "Create tensor fail.", final );
/* transpose and reshape output */
aux_input_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self,
tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( aux_input_fc_outputs[i], "Create tensor fail.", final );
}
}
}
@@ -432,6 +462,7 @@ static vsi_bool op_setup
aux_input_fc_outputs[i]->t,
&p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( input_add_aux_input_fc_outputs[i], "Create tensor fail.", final );
input_fc_outputs[i] = input_add_aux_input_fc_outputs[i];
}
}
@@ -447,6 +478,7 @@ static vsi_bool op_setup
NULL,
&p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_R2I + i],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( recurrent_fc_outputs[i], "Create tensor fail.", final );
}
}
else
@@ -456,6 +488,7 @@ static vsi_bool op_setup
(uint32_t)inputs[LSTMUNIT_INPUT_H_STATE]->attr.size[0], &kernel_h, &kernel_w);
recurrent_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self,
inputs[LSTMUNIT_INPUT_H_STATE], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( recurrent_input_tensor, "Create tensor fail.", final );
for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++)
{
@@ -466,31 +499,37 @@ static vsi_bool op_setup
kernel_h, kernel_w,
&p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_R2I + i],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( tmp, "Create tensor fail.", final );
/* transpose and reshape output */
recurrent_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self,
tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( recurrent_fc_outputs[i], "Create tensor fail.", final );
}
}
if (p->local->use_peephole)
{
+ vsi_status status = VSI_FAILURE;
/* update input gate */
if (!p->local->use_cifg)
{
- create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE],
+ status = create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE],
inputs[LSTMUNIT_INPUT_WEIGHT_C2I], &(input_fc_outputs[0]),
use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO( status, final );
}
/* update forget gate */
- create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE],
+ status = create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE],
inputs[LSTMUNIT_INPUT_WEIGHT_C2F], &(input_fc_outputs[1]),
use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO( status, final );
/* update output gate */
- create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE],
+ status = create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE],
inputs[LSTMUNIT_INPUT_WEIGHT_C2O], &(input_fc_outputs[3]),
use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO( status, final );
}
/* layernorm */
@@ -498,59 +537,31 @@ static vsi_bool op_setup
{
for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++ )
{
- if (self->graph->ctx->config.support_stream_processor)
- {
- memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
- attr.dim_num = VSI_NN_DIM_AUTO;
- attr.vtl = use_virtual_tensor;
- attr.is_const = FALSE;
- attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
- attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
- add_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
- /* create internal nodes */
- curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 );
- curr->inputs[0] = input_fc_outputs[i]->t;
- curr->inputs[1] = recurrent_fc_outputs[i]->t;
- curr->outputs[0] = add_tensor->t;
- vsi_nn_internal_setup_node(self, curr);
-
- /* create internal nodes */
- input_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
- curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LAYER_NORM, 0, 0 );
- curr->node->nn_param.layernorm.eps = (float)1e-8;
- curr->inputs[0] = add_tensor->t;
- curr->inputs[1] = inputs[LSTMUNIT_INPUT_BIAS_I + i];
- curr->inputs[2] = inputs[LSTMUNIT_INPUT_LAYERNORM_I + i];
- curr->outputs[0] = input_tensor->t;
- vsi_nn_internal_setup_node(self, curr);
-
- layernorm_outputs[i] = input_tensor;
- }
- else
- {
- memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
- attr.dim_num = VSI_NN_DIM_AUTO;
- attr.vtl = use_virtual_tensor;
- attr.is_const = FALSE;
- attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
- attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
- input_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
-
- /* create internal nodes */
- curr = vsi_nn_internal_new_node( self, VSI_NN_OP_TENSOR_ADD_MEAN_STDDEV_NORM, 0, 0 );
- curr->node->nn_param.tensor_add_mean_stddev_norm.eps = (float)1e-8;
- curr->inputs[0] = input_fc_outputs[i]->t;
- curr->inputs[1] = recurrent_fc_outputs[i]->t;
- curr->outputs[0] = input_tensor->t;
- vsi_nn_internal_setup_node(self, curr);
-
- layernorm_outputs[i] = input_tensor;
- }
+ memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+ attr.dim_num = VSI_NN_DIM_AUTO;
+ attr.vtl = use_virtual_tensor;
+ attr.is_const = FALSE;
+ attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+ attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+ input_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+ CHECK_PTR_FAIL_GOTO( input_tensor, "Create tensor fail.", final );
+
+ /* create internal nodes */
+ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_TENSOR_ADD_MEAN_STDDEV_NORM, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+ curr->node->nn_param.tensor_add_mean_stddev_norm.eps = (float)1e-8;
+ curr->inputs[0] = input_fc_outputs[i]->t;
+ curr->inputs[1] = recurrent_fc_outputs[i]->t;
+ curr->outputs[0] = input_tensor->t;
+ vsi_nn_internal_setup_node(self, curr);
+
+ layernorm_outputs[i] = input_tensor;
}
}
/* activations */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_ACTIVATION, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.lstmunit_activation.cell_clip = p->cell_clip;
curr->node->nn_param.lstmunit_activation.proj_clip = p->proj_clip;
curr->node->nn_param.lstmunit_activation.forget_bias = p->forget_bias;
@@ -562,10 +573,9 @@ static vsi_bool op_setup
curr->node->nn_param.lstmunit_activation.recurrent_activation = p->recurrent_activation;
curr->inputs[LSTMUNIT_ACT_CSTATE_IN] = inputs[LSTMUNIT_INPUT_C_STATE];
- for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++ )
+ for ( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++ )
{
- if( (p->local->use_layer_norm && !self->graph->ctx->config.support_stream_processor) ||
- p->local->use_hybrid )
+ if( p->local->use_layer_norm || p->local->use_hybrid )
{
curr->inputs[LSTMUNIT_ACT_DATA_BI + i] = inputs[LSTMUNIT_INPUT_BIAS_I + i];
}
@@ -573,14 +583,7 @@ static vsi_bool op_setup
if( p->local->use_layer_norm )
{
/* Pass layernorm weights to VSI_NN_OP_LSTMUNIT_ACTIVATION */
- if (self->graph->ctx->config.support_stream_processor)
- {
- curr->inputs[LSTMUNIT_ACT_LN_WI + i] = NULL;
- }
- else
- {
- curr->inputs[LSTMUNIT_ACT_LN_WI + i] = inputs[LSTMUNIT_INPUT_LAYERNORM_I + i];
- }
+ curr->inputs[LSTMUNIT_ACT_LN_WI + i] = inputs[LSTMUNIT_INPUT_LAYERNORM_I + i];
curr->inputs[LSTMUNIT_ACT_INPUT_FC_I + i] = layernorm_outputs[i]->t;
curr->inputs[LSTMUNIT_ACT_HSTATE_FC_I + i] = NULL;
}
@@ -616,6 +619,7 @@ static vsi_bool op_setup
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
}
output_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( output_tensor, curr, "Create tensor fail.", final );
curr->outputs[LSTMUNIT_ACT_OUTPUT] = output_tensor->t;
curr->outputs[LSTMUNIT_ACT_CSTATE_OUT] = outputs[LSTMUNIT_OUTPUT_C_STATE];
@@ -637,11 +641,14 @@ static vsi_bool op_setup
use_virtual_tensor = inputs[LSTMUNIT_INPUT_BIAS_PROJ]->attr.vtl;
input_tensor = vsi_nn_internal_create_zero_bias_tensor(self, &output_tensor->t->attr,
&inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr, VSI_NN_OP_FCL, FALSE);
+ CHECK_PTR_FAIL_GOTO( input_tensor, "Create tensor fail.", final );
+
zero_bias_tensor = input_tensor->t;
if (use_virtual_tensor)
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[LSTMUNIT_INPUT_BIAS_PROJ];
curr->outputs[0] = zero_bias_tensor;
@@ -656,6 +663,8 @@ static vsi_bool op_setup
{
input_tensor = vsi_nn_internal_create_zero_bias_tensor(self, &output_tensor->t->attr,
&inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr, VSI_NN_OP_FCL, FALSE);
+ CHECK_PTR_FAIL_GOTO( input_tensor, "Create tensor fail.", final );
+
zero_bias_tensor = input_tensor->t;
}
else
@@ -664,6 +673,7 @@ static vsi_bool op_setup
}
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_FCL, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.fcl.axis = 0;
curr->node->nn_param.fcl.weights = (uint32_t)inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr.size[1];
@@ -678,12 +688,15 @@ static vsi_bool op_setup
/* copy h_state to output */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = outputs[LSTMUNIT_OUTPUT_H_STATE];
curr->outputs[0] = outputs[LSTMUNIT_OUTPUT_OUTPUT];
vsi_nn_internal_setup_node(self, curr);
}
return TRUE;
+final:
+ return FALSE;
} /* op_setup() */
static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
index 846339029..f4005a841 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
@@ -35,6 +35,8 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_constraint_check.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
#define _ARG_NUM (7)
#define _INPUT_NUM (2)
@@ -49,22 +51,24 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
- vsi_status status = VSI_FAILURE;
- vsi_nn_kernel_param_t * param = NULL;
- vsi_nn_kernel_node_t n = NULL;
- vsi_nn_tensor_t * tmp_inputs[2] = {NULL};
- vsi_nn_tensor_t * tmp_outputs[1] = {NULL};
- vsi_nn_tensor_t * rs_input = NULL;
- vsi_nn_tensor_t * rs_output = NULL;
- vsi_size_t shape_in[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
- vsi_size_t shape_out[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
- uint32_t i = 0;
+ vsi_status status = VSI_FAILURE;
+ vsi_nn_kernel_param_t *param = NULL;
+ vsi_nn_kernel_node_t n = NULL;
+ vsi_nn_tensor_t * tmp_inputs[2] = {NULL};
+ vsi_nn_tensor_t * tmp_outputs[1] = {NULL};
+ uint32_t new_rank[3] = {0};
+ vsi_bool ret = FALSE;
+ vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
int32_t transposeA = self->nn_param.matrixmul.transpose[0];
int32_t transposeB = self->nn_param.matrixmul.transpose[1];
int32_t adjointA = self->nn_param.matrixmul.adjoint[0];
int32_t adjointB = self->nn_param.matrixmul.adjoint[1];
+ uint32_t cross_flg = 0;
+ uint32_t size_axis_inner_outer[3] = {0};
+ uint32_t stride_axis_inner_outer[9] = {0};
+
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_int32( param, "transposeA", transposeA );
@@ -72,46 +76,35 @@ static vsi_status op_compute
vsi_nn_kernel_param_add_int32( param, "adjointA", adjointA );
vsi_nn_kernel_param_add_int32( param, "adjointB", adjointB );
- if (inputs[0]->attr.dim_num == 1 && inputs[1]->attr.dim_num > 1)
- {
- shape_in[0] = inputs[0]->attr.size[0];
- shape_in[1] = 1;
- shape_out[0] = outputs[0]->attr.size[0];
- shape_out[1] = 1;
- for(i = 2; i <= outputs[0]->attr.dim_num; i++)
- {
- shape_out[i] = outputs[0]->attr.size[i - 1];
- }
- rs_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape_in, 2);
- rs_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape_out, outputs[0]->attr.dim_num + 1);
- tmp_inputs[0] = rs_input;
- tmp_inputs[1] = inputs[1];
- tmp_outputs[0] = rs_output;
- }
- else if (inputs[1]->attr.dim_num == 1 && inputs[0]->attr.dim_num > 1)
- {
- shape_in[0] = 1;
- shape_in[1] = inputs[1]->attr.size[0];
- shape_out[0] = 1;
- for(i = 1; i <= outputs[0]->attr.dim_num; i++)
- {
- shape_out[i] = outputs[0]->attr.size[i - 1];
- }
- rs_input = vsi_nn_reshape_tensor(self->graph, inputs[1], shape_in, 2);
- rs_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape_out, outputs[0]->attr.dim_num + 1);
+ ret = vsi_nn_kernel_optimize_matrixmul_broadcast_shape(
+ inputs[0]->attr.size,
+ inputs[1]->attr.size,
+ outputs[0]->attr.size,
+ inputs[0]->attr.dim_num,
+ inputs[1]->attr.dim_num,
+ outputs[0]->attr.dim_num,
+ shapes[0], shapes[1], shapes[2], new_rank,
+ &cross_flg, size_axis_inner_outer, stride_axis_inner_outer);
+
+ if (ret)
+ {
+ vsi_nn_kernel_param_add_int32( param, "cross_flg", cross_flg );
+ vsi_nn_kernel_param_add_buffer( param, "size_axis_inner_outer", size_axis_inner_outer, 3);
+ vsi_nn_kernel_param_add_buffer( param, "stride_axis_inner_outer", stride_axis_inner_outer, 9);
- tmp_inputs[0] = inputs[0];
- tmp_inputs[1] = rs_input;
- tmp_outputs[0] = rs_output;
+ tmp_inputs[0] = vsi_nn_reshape_tensor(self->graph, inputs[0], shapes[0], new_rank[0]);
+ tmp_inputs[1] = vsi_nn_reshape_tensor(self->graph, inputs[1], shapes[1], new_rank[1]);
+ tmp_outputs[0] = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes[2], new_rank[2]);
}
else
{
- tmp_inputs[0] = inputs[0];
- tmp_inputs[1] = inputs[1];
- tmp_outputs[0] = outputs[0];
+ VSILOGE("illegal inputs shape");
+ status = VSI_FAILURE;
+ goto final;
}
+
n = vsi_nn_kernel_selector( self->graph, "matrixmul", tmp_inputs, 2, tmp_outputs, 1, param );
if ( n != NULL )
{
@@ -119,19 +112,15 @@ static vsi_status op_compute
status = VSI_SUCCESS;
}
+final:
if (param != NULL)
{
vsi_nn_kernel_param_release( ¶m );
}
- if (rs_input != NULL)
- {
- vsi_nn_ReleaseTensor( &rs_input );
- }
- if (rs_output != NULL)
- {
- vsi_nn_ReleaseTensor( &rs_output );
- }
+ vsi_safe_release_tensor( tmp_inputs[0] );
+ vsi_safe_release_tensor( tmp_inputs[1] );
+ vsi_safe_release_tensor( tmp_outputs[0] );
return status;
} /* op_compute() */
@@ -282,32 +271,17 @@ static vsi_bool op_setup
outputs[0]->attr.size[i] = inputs[0]->attr.size[i + 1];
}
}
- else if (inputs[0]->attr.dim_num > inputs[1]->attr.dim_num)
- {
- for (i = 2; i < inputs[0]->attr.dim_num; i++)
- {
- outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
- }
- }
- else if (inputs[1]->attr.dim_num > inputs[0]->attr.dim_num)
- {
- for (i = 2; i < inputs[1]->attr.dim_num; i++)
- {
- outputs[0]->attr.size[i] = inputs[1]->attr.size[i];
- }
- }
- else if (inputs[0]->attr.size[2] >= inputs[1]->attr.size[2])
- {
- for (i = 2; i < inputs[0]->attr.dim_num; i++)
- {
- outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
- }
- }
else
{
- for (i = 2; i < inputs[1]->attr.dim_num; i++)
+ uint32_t rank0 = inputs[0]->attr.dim_num;
+ uint32_t rank1 = inputs[1]->attr.dim_num;
+ for (i = 2; i < outputs[0]->attr.dim_num; i++)
{
- outputs[0]->attr.size[i] = inputs[1]->attr.size[i];
+ vsi_size_t sz0 = i < rank0 ? inputs[0]->attr.size[i] : 1;
+ vsi_size_t sz1 = i < rank1 ? inputs[1]->attr.size[i] : 1;
+ vsi_size_t sz2 = vsi_nn_max(sz0, sz1);
+
+ outputs[0]->attr.size[i] = sz2;
}
}
}
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c
index 57f8cad39..a94df5511 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c
@@ -36,6 +36,7 @@
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_link_list.h"
#include "vsi_nn_internal_node.h"
+#include "vsi_nn_error.h"
typedef struct _max_pool3d_local_data_t {
int32_t placeholder;
@@ -54,6 +55,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -79,6 +82,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
}
@@ -89,7 +94,7 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
- vsi_bool ret = TRUE;
+ vsi_bool ret = FALSE;
vsi_nn_max_pool3d_param *p = &(self->nn_param.max_pool3d);
vsi_size_t ksize[_cnt_of_array(p->ksize)] = {0}, i = 0;
vsi_size_t pad[_cnt_of_array(p->pad)] = {0};
@@ -173,10 +178,14 @@ static vsi_bool op_setup
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, TRUE);
input_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final);
pool2d_0_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(pool2d_0_tensor, "Create internal tensor failed", final);
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
reshape_input_size = vsi_nn_internal_new_node_param(curr,
VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_input_size, curr, "Create internal buffer failed", final);
reshape_input_size[0] = inputs[0]->attr.size[0];
reshape_input_size[1] = inputs[0]->attr.size[1];
reshape_input_size[2] = 1;
@@ -189,9 +198,10 @@ static vsi_bool op_setup
curr->node->nn_param.reshape2.dim_num = 4;
curr->inputs[0] = inputs[0];
curr->outputs[0] = input_tensor->t;
- vsi_nn_internal_setup_node( self, curr );
+ ret = vsi_nn_internal_setup_node( self, curr );
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_POOL, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.pool.ksize[0] = p->ksize[0];
curr->node->nn_param.pool.ksize[1] = p->ksize[1];
curr->node->nn_param.pool.stride[0] = p->stride[0];
@@ -205,28 +215,33 @@ static vsi_bool op_setup
curr->node->nn_param.pool.pad_type = p->pad_type;
curr->inputs[0] = input_tensor->t;
curr->outputs[0] = pool2d_0_tensor->t;
- vsi_nn_internal_setup_node( self, curr );
+ ret &= vsi_nn_internal_setup_node( self, curr );
if (p->ksize[2] == 1 && p->stride[2] == 1 && p->pad[4] == 0 && p->pad[5] == 0)
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.reshape2.size = outputs[0]->attr.size;
curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num;
curr->inputs[0] = pool2d_0_tensor->t;
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node( self, curr );
+ ret &= vsi_nn_internal_setup_node( self, curr );
}
else
{
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, TRUE);
reshape_0_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(reshape_0_tensor, "Create internal tensor failed", final);
pool2d_1_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(pool2d_1_tensor, "Create internal tensor failed", final);
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
reshape_pool_size = vsi_nn_internal_new_node_param(curr,
VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
- reshape_pool_size[0] = -1;
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_pool_size, curr, "Create internal buffer failed", final);
+ reshape_pool_size[0] = (vsi_size_t)-1;
reshape_pool_size[1] = inputs[0]->attr.size[2];
reshape_pool_size[2] = 1;
for (i = 3; i < inputs[0]->attr.dim_num; i++)
@@ -238,9 +253,10 @@ static vsi_bool op_setup
curr->node->nn_param.reshape2.dim_num = 4;
curr->inputs[0] = pool2d_0_tensor->t;
curr->outputs[0] = reshape_0_tensor->t;
- vsi_nn_internal_setup_node( self, curr );
+ ret &= vsi_nn_internal_setup_node( self, curr );
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_POOL, 1, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.pool.ksize[0] = 1;
curr->node->nn_param.pool.ksize[1] = p->ksize[2];
curr->node->nn_param.pool.stride[0] = 1;
@@ -254,16 +270,18 @@ static vsi_bool op_setup
curr->node->nn_param.pool.pad_type = p->pad_type;
curr->inputs[0] = reshape_0_tensor->t;
curr->outputs[0] = pool2d_1_tensor->t;
- vsi_nn_internal_setup_node( self, curr );
+ ret &= vsi_nn_internal_setup_node( self, curr );
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.reshape2.size = outputs[0]->attr.size;
curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num;
curr->inputs[0] = pool2d_1_tensor->t;
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node( self, curr );
+ ret &= vsi_nn_internal_setup_node( self, curr );
}
+final:
return ret;
} /* op_setup() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c
index 9df9c1b27..2deed48b7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c
@@ -56,20 +56,29 @@ static vsi_status op_compute
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
uint32_t new_rank = 0;
vsi_nn_kernel_param_t * param = NULL;
- int32_t ksize_x = (int32_t)self->nn_param.pool.ksize[0];
- int32_t ksize_y = (int32_t)self->nn_param.pool.ksize[1];
- int32_t stride_x = (int32_t)self->nn_param.pool.stride[0];
- int32_t stride_y = (int32_t)self->nn_param.pool.stride[1];
- int32_t pad_left = (int32_t)self->nn_param.pool.pad[0];
- int32_t pad_right = (int32_t)self->nn_param.pool.pad[1];
- int32_t pad_top = (int32_t)self->nn_param.pool.pad[2];
- int32_t pad_bottom = (int32_t)self->nn_param.pool.pad[3];
+ int32_t ksize_x = 0;
+ int32_t ksize_y = 0;
+ int32_t stride_x = 0;
+ int32_t stride_y = 0;
+ int32_t pad_left = 0;
+ int32_t pad_right = 0;
+ int32_t pad_top = 0;
+ int32_t pad_bottom = 0;
if ( NULL == self )
{
return VSI_FAILURE;
}
+ ksize_x = (int32_t)self->nn_param.pool.ksize[0];
+ ksize_y = (int32_t)self->nn_param.pool.ksize[1];
+ stride_x = (int32_t)self->nn_param.pool.stride[0];
+ stride_y = (int32_t)self->nn_param.pool.stride[1];
+ pad_left = (int32_t)self->nn_param.pool.pad[0];
+ pad_right = (int32_t)self->nn_param.pool.pad[1];
+ pad_top = (int32_t)self->nn_param.pool.pad[2];
+ pad_bottom = (int32_t)self->nn_param.pool.pad[3];
+
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_optimize_nchw2xhw_shape(inputs[0]->attr.size, inputs[0]->attr.dim_num,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_mod.c b/src/tim/vx/internal/src/ops/vsi_nn_op_mod.c
index 29310ad96..7be779db1 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_mod.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_mod.c
@@ -59,13 +59,15 @@ static vsi_status op_compute
vsi_size_t new_rank = 0;
vsi_bool ret;
vsi_nn_kernel_param_t * param = NULL;
- int32_t isfmod = (int32_t)self->nn_param.mod.fmod;
+ int32_t isfmod = 0;
if (NULL == self)
{
return VSI_FAILURE;
}
+ isfmod = (int32_t)self->nn_param.mod.fmod;
+
param = vsi_nn_kernel_param_create();
ret = vsi_nn_kernel_optimize_eltwise_shape(
@@ -183,6 +185,8 @@ static vsi_bool op_setup
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
vsi_bool ret = TRUE;
+ VSI_UNREFERENCED(self);
+
in1_rank = inputs[0]->attr.dim_num;
in2_rank = inputs[1]->attr.dim_num;
out_rank = vsi_nn_max( in1_rank, in2_rank );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c b/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c
index 8276c0f7c..39dda244d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c
@@ -71,13 +71,14 @@ static void _set_io_index
vxSetParameterByIndex(self->n, idx++, (vx_reference)inputs[i]->t);
scalar_index = idx;
param = vxGetParameterByIndex(self->n, scalar_index);
- vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum));
- if (param != NULL)
+
+ if (param)
{
+ vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum));
vxReleaseParameter(¶m);
param = NULL;
-
}
+
if (type != VX_TYPE_SCALAR)
{
continue;
@@ -92,17 +93,18 @@ static void _set_io_index
vx_reference ref = 0;
vsi_status status;
param = vxGetParameterByIndex(self->n, j);
- vxQueryParameter(param, VX_PARAMETER_REF, &ref, sizeof(vx_reference));
- status = vxQueryScalar((vx_scalar)ref, VX_SCALAR_TYPE, &data_type, sizeof(vx_enum));
- if (status == VX_ERROR_INVALID_REFERENCE)
- {
- vx_scalar scalar = vxCreateScalar(self->graph->ctx->c, VX_TYPE_INT32, 0);
- ref = (vx_reference)scalar;
- vxSetParameterByIndex(self->n, idx++, ref);
- vxReleaseReference(&ref);
- }
- if (param != NULL)
+
+ if (param)
{
+ vxQueryParameter(param, VX_PARAMETER_REF, &ref, sizeof(vx_reference));
+ status = vxQueryScalar((vx_scalar)ref, VX_SCALAR_TYPE, &data_type, sizeof(vx_enum));
+ if (status == VX_ERROR_INVALID_REFERENCE)
+ {
+ vx_scalar scalar = vxCreateScalar(self->graph->ctx->c, VX_TYPE_INT32, 0);
+ ref = (vx_reference)scalar;
+ vxSetParameterByIndex(self->n, idx++, ref);
+ vxReleaseReference(&ref);
+ }
vxReleaseParameter(¶m);
param = NULL;
}
@@ -165,6 +167,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
} /* op_check() */
@@ -178,6 +183,9 @@ static vsi_bool op_setup
/*
* Network Binary Graph node do not need to calculate output shape
*/
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
} /* op_setup() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c b/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c
index 3c8a57d0a..acd1c9eae 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c
@@ -85,6 +85,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -96,6 +99,8 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = 1;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c b/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c
index 71a5e0786..766392ac4 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c
@@ -43,6 +43,9 @@ static vsi_status op_compute
)
{
int i;
+
+ VSI_UNREFERENCED(self);
+
for( i = 0; i < 10; i ++ )
{
if( NULL == outputs[i] )
@@ -65,6 +68,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
} /* op_check() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c
index 2c7dba946..111fc3d3c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c
@@ -156,6 +156,7 @@ static vsi_status op_compute
attr.is_const = FALSE;
convert_tensor = vsi_nn_CreateTensor(self->graph, &attr);
+ CHECK_PTR_FAIL_GOTO( convert_tensor, "Create tensor fail.", final );
self->n = vxTensorCopyNode(
self->graph->g,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c
index 399d0c6be..146ee332f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c
@@ -51,6 +51,8 @@ static vsi_bool _is_same_memory_shape
uint32_t dim_num0 = inputs[0]->attr.dim_num;
uint32_t dim_num1 = self->nn_param.permute.dim_num;
+ VSI_UNREFERENCED(outputs);
+
if (dim_num0 != dim_num1)
return FALSE;
@@ -102,6 +104,8 @@ static vsi_bool _is_same_quant
{
vsi_nn_dtype_t *dtype,*_dtype;
+ VSI_UNREFERENCED(self);
+
dtype = &inputs[0]->attr.dtype;
_dtype = &outputs[0]->attr.dtype;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c
index cfdf7c2f1..24b0d6260 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c
@@ -34,7 +34,6 @@
#include "utils/vsi_nn_util.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
-#include "libnnext/vsi_nn_vxkernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
#include "utils/vsi_nn_constraint_check.h"
@@ -136,21 +135,28 @@ static vsi_status op_compute
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
uint32_t new_rank = 0;
- vsi_bool ret;
+ vsi_bool ret = FALSE;
vsi_nn_kernel_param_t * param = NULL;
- int32_t ksize_x = (int32_t)self->nn_param.pool.ksize[0];
- int32_t ksize_y = (int32_t)self->nn_param.pool.ksize[1];
- int32_t stride_x = (int32_t)self->nn_param.pool.stride[0];
- int32_t stride_y = (int32_t)self->nn_param.pool.stride[1];
- int32_t pad_x = (int32_t)self->nn_param.pool.pad[0];
- int32_t pad_y = (int32_t)self->nn_param.pool.pad[2];
+ int32_t ksize_x = 0;
+ int32_t ksize_y = 0;
+ int32_t stride_x = 0;
+ int32_t stride_y = 0;
+ int32_t pad_x = 0;
+ int32_t pad_y = 0;
- if( NULL == self )
+ if ( NULL == self )
{
return VSI_FAILURE;
}
- param =vsi_nn_kernel_param_create();
+ ksize_x = (int32_t)self->nn_param.pool.ksize[0];
+ ksize_y = (int32_t)self->nn_param.pool.ksize[1];
+ stride_x = (int32_t)self->nn_param.pool.stride[0];
+ stride_y = (int32_t)self->nn_param.pool.stride[1];
+ pad_x = (int32_t)self->nn_param.pool.pad[0];
+ pad_y = (int32_t)self->nn_param.pool.pad[2];
+
+ param = vsi_nn_kernel_param_create();
ret = vsi_nn_poolwithargmax_optimize_shape(self,
(vsi_ssize_t*)inputs[0]->attr.size, (vsi_ssize_t*)outputs[0]->attr.size,
@@ -164,7 +170,7 @@ static vsi_status op_compute
vsi_nn_kernel_param_add_int32( param, "pad_x", pad_x );
vsi_nn_kernel_param_add_int32( param, "pad_y", pad_y );
- if( ret )
+ if ( ret )
{
reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
inputs[0], shapes[0], new_rank );
@@ -180,7 +186,7 @@ static vsi_status op_compute
vsi_nn_ReleaseTensor( &reshape_tensors[2] );
}
- if( self->n )
+ if ( self->n )
{
status = VSI_SUCCESS;
}
@@ -270,10 +276,12 @@ static vsi_bool op_setup
self->nn_param.pool.pad[i] = (uint32_t)pad[i];
}
- if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+ if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
ret = vsi_nn_OpSetup( VSI_NN_OP_POOL, self, inputs, outputs );
-
+ }
+ if ( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num )
+ {
outputs[1]->attr.dim_num = outputs[0]->attr.dim_num;
memcpy( outputs[1]->attr.size, outputs[0]->attr.size,
VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c
index 18942faf4..9b060f141 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c
@@ -34,7 +34,7 @@
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
#include "utils/vsi_nn_dtype_util.h"
#include "vsi_nn_internal_node.h"
@@ -48,6 +48,8 @@ static vsi_bool _is_same_type
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+
if(vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
{
return FALSE;
@@ -63,6 +65,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -73,6 +77,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -85,6 +92,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
@@ -95,7 +104,7 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
- vsi_bool ret;
+ vsi_bool ret = FALSE;
uint32_t i;
uint32_t axis;
vsi_nn_tensor_attr_t attr;
@@ -112,7 +121,6 @@ static vsi_bool op_setup
return FALSE;
}
- ret = TRUE;
/* output */
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
@@ -152,6 +160,7 @@ static vsi_bool op_setup
self->nn_param.post_process.local.enable_perm == FALSE)
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.reshape2.size = outputs[0]->attr.size;
curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num;
curr->inputs[0] = inputs[POST_PROCESS_INPUT];
@@ -163,6 +172,7 @@ static vsi_bool op_setup
self->nn_param.post_process.local.enable_perm == FALSE)
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[POST_PROCESS_INPUT];
curr->outputs[0] = outputs[POST_PROCESS_OUTPUT];
@@ -172,6 +182,7 @@ static vsi_bool op_setup
self->nn_param.post_process.local.enable_perm == TRUE)
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.permute.perm = self->nn_param.post_process.perm;
curr->node->nn_param.permute.dim_num = self->nn_param.post_process.dim_num;
curr->inputs[0] = inputs[POST_PROCESS_INPUT];
@@ -187,8 +198,10 @@ static vsi_bool op_setup
attr.vtl = use_virtual_tensor;
attr.is_const = FALSE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.permute.perm = self->nn_param.post_process.perm;
curr->node->nn_param.permute.dim_num = self->nn_param.post_process.dim_num;
curr->inputs[0] = inputs[POST_PROCESS_INPUT];
@@ -197,12 +210,15 @@ static vsi_bool op_setup
vsi_nn_internal_setup_node( self, curr );
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = output_tensor->t;
curr->outputs[0] = outputs[POST_PROCESS_OUTPUT];
vsi_nn_internal_setup_node(self, curr);
}
+ ret = TRUE;
+final:
return ret;
} /* op_setup() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
index aa5b46c1b..f977e32d0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
@@ -36,6 +36,7 @@
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_internal_node.h"
#include "utils/vsi_nn_util.h"
+#include "vsi_nn_error.h"
static vsi_status op_compute
(
@@ -44,8 +45,27 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
- vsi_status status = vsi_nn_internal_compute_node( self );
- self->n = vsi_nn_internal_get_node_by_uid(self, 1)->node->n;
+ vsi_status status = VSI_SUCCESS;
+ vsi_nn_internal_node_t* interal_node = NULL;
+
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
+
+ status = vsi_nn_internal_compute_node( self );
+ CHECK_STATUS_FAIL_GOTO(status, final );
+
+ interal_node = vsi_nn_internal_get_node_by_uid(self, 1);
+
+ if (interal_node)
+ {
+ self->n = interal_node->node->n;
+ }
+ else
+ {
+ status = VSI_FAILURE;
+ }
+
+final:
return status;
} /* op_compute() */
@@ -56,6 +76,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -68,6 +91,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
@@ -81,9 +106,10 @@ static vsi_bool op_setup
/* TODO: Add code to comput outputs' shape. */
vsi_nn_internal_node_t* curr = NULL;
vsi_nn_pre_process_param * p = NULL;
- vsi_bool ret = TRUE;
+ vsi_bool ret = FALSE;
vsi_nn_internal_tensor_t* preprocess_tensor = NULL;
vsi_nn_preprocess_dest_layout_e layout = VSI_NN_DEST_LAYOUT_NCHW;
+ vsi_bool enable_rgb88_planar_nhwc = FALSE;
p = (vsi_nn_pre_process_param *)&(self->nn_param.pre_process);
@@ -122,11 +148,18 @@ static vsi_bool op_setup
if (i != self->nn_param.pre_process_rgb.dim_num)
{
layout = VSI_NN_DEST_LAYOUT_NHWC;
+
+ if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ||
+ p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP)
+ {
+ enable_rgb88_planar_nhwc = self->graph->ctx->options.enable_rgb88_planar_nhwc;
+ }
}
- if (layout == VSI_NN_DEST_LAYOUT_NHWC)
+ if (layout == VSI_NN_DEST_LAYOUT_NHWC && !enable_rgb88_planar_nhwc)
{
memcpy( &attr, &outputs[PRE_PROCESS_OUTPUT]->attr, sizeof( attr ) );
+
attr.size[0] = p->output_attr.size[1];
attr.size[1] = p->output_attr.size[2];
attr.size[2] = p->output_attr.size[0];
@@ -136,7 +169,8 @@ static vsi_bool op_setup
attr.vtl = use_virtual_tensor;
attr.is_const = FALSE;
- preprocess_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ preprocess_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+ CHECK_PTR_FAIL_GOTO(preprocess_tensor, "Create internal tensor failed", final);
}
}
@@ -145,6 +179,7 @@ static vsi_bool op_setup
case VSI_NN_SOURCE_FORMAT_TENSOR:
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_TENSOR, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.pre_process_tensor.perm = p->perm;
curr->node->nn_param.pre_process_tensor.dim_num = p->dim_num;
@@ -152,12 +187,13 @@ static vsi_bool op_setup
curr->inputs[0] = inputs[PRE_PROCESS_INPUT0];
curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
}
break;
case VSI_NN_SOURCE_FORMAT_IMAGE_GRAY:
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_GRAY, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.pre_process_gray.mean = p->norm.mean[0];
curr->node->nn_param.pre_process_gray.scale = p->norm.scale;
@@ -178,27 +214,33 @@ static vsi_bool op_setup
curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
}
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
}
break;
case VSI_NN_SOURCE_FORMAT_IMAGE_RGB:
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_RGB, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
if (p->reverse_channel)
{
curr->node->nn_param.pre_process_rgb.r_mean = p->norm.mean[2];
curr->node->nn_param.pre_process_rgb.g_mean = p->norm.mean[1];
curr->node->nn_param.pre_process_rgb.b_mean = p->norm.mean[0];
+ curr->node->nn_param.pre_process_rgb.r_scale = p->norm2.scale[2];
+ curr->node->nn_param.pre_process_rgb.g_scale = p->norm2.scale[1];
+ curr->node->nn_param.pre_process_rgb.b_scale = p->norm2.scale[0];
}
else
{
curr->node->nn_param.pre_process_rgb.r_mean = p->norm.mean[0];
curr->node->nn_param.pre_process_rgb.g_mean = p->norm.mean[1];
curr->node->nn_param.pre_process_rgb.b_mean = p->norm.mean[2];
+ curr->node->nn_param.pre_process_rgb.r_scale = p->norm2.scale[0];
+ curr->node->nn_param.pre_process_rgb.g_scale = p->norm2.scale[1];
+ curr->node->nn_param.pre_process_rgb.b_scale = p->norm2.scale[2];
}
- curr->node->nn_param.pre_process_rgb.rgb_scale = p->norm.scale;
curr->node->nn_param.pre_process_rgb.reverse_channel = p->reverse_channel;
curr->node->nn_param.pre_process_rgb.rect.left = p->rect.left;
curr->node->nn_param.pre_process_rgb.rect.top = p->rect.top;
@@ -219,27 +261,51 @@ static vsi_bool op_setup
curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
}
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
}
break;
case VSI_NN_SOURCE_FORMAT_IMAGE_YUV420:
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV420, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
if (p->reverse_channel)
{
curr->node->nn_param.pre_process_yuv420.r_mean = p->norm.mean[2];
curr->node->nn_param.pre_process_yuv420.g_mean = p->norm.mean[1];
curr->node->nn_param.pre_process_yuv420.b_mean = p->norm.mean[0];
+ if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+ {
+ curr->node->nn_param.pre_process_yuv420.r_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_yuv420.g_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_yuv420.b_scale = p->norm.scale;
+ }
+ else
+ {
+ curr->node->nn_param.pre_process_yuv420.r_scale = p->norm2.scale[2];
+ curr->node->nn_param.pre_process_yuv420.g_scale = p->norm2.scale[1];
+ curr->node->nn_param.pre_process_yuv420.b_scale = p->norm2.scale[0];
+ }
}
else
{
curr->node->nn_param.pre_process_yuv420.r_mean = p->norm.mean[0];
curr->node->nn_param.pre_process_yuv420.g_mean = p->norm.mean[1];
curr->node->nn_param.pre_process_yuv420.b_mean = p->norm.mean[2];
+ if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+ {
+ curr->node->nn_param.pre_process_yuv420.r_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_yuv420.g_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_yuv420.b_scale = p->norm.scale;
+ }
+ else
+ {
+ curr->node->nn_param.pre_process_yuv420.r_scale = p->norm2.scale[0];
+ curr->node->nn_param.pre_process_yuv420.g_scale = p->norm2.scale[1];
+ curr->node->nn_param.pre_process_yuv420.b_scale = p->norm2.scale[2];
+ }
}
- curr->node->nn_param.pre_process_yuv420.rgb_scale = p->norm.scale;
curr->node->nn_param.pre_process_yuv420.reverse_channel = p->reverse_channel;
curr->node->nn_param.pre_process_yuv420.rect.left = p->rect.left;
curr->node->nn_param.pre_process_yuv420.rect.top = p->rect.top;
@@ -262,27 +328,51 @@ static vsi_bool op_setup
curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
}
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
}
break;
case VSI_NN_SOURCE_FORMAT_IMAGE_BGRA:
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_BGRA, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
if (p->reverse_channel)
{
curr->node->nn_param.pre_process_bgra.r_mean = p->norm.mean[2];
curr->node->nn_param.pre_process_bgra.g_mean = p->norm.mean[1];
curr->node->nn_param.pre_process_bgra.b_mean = p->norm.mean[0];
+ if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+ {
+ curr->node->nn_param.pre_process_bgra.r_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_bgra.g_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_bgra.b_scale = p->norm.scale;
+ }
+ else
+ {
+ curr->node->nn_param.pre_process_bgra.r_scale = p->norm2.scale[2];
+ curr->node->nn_param.pre_process_bgra.g_scale = p->norm2.scale[1];
+ curr->node->nn_param.pre_process_bgra.b_scale = p->norm2.scale[0];
+ }
}
else
{
curr->node->nn_param.pre_process_bgra.r_mean = p->norm.mean[0];
curr->node->nn_param.pre_process_bgra.g_mean = p->norm.mean[1];
curr->node->nn_param.pre_process_bgra.b_mean = p->norm.mean[2];
+ if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+ {
+ curr->node->nn_param.pre_process_bgra.r_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_bgra.g_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_bgra.b_scale = p->norm.scale;
+ }
+ else
+ {
+ curr->node->nn_param.pre_process_bgra.r_scale = p->norm2.scale[0];
+ curr->node->nn_param.pre_process_bgra.g_scale = p->norm2.scale[1];
+ curr->node->nn_param.pre_process_bgra.b_scale = p->norm2.scale[2];
+ }
}
- curr->node->nn_param.pre_process_bgra.rgb_scale = p->norm.scale;
curr->node->nn_param.pre_process_bgra.reverse_channel = p->reverse_channel;
curr->node->nn_param.pre_process_bgra.rect.left = p->rect.left;
curr->node->nn_param.pre_process_bgra.rect.top = p->rect.top;
@@ -303,59 +393,30 @@ static vsi_bool op_setup
curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
}
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
}
break;
case VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR:
case VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP:
{
- uint32_t i = 0;
- uint32_t axis = 2;
vsi_bool is_input_sep = p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ? FALSE : TRUE;
- vsi_nn_internal_tensor_t * output_tensor_group[3] = {NULL};
- vsi_nn_internal_tensor_t* tmp_outputs[3] = { NULL };
- vsi_nn_tensor_attr_t attr;
float mean[3] = {0};
- vsi_size_t size_32bit[VSI_NN_MAX_DIM_NUM] = {0};
-
- memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
- memcpy(&attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t));
- for(i = 0; i < p->output_attr.dim_num; i++)
- {
- attr.size[i] = -1 == p->output_attr.size[i] ? -1 : (vsi_size_t)p->output_attr.size[i];
- }
- attr.size[axis] = 1;
- attr.vtl = TRUE;
- attr.is_const = FALSE;
- output_tensor_group[0] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
- output_tensor_group[1] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
- output_tensor_group[2] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
- for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
- {
- size_32bit[i] = attr.size[i];
- }
if (p->reverse_channel)
{
- int32_t order[3] = {2, 1, 0};
-
mean[0] = p->norm.mean[2];
mean[1] = p->norm.mean[1];
mean[2] = p->norm.mean[0];
-
- vsi_nn_reorder_tensor( (vsi_nn_tensor_t **)output_tensor_group, order,
- 3, (vsi_nn_tensor_t **)tmp_outputs );
}
else
{
mean[0] = p->norm.mean[0];
mean[1] = p->norm.mean[1];
mean[2] = p->norm.mean[2];
-
- memmove( tmp_outputs, output_tensor_group, sizeof(vsi_nn_tensor_t*) * 3 );
}
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_RGB888_PLANAR, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
if (is_input_sep)
{
curr->inputs[0] = inputs[0];
@@ -368,28 +429,7 @@ static vsi_bool op_setup
curr->inputs[1] = NULL;
curr->inputs[2] = NULL;
}
- curr->outputs[0] = output_tensor_group[0]->t;
- curr->outputs[1] = output_tensor_group[1]->t;
- curr->outputs[2] = output_tensor_group[2]->t;
- curr->node->nn_param.pre_process_rgb888_planar.r_mean = mean[0];
- curr->node->nn_param.pre_process_rgb888_planar.g_mean = mean[1];
- curr->node->nn_param.pre_process_rgb888_planar.b_mean = mean[2];
- curr->node->nn_param.pre_process_rgb888_planar.scale = p->norm.scale;
- curr->node->nn_param.pre_process_rgb888_planar.rect.left = p->rect.left;
- curr->node->nn_param.pre_process_rgb888_planar.rect.top = p->rect.top;
- curr->node->nn_param.pre_process_rgb888_planar.rect.width = p->rect.width;
- curr->node->nn_param.pre_process_rgb888_planar.rect.height = p->rect.height;
- curr->node->nn_param.pre_process_rgb888_planar.output_attr.size = size_32bit;
- curr->node->nn_param.pre_process_rgb888_planar.output_attr.dim_num = p->output_attr.dim_num;
- vsi_nn_internal_setup_node(self, curr);
-
- curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 3, 1 );
-
- curr->node->nn_param.concat.axis = axis;
- curr->inputs[0] = tmp_outputs[0]->t;
- curr->inputs[1] = tmp_outputs[1]->t;
- curr->inputs[2] = tmp_outputs[2]->t;
- if (layout == VSI_NN_DEST_LAYOUT_NHWC)
+ if (layout == VSI_NN_DEST_LAYOUT_NHWC && !enable_rgb88_planar_nhwc)
{
curr->outputs[0] = preprocess_tensor->t;
}
@@ -398,27 +438,93 @@ static vsi_bool op_setup
curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
}
- vsi_nn_internal_setup_node(self, curr);
+ if (p->reverse_channel)
+ {
+ if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+ {
+ curr->node->nn_param.pre_process_rgb888_planar.r_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_rgb888_planar.g_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_rgb888_planar.b_scale = p->norm.scale;
+ }
+ else
+ {
+ curr->node->nn_param.pre_process_rgb888_planar.r_scale = p->norm2.scale[2];
+ curr->node->nn_param.pre_process_rgb888_planar.g_scale = p->norm2.scale[1];
+ curr->node->nn_param.pre_process_rgb888_planar.b_scale = p->norm2.scale[0];
+ }
+ }
+ else
+ {
+ if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+ {
+ curr->node->nn_param.pre_process_rgb888_planar.r_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_rgb888_planar.g_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_rgb888_planar.b_scale = p->norm.scale;
+ }
+ else
+ {
+ curr->node->nn_param.pre_process_rgb888_planar.r_scale = p->norm2.scale[0];
+ curr->node->nn_param.pre_process_rgb888_planar.g_scale = p->norm2.scale[1];
+ curr->node->nn_param.pre_process_rgb888_planar.b_scale = p->norm2.scale[2];
+ }
+ }
+
+ curr->node->nn_param.pre_process_rgb888_planar.r_mean = mean[0];
+ curr->node->nn_param.pre_process_rgb888_planar.g_mean = mean[1];
+ curr->node->nn_param.pre_process_rgb888_planar.b_mean = mean[2];
+ curr->node->nn_param.pre_process_rgb888_planar.rect.left = p->rect.left;
+ curr->node->nn_param.pre_process_rgb888_planar.rect.top = p->rect.top;
+ curr->node->nn_param.pre_process_rgb888_planar.rect.width = p->rect.width;
+ curr->node->nn_param.pre_process_rgb888_planar.rect.height = p->rect.height;
+ curr->node->nn_param.pre_process_rgb888_planar.output_attr.size = p->output_attr.size;
+ curr->node->nn_param.pre_process_rgb888_planar.output_attr.dim_num = p->output_attr.dim_num;
+ curr->node->nn_param.pre_process_rgb888_planar.reverse_channel = p->reverse_channel;
+ curr->node->nn_param.pre_process_rgb888_planar.enable_rgb88_planar_nhwc = enable_rgb88_planar_nhwc;
+ ret = vsi_nn_internal_setup_node(self, curr);
}
break;
case VSI_NN_SOURCE_FORMAT_IMAGE_YUV444:
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV444, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
if (p->reverse_channel)
{
curr->node->nn_param.pre_process_yuv444.r_mean = p->norm.mean[2];
curr->node->nn_param.pre_process_yuv444.g_mean = p->norm.mean[1];
curr->node->nn_param.pre_process_yuv444.b_mean = p->norm.mean[0];
+ if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+ {
+ curr->node->nn_param.pre_process_yuv444.r_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_yuv444.g_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_yuv444.b_scale = p->norm.scale;
+ }
+ else
+ {
+ curr->node->nn_param.pre_process_yuv444.r_scale = p->norm2.scale[2];
+ curr->node->nn_param.pre_process_yuv444.g_scale = p->norm2.scale[1];
+ curr->node->nn_param.pre_process_yuv444.b_scale = p->norm2.scale[0];
+ }
}
else
{
curr->node->nn_param.pre_process_yuv444.r_mean = p->norm.mean[0];
curr->node->nn_param.pre_process_yuv444.g_mean = p->norm.mean[1];
curr->node->nn_param.pre_process_yuv444.b_mean = p->norm.mean[2];
+ if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+ {
+ curr->node->nn_param.pre_process_yuv444.r_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_yuv444.g_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_yuv444.b_scale = p->norm.scale;
+ }
+ else
+ {
+ curr->node->nn_param.pre_process_yuv444.r_scale = p->norm2.scale[0];
+ curr->node->nn_param.pre_process_yuv444.g_scale = p->norm2.scale[1];
+ curr->node->nn_param.pre_process_yuv444.b_scale = p->norm2.scale[2];
+ }
}
- curr->node->nn_param.pre_process_yuv444.rgb_scale = p->norm.scale;
curr->node->nn_param.pre_process_yuv444.reverse_channel = p->reverse_channel;
curr->node->nn_param.pre_process_yuv444.rect.left = p->rect.left;
curr->node->nn_param.pre_process_yuv444.rect.top = p->rect.top;
@@ -441,25 +547,50 @@ static vsi_bool op_setup
curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
}
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
}
break;
case VSI_NN_SOURCE_FORMAT_IMAGE_NV21:
case VSI_NN_SOURCE_FORMAT_IMAGE_NV12:
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_NV12, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
if (p->reverse_channel)
{
curr->node->nn_param.pre_process_nv12.r_mean = p->norm.mean[2];
curr->node->nn_param.pre_process_nv12.g_mean = p->norm.mean[1];
curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[0];
+ if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+ {
+ curr->node->nn_param.pre_process_nv12.r_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_nv12.g_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_nv12.b_scale = p->norm.scale;
+ }
+ else
+ {
+ curr->node->nn_param.pre_process_nv12.r_scale = p->norm2.scale[2];
+ curr->node->nn_param.pre_process_nv12.g_scale = p->norm2.scale[1];
+ curr->node->nn_param.pre_process_nv12.b_scale = p->norm2.scale[0];
+ }
}
else
{
curr->node->nn_param.pre_process_nv12.r_mean = p->norm.mean[0];
curr->node->nn_param.pre_process_nv12.g_mean = p->norm.mean[1];
curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[2];
+ if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+ {
+ curr->node->nn_param.pre_process_nv12.r_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_nv12.g_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_nv12.b_scale = p->norm.scale;
+ }
+ else
+ {
+ curr->node->nn_param.pre_process_nv12.r_scale = p->norm2.scale[0];
+ curr->node->nn_param.pre_process_nv12.g_scale = p->norm2.scale[1];
+ curr->node->nn_param.pre_process_nv12.b_scale = p->norm2.scale[2];
+ }
}
if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12)
@@ -471,7 +602,6 @@ static vsi_bool op_setup
curr->node->nn_param.pre_process_nv12.nv_type = VSI_NN_YUV_TYPE_NV21;
}
- curr->node->nn_param.pre_process_nv12.rgb_scale = p->norm.scale;
curr->node->nn_param.pre_process_nv12.reverse_channel = p->reverse_channel;
curr->node->nn_param.pre_process_nv12.rect.left = p->rect.left;
curr->node->nn_param.pre_process_nv12.rect.top = p->rect.top;
@@ -493,25 +623,50 @@ static vsi_bool op_setup
curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
}
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
}
break;
case VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422:
case VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422:
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV422, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
if (p->reverse_channel)
{
curr->node->nn_param.pre_process_yuv422.r_mean = p->norm.mean[2];
curr->node->nn_param.pre_process_yuv422.g_mean = p->norm.mean[1];
curr->node->nn_param.pre_process_yuv422.b_mean = p->norm.mean[0];
+ if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+ {
+ curr->node->nn_param.pre_process_yuv422.r_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_yuv422.g_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_yuv422.b_scale = p->norm.scale;
+ }
+ else
+ {
+ curr->node->nn_param.pre_process_yuv422.r_scale = p->norm2.scale[2];
+ curr->node->nn_param.pre_process_yuv422.g_scale = p->norm2.scale[1];
+ curr->node->nn_param.pre_process_yuv422.b_scale = p->norm2.scale[0];
+ }
}
else
{
curr->node->nn_param.pre_process_yuv422.r_mean = p->norm.mean[0];
curr->node->nn_param.pre_process_yuv422.g_mean = p->norm.mean[1];
curr->node->nn_param.pre_process_yuv422.b_mean = p->norm.mean[2];
+ if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+ {
+ curr->node->nn_param.pre_process_yuv422.r_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_yuv422.g_scale = p->norm.scale;
+ curr->node->nn_param.pre_process_yuv422.b_scale = p->norm.scale;
+ }
+ else
+ {
+ curr->node->nn_param.pre_process_yuv422.r_scale = p->norm2.scale[0];
+ curr->node->nn_param.pre_process_yuv422.g_scale = p->norm2.scale[1];
+ curr->node->nn_param.pre_process_yuv422.b_scale = p->norm2.scale[2];
+ }
}
if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422)
@@ -523,7 +678,6 @@ static vsi_bool op_setup
curr->node->nn_param.pre_process_yuv422.yuv422_type = 1;
}
- curr->node->nn_param.pre_process_yuv422.rgb_scale = p->norm.scale;
curr->node->nn_param.pre_process_yuv422.reverse_channel = p->reverse_channel;
curr->node->nn_param.pre_process_yuv422.rect.left = p->rect.left;
curr->node->nn_param.pre_process_yuv422.rect.top = p->rect.top;
@@ -544,13 +698,13 @@ static vsi_bool op_setup
curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
}
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
}
break;
default:
{
VSILOGE( "Not support this type!(PRE_PROCESS)\n");
- ret = FALSE;
+ goto final;
}
break;
}
@@ -564,22 +718,24 @@ static vsi_bool op_setup
p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB ||
p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA ||
p->type == VSI_NN_SOURCE_FORMAT_IMAGE_GRAY ||
- p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ||
- p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP
+ (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR && !enable_rgb88_planar_nhwc) ||
+ (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP && !enable_rgb88_planar_nhwc)
)
{
if (layout == VSI_NN_DEST_LAYOUT_NHWC)
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.permute.perm = p->perm;
curr->node->nn_param.permute.dim_num = p->dim_num;
curr->inputs[0] = preprocess_tensor->t;
curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
- vsi_nn_internal_setup_node( self, curr );
+ ret = vsi_nn_internal_setup_node( self, curr );
}
}
+final:
return ret;
} /* op_setup() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c
index c1be23962..2c5e5b77d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c
@@ -60,7 +60,9 @@ static vsi_status op_compute
vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_bgra.r_mean );
vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_bgra.g_mean );
vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_bgra.b_mean );
- vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_bgra.rgb_scale );
+ vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_bgra.r_scale );
+ vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_bgra.g_scale );
+ vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_bgra.b_scale );
vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_bgra.reverse_channel );
vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_bgra.local.enable_perm );
vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_bgra.local.enable_copy );
@@ -111,6 +113,9 @@ static vsi_bool op_setup
/* TODO: Add code to comput outputs' shape. */
vsi_nn_pre_process_bgra_param * p = NULL;
uint32_t i = 0;
+
+ VSI_UNREFERENCED(inputs);
+
p = (vsi_nn_pre_process_bgra_param *)&(self->nn_param.pre_process_bgra);
if (p->rect.width == 0 || p->rect.height == 0)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c
index d264ee7fa..6bc1f796b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c
@@ -115,6 +115,9 @@ static vsi_bool op_setup
{
vsi_nn_pre_process_gray_param * p = NULL;
uint32_t i = 0;
+
+ VSI_UNREFERENCED(inputs);
+
p = (vsi_nn_pre_process_gray_param *)&(self->nn_param.pre_process_gray);
if (p->rect.width == 0 || p->rect.height == 0)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
index 09eb682ff..7fa635a5b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
@@ -56,7 +56,9 @@ static vsi_status op_compute
vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_nv12.r_mean );
vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_nv12.g_mean );
vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_nv12.b_mean );
- vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_nv12.rgb_scale );
+ vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_nv12.r_scale );
+ vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_nv12.g_scale );
+ vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_nv12.b_scale );
vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_nv12.reverse_channel );
vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_nv12.local->enable_perm );
vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_nv12.local->enable_copy );
@@ -113,6 +115,9 @@ static vsi_bool op_setup
/* TODO: Add code to comput outputs' shape. */
vsi_nn_pre_process_nv12_param * p = NULL;
uint32_t i = 0;
+
+ VSI_UNREFERENCED(inputs);
+
p = (vsi_nn_pre_process_nv12_param *)&(self->nn_param.pre_process_nv12);
if (p->rect.width == 0 || p->rect.height == 0)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
index 6d19e4a47..80acd7974 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
@@ -59,7 +59,9 @@ static vsi_status op_compute
vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_rgb.r_mean );
vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_rgb.g_mean );
vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_rgb.b_mean );
- vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_rgb.rgb_scale );
+ vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_rgb.r_scale );
+ vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_rgb.g_scale );
+ vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_rgb.b_scale );
vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_rgb.reverse_channel );
vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_rgb.local.enable_perm );
vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_rgb.local.enable_copy );
@@ -116,6 +118,9 @@ static vsi_bool op_setup
/* TODO: Add code to comput outputs' shape. */
vsi_nn_pre_process_rgb_param * p = NULL;
uint32_t i = 0;
+
+ VSI_UNREFERENCED(inputs);
+
p = (vsi_nn_pre_process_rgb_param *)&(self->nn_param.pre_process_rgb);
if (p->rect.width == 0 || p->rect.height == 0)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c
index 13a636d78..3c27ecc19 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c
@@ -47,7 +47,7 @@ typedef struct _pre_process_rgb888_planar_local_data_t {
Declare number of input and output.
*/
#define _INPUT_NUM (3)
-#define _OUTPUT_NUM (3)
+#define _OUTPUT_NUM (1)
static vsi_status op_compute
(
@@ -59,21 +59,35 @@ static vsi_status op_compute
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_param_t * param = NULL;
vsi_nn_kernel_node_t n = NULL;
+ vsi_nn_pre_process_rgb888_planar_param * p = NULL;
+
+ p = (vsi_nn_pre_process_rgb888_planar_param *)&(self->nn_param.pre_process_rgb888_planar);
param = vsi_nn_kernel_param_create();
- vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_rgb888_planar.local->scale_x );
- vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_rgb888_planar.local->scale_y );
- vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_rgb888_planar.rect.left );
- vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_rgb888_planar.rect.top );
- vsi_nn_kernel_param_add_int32( param, "width", self->nn_param.pre_process_rgb888_planar.rect.width );
- vsi_nn_kernel_param_add_int32( param, "height", self->nn_param.pre_process_rgb888_planar.rect.height );
- vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_rgb888_planar.r_mean );
- vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_rgb888_planar.g_mean );
- vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_rgb888_planar.b_mean );
- vsi_nn_kernel_param_add_float32( param, "scale", self->nn_param.pre_process_rgb888_planar.scale );
- vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_rgb888_planar.local->enable_copy );
-
- n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb888_planar", inputs, 3, outputs, 3, param );
+ vsi_nn_kernel_param_add_int32( param, "scale_x", p->local->scale_x );
+ vsi_nn_kernel_param_add_int32( param, "scale_y", p->local->scale_y );
+ vsi_nn_kernel_param_add_int32( param, "left", p->rect.left );
+ vsi_nn_kernel_param_add_int32( param, "top", p->rect.top );
+ vsi_nn_kernel_param_add_int32( param, "width", p->rect.width );
+ vsi_nn_kernel_param_add_int32( param, "height", p->rect.height );
+ vsi_nn_kernel_param_add_float32( param, "r_mean", p->r_mean );
+ vsi_nn_kernel_param_add_float32( param, "g_mean", p->g_mean );
+ vsi_nn_kernel_param_add_float32( param, "b_mean", p->b_mean );
+ vsi_nn_kernel_param_add_float32( param, "r_scale", p->r_scale );
+ vsi_nn_kernel_param_add_float32( param, "g_scale", p->g_scale );
+ vsi_nn_kernel_param_add_float32( param, "b_scale", p->b_scale );
+ vsi_nn_kernel_param_add_int32( param, "enable_copy", p->local->enable_copy );
+ vsi_nn_kernel_param_add_int32( param, "reverse", p->reverse_channel );
+
+ if (p->enable_rgb88_planar_nhwc)
+ {
+ n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb888_planar_nhwc", inputs, 3, outputs, 1, param );
+ }
+ else
+ {
+ n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb888_planar", inputs, 3, outputs, 1, param );
+ }
+
if ( n != NULL )
{
self->n = (vx_node)n;
@@ -97,11 +111,11 @@ static vsi_bool op_check
{
if (inputs[1] == NULL)
{
- BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 1, 3)
- IO_TYPE(D_U8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
- IO_TYPE(D_U8, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP)
- IO_TYPE(D_U8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
- IO_TYPE(D_U8, D_F16, D_F16, D_F16)
+ BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 1, 1)
+ IO_TYPE(D_U8, D_U8|Q_ASYM)
+ IO_TYPE(D_U8, D_I8|Q_DFP)
+ IO_TYPE(D_U8, D_I16|Q_DFP)
+ IO_TYPE(D_U8, D_F16)
END_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR)
if (!VALIDATE_OP_IO_TYPES(PRE_PROCESS_RGB888_PLANAR, self, inputs, 1,
@@ -115,11 +129,11 @@ static vsi_bool op_check
}
else
{
- BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 3, 3)
- IO_TYPE(D_U8, D_U8, D_U8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
- IO_TYPE(D_U8, D_U8, D_U8, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP)
- IO_TYPE(D_U8, D_U8, D_U8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
- IO_TYPE(D_U8, D_U8, D_U8, D_F16, D_F16, D_F16)
+ BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 3, 1)
+ IO_TYPE(D_U8, D_U8, D_U8, D_U8|Q_ASYM)
+ IO_TYPE(D_U8, D_U8, D_U8, D_I8|Q_DFP)
+ IO_TYPE(D_U8, D_U8, D_U8, D_I16|Q_DFP)
+ IO_TYPE(D_U8, D_U8, D_U8, D_F16)
END_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR)
if (!VALIDATE_OP_IO_TYPES(PRE_PROCESS_RGB888_PLANAR, self, inputs, self->input.num,
@@ -144,6 +158,9 @@ static vsi_bool op_setup
{
vsi_nn_pre_process_rgb888_planar_param * p = NULL;
uint32_t i = 0, j = 0;
+
+ VSI_UNREFERENCED(inputs);
+
p = (vsi_nn_pre_process_rgb888_planar_param *)&(self->nn_param.pre_process_rgb888_planar);
if (p->rect.width == 0 || p->rect.height == 0)
@@ -163,29 +180,34 @@ static vsi_bool op_setup
}
}
- for (j = 0; j < 3; j++)
+
+ if ( VSI_NN_DIM_AUTO == outputs[j]->attr.dim_num )
{
- if ( VSI_NN_DIM_AUTO == outputs[j]->attr.dim_num )
+ if (p->output_attr.dim_num > 0)
{
- if (p->output_attr.dim_num > 0)
- {
- outputs[j]->attr.dim_num = p->output_attr.dim_num;
- for (i = 0; i < p->output_attr.dim_num; i++)
- {
- outputs[j]->attr.dim_num = p->output_attr.dim_num;
- outputs[j]->attr.size[i] = p->output_attr.size[i];
- }
- }
- else
+ outputs[j]->attr.dim_num = p->output_attr.dim_num;
+ for (i = 0; i < p->output_attr.dim_num; i++)
{
- VSILOGE("output dim num cannot be zero!(PRE_PROCESS_RGB888_PLANAR)\n");
- return FALSE;
+ outputs[j]->attr.size[i] = p->output_attr.size[i];
}
}
+ else
+ {
+ VSILOGE("output dim num cannot be zero!(PRE_PROCESS_RGB888_PLANAR)\n");
+ return FALSE;
+ }
}
- p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[0]->attr.size[0]);
- p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[0]->attr.size[1]);
+ if (p->enable_rgb88_planar_nhwc)
+ {
+ p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[0]->attr.size[1]);
+ p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[0]->attr.size[2]);
+ }
+ else
+ {
+ p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[0]->attr.size[0]);
+ p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[0]->attr.size[1]);
+ }
p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15)));
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c
index b4220a716..9886be018 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c
@@ -34,7 +34,7 @@
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
#include "utils/vsi_nn_dtype_util.h"
#include "vsi_nn_internal_node.h"
@@ -48,6 +48,8 @@ static vsi_bool _is_same_type
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+
if(vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
{
return FALSE;
@@ -63,6 +65,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -73,6 +77,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -85,6 +92,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
@@ -95,7 +104,7 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
- vsi_bool ret;
+ vsi_bool ret = FALSE;
uint32_t i;
uint32_t axis;
vsi_nn_tensor_attr_t attr;
@@ -112,7 +121,6 @@ static vsi_bool op_setup
return FALSE;
}
- ret = TRUE;
/* output */
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
@@ -125,7 +133,7 @@ static vsi_bool op_setup
VSILOGE( "Error permute axis '%u', the dim is '%u' ",
axis, inputs[0]->attr.dim_num );
ret = FALSE;
- break;
+ goto final;
}
outputs[0]->attr.size[i] = inputs[0]->attr.size[axis];
}
@@ -152,32 +160,35 @@ static vsi_bool op_setup
self->nn_param.pre_process_tensor.local.enable_perm == FALSE)
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.reshape2.size = outputs[0]->attr.size;
curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num;
curr->inputs[0] = inputs[PRE_PROCESS_TENSOR_INPUT];
curr->outputs[0] = outputs[PRE_PROCESS_TENSOR_OUTPUT];
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
}
else if (self->nn_param.pre_process_tensor.local.enable_data_conv == TRUE &&
self->nn_param.pre_process_tensor.local.enable_perm == FALSE)
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[PRE_PROCESS_TENSOR_INPUT];
curr->outputs[0] = outputs[PRE_PROCESS_TENSOR_OUTPUT];
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
}
else if (self->nn_param.pre_process_tensor.local.enable_data_conv == FALSE &&
self->nn_param.pre_process_tensor.local.enable_perm == TRUE)
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.permute.perm = self->nn_param.pre_process_tensor.perm;
curr->node->nn_param.permute.dim_num = self->nn_param.pre_process_tensor.dim_num;
curr->inputs[0] = inputs[PRE_PROCESS_TENSOR_INPUT];
curr->outputs[0] = outputs[PRE_PROCESS_TENSOR_OUTPUT];
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
}
else
{
@@ -187,22 +198,26 @@ static vsi_bool op_setup
attr.vtl = use_virtual_tensor;
attr.is_const = FALSE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[PRE_PROCESS_TENSOR_INPUT];
curr->outputs[0] = output_tensor->t;
- vsi_nn_internal_setup_node( self, curr );
+ ret = vsi_nn_internal_setup_node( self, curr );
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.permute.perm = self->nn_param.pre_process_tensor.perm;
curr->node->nn_param.permute.dim_num = self->nn_param.pre_process_tensor.dim_num;
curr->inputs[0] = output_tensor->t;
curr->outputs[0] = outputs[PRE_PROCESS_TENSOR_OUTPUT];
- vsi_nn_internal_setup_node(self, curr);
+ ret &= vsi_nn_internal_setup_node(self, curr);
}
+final:
return ret;
} /* op_setup() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c
index bcac93c3c..37696ff6c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c
@@ -56,7 +56,9 @@ static vsi_status op_compute
vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_yuv420.r_mean );
vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_yuv420.g_mean );
vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_yuv420.b_mean );
- vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_yuv420.rgb_scale );
+ vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_yuv420.r_scale );
+ vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_yuv420.g_scale );
+ vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_yuv420.b_scale );
vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_yuv420.reverse_channel );
vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_yuv420.local.enable_perm );
vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_yuv420.local.enable_copy );
@@ -113,6 +115,9 @@ static vsi_bool op_setup
/* TODO: Add code to comput outputs' shape. */
vsi_nn_pre_process_yuv420_param * p = NULL;
uint32_t i = 0;
+
+ VSI_UNREFERENCED(inputs);
+
p = (vsi_nn_pre_process_yuv420_param *)&(self->nn_param.pre_process_yuv420);
if (p->rect.width == 0 || p->rect.height == 0)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c
index b9c4daf33..3922de4c2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c
@@ -65,7 +65,9 @@ static vsi_status op_compute
vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_yuv422.r_mean );
vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_yuv422.g_mean );
vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_yuv422.b_mean );
- vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_yuv422.rgb_scale );
+ vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_yuv422.r_scale );
+ vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_yuv422.g_scale );
+ vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_yuv422.b_scale );
vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_yuv422.reverse_channel );
vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_yuv422.local->enable_perm );
vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_yuv422.local->enable_copy );
@@ -123,6 +125,9 @@ static vsi_bool op_setup
/* TODO: Add code to comput outputs' shape. */
vsi_nn_pre_process_yuv422_param * p = NULL;
uint32_t i = 0;
+
+ VSI_UNREFERENCED(inputs);
+
p = (vsi_nn_pre_process_yuv422_param *)&(self->nn_param.pre_process_yuv422);
if (p->rect.width == 0 || p->rect.height == 0)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c
index 6a350d16e..baa5cc440 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c
@@ -56,7 +56,9 @@ static vsi_status op_compute
vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_yuv444.r_mean );
vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_yuv444.g_mean );
vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_yuv444.b_mean );
- vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_yuv444.rgb_scale );
+ vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_yuv444.r_scale );
+ vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_yuv444.g_scale );
+ vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_yuv444.b_scale );
vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_yuv444.reverse_channel );
vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_yuv444.local->enable_perm );
vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_yuv444.local->enable_copy );
@@ -113,6 +115,9 @@ static vsi_bool op_setup
/* TODO: Add code to comput outputs' shape. */
vsi_nn_pre_process_yuv444_param * p = NULL;
uint32_t i = 0;
+
+ VSI_UNREFERENCED(inputs);
+
p = (vsi_nn_pre_process_yuv444_param *)&(self->nn_param.pre_process_yuv444);
if (p->rect.width == 0 || p->rect.height == 0)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c
index b66a5cf01..2bdc1362f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c
@@ -213,6 +213,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
} /* op_check() */
@@ -277,6 +280,8 @@ static vsi_status op_optimize
uint32_t dim;
vx_tensor rois_tmp, score_tmp;
+ VSI_UNREFERENCED(inputs);
+
rois_tmp = NULL, score_tmp = NULL;
if( direction == VSI_NN_OPTIMIZE_BACKWARD )
{
@@ -326,16 +331,20 @@ static vsi_status op_deinit
vsi_nn_node_t * self
)
{
- vx_tensor rois = self->nn_param.proposal.local.rois;
- vx_tensor score = self->nn_param.proposal.local.score;
- if( NULL != self && NULL != self->n )
+ vx_tensor rois = NULL;
+ vx_tensor score = NULL;
+
+ if ( NULL != self && NULL != self->n )
{
- if(rois)
+ rois = self->nn_param.proposal.local.rois;
+ score = self->nn_param.proposal.local.score;
+
+ if (rois)
{
vxReleaseTensor(&rois);
rois = NULL;
}
- if(score)
+ if (score)
{
vxReleaseTensor(&score);
score = NULL;
@@ -343,6 +352,11 @@ static vsi_status op_deinit
vxReleaseNode( &self->n );
self->n = NULL;
}
+ else
+ {
+ return VSI_FAILURE;
+ }
+
return VSI_SUCCESS;
}
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c
index 4ea879fbf..c203fdd6a 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c
@@ -55,6 +55,9 @@ static vsi_status op_compute
VX_CONVERT_POLICY_SATURATE, outputs[0]->t );
*/
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
+
if( NULL != self->n )
{
status = VSI_SUCCESS;
@@ -69,6 +72,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -80,6 +86,10 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
+
/* TODO: Add code to comput outputs' shape. */
return TRUE;
} /* op_setup() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
index a7a549448..dcbb75b04 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
@@ -36,6 +36,7 @@
#include "vsi_nn_internal_node.h"
#include "utils/vsi_nn_dtype_util.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "vsi_nn_error.h"
#define _ARG_NUM (6)
#define _INPUT_NUM (1)
@@ -209,6 +210,8 @@ static vsi_status op_compute
{
vsi_status status = VSI_FAILURE;
+ VSI_UNREFERENCED(outputs);
+
if ( self->nn_param.reduce.local2->use_internal_node )
{
status = vsi_nn_internal_compute_node( self );
@@ -219,7 +222,7 @@ static vsi_status op_compute
vsi_nn_tensor_t *axis_tensor = NULL;
vsi_nn_tensor_t *axis_tensor2 = NULL;
vsi_nn_tensor_attr_t attr, attr2;
- vx_int32 resolved_dim[4] = {-1, -1, -1, -1};
+ vx_int32 resolved_dim[VSI_NN_MAX_DIM_NUM] = {-1};
vx_int32 resolved_dim_count = 0;
uint32_t i = 0;
vsi_size_t re_sizes[VSI_NN_MAX_DIM_NUM] = {1};
@@ -230,6 +233,9 @@ static vsi_status op_compute
vsi_nn_tensor_t *reshaped_output1 = self->nn_param.reduce.local2->reshaped_output1;
char tensor_name[128];
+ CHECK_PTR_FAIL_GOTO( reshaped_input1, "check tensor pointer.", final );
+ CHECK_PTR_FAIL_GOTO( reshaped_output1, "check tensor pointer.", final );
+
memset(tensor_name, 0, sizeof(tensor_name));
snprintf(tensor_name,
sizeof(tensor_name),
@@ -240,11 +246,20 @@ static vsi_status op_compute
{
VSILOGW("Set uid %u reduce reshaped output name fail",
self->uid);
- return VSI_FAILURE;
+
+ status = VSI_FAILURE;
+ goto final;
}
resolved_dim_count = self->nn_param.reduce.local2->axes_num;
+ if (resolved_dim_count > VSI_NN_MAX_DIM_NUM)
+ {
+ VSILOGE("resolved_dim_count greater than VSI_NN_MAX_DIM_NUM");
+
+ status = VSI_FAILURE;
+ goto final;
+ }
for (i = 0; i < (uint32_t)resolved_dim_count; i++)
{
@@ -313,7 +328,7 @@ static vsi_status op_compute
input_t,
output_t);
}
- else if (3 == resolved_dim[resolved_dim_count - 1] && resolved_dim_count < 3)
+ else if (resolved_dim_count > 0 && 3 == resolved_dim[resolved_dim_count - 1] && resolved_dim_count < 3)
{
if (1 == resolved_dim_count)
{
@@ -349,6 +364,7 @@ static vsi_status op_compute
attr2.size[resolved_dim[0]] = 1;
attr2.vtl = FALSE;
mean_tmp_tensor = vsi_nn_CreateTensor(self->graph, &attr2);
+ CHECK_PTR_FAIL_GOTO( mean_tmp_tensor, "Create tensor fail.", final );
self->nn_param.reduce.local2->reshaped_tmp = mean_tmp_tensor;
re_sizes[resolved_dim[0]] = 1;
memset(&attr, 0, sizeof(attr));
@@ -433,6 +449,8 @@ static vsi_status op_compute
attr2.size[resolved_dim[1]] = 1;
attr2.vtl = FALSE;
mean_tmp_tensor = vsi_nn_CreateTensor(self->graph, &attr2);
+ CHECK_PTR_FAIL_GOTO( mean_tmp_tensor, "Create tensor fail.", final );
+
self->nn_param.reduce.local2->reshaped_tmp = mean_tmp_tensor;
re_sizes[resolved_dim[0]] = 1;
re_sizes[resolved_dim[1]] = 1;
@@ -446,11 +464,8 @@ static vsi_status op_compute
self->graph,
(uint8_t *)&resolved_dim[0],
&attr);
- if( NULL == axis_tensor )
- {
- VSILOGE("Create axis_tensor fail.(reduce)");
- return VSI_FAILURE;
- }
+ CHECK_PTR_FAIL_GOTO( axis_tensor, "Create tensor fail.", final );
+
self->nn_param.reduce.local.axis_tensor = axis_tensor;
status = op_comput_reduce_mean(self,
axis_tensor,
@@ -512,6 +527,7 @@ static vsi_status op_compute
}
}
+final:
return status;
} /* op_compute() */
@@ -523,6 +539,9 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
+
if ( self->nn_param.reduce.local2->use_internal_node )
{
return vsi_nn_internal_optimize_node(self, direction );
@@ -540,6 +559,10 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
+
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -711,7 +734,7 @@ static vsi_bool op_set_reduce_axis(
for (i = 0; i < self->nn_param.reduce.axis_num; i++)
{
vx_int32 current_axis = self->nn_param.reduce.axis[i] < 0 ? \
- inputs[0]->attr.dim_num + self->nn_param.reduce.axis[i] : self->nn_param.reduce.axis[i];
+ (int32_t)inputs[0]->attr.dim_num + self->nn_param.reduce.axis[i] : self->nn_param.reduce.axis[i];
if (current_axis < 0 || current_axis >= (vx_int32)inputs[0]->attr.dim_num)
{
@@ -822,16 +845,20 @@ static vsi_bool op_set_sp_reduce_internal
int32_t axes_num = self->nn_param.reduce.local2->axes_num;
int32_t i = 0, j = 0, index = 0;
vsi_size_t reduce_size = 1;
+ vsi_bool ret = FALSE;
vsi_nn_internal_init_node_wksp( self );
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor);
tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+ CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final);
tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode,
inputs[0]->attr.dim_num * sizeof(uint32_t));
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(permute_in_perm, tmp_inode, "Create buffer failed", final);
for ( i = 0; i < axes_num; i++)
{
@@ -862,11 +889,14 @@ static vsi_bool op_set_sp_reduce_internal
vsi_nn_internal_setup_node(self, tmp_inode);
new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes, outputs[0]->attr.dim_num);
+ CHECK_PTR_FAIL_GOTO(new_output, "Create tensor failed", final);
+ self->nn_param.reduce.local2->reshaped_output = new_output;
tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_REDUCE_MEAN_INTERNAL, 0, 0 );
-
+ CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
new_axis = (int32_t *)vsi_nn_internal_new_node_param(tmp_inode,
axes_num * sizeof(int32_t));
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(new_axis, tmp_inode, "Create buffer failed", final);
for (i = 0; i < axes_num; i++)
{
new_axis[i] = i;
@@ -885,11 +915,10 @@ static vsi_bool op_set_sp_reduce_internal
tmp_inode->node->nn_param.reduce_mean_internal.scale =
1.0f / (float)reduce_size;
}
- vsi_nn_internal_setup_node(self, tmp_inode);
+ ret = vsi_nn_internal_setup_node(self, tmp_inode);
- self->nn_param.reduce.local2->reshaped_output = new_output;
-
- return TRUE;
+final:
+ return ret;
}
static vsi_bool op_set_reduce_internal
@@ -912,6 +941,8 @@ static vsi_bool op_set_reduce_internal
vx_int32 resolved_dim_count = 0;
int32_t * axes = self->nn_param.reduce.local2->axes;
vx_bool is_use_float = vx_false_e;
+ vsi_bool ret = FALSE;
+
resolved_dim_count = self->nn_param.reduce.local2->axes_num;
if ((VSI_NN_OP_REDUCESUM_INTERNAL == type_name) || (VSI_NN_OP_REDUCEPROD_INTERNAL == type_name))
@@ -975,6 +1006,7 @@ static vsi_bool op_set_reduce_internal
}
curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
op_set_reduce_param_value(&(curr->node->nn_param), type_name,
axes, 1, self->nn_param.reduce.keep_dim);
if (self->nn_param.reduce.local2->reshaped_input)
@@ -1001,9 +1033,11 @@ static vsi_bool op_set_reduce_internal
attr.vtl = use_virtual_tensor;
attr.is_const = FALSE;
tmp_output_tensor[0] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(tmp_output_tensor[0], "Create internal tensor failed", final);
re_sizes[axes[0]] = 1;
curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
op_set_reduce_param_value(&(curr->node->nn_param), type_name,
&(axes[0]), 1, vx_true_e);
curr->inputs[0] = inputs[POST_PROCESS_INPUT];
@@ -1034,8 +1068,11 @@ static vsi_bool op_set_reduce_internal
re_sizes[axes[1]] = 1;
new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], re_sizes, dim_num);
}
+ CHECK_PTR_FAIL_GOTO(new_output, "Reshape tensor failed", final);
+ self->nn_param.reduce.local2->reshaped_output = new_output;
curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
op_set_reduce_param_value(&(curr->node->nn_param), type_name,
&(axes[1]), 1, vx_true_e);
if (self->nn_param.reduce.local2->reshaped_input)
@@ -1047,7 +1084,6 @@ static vsi_bool op_set_reduce_internal
curr->inputs[0] = tmp_output_tensor[0]->t;
}
curr->outputs[0] = new_output;
- self->nn_param.reduce.local2->reshaped_output = new_output;
vsi_nn_internal_setup_node(self, curr);
}
else if (3 == resolved_dim_count)
@@ -1056,12 +1092,15 @@ static vsi_bool op_set_reduce_internal
attr.vtl = use_virtual_tensor;
attr.is_const = FALSE;
tmp_output_tensor[0] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(tmp_output_tensor[0], "Create internal tensor failed", final);
attr.size[axes[1]] = 1;
tmp_output_tensor[1] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(tmp_output_tensor[1], "Create internal tensor failed", final);
re_sizes[axes[0]] = 1;
re_sizes[axes[1]] = 1;
curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
op_set_reduce_param_value(&(curr->node->nn_param), type_name,
&(axes[0]), 1, vx_true_e);
curr->inputs[0] = inputs[POST_PROCESS_INPUT];
@@ -1069,6 +1108,7 @@ static vsi_bool op_set_reduce_internal
vsi_nn_internal_setup_node( self, curr );
curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
op_set_reduce_param_value(&(curr->node->nn_param), type_name,
&(axes[1]), 1, vx_true_e);
curr->inputs[0] = tmp_output_tensor[0]->t;
@@ -1100,6 +1140,7 @@ static vsi_bool op_set_reduce_internal
}
curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
op_set_reduce_param_value(&(curr->node->nn_param), type_name,
&(axes[2]), 1, vx_true_e);
if (self->nn_param.reduce.local2->reshaped_input)
@@ -1119,7 +1160,10 @@ static vsi_bool op_set_reduce_internal
VSILOGE("error: resolved_dim_count is %d\n", resolved_dim_count);
return FALSE;
}
- return TRUE;
+
+ ret = TRUE;
+final:
+ return ret;
}
static vsi_bool op_setup
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c
index 4f5022836..74132f149 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c
@@ -57,11 +57,13 @@ static vsi_status op_compute
int32_t axis_num = self->nn_param.reduce_mean_internal.axis_num;
float scale = self->nn_param.reduce_mean_internal.scale;
vsi_enum type = self->nn_param.reduce_mean_internal.type;
+ int32_t *axis = self->nn_param.reduce_mean_internal.axis;
vsi_nn_kernel_param_t * param = NULL;
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_int32( param, "axis_num", axis_num );
vsi_nn_kernel_param_add_float32( param, "scale", scale );
+ vsi_nn_kernel_param_add_str( param, "axis", (const char*)axis );
if (type == VSI_NN_REDUCE_MAX)
{
@@ -95,6 +97,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c
index dd41b6a0e..08e5b9401 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c
@@ -91,6 +91,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -102,6 +105,9 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/* TODO: Add code to comput outputs' shape. */
return TRUE;
} /* op_setup() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c
index 062922637..9efd8fca5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c
@@ -159,6 +159,8 @@ static vsi_bool op_setup
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
vsi_bool ret = TRUE;
+ VSI_UNREFERENCED(self);
+
in1_rank = inputs[0]->attr.dim_num;
in2_rank = inputs[1]->attr.dim_num;
out_rank = vsi_nn_max( in1_rank, in2_rank );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c
index 8c40d429a..6ec9d19af 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c
@@ -34,7 +34,7 @@
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
#include "vsi_nn_internal_node.h"
#include "utils/vsi_nn_util.h"
@@ -46,6 +46,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -56,6 +58,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -84,7 +89,9 @@ static vsi_bool op_setup
float max_value = 0;
float threshold = 0;
uint32_t max_raw = 0;
- if( NULL == self )
+ vsi_bool ret = FALSE;
+
+ if ( NULL == self )
{
return FALSE;
}
@@ -101,30 +108,35 @@ static vsi_bool op_setup
if (alpha == 0 && max_raw == VSI_NN_FLOAT32_INF && threshold == 0)
{
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
}
else if (alpha == 1.0f && max_value == 1.0f && threshold == -1.0f)
{
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU1, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
}
else if (alpha == 0 && max_value == 6.0f && threshold == 0)
{
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU6, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
}
else if (alpha == 0.1 && max_value == VSI_NN_FLOAT32_INF && threshold == 0)
{
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_LEAKY_RELU, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
}
else
{
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU_KERAS_INTERNAL, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
curr->node->nn_param.relu_keras_internal.max_value = max_value;
@@ -132,9 +144,10 @@ static vsi_bool op_setup
curr->node->nn_param.relu_keras_internal.threshold = threshold;
}
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
- return TRUE;
+final:
+ return ret;
}
#ifdef __cplusplus
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c
index 2a77c5c99..96d760e39 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c
@@ -158,8 +158,32 @@ static vsi_status op_compute
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_int32( param, "axis", axis );
- n = vsi_nn_kernel_selector( self->graph, "repeat",
- tmp_inputs, _INPUT_NUM, tmp_output, _OUTPUT_NUM, param );
+
+ if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE)
+ {
+ vsi_nn_tensor_t* temp_tensors = NULL;
+ vsi_nn_tensor_attr_t attr;
+ VSILOGW("repeat is no_range_change operation! \
+ Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!");
+
+ memcpy( &attr, &tmp_output[0]->attr, sizeof(attr));
+ memcpy( &attr.dtype, &tmp_inputs[0]->attr.dtype, sizeof(attr.dtype));
+ attr.is_const = FALSE;
+ attr.vtl = TRUE;
+ temp_tensors = vsi_nn_CreateTensor( self->graph, &attr );
+
+ vsi_nn_kernel_selector( self->graph, "repeat",
+ tmp_inputs, _INPUT_NUM, &temp_tensors, _OUTPUT_NUM, param );
+
+ n = vxTensorCopyNode( self->graph->g, temp_tensors->t, tmp_output[0]->t);
+ vsi_safe_release_tensor(temp_tensors);
+ }
+ else
+ {
+ n = vsi_nn_kernel_selector( self->graph, "repeat",
+ tmp_inputs, _INPUT_NUM, tmp_output, _OUTPUT_NUM, param );
+ }
+
if ( n != NULL )
{
self->n = (vx_node)n;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
index e1cfdaa69..523eeb46a 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
@@ -124,7 +124,8 @@ static vsi_bool op_setup
uint32_t i = 0;
for (i = 0; i < self->nn_param.reshape.dim_num; i++)
{
- shape[i] = -1 == self->nn_param.reshape.size[i] ? -1 : (vsi_size_t)self->nn_param.reshape.size[i];
+ shape[i] = (uint32_t)-1 == self->nn_param.reshape.size[i] ? \
+ (vsi_size_t)-1 : (vsi_size_t)self->nn_param.reshape.size[i];
}
ret = vsi_nn_CalcReshapeTensor(inputs[0],
outputs[0],
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
index 002b39be5..1a719af73 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
@@ -43,6 +43,7 @@
#include "vsi_nn_log.h"
#include "vsi_nn_internal_node.h"
#include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_error.h"
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
@@ -83,7 +84,7 @@ static vsi_status op_compute
}
else
{
- char kernel_name[128];
+ char kernel_name[128] = {0};
vsi_nn_kernel_param_t * param = NULL;
int32_t align_corners = self->nn_param.resize.align_corners;
int32_t half_pixel_centers = self->nn_param.resize.half_pixel_centers;
@@ -156,6 +157,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -171,6 +175,7 @@ static vsi_bool op_setup
float factor = self->nn_param.resize.factor;
vsi_enum layout = self->nn_param.resize.layout;
vsi_nn_internal_node_t* curr = NULL;
+ vsi_bool ret = FALSE;
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
@@ -220,13 +225,14 @@ static vsi_bool op_setup
vsi_nn_internal_init_node_wksp( self );
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_INTERNAL, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.resize_internal.align_corners = self->nn_param.resize.align_corners;
curr->node->nn_param.resize_internal.factor = self->nn_param.resize.factor;
curr->node->nn_param.resize_internal.half_pixel_centers = self->nn_param.resize.half_pixel_centers;
curr->node->nn_param.resize_internal.layout = self->nn_param.resize.layout;
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
}
else if (_is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num))
{
@@ -234,12 +240,18 @@ static vsi_bool op_setup
vsi_nn_internal_init_node_wksp( self );
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
+ }
+ else
+ {
+ ret = TRUE;
}
- return TRUE;
+final:
+ return ret;
} /* op_setup() */
static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c
index c05ec675a..d1b499ec7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c
@@ -34,6 +34,7 @@
#include "vsi_nn_tensor.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_error.h"
/*
Declare number of input and output.
@@ -71,6 +72,9 @@ static vsi_status op_compute
{
vsi_status status = VSI_FAILURE;
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
+
status = vsi_nn_internal_compute_node( self );
return status;
@@ -102,6 +106,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return TRUE;
} /* op_check() */
@@ -114,6 +121,7 @@ static vsi_bool op_setup
{
float factor = self->nn_param.resize_1d.factor;
vsi_nn_internal_node_t* curr = NULL;
+ vsi_bool ret = FALSE;
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
@@ -135,36 +143,40 @@ static vsi_bool op_setup
{
vsi_nn_internal_init_node_wksp( self );
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
}
else if (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize_1d.type)
{
vsi_nn_internal_init_node_wksp( self );
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_1D_BILINEAR_INTERNAL, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.resize_1d_bilinear_internal.align_corners = self->nn_param.resize_1d.align_corners;
curr->node->nn_param.resize_1d_bilinear_internal.factor = self->nn_param.resize_1d.factor;
curr->node->nn_param.resize_1d_bilinear_internal.half_pixel_centers = \
self->nn_param.resize_1d.half_pixel_centers;
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
}
else if (VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize_1d.type)
{
vsi_nn_internal_init_node_wksp( self );
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_1D_NEAREST_INTERNAL, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.resize_1d_nearest_internal.align_corners = self->nn_param.resize_1d.align_corners;
curr->node->nn_param.resize_1d_nearest_internal.factor = self->nn_param.resize_1d.factor;
curr->node->nn_param.resize_1d_nearest_internal.half_pixel_centers = \
self->nn_param.resize_1d.half_pixel_centers;
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
}
- return TRUE;
+final:
+ return ret;
} /* op_setup() */
static vsi_status op_init
@@ -172,6 +184,8 @@ static vsi_status op_init
vsi_nn_node_t* self
)
{
+ VSI_UNREFERENCED(self);
+
return VSI_SUCCESS;
} /* op_init() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_bilinear_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_bilinear_internal.c
index 66ea066ed..5b37e89a8 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_bilinear_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_bilinear_internal.c
@@ -135,7 +135,7 @@ static vsi_status op_init
vsi_nn_node_t* self
)
{
-
+ VSI_UNREFERENCED(self);
return VSI_SUCCESS;
} /* op_init() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_nearest_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_nearest_internal.c
index edddc1a27..b202f8ca3 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_nearest_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_nearest_internal.c
@@ -134,6 +134,7 @@ static vsi_status op_init
vsi_nn_node_t* self
)
{
+ VSI_UNREFERENCED(self);
return VSI_SUCCESS;
} /* op_init() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_3d.c
new file mode 100644
index 000000000..989bb1b70
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_3d.c
@@ -0,0 +1,334 @@
+/****************************************************************************
+*
+* Copyright (c) 2020 Vivante Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include
+#include
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_tensor_util.h"
+
+typedef struct _resize_3d_local_data_t {
+ int32_t placeholder;
+} resize_3d_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM (1)
+#define _OUTPUT_NUM (1)
+
+static vsi_status op_compute
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t ** inputs,
+ vsi_nn_tensor_t ** outputs
+ )
+{
+ vsi_status status = VSI_FAILURE;
+ vsi_nn_tensor_t * reshape_inputs[1] = {NULL};
+ vsi_nn_tensor_t * reshape_outputs[1] = {NULL};
+
+ if ( self->nn_param.resize_3d.lcl_data->use_internal_node )
+ {
+ status = vsi_nn_internal_compute_node( self );
+ }
+ else
+ {
+ char kernel_name[128];
+ vsi_nn_kernel_param_t * param = NULL;
+ int32_t align_corners = self->nn_param.resize_3d.align_corners;
+ int32_t half_pixel_centers = self->nn_param.resize_3d.half_pixel_centers;
+ vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
+ uint32_t new_rank = 4;
+ uint32_t i = 0;
+
+ if (inputs[0]->attr.dim_num > 3)
+ {
+ shapes[0][0] = inputs[0]->attr.size[0];
+ shapes[0][1] = inputs[0]->attr.size[1];
+ shapes[0][2] = inputs[0]->attr.size[2];
+ shapes[1][0] = outputs[0]->attr.size[0];
+ shapes[1][1] = outputs[0]->attr.size[1];
+ shapes[1][2] = outputs[0]->attr.size[2];
+ shapes[0][3] = 1;
+ shapes[1][3] = 1;
+
+ for (i = 3; i < inputs[0]->attr.dim_num; i++)
+ {
+ shapes[0][3] = shapes[0][3] * inputs[0]->attr.size[i];
+ }
+ shapes[1][3] = shapes[0][3];
+
+ reshape_inputs[0] = vsi_nn_reshape_tensor(self->graph, inputs[0], shapes[0], new_rank);
+ reshape_outputs[0] = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes[1], new_rank);
+
+ if (reshape_inputs[0] == NULL || reshape_outputs[0] == NULL)
+ {
+ VSILOGE("reshape tensor failed");
+ status = VSI_FAILURE;
+ goto final;
+ }
+ }
+ else
+ {
+ reshape_inputs[0] = inputs[0];
+ reshape_outputs[0] = outputs[0];
+ }
+
+
+ param = vsi_nn_kernel_param_create();
+
+ vsi_nn_kernel_param_add_int32( param, "align_corners", align_corners );
+ vsi_nn_kernel_param_add_int32( param, "half_pixel_centers", half_pixel_centers );
+ vsi_nn_kernel_param_add_int32( param, "type", self->nn_param.resize_3d.type );
+
+ switch (self->nn_param.resize_3d.type)
+ {
+ case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR:
+ snprintf(kernel_name, sizeof(kernel_name),
+ "resize_3d_nearest");
+ break;
+ case VSI_NN_INTERPOLATION_BILINEAR:
+ snprintf(kernel_name, sizeof(kernel_name),
+ "resize_3d_bilinear");
+ break;
+ default:
+ break;
+ }
+
+ self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+ kernel_name, &reshape_inputs[0], 1, &reshape_outputs[0], 1, param );
+
+ if (self->n) {
+ status = VSI_SUCCESS;
+ }
+
+ vsi_nn_kernel_param_release(¶m);
+ }
+
+final:
+ vsi_safe_release_tensor( reshape_inputs[0] );
+ vsi_safe_release_tensor( reshape_outputs[0] );
+
+ return status;
+} /* op_compute() */
+
+static vsi_bool _is_same_shape
+ (
+ vsi_nn_tensor_t * inputs,
+ vsi_size_t *sizes,
+ uint32_t dims
+ )
+{
+ uint32_t i = 0;
+
+ if (inputs->attr.dim_num != dims)
+ return FALSE;
+
+ for (i = 0; i < dims; i++)
+ {
+ if (sizes[i] != inputs->attr.size[i])
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+static vsi_status op_optimize
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t ** inputs,
+ vsi_nn_tensor_t ** outputs,
+ vsi_nn_opt_direction_e direction
+ )
+{
+ if ( self->nn_param.resize_3d.lcl_data->use_internal_node )
+ {
+ return vsi_nn_internal_optimize_node(self, direction );
+ }
+ else
+ {
+ int32_t half_pixel_centers = self->nn_param.resize_3d.half_pixel_centers;
+ vsi_size_t * input_size = inputs[0]->attr.size;
+ vsi_size_t * output_size = outputs[0]->attr.size;
+
+ if ( (output_size[0] % input_size[0] == 0) && (output_size[1] % input_size[1] == 0) &&
+ half_pixel_centers == TRUE && self->nn_param.resize_3d.type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR )
+ {
+ self->nn_param.resize_3d.half_pixel_centers = FALSE;
+ }
+
+ return VSI_SUCCESS;
+ }
+} /* op_optimize() */
+
+static vsi_bool op_check
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t ** inputs,
+ vsi_nn_tensor_t ** outputs
+ )
+{
+ BEGIN_IO_TYPE_DECL(RESIZE_3D, 1, 1)
+ IO_TYPE(D_F16, D_U8|Q_ASYM)
+ IO_TYPE(D_F32, D_U8|Q_ASYM)
+ IO_TYPE(D_F16, D_F16)
+ IO_TYPE(D_F32, D_F32)
+ IO_TYPE(D_BF16, D_BF16)
+ IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM)
+ IO_TYPE(D_U8|Q_ASYM, D_F16)
+ IO_TYPE(D_U8|Q_ASYM, D_F32)
+ IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP)
+ IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM)
+ IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM)
+ IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP)
+ IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM)
+ IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM)
+ END_IO_TYPE_DECL(RESIZE_3D)
+ if (!VALIDATE_OP_IO_TYPES(RESIZE_3D, self, inputs, self->input.num, outputs, self->output.num)) {
+ char* desc = generate_op_io_types_desc(inputs,
+ self->input.num, outputs, self->output.num);
+ VSILOGE("Inputs/Outputs data type not support: %s", desc);
+ destroy_op_io_types_desc(desc);
+ return FALSE;
+ }
+
+ return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+ (
+ vsi_nn_node_t * self,
+ vsi_nn_tensor_t ** inputs,
+ vsi_nn_tensor_t ** outputs
+ )
+{
+ float factor = self->nn_param.resize_3d.factor;
+ vsi_nn_internal_node_t* curr = NULL;
+ uint32_t i = 0;
+ vsi_bool ret = TRUE;
+
+ if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+ {
+ outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+ if (factor != 0)
+ {
+ outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor);
+ outputs[0]->attr.size[1] = (uint32_t)(inputs[0]->attr.size[1] * factor);
+ outputs[0]->attr.size[2] = (uint32_t)(inputs[0]->attr.size[2] * factor);
+ }
+ else
+ {
+ outputs[0]->attr.size[0] = self->nn_param.resize_3d.size[0];
+ outputs[0]->attr.size[1] = self->nn_param.resize_3d.size[1];
+ outputs[0]->attr.size[2] = self->nn_param.resize_3d.size[2];
+ }
+ for (i = 3; i < inputs[0]->attr.dim_num; i++)
+ {
+ outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+ }
+ }
+
+ if (_is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num))
+ {
+ self->nn_param.resize.lcl_data->use_internal_node = TRUE;
+ vsi_nn_internal_init_node_wksp( self );
+ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+ curr->inputs[0] = inputs[0];
+ curr->outputs[0] = outputs[0];
+ ret = vsi_nn_internal_setup_node(self, curr);
+ }
+
+final:
+ return ret;
+} /* op_setup() */
+
+static vsi_status op_init(vsi_nn_node_t* self) {
+ vsi_status status = VSI_SUCCESS;
+
+ self->nn_param.resize_3d.lcl_data =
+ (vsi_nn_resize_3d_local_data*)malloc(sizeof(vsi_nn_resize_3d_local_data));
+ if (NULL == self->nn_param.resize_3d.lcl_data) {
+ VSILOGE("Create resize_3d local data fail.");
+ status = VSI_FAILURE;
+ goto final;
+ }
+ memset(self->nn_param.resize_3d.lcl_data, 0, sizeof(vsi_nn_resize_3d_local_data));
+
+ self->nn_param.resize_3d.align_corners = FALSE;
+ self->nn_param.resize_3d.half_pixel_centers = FALSE;
+
+
+final:
+ return status;
+} /* op_init() */
+
+static vsi_status op_deinit
+ (
+ vsi_nn_node_t* self
+ )
+{
+ if (self->nn_param.resize_3d.lcl_data->use_internal_node)
+ {
+ vsi_nn_safe_free(self->nn_param.resize_3d.lcl_data);
+ vsi_nn_internal_deinit_node_wksp(self);
+ }
+ else
+ {
+ vsi_nn_safe_free(self->nn_param.resize_3d.lcl_data);
+ vsi_nn_op_common_deinit(self);
+ }
+
+ return VSI_SUCCESS;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+ (
+ /* op_name */ RESIZE_3D,
+ /* init */ op_init,
+ /* compute */ op_compute,
+ /* deinit */ op_deinit,
+ /* check */ op_check,
+ /* setup */ op_setup,
+ /* optimize */ op_optimize,
+ /* input_num */ _INPUT_NUM,
+ /* output_num */ _OUTPUT_NUM
+ );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c
index 50924672f..1a9ad7d77 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c
@@ -36,6 +36,7 @@
#include "vsi_nn_log.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
@@ -183,7 +184,7 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
- vsi_bool ret = TRUE;
+ vsi_bool ret = FALSE;
vsi_nn_internal_node_t* curr = NULL;
vsi_nn_internal_init_node_wksp(self);
@@ -201,21 +202,26 @@ static vsi_bool op_setup
attr.vtl = TRUE;
attr.is_const = FALSE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_REVERSE, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[0];
curr->outputs[0] = output_tensor->t;
curr->node->nn_param.reverse.axis = self->nn_param.reverse.axis;
curr->node->nn_param.reverse.axis_num = self->nn_param.reverse.axis_num;
- vsi_nn_internal_setup_node(self, curr);
+ ret &= vsi_nn_internal_setup_node(self, curr);
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = output_tensor->t;
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node(self, curr);
+ ret &= vsi_nn_internal_setup_node(self, curr);
}
return ret;
+final:
+ return FALSE;
} /* op_setup() */
#ifdef __cplusplus
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rnn.c
index 38df1523b..2632ed652 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_rnn.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rnn.c
@@ -89,41 +89,43 @@ static vsi_bool op_check
)
{
uint32_t input_idx = 0;
- do {
+ {
vsi_bool break_early = FALSE;
// input_idx = 0 : inputs[0].shape = shape(batch_size, input_size)
- if (input_idx >= self->input.num) break;
+ if (input_idx >= self->input.num) goto continue_point;
break_early = (inputs[input_idx]->attr.dim_num != 2);
- if (break_early) break;
+ if (break_early) goto continue_point;
input_idx ++;
// input_idx = 1 : inputs[1].shape = shape(num_units, input_size)
- if (input_idx >= self->input.num) break;
+ if (input_idx >= self->input.num) goto continue_point;
break_early = (inputs[input_idx]->attr.dim_num != 2);
- if (break_early) break;
+ if (break_early) goto continue_point;
input_idx ++;
// input_idx = 2 : inputs[2].shape = shape(num_units, num_units)
- if (input_idx >= self->input.num) break;
+ if (input_idx >= self->input.num) goto continue_point;
break_early = (inputs[input_idx]->attr.dim_num != 2);
- if (break_early) break;
+ if (break_early) goto continue_point;
input_idx ++;
// input_idx = 3 : inputs[3].shape = shape(num_units)
- if (input_idx >= self->input.num) break;
+ if (input_idx >= self->input.num) goto continue_point;
break_early = (inputs[input_idx]->attr.dim_num != 1);
- if (break_early) break;
+ if (break_early) goto continue_point;
input_idx ++;
// input_idx = 4 : inputs[4].shape = shape(batch_size, num_units)
- if (input_idx >= self->input.num) break;
+ if (input_idx >= self->input.num) goto continue_point;
break_early = (inputs[input_idx]->attr.dim_num != 2);
- if (break_early) break;
+ if (break_early) goto continue_point;
input_idx ++;
return TRUE;
- } while(0);
+ }
+
+continue_point:
{
BEGIN_IO_TYPE_DECL(RNN, 5, 1)
@@ -155,6 +157,8 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+
if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) {
outputs[0]->attr.size[0] = inputs[4]->attr.size[0];
outputs[0]->attr.size[1] = inputs[4]->attr.size[1];
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c
index a5f82613a..b2c254fd9 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c
@@ -46,6 +46,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -56,6 +58,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -68,6 +73,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
@@ -98,6 +105,7 @@ static vsi_bool setup_op_shapes
attr.is_const = TRUE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
inputs[RNNCELL_INPUT_H_STATE] = output_tensor->t;
}
@@ -108,6 +116,7 @@ static vsi_bool setup_op_shapes
memcpy( &attr.dtype, &outputs[RNNCELL_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) );
attr.vtl = TRUE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
outputs[RNNCELL_OUTPUT_H_STATE] = output_tensor->t;
}
@@ -131,7 +140,10 @@ static vsi_bool setup_op_shapes
outputs[RNNCELL_OUTPUT_OUTPUT]->attr.size,
VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) );
}
+
return TRUE;
+final:
+ return FALSE;
}
static vsi_bool op_setup
@@ -207,6 +219,7 @@ static vsi_bool op_setup
inputs[RNNCELL_INPUT_BIAS_I],
&p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(input_gate_fc_outputs, "Create internal tensor failed", final);
if (inputs[RNNCELL_INPUT_AUX_INPUT] != NULL)
{
aux_input_gate_fc_outputs = vsi_nn_rnn_create_tp_fc(self,
@@ -215,6 +228,7 @@ static vsi_bool op_setup
NULL,
&p->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(aux_input_gate_fc_outputs, "Create internal tensor failed", final);
}
}
else
@@ -225,6 +239,7 @@ static vsi_bool op_setup
&kernel_h, &kernel_w);
input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[RNNCELL_INPUT_INPUT],
p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final);
tmp = vsi_nn_rnn_create_nn_fc(self,
input_tensor->t,
@@ -233,9 +248,11 @@ static vsi_bool op_setup
kernel_h, kernel_w,
&p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
/* transpose and reshape output */
input_gate_fc_outputs = vsi_nn_rnn_process_output_for_nn_fc(self, tmp->t, p->local->multi_batch, kernel_h,
kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(input_gate_fc_outputs, "Create internal tensor failed", final);
if (inputs[RNNCELL_INPUT_AUX_INPUT] != NULL)
{
/* reshape and transpose input */
@@ -245,6 +262,8 @@ static vsi_bool op_setup
input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self,
inputs[RNNCELL_INPUT_AUX_INPUT],
p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final);
+
tmp = vsi_nn_rnn_create_nn_fc(self,
input_tensor->t,
inputs[RNNCELL_INPUT_AUX_INPUT],
@@ -252,10 +271,13 @@ static vsi_bool op_setup
kernel_h, kernel_w,
&p->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
+
/* transpose and reshape output */
aux_input_gate_fc_outputs = vsi_nn_rnn_process_output_for_nn_fc(self,
tmp->t, p->local->multi_batch, kernel_h,
kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(aux_input_gate_fc_outputs, "Create internal tensor failed", final);
}
}
@@ -268,6 +290,7 @@ static vsi_bool op_setup
inputs[RNNCELL_INPUT_BIAS_H],
&p->internal_dtype[RNNCELL_QUANTIZE_PARAM_H],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(hstate_gate_fc_outputs, "Create internal tensor failed", final);
}
else
{
@@ -277,6 +300,7 @@ static vsi_bool op_setup
hstate_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self,
inputs[RNNCELL_INPUT_H_STATE],
p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(hstate_input_tensor, "Create internal tensor failed", final);
tmp = vsi_nn_rnn_create_nn_fc(self,
hstate_input_tensor->t,
@@ -285,9 +309,12 @@ static vsi_bool op_setup
kernel_h, kernel_w,
&p->internal_dtype[RNNCELL_QUANTIZE_PARAM_H],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
+
/* transpose and reshape output */
hstate_gate_fc_outputs = vsi_nn_rnn_process_output_for_nn_fc(self,
tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(hstate_gate_fc_outputs, "Create internal tensor failed", final);
}
input_add_hstate_outputs = vsi_nn_rnn_create_tensor_add(self,
@@ -295,14 +322,22 @@ static vsi_bool op_setup
hstate_gate_fc_outputs->t,
&p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(input_add_hstate_outputs, "Create internal tensor failed", final);
if (inputs[RNNCELL_INPUT_AUX_INPUT] != NULL)
{
+ if (aux_input_gate_fc_outputs == NULL ||
+ input_add_hstate_outputs == NULL)
+ {
+ return FALSE;
+ }
+
gate_fc_outputs = vsi_nn_rnn_create_tensor_add(self,
input_add_hstate_outputs->t,
aux_input_gate_fc_outputs->t,
&p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I],
use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(gate_fc_outputs, "Create internal tensor failed", final);
}
else
{
@@ -311,6 +346,7 @@ static vsi_bool op_setup
/* activation */
curr = vsi_nn_internal_new_node( self, vsi_nn_rnn_get_act_op_type(p->activation), 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.tanh.scale_a = 1.0;
curr->node->nn_param.tanh.scale_b = 1.0;
curr->inputs[0] = gate_fc_outputs->t;
@@ -320,12 +356,15 @@ static vsi_bool op_setup
if (outputs[RNNCELL_OUTPUT_H_STATE] != NULL)
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = outputs[RNNCELL_OUTPUT_OUTPUT];
curr->outputs[0] = outputs[RNNCELL_OUTPUT_H_STATE];
vsi_nn_internal_setup_node(self, curr);
}
return TRUE;
+final:
+ return FALSE;
} /* op_setup() */
static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c
index 12668f0b5..f97dd1c07 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c
@@ -157,6 +157,8 @@ static vsi_status op_optimize
uint32_t dim;
vx_tensor rois_tmp;
+ VSI_UNREFERENCED(outputs);
+
rois_tmp = NULL;
if( direction == VSI_NN_OPTIMIZE_FORWARD && inputs[1]->attr.dim_num == 2 )
{
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c
index 87a714451..6d607b488 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c
@@ -37,7 +37,7 @@
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_link_list.h"
#include "utils/vsi_nn_dtype_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
#define _INPUT_NUM (3)
#define _OUTPUT_NUM (1)
@@ -49,6 +49,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -59,6 +61,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -71,18 +76,20 @@ static vsi_bool op_setup
)
{
vsi_nn_internal_node_t* curr = NULL;
- vsi_bool ret = TRUE;
+ vsi_bool ret = FALSE;
vsi_nn_internal_init_node_wksp( node );
curr = vsi_nn_internal_new_node( node, VSI_NN_OP_A_TIMES_B_PLUS_C, node->input.num, node->output.num );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[0];
curr->inputs[1] = inputs[1];
curr->inputs[2] = inputs[2];
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node(node, curr);
+ ret = vsi_nn_internal_setup_node(node, curr);
+final:
return ret;
} /* op_setup() */
@@ -94,6 +101,9 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c
index 99f8e4056..a6e6c8ead 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c
@@ -129,6 +129,8 @@ static vsi_bool op_setup
uint32_t i = 0;
uint32_t indices_dims = inputs[1]->attr.dim_num;
+ VSI_UNREFERENCED(self);
+
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c
index d8c9842e1..462a2cad9 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c
@@ -30,10 +30,11 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
-#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_tensor_util_prv.h"
#define _INPUT_NUM (2)
#define _OUTPUT_NUM (1)
@@ -75,7 +76,32 @@ static vsi_status op_compute
vsi_nn_kernel_param_add_int32( param, "block_size", block_size );
vsi_nn_kernel_param_add_int32( param, "coord_dim", coord_dim );
vsi_nn_kernel_param_add_int32( param, "idx_num", idx_num );
- n = vsi_nn_kernel_selector( self->graph, "scatter_nd", inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+
+ if (vsi_nn_is_same_data_type(inputs[1], outputs[0]) == FALSE ||
+ vsi_nn_is_same_quant_type(inputs[1], outputs[0]))
+ {
+ n = vsi_nn_kernel_selector( self->graph, "scatter_nd", inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+ }
+ else
+ {
+ vsi_nn_tensor_attr_t attr;
+ vsi_nn_tensor_t* temp_tensors = NULL;
+
+ VSILOGW("scatter_nd is no_range_change operation! \
+ Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!");
+
+ memcpy( &attr, &outputs[0]->attr, sizeof(attr));
+ memcpy( &attr.dtype, &inputs[1]->attr.dtype, sizeof(attr.dtype));
+ attr.is_const = FALSE;
+ attr.vtl = TRUE;
+ temp_tensors = vsi_nn_CreateTensor( self->graph, &attr );
+
+ vsi_nn_kernel_selector( self->graph, "scatter_nd", inputs, _INPUT_NUM, &temp_tensors, _OUTPUT_NUM, param );
+ n = vxTensorCopyNode( self->graph->g, temp_tensors->t, outputs[0]->t);
+
+ vsi_safe_release_tensor(temp_tensors);
+ }
+
if ( n != NULL )
{
self->n = (vx_node)n;
@@ -134,6 +160,8 @@ static vsi_bool op_setup
uint32_t i = 0;
vsi_nn_scatter_nd_param * p = &(self->nn_param.scatter_nd);
+ VSI_UNREFERENCED(inputs);
+
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
if (p->shape == NULL)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c
index 63900eb98..e3e19ade7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c
@@ -141,6 +141,8 @@ static vsi_bool op_setup
/* TODO: Add code to comput outputs' shape. */
uint32_t i = 0;
+ VSI_UNREFERENCED(self);
+
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c
index 485dcd5ef..7efc8c767 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c
@@ -48,68 +48,15 @@ static vsi_status op_compute
)
{
vsi_status status = VSI_FAILURE;
- vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL };
- vsi_size_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
- vsi_size_t* shapes_ptr[_IO_NUM];
- vsi_size_t* shapes_in[_INPUT_NUM];
- vsi_size_t rank_in[_INPUT_NUM];
- uint32_t new_rank = 0;
- int32_t i = 0;
- vsi_bool ret = FALSE;
- vsi_nn_context_t ctx = NULL;
if ( NULL == self )
{
return VSI_FAILURE;
}
- ctx = self->graph->ctx;
-
- for (i = 0; i < _IO_NUM; i++)
- {
- shapes_ptr[i] = shapes[i];
- }
-
- for (i = 0; i < _INPUT_NUM; i++)
- {
- shapes_in[i] = inputs[i]->attr.size;
- rank_in[i] = (vsi_size_t)inputs[i]->attr.dim_num;
- }
-
- ret = vsi_nn_kernel_optimize_broadcast_shape(
- (const vsi_size_t**)shapes_in, rank_in, _INPUT_NUM,
- outputs[0]->attr.size, outputs[0]->attr.dim_num,
- shapes_ptr, shapes[_INPUT_NUM], &new_rank);
-
- if ( ret && !ctx->config.support_stream_processor )
- {
- for (i = 0; i < _INPUT_NUM; i++)
- {
- reshape_tensors[i] = vsi_nn_reshape_tensor( self->graph,
- inputs[i], shapes[i], new_rank );
- }
-
- for (i = 0; i < _OUTPUT_NUM; i++)
- {
- reshape_tensors[i + _INPUT_NUM] = vsi_nn_reshape_tensor( self->graph,
- outputs[i], shapes[i + _INPUT_NUM], new_rank );
- }
-
- self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "select",
- &reshape_tensors[0], _INPUT_NUM,
- &reshape_tensors[_INPUT_NUM], _OUTPUT_NUM, NULL );
-
- for (i = 0; i < _IO_NUM; i++)
- {
- vsi_safe_release_tensor( reshape_tensors[i] );
- }
- }
- else
- {
- self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "select",
- inputs, _INPUT_NUM,
- outputs, _OUTPUT_NUM, NULL );
- }
+ self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "select",
+ inputs, _INPUT_NUM,
+ outputs, _OUTPUT_NUM, NULL );
if ( self->n )
{
@@ -247,6 +194,8 @@ static vsi_bool op_setup
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
vsi_bool ret = TRUE;
+ VSI_UNREFERENCED(self);
+
in0_rank = inputs[0]->attr.dim_num;
in1_rank = inputs[1]->attr.dim_num;
in2_rank = inputs[2]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c b/src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c
index 500e6761e..dc54ba7ad 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c
@@ -100,6 +100,14 @@ static vsi_bool op_check
IO_TYPE(D_I32, D_I32, D_F32)
IO_TYPE(D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM)
+ IO_TYPE(D_I8|Q_DFP, D_NONE, D_I8|Q_DFP)
+ IO_TYPE(D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM)
+ IO_TYPE(D_I16|Q_DFP, D_NONE, D_I16|Q_DFP)
+ IO_TYPE(D_I16|Q_ASYM, D_NONE, D_I16|Q_ASYM)
+ IO_TYPE(D_I16|Q_SYM, D_NONE, D_I16|Q_SYM)
+ IO_TYPE(D_I16|Q_SYM, D_NONE, D_U8|Q_ASYM)
+ IO_TYPE(D_F16, D_NONE, D_F16)
+ IO_TYPE(D_F16, D_NONE, D_U8|Q_ASYM)
IO_TYPE(D_I32, D_NONE, D_U8|Q_ASYM)
IO_TYPE(D_I32, D_NONE, D_BOOL8)
END_IO_TYPE_DECL(SEQUENCE_MASK)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c
index bb41e98ad..f922b8d16 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c
@@ -37,6 +37,7 @@
#include "kernel/vsi_nn_kernel.h"
#include "vsi_nn_internal_node.h"
#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
#define _ARG_NUM (3)
#define _INPUT_NUM (1)
@@ -136,6 +137,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
if (self->input.num > 1)
{
return VSI_SUCCESS;
@@ -153,9 +156,10 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
- vsi_nn_slice_param * p;
+ vsi_nn_slice_param * p = NULL;
vsi_nn_internal_node_t* curr = NULL;
- uint32_t i;
+ uint32_t i = 0;
+ vsi_bool ret = FALSE;
if (self->nn_param.slice.dims == 0)
{
@@ -187,6 +191,7 @@ static vsi_bool op_setup
}
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.strided_slice.begin_dims = p->lcl_data->begin_dims;
curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num;
curr->node->nn_param.strided_slice.end_dims = p->lcl_data->end_dims;
@@ -199,9 +204,10 @@ static vsi_bool op_setup
curr->node->nn_param.strided_slice.new_axis_mask = 0;
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node( self, curr );
+ ret = vsi_nn_internal_setup_node( self, curr );
- return TRUE;
+final:
+ return ret;
} /* op_setup() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c
index c81639929..27431a73f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c
@@ -37,6 +37,7 @@
#include "utils/vsi_nn_math.h"
#include "utils/vsi_nn_constraint_check.h"
#include "vsi_nn_tensor_util_prv.h"
+#include "vsi_nn_error.h"
static vsi_status op_compute
(
@@ -45,6 +46,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -123,6 +126,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
if (VSI_NN_OPTIMIZE_BACKWARD == direction)
{
return VSI_SUCCESS;
@@ -174,7 +179,9 @@ static vsi_bool op_setup
)
{
vsi_nn_internal_node_t* curr = NULL;
- if( NULL == self )
+ vsi_bool ret = FALSE;
+
+ if ( NULL == self )
{
return FALSE;
}
@@ -202,13 +209,15 @@ static vsi_bool op_setup
vsi_nn_internal_init_node_wksp(self);
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_SOFTMAX_INTERNAL, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
curr->node->nn_param.softmax_internal.beta = self->nn_param.softmax.beta;
curr->node->nn_param.softmax_internal.axis = self->nn_param.softmax.axis;
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
- return TRUE;
+final:
+ return ret;
}
#ifdef __cplusplus
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
index 0dbe88c87..0d85eb13e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
@@ -45,6 +45,8 @@ static vsi_bool _need_split_softmax
)
{
vsi_bool ret = FALSE;
+ VSI_UNREFERENCED(self);
+
if(inputs[0]->attr.dim_num == 2 && inputs[0]->attr.size[1] > MAX_SOFTMAX_BATCH)
{
ret = TRUE;
@@ -250,6 +252,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
//TODO: Check tensor shapes.
return TRUE;
} /* op_check() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c
index d6e201e5b..71615e740 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c
@@ -35,8 +35,7 @@
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_math.h"
-#include "libnnext/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
+#include "vsi_nn_error.h"
#include "vsi_nn_test.h"
#include "utils/vsi_nn_constraint_check.h"
@@ -103,6 +102,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
if (self->nn_param.space2depth.block_size[0] != self->nn_param.space2depth.block_size[1])
{
return vsi_nn_internal_optimize_node(self, direction );
@@ -142,12 +143,13 @@ static vsi_bool op_set_space2depth_internal
vsi_nn_op_t type_name
)
{
- vsi_bool retn = TRUE;
+ vsi_bool retn = FALSE;
vsi_nn_internal_node_t* curr = NULL;
vsi_nn_internal_init_node_wksp( self );
curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.space2depth_internal.block_size_x =
self->nn_param.space2depth.block_size[0];
curr->node->nn_param.space2depth_internal.block_size_y =
@@ -156,6 +158,7 @@ static vsi_bool op_set_space2depth_internal
curr->outputs[0] = outputs[0];
retn = vsi_nn_internal_setup_node(self, curr);
+final:
return retn;
}
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_split.c b/src/tim/vx/internal/src/ops/vsi_nn_op_split.c
index 9810b2c09..65dc6de93 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_split.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_split.c
@@ -37,6 +37,7 @@
#include "utils/vsi_nn_link_list.h"
#include "vsi_nn_internal_node.h"
#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
static vsi_status op_compute
(
@@ -45,6 +46,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -178,9 +181,9 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
- vsi_bool ret;
- uint32_t i, num;
- vsi_size_t average;
+ vsi_bool ret = FALSE;
+ uint32_t i = 0, num = 0;
+ vsi_size_t average = 1;
vsi_size_t start[VSI_NN_MAX_DIM_NUM] = { 0 };
vsi_size_t end[VSI_NN_MAX_DIM_NUM] = { 0 };
uint32_t axis = self->nn_param.split.axis;
@@ -189,8 +192,6 @@ static vsi_bool op_setup
vsi_nn_split_param * p = NULL;
vsi_nn_internal_node_t* curr = NULL;
- ret = TRUE;
- average = 1;
/* compute the output tensor number */
num = (uint32_t)(self->output.num - 1);
while ( NULL == outputs[num] )
@@ -237,6 +238,7 @@ static vsi_bool op_setup
p->lcl_data->end_dims[j] = (int32_t)end[j];
}
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.strided_slice.begin_dims = p->lcl_data->begin_dims;
curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num;
curr->node->nn_param.strided_slice.end_dims = p->lcl_data->end_dims;
@@ -249,10 +251,12 @@ static vsi_bool op_setup
curr->node->nn_param.strided_slice.new_axis_mask = 0;
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[i];
- vsi_nn_internal_setup_node( self, curr );
+ ret = vsi_nn_internal_setup_node( self, curr );
}
return ret;
+final:
+ return FALSE;
} /* op_setup() */
static vsi_status op_init
@@ -309,28 +313,12 @@ static vsi_status op_deinit
p = &(self->nn_param.split);
- if (p->lcl_data->begin_dims)
- {
- free(p->lcl_data->begin_dims);
- p->lcl_data->begin_dims = NULL;
- }
-
- if (p->lcl_data->end_dims)
- {
- free(p->lcl_data->end_dims);
- p->lcl_data->end_dims = NULL;
- }
-
- if (p->lcl_data->stride_dims)
- {
- free(p->lcl_data->stride_dims);
- p->lcl_data->stride_dims = NULL;
- }
-
- if (p->lcl_data)
+ if (p && p->lcl_data)
{
- free(p->lcl_data);
- p->lcl_data = NULL;
+ vsi_nn_safe_free(p->lcl_data->begin_dims);
+ vsi_nn_safe_free(p->lcl_data->end_dims);
+ vsi_nn_safe_free(p->lcl_data->stride_dims);
+ vsi_nn_safe_free(p->lcl_data);
}
vsi_nn_internal_deinit_node_wksp( self );
@@ -346,6 +334,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
}
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c
index 3609aad4f..4e0a5e566 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c
@@ -35,6 +35,7 @@
#include "kernel/vsi_nn_kernel.h"
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_internal_node.h"
+#include "vsi_nn_error.h"
/*
Declare number of input and output.
@@ -49,6 +50,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -106,6 +109,7 @@ static vsi_bool op_setup
vsi_bool shouldSqueeze[VSI_NN_MAX_DIM_NUM] = {FALSE};
uint32_t numDimsSqueezed = 0;
vsi_nn_internal_node_t* curr = NULL;
+ vsi_bool ret = FALSE;
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
@@ -122,7 +126,7 @@ static vsi_bool op_setup
{
int32_t rank = self->nn_param.squeeze.axis[i];
- rank = rank < 0 ? rank + inputs[0]->attr.dim_num : rank;
+ rank = rank < 0 ? rank + (int32_t)inputs[0]->attr.dim_num : rank;
if ( !shouldSqueeze[rank] )
{
@@ -145,13 +149,15 @@ static vsi_bool op_setup
vsi_nn_internal_init_node_wksp( self );
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.reshape2.size = outputs[0]->attr.size;
curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num;
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node( self, curr );
+ ret = vsi_nn_internal_setup_node( self, curr );
- return TRUE;
+final:
+ return ret;
} /* op_setup() */
static vsi_status op_deinit
@@ -172,6 +178,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
}
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c
index 9b59d9920..d59c6f5d1 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c
@@ -37,7 +37,7 @@
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_link_list.h"
#include "utils/vsi_nn_dtype_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
#define _ARG_NUM (1)
#define _INPUT_NUM VSI_NN_STACK_MAX_INPUTS
@@ -53,6 +53,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -63,6 +65,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -84,7 +89,7 @@ static vsi_bool op_setup
vsi_nn_internal_node_t* curr = NULL;
vsi_nn_tensor_t *output_rs = NULL;
vsi_nn_stack_lcl_data * data = NULL;
- vsi_bool ret = TRUE;
+ vsi_bool ret = FALSE;
vx_int8 is_scalar = vsi_nn_GetTensorIsScalar(inputs[0]);
vsi_nn_internal_init_node_wksp( node );
@@ -122,10 +127,12 @@ static vsi_bool op_setup
if (1 == node->input.num)
{
curr = vsi_nn_internal_new_node( node, VSI_NN_OP_RESHAPE2, 1, 1);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num;
curr->node->nn_param.reshape2.size = outputs[0]->attr.size;
+ ret = vsi_nn_internal_setup_node(node, curr);
goto final;
}
@@ -133,17 +140,13 @@ static vsi_bool op_setup
input_shape[1] = block_num;
curr = vsi_nn_internal_new_node( node, VSI_NN_OP_CONCAT, node->input.num, node->output.num );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
for (i = 0; i < node->input.num; i++)
{
vsi_nn_tensor_t *input_rs = NULL;
/* Malloc ptr */
data = (vsi_nn_stack_lcl_data *)malloc( sizeof(vsi_nn_stack_lcl_data) );
- if( NULL == data )
- {
- VSILOGE( "Create stack local data fail." );
- ret = FALSE;
- goto final;
- }
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(data, curr, "Create buffer failed", final);
memset( data, 0, sizeof(vsi_nn_stack_lcl_data) );
input_rs = vsi_nn_reshape_tensor(node->graph, inputs[i], input_shape, 2);
@@ -171,16 +174,18 @@ static vsi_bool op_setup
/* Malloc ptr */
data = (vsi_nn_stack_lcl_data *)malloc( sizeof(vsi_nn_stack_lcl_data) );
- if( NULL == data )
- {
- VSILOGE( "Create stack local data fail." );
- ret = FALSE;
- goto final;
- }
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(data, curr, "Create buffer failed", final);
memset( data, 0, sizeof(vsi_nn_stack_lcl_data) );
output_rs = vsi_nn_reshape_tensor(node->graph, outputs[0], output_shape, 2);
- data->src_in = output_rs;
+ if (output_rs == NULL)
+ {
+ vsi_nn_internal_release_node(&curr);
+ VSILOGD("Create reshape tensor failed\n");
+ vsi_nn_safe_free(data);
+ goto final;
+ }
+ data->src_in = output_rs;
/* Store node, ptr */
vsi_nn_LinkListPushStart(
(vsi_nn_link_list_t **)&node->nn_param.stack.lcl_data,
@@ -188,10 +193,9 @@ static vsi_bool op_setup
curr->outputs[0] = output_rs;
curr->node->nn_param.concat.axis = axis;
+ ret = vsi_nn_internal_setup_node(node, curr);
final:
- vsi_nn_internal_setup_node(node, curr);
-
return ret;
} /* op_setup() */
@@ -203,6 +207,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
index 1cf2891ad..ae43c05c8 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
@@ -749,6 +749,8 @@ static vsi_status op_optimize
vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
vsi_bool is_same_quant_type = FALSE;
vsi_bool is_same_shape = TRUE;
+ vsi_size_t input_elements = 0;
+ vsi_size_t output_elements = 0;
/* Only forward run stride_slice's optimize */
if ( direction == VSI_NN_OPTIMIZE_BACKWARD )
@@ -775,38 +777,49 @@ static vsi_status op_optimize
VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
- if ( NULL == inputs[0]->t )
- {
- vsi_nn_TensorReinit( self->graph, inputs[0] );
- }
-
- /* Create tensor from view */
- memcpy( start, (vsi_size_t*)start_dims, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM );
- memcpy( end, (vsi_size_t*)stop_dims, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM );
- in_view_tensor = vsi_nn_CreateViewTensor(self->graph, start, end, inputs[0]);
- if ( NULL == in_view_tensor )
- {
- VSILOGE( "Create tensor %d from view fail.", i );
- status = VSI_FAILURE;
- goto OnError;
- }
-
self->nn_param.strided_slice.lcl2_data->is_optimized = TRUE;
is_same_quant_type = _is_same_quant(inputs, outputs);
- if ( NULL != outputs[0]->t || is_same_quant_type == FALSE)
+ input_elements = vsi_nn_GetElementNum( inputs[0] );
+ output_elements = vsi_nn_GetElementNum( outputs[0] );
+ if (NULL != outputs[0]->t && NULL == inputs[0]->t &&
+ is_same_quant_type && input_elements == output_elements)
{
- VSILOGI( "stride slice copy tensor.");
- // Copy old tensor values to the new address.
- status = copy_tensor_to_view( self, in_view_tensor, outputs[0], shape, is_same_shape);
- if ( VSI_FAILURE == status )
- {
- goto OnError;
- }
+ inputs[0]->t = vsi_nn_safe_reshape_tensor( outputs[0]->t,
+ (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num,
+ sizeof(inputs[0]->attr.size[0]) );
}
else
{
- outputs[0]->t = in_view_tensor;
+ if ( NULL == inputs[0]->t )
+ {
+ vsi_nn_TensorReinit( self->graph, inputs[0] );
+ }
+ /* Create tensor from view */
+ memcpy( start, (vsi_size_t*)start_dims, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM );
+ memcpy( end, (vsi_size_t*)stop_dims, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM );
+ in_view_tensor = vsi_nn_CreateViewTensor(self->graph, start, end, inputs[0]);
+ if ( NULL == in_view_tensor )
+ {
+ VSILOGE( "Create tensor %d from view fail.", i );
+ status = VSI_FAILURE;
+ goto OnError;
+ }
+
+ if ( NULL != outputs[0]->t || is_same_quant_type == FALSE)
+ {
+ VSILOGI( "stride slice copy tensor.");
+ // Copy old tensor values to the new address.
+ status = copy_tensor_to_view( self, in_view_tensor, outputs[0], shape, is_same_shape);
+ if ( VSI_FAILURE == status )
+ {
+ goto OnError;
+ }
+ }
+ else
+ {
+ outputs[0]->t = in_view_tensor;
+ }
}
OnError:
@@ -841,32 +854,32 @@ static vsi_status op_deinit
vsi_nn_safe_free( params->end_dims );
vsi_nn_safe_free( params->stride_dims );
- if (lcl2_data->cp_node)
+ if (lcl2_data && lcl2_data->cp_node)
{
vxReleaseNode( &lcl2_data->cp_node );
}
- if (lcl2_data->src_tensor)
+ if (lcl2_data && lcl2_data->src_tensor)
{
vxReleaseTensor( &lcl2_data->src_tensor );
}
- if (lcl2_data->dst_tensor && !lcl2_data->is_same_shape)
+ if (lcl2_data && lcl2_data->dst_tensor && !lcl2_data->is_same_shape)
{
vxReleaseTensor( &lcl2_data->dst_tensor );
}
- if (lcl2_data->begin_dims)
+ if (lcl2_data && lcl2_data->begin_dims)
{
free(lcl2_data->begin_dims);
}
- if (lcl2_data->end_dims)
+ if (lcl2_data && lcl2_data->end_dims)
{
free(lcl2_data->end_dims);
}
- if (lcl2_data->stride_dims)
+ if (lcl2_data && lcl2_data->stride_dims)
{
free(lcl2_data->stride_dims);
}
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c b/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c
index b8b4c1e53..080183652 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c
@@ -31,6 +31,7 @@
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_prv.h"
#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
static vsi_status _create_local_tensor
(
@@ -129,6 +130,7 @@ static vsi_status op_compute
attr.is_const = TRUE;
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
bias_tensor = vsi_nn_CreateTensor(self->graph, &attr);
+ CHECK_PTR_FAIL_GOTO( bias_tensor, "Create tensor fail.", final );
param.bias = bias_tensor->t;
}
@@ -145,6 +147,7 @@ static vsi_status op_compute
status = VSI_SUCCESS;
}
+final:
if (bias_tensor != NULL) vsi_nn_ReleaseTensor(&bias_tensor);
return status;
} /* op_compute() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c b/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c
index 812cea379..61a541c79 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c
@@ -63,6 +63,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c
index 78f350858..ff15f81de 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c
@@ -49,7 +49,7 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
- vsi_status status = VX_FAILURE;
+ vsi_status status = VSI_FAILURE;
vsi_nn_kernel_param_t * param = NULL;
vsi_nn_tensor_add_mean_stddev_norm_param * p = NULL;
float eps;
@@ -113,6 +113,8 @@ static vsi_bool op_setup
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(node);
+
/* TODO: Add code to comput outputs' shape. */
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c
index 3098b6cf8..82f104a58 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c
@@ -141,6 +141,8 @@ static vsi_bool op_setup
vsi_nn_tensorstackconcat_param *p = NULL;
int32_t axis = 0;
+ VSI_UNREFERENCED(outputs);
+
if ( NULL == self )
{
return ret;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c
index 647396fdb..b6fb26ec7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c
@@ -41,6 +41,30 @@
Declare number of input and output.
*/
+static vsi_bool _is_supported_axis(vsi_size_t* multiples, vsi_size_t multiples_num)
+{
+ vsi_size_t i = 0;
+
+ if ( multiples_num < 4)
+ {
+ return TRUE;
+ }
+ else if ( multiples_num > 4)
+ {
+ return FALSE;
+ }
+
+ for ( i = 3; i < multiples_num; i++)
+ {
+ if (multiples[i] > 1)
+ {
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
static vsi_status _tile_op_compute
(
const char * kernel_name,
@@ -49,18 +73,100 @@ static vsi_status _tile_op_compute
vsi_nn_tensor_t ** outputs
)
{
- vsi_status status = VSI_FAILURE;
+ vsi_status status = VSI_FAILURE;
+ vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+ vsi_size_t new_rank = 0;
+ vsi_bool ret = FALSE;
+ vsi_size_t* multiples = (vsi_size_t*)self->nn_param.tile.multiples;
+ vsi_nn_tensor_t* temp_tensors[2] = { NULL };
+ vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+ vsi_nn_tensor_attr_t attr;
+
+ if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE)
+ {
+ VSILOGW("tile is no_range_change operation! \
+ Insert DataConvert Operation when the quantization parameters\
+ of input and output are inconsistent!");
- self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
- kernel_name,
- &inputs[0], 1,
- &outputs[0], 1, NULL );
+ memcpy( &attr, &outputs[0]->attr, sizeof(attr));
+ memcpy( &attr.dtype, &inputs[0]->attr.dtype, sizeof(attr.dtype));
+ attr.is_const = FALSE;
+ attr.vtl = TRUE;
+ temp_tensors[1] = vsi_nn_CreateTensor( self->graph, &attr );
+ }
+ else
+ {
+ temp_tensors[1] = outputs[0];
+ }
- if( self->n )
+ ret = vsi_nn_kernel_optimize_tile_shape(
+ inputs[0]->attr.size, inputs[0]->attr.dim_num,
+ multiples, inputs[0]->attr.dim_num,
+ temp_tensors[1]->attr.size, temp_tensors[1]->attr.dim_num,
+ shapes[0], shapes[1], shapes[2], &new_rank );
+
+ if (ret)
+ {
+ if (_is_supported_axis(shapes[1], new_rank) == FALSE)
+ {
+ reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0],\
+ shapes[0], (vsi_size_t)new_rank );
+ reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, temp_tensors[1],\
+ shapes[2], (vsi_size_t)new_rank );
+ if (reshape_tensors[0] == NULL || reshape_tensors[1] == NULL)
+ {
+ VSILOGE("reshape tensor failed!");
+ status = VSI_FAILURE;
+ goto final;
+ }
+
+ memcpy( &attr, &reshape_tensors[0]->attr, sizeof(attr));
+ attr.is_const = FALSE;
+ attr.vtl = TRUE;
+ attr.size[0] = reshape_tensors[1]->attr.size[0];
+ attr.size[1] = reshape_tensors[1]->attr.size[1];
+
+ temp_tensors[0] = vsi_nn_CreateTensor( self->graph, &attr );
+
+ self->n = (vx_node)vsi_nn_kernel_selector(
+ self->graph, kernel_name, &reshape_tensors[0], 1, &temp_tensors[0], 1, NULL);
+ self->n = (vx_node)vsi_nn_kernel_selector(
+ self->graph, kernel_name, &temp_tensors[0], 1, &reshape_tensors[1], 1, NULL);
+
+ }
+ else
+ {
+ reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0],\
+ shapes[0], (vsi_size_t)new_rank );
+ reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, temp_tensors[1],\
+ shapes[2], (vsi_size_t)new_rank );
+ if (reshape_tensors[0] == NULL || reshape_tensors[1] == NULL)
+ {
+ VSILOGE("reshape tensor failed!");
+ status = VSI_FAILURE;
+ goto final;
+ }
+
+ self->n = (vx_node)vsi_nn_kernel_selector( self->graph, kernel_name,\
+ &reshape_tensors[0], 1, &reshape_tensors[1], 1, NULL );
+ }
+ }
+
+ if ( self->n )
{
status = VSI_SUCCESS;
}
+final:
+ vsi_safe_release_tensor(reshape_tensors[0]);
+ vsi_safe_release_tensor(reshape_tensors[1]);
+ vsi_safe_release_tensor(temp_tensors[0]);
+ if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE)
+ {
+ self->n = vxTensorCopyNode( self->graph->g, temp_tensors[1]->t, outputs[0]->t);
+ vsi_safe_release_tensor(temp_tensors[1]);
+ }
+
return status;
} /* _tile_op_compute() */
@@ -71,7 +177,7 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
- /*TODO: Check tensor shapes. */
+ /*TODO: Check tensor shapes. */
vsi_nn_tile_param * p;
BEGIN_IO_TYPE_DECL(TILE, 1, 1)
@@ -88,6 +194,8 @@ static vsi_bool op_check
IO_TYPE(D_I32, D_I32)
IO_TYPE(D_U32, D_U32)
IO_TYPE(D_F32, D_F32)
+ IO_TYPE(D_F32, D_U8|Q_ASYM)
+ IO_TYPE(D_F16, D_U8|Q_ASYM)
END_IO_TYPE_DECL(TILE)
if (!VALIDATE_OP_IO_TYPES(TILE, self, inputs, self->input.num, outputs, self->output.num)) {
char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
index d797af2cd..ff8c0e0fd 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
@@ -36,10 +36,59 @@
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_constraint_check.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "vsi_nn_error.h"
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (2)
+vsi_nn_tensor_t* _create_permute_node
+ (
+ vsi_nn_node_t* self,
+ vsi_nn_tensor_t* input_tensor,
+ vsi_nn_tensor_t* output_tensor,
+ uint32_t* perm,
+ uint32_t dim_num,
+ vsi_bool use_virtual_tensor
+ )
+{
+ vsi_nn_tensor_t* tensor0 = NULL;
+ vsi_nn_tensor_t *output = NULL;
+
+ if (output_tensor)
+ {
+ output = output_tensor;
+ }
+ else
+ {
+ uint32_t i = 0;
+ vsi_nn_tensor_attr_t attr;
+ memcpy(&attr, &input_tensor->attr, sizeof(attr));
+ attr.vtl = use_virtual_tensor;
+ for ( i = 0; i < dim_num; i++ )
+ {
+ attr.size[i] = input_tensor->attr.size[perm[i]];
+ }
+ tensor0 = vsi_nn_CreateTensor( self->graph, &attr );
+ CHECK_PTR_FAIL_GOTO( tensor0, "Create tensor fail.", final );
+ output = tensor0;
+ }
+ self->n = vxTensorPermuteNode(
+ self->graph->g,
+ input_tensor->t,
+ output->t,
+ perm,
+ dim_num
+ );
+ if (self->n == NULL)
+ {
+ vsi_safe_release_tensor(tensor0);
+ }
+
+final:
+ return tensor0;
+}
+
static vsi_status op_compute
(
vsi_nn_node_t * self,
@@ -49,18 +98,122 @@ static vsi_status op_compute
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_param_t * param = NULL;
+ vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+ vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+ uint32_t rank_in = 0;
+ uint32_t rank_out = 0;
+ int32_t new_axis0 = 0;
+ int32_t new_axis1 = 0;
+ int32_t axis = self->nn_param.topk.axis;
+ int32_t top_k = self->nn_param.topk.k;
+ vsi_nn_tensor_t * in_tensor = NULL;
+ vsi_nn_tensor_t * out0_tensor = NULL;
+ vsi_nn_tensor_t * out1_tensor = NULL;
+ vsi_bool ret = FALSE;
+
+ ret = vsi_nn_kernel_optimize_softmax_shape(
+ inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+ shapes[0], &rank_in, &new_axis0);
+
+ ret = vsi_nn_kernel_optimize_softmax_shape(
+ outputs[0]->attr.size, outputs[0]->attr.dim_num, axis,
+ shapes[1], &rank_out, &new_axis1);
param = vsi_nn_kernel_param_create();
- vsi_nn_kernel_param_add_int32( param, "top_k", self->nn_param.topk.k );
+ vsi_nn_kernel_param_add_int32( param, "top_k", top_k );
+
+ if (ret)
+ {
+ uint32_t perm_in[VSI_NN_MAX_DIM_NUM] = {0};
+ uint32_t perm_out[VSI_NN_MAX_DIM_NUM] = {0};
+ vsi_nn_tensor_t* input_tensor = NULL;
+ vsi_nn_tensor_t* outputs_tensor[2] = {NULL};
+
+ reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
+ inputs[0], shapes[0], rank_in );
+ reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+ outputs[0], shapes[1], rank_in );
+ reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph,
+ outputs[1], shapes[1], rank_in );
+
+ axis = new_axis0;
+
+ if (axis != 0)
+ {
+ uint32_t i = 0;
+ uint32_t index = 0;
+
+ vsi_nn_tensor_attr_t attr0, attr1;
+ memcpy(&attr0, &reshape_tensors[1]->attr, sizeof(attr0));
+ memcpy(&attr1, &reshape_tensors[2]->attr, sizeof(attr1));
+
+ attr0.vtl = TRUE;
+ attr1.vtl = TRUE;
+ attr0.size[index] = (vsi_size_t)top_k;
+ attr1.size[index] = (vsi_size_t)top_k;
+ perm_in[index ++] = (uint32_t)axis;
+ for ( i = 0; i < rank_in; i++ )
+ {
+ if ((int32_t)i == axis)
+ continue;
+ attr0.size[index] = shapes[1][i];
+ attr1.size[index] = shapes[1][i];
+ perm_in[index ++] = i;
+ }
+
+ perm_out[axis] = 0;
+ for ( i = 1, index = 0; i < rank_in; i++ )
+ {
+ if ((int32_t)index == axis)
+ {
+ index ++;
+ }
+ perm_out[index ++] = i;
+ }
+
+ out0_tensor = vsi_nn_CreateTensor( self->graph, &attr0 );
+ CHECK_PTR_FAIL_GOTO( out0_tensor, "Create tensor fail.", final );
+ out1_tensor = vsi_nn_CreateTensor( self->graph, &attr1 );
+ CHECK_PTR_FAIL_GOTO( out1_tensor, "Create tensor fail.", final );
+
+ in_tensor = _create_permute_node(self, reshape_tensors[0], NULL, perm_in, rank_in, TRUE);
+ CHECK_PTR_FAIL_GOTO( in_tensor, "Create internal tensor fail.", final );
+
+ input_tensor = in_tensor;
+ outputs_tensor[0] = out0_tensor;
+ outputs_tensor[1] = out1_tensor;
+ }
+ else
+ {
+ input_tensor = reshape_tensors[0];
+ outputs_tensor[0] = reshape_tensors[1];
+ outputs_tensor[1] = reshape_tensors[2];
+ }
+
+ self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "topk",
+ &input_tensor, _INPUT_NUM,
+ outputs_tensor, _OUTPUT_NUM, param );
- self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "topk",
- inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+ if (axis != 0)
+ {
+ _create_permute_node(self, outputs_tensor[0], reshape_tensors[1], perm_out, rank_in, TRUE);
+ _create_permute_node(self, outputs_tensor[1], reshape_tensors[2], perm_out, rank_in, TRUE);
+ }
+ }
- if( self->n )
+ if ( self->n )
{
status = VSI_SUCCESS;
}
+final:
+ vsi_safe_release_tensor( reshape_tensors[0] );
+ vsi_safe_release_tensor( reshape_tensors[1] );
+ vsi_safe_release_tensor( reshape_tensors[2] );
+ vsi_safe_release_tensor( in_tensor );
+ vsi_safe_release_tensor( out0_tensor );
+ vsi_safe_release_tensor( out1_tensor );
+
return status;
} /* op_compute() */
@@ -107,29 +260,38 @@ static vsi_bool op_setup
/* TODO: Add code to comput outputs' shape. */
uint32_t i;
- if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+ if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
vsi_nn_topk_param * p;
p = &(self->nn_param.topk);
+
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
- outputs[0]->attr.size[0] = p->k;
- for (i = 1; i < inputs[0]->attr.dim_num; i++)
+ outputs[0]->attr.size[p->axis] = p->k;
+ for (i = 0; i < inputs[0]->attr.dim_num; i++)
{
+ if ((int32_t)i == p->axis)
+ {
+ continue;
+ }
outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
}
}
- if( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num )
+ if ( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num )
{
vsi_nn_topk_param * p;
p = &(self->nn_param.topk);
outputs[1]->attr.dim_num = inputs[0]->attr.dim_num;
- outputs[1]->attr.size[0] = p->k;
- for (i = 1; i < inputs[0]->attr.dim_num; i++)
+ outputs[1]->attr.size[p->axis] = p->k;
+ for (i = 0; i < inputs[0]->attr.dim_num; i++)
{
+ if ((int32_t)i == p->axis)
+ {
+ continue;
+ }
outputs[1]->attr.size[i] = inputs[0]->attr.size[i];
}
}
@@ -137,6 +299,17 @@ static vsi_bool op_setup
return TRUE;
} /* op_setup() */
+static vsi_status op_init
+ (
+ vsi_nn_node_t * self
+ )
+{
+ vsi_status status = VSI_SUCCESS;
+ self->nn_param.topk.axis = 0;
+
+ return status;
+} /* op_init() */
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -144,7 +317,7 @@ extern "C" {
DEF_OP_REG
(
/* op_name */ TOPK,
- /* init */ NULL,
+ /* init */ op_init,
/* compute */ op_compute,
/* deinit */ vsi_nn_op_common_deinit,
/* check */ op_check,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c
index a6d526633..ece932e6e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c
@@ -35,9 +35,9 @@
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
#include "vsi_nn_internal_node.h"
#include "vsi_nn_rnn_helper.h"
+#include "vsi_nn_error.h"
static vsi_bool setup_op_shapes
(
@@ -80,6 +80,7 @@ static vsi_bool setup_op_shapes
attr.is_const = TRUE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
inputs[RNN_INPUT_H_STATE] = output_tensor->t;
}
@@ -91,6 +92,7 @@ static vsi_bool setup_op_shapes
attr.vtl = use_virtual_tensor;
attr.is_const = FALSE;
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
outputs[RNN_OUTPUT_H_STATE] = output_tensor->t;
}
@@ -112,6 +114,8 @@ static vsi_bool setup_op_shapes
}
return TRUE;
+final:
+ return FALSE;
}
static vsi_status op_compute
@@ -121,6 +125,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -131,6 +137,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -143,6 +152,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
@@ -168,6 +179,8 @@ static vsi_bool op_setup
vsi_size_t batch_size = 0;
vsi_size_t time_step = 0;
uint32_t i = 0;
+ vsi_bool ret = FALSE;
+ vsi_status status = VSI_FAILURE;
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
@@ -193,21 +206,28 @@ static vsi_bool op_setup
/* transpose to time_major */
output_tensor = vsi_nn_rnn_transpose_time_major(self,
inputs[RNN_INPUT_INPUT], NULL, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
input_tensor = output_tensor->t;
}
/* split input tensor */
split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
+ CHECK_PTR_FAIL_GOTO( split_output_tensors, "Create buffer fail.", final );
memset( split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
rnncell_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step *
sizeof(vsi_nn_tensor_t **));
+ CHECK_PTR_FAIL_GOTO( rnncell_reshape_output_tensors, "Create buffer fail.", final );
memset( rnncell_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
- vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+ status = vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors,
+ (uint32_t)time_step, use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO(status, final);
- vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+ status = vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+ CHECK_STATUS_FAIL_GOTO(status, final);
last_step_h_state = inputs[RNN_INPUT_H_STATE];
+
for( i = 0; i < time_step; i++ )
{
vsi_nn_tensor_t* reshape_output = NULL;
@@ -217,26 +237,30 @@ static vsi_bool op_setup
/* reshape for split output */
output_tensor = vsi_nn_rnn_reshape_split_output(self,
split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor);
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
reshape_output = output_tensor->t;
/* rnncell output */
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[RNN_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
rnncell_out0 = output_tensor->t;
/* rnncell output h_state */
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[RNN_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
rnncell_out1 = output_tensor->t;
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RNNCELL_OVXLIB, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.rnncell_ovxlib.activation = curr_param->activation;
if ( reshape_output->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
reshape_output->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 )
{
- int32_t k = 0;
+ size_t k = 0;
for (k = 0; k < _cnt_of_array( curr_param->internal_dtype ); k++)
{
if (curr_param->internal_dtype[k].vx_type == VSI_NN_TYPE_NONE)
@@ -274,6 +298,7 @@ static vsi_bool op_setup
vsi_nn_internal_init_tensor_attr(&attr,
&outputs[RNN_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
tensor = output_tensor->t;
}
@@ -281,6 +306,7 @@ static vsi_bool op_setup
if (outputs[RNN_OUTPUT_H_STATE] != NULL)
{
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = last_step_h_state;
curr->outputs[0] = outputs[RNN_OUTPUT_H_STATE];
vsi_nn_internal_setup_node(self, curr);
@@ -288,13 +314,14 @@ static vsi_bool op_setup
/* concat rnncell output, the rnn's output is 3-dims */
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.concat.axis = 2;
for( i = 0; i < time_step; i++ )
{
curr->inputs[i] = rnncell_reshape_output_tensors[i];
}
curr->outputs[0] = tensor;
- vsi_nn_internal_setup_node( self, curr );
+ ret = vsi_nn_internal_setup_node( self, curr );
if( !curr_param->time_major )
{
@@ -303,10 +330,11 @@ static vsi_bool op_setup
tensor, outputs[RNN_OUTPUT_OUTPUT], use_virtual_tensor);
}
+final:
vsi_nn_safe_free( split_output_tensors );
vsi_nn_safe_free( rnncell_reshape_output_tensors );
- return TRUE;
+ return ret;
} /* op_setup() */
static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c
index 7e57e3223..35d84a5f8 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c
@@ -34,8 +34,8 @@
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
-#include "libnnext/vsi_nn_vxkernel.h"
#include "vsi_nn_internal_node.h"
+#include "vsi_nn_error.h"
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (VSI_NN_UNSTACK_MAX_OUTPUTS)
@@ -47,6 +47,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -58,6 +60,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
@@ -68,6 +72,9 @@ static vsi_bool op_check
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(self);
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
@@ -94,6 +101,7 @@ static vsi_bool op_setup
uint32_t i = 0, j = 0;
uint32_t rank = inputs[0]->attr.dim_num;
int8_t is_scalar = (rank - 1) == 0 ? TRUE : FALSE;
+ vsi_bool ret = FALSE;
vsi_nn_internal_init_node_wksp( self );
@@ -172,10 +180,13 @@ static vsi_bool op_setup
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor);
input_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final);
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
reshape_input_size = (vsi_size_t*)vsi_nn_internal_new_node_param(curr,
VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_input_size, curr, "Create internal buffer failed", final);
reshape_input_size[0] = block_size;
reshape_input_size[1] = tensor_num;
reshape_input_size[2] = block_num;
@@ -186,23 +197,28 @@ static vsi_bool op_setup
curr->outputs[0] = input_tensor->t;
vsi_nn_internal_setup_node( self, curr );
+ curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, tensor_num );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
slices = (uint32_t *)vsi_nn_internal_new_node_param(curr,
tensor_num * sizeof(uint32_t));
- curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, tensor_num );
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(slices, curr, "Create internal buffer failed", final);
curr->node->nn_param.split.axis = 1;
curr->node->nn_param.split.slices = slices;
curr->node->nn_param.split.slices_num = tensor_num;
curr->inputs[0] = input_tensor->t;
output_tensors = (vsi_nn_internal_tensor_t**)malloc(tensor_num * sizeof(vsi_nn_internal_tensor_t*));
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( output_tensors, curr, "Create tensor fail.", final );
+
for (i = 0; i < tensor_num; i++)
{
slices[i] = 1;
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
vsi_nn_internal_init_tensor_attr(&attr, &outputs[i]->attr.dtype, use_virtual_tensor);
output_tensors[i] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+ CHECK_PTR_FAIL_GOTO(output_tensors[i], "Create internal tensor failed", final);
curr->outputs[i] = output_tensors[i]->t;
}
- vsi_nn_internal_setup_node( self, curr );
+ ret = vsi_nn_internal_setup_node( self, curr );
for (i = 0; i < tensor_num; i++)
{
@@ -210,10 +226,12 @@ static vsi_bool op_setup
output_size = (vsi_size_t *)vsi_nn_internal_new_node_param(curr,
VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+ CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(output_size, curr, "Create internal buffer failed", final);
memcpy(output_size, outputs[i]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.reshape2.size = output_size;
curr->node->nn_param.reshape2.dim_num = outputs[i]->attr.dim_num;
curr->inputs[0] = output_tensors[i]->t;
@@ -221,9 +239,10 @@ static vsi_bool op_setup
vsi_nn_internal_setup_node( self, curr );
}
+final:
vsi_nn_safe_free(output_tensors);
- return TRUE;
+ return ret;
} /* op_setup() */
static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c b/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c
index 1923b26a6..36bbdbc34 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c
@@ -35,7 +35,6 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_log.h"
#include "ops/vsi_nn_op_upsample.h"
-#include "libnnext/vsi_nn_vxkernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
#include "utils/vsi_nn_constraint_check.h"
@@ -144,17 +143,20 @@ static vsi_status op_compute
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
uint32_t new_rank = 0;
- vsi_bool ret;
+ vsi_bool ret = FALSE;
vsi_nn_kernel_param_t * param = NULL;
- int32_t scale_x = (int32_t)self->nn_param.upsample.scale[0];
- int32_t scale_y = (int32_t)self->nn_param.upsample.scale[1];
+ int32_t scale_x = 0;
+ int32_t scale_y = 0;
if( NULL == self )
{
return VSI_FAILURE;
}
- param =vsi_nn_kernel_param_create();
+ scale_x = (int32_t)self->nn_param.upsample.scale[0];
+ scale_y = (int32_t)self->nn_param.upsample.scale[1];
+
+ param = vsi_nn_kernel_param_create();
ret = vsi_nn_upsample_optimize_shape(self,
(vsi_ssize_t*)inputs[0]->attr.size, (vsi_ssize_t*)inputs[1]->attr.size,
@@ -164,7 +166,7 @@ static vsi_status op_compute
vsi_nn_kernel_param_add_int32( param, "scale_x", scale_x );
vsi_nn_kernel_param_add_int32( param, "scale_y", scale_y );
- if( ret )
+ if ( ret )
{
reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
inputs[0], shapes[0], new_rank );
@@ -180,7 +182,7 @@ static vsi_status op_compute
vsi_nn_ReleaseTensor( &reshape_tensors[2] );
}
- if( self->n )
+ if ( self->n )
{
status = VSI_SUCCESS;
}
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c
index 6bb917586..4b7dd3f61 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c
@@ -35,6 +35,7 @@
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
typedef struct _upsamplescale_local_data_t {
int32_t placeholder;
@@ -56,8 +57,8 @@ static vsi_status op_compute
)
{
vsi_status status = VSI_FAILURE;
- int32_t stride = self->nn_param.upsamplescale.stride;
- float scale = self->nn_param.upsamplescale.scale;
+ int32_t stride = 0;
+ float scale = 0;
vsi_nn_kernel_param_t * param = NULL;
if( NULL == self )
@@ -65,12 +66,15 @@ static vsi_status op_compute
return VSI_FAILURE;
}
+ stride = self->nn_param.upsamplescale.stride;
+ scale = self->nn_param.upsamplescale.scale;
+
if (stride == 1 || vsi_nn_abs(scale - 1.0f) == _EPSILON)
{
return vsi_nn_internal_compute_node( self );
}
- param =vsi_nn_kernel_param_create();
+ param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_int32( param, "stride", stride );
vsi_nn_kernel_param_add_float32( param, "scale", scale );
@@ -82,7 +86,7 @@ static vsi_status op_compute
vsi_nn_kernel_param_release( ¶m );
- if( self->n )
+ if ( self->n )
{
status = VSI_SUCCESS;
}
@@ -141,6 +145,9 @@ static vsi_status op_optimize
int32_t stride = self->nn_param.upsamplescale.stride;
float scale = self->nn_param.upsamplescale.scale;
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
+
if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON)
{
return vsi_nn_internal_optimize_node( self, direction );
@@ -163,30 +170,34 @@ static vsi_bool op_setup
float scale = self->nn_param.upsamplescale.scale;
int32_t i = 0;
vsi_nn_internal_node_t* curr = NULL;
+ vsi_bool ret = FALSE;
vsi_nn_internal_init_node_wksp(self);
if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON)
{
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
}
else if (stride == 1)
{
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_LINEAR, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.linear.a = scale;
curr->node->nn_param.linear.b = 0;
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
}
else if (vsi_nn_abs(scale - 1.0f) == _EPSILON)
{
curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESIZE, 0, 0);
+ CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
curr->node->nn_param.resize.type = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR;
curr->node->nn_param.resize.align_corners = FALSE;
curr->node->nn_param.resize.half_pixel_centers = FALSE;
@@ -195,7 +206,7 @@ static vsi_bool op_setup
curr->inputs[0] = inputs[0];
curr->outputs[0] = outputs[0];
- vsi_nn_internal_setup_node(self, curr);
+ ret = vsi_nn_internal_setup_node(self, curr);
}
else
{
@@ -206,9 +217,12 @@ static vsi_bool op_setup
outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
}
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+
+ ret = TRUE;
}
- return TRUE;
+final:
+ return ret;
} /* op_setup() */
static vsi_status op_init
@@ -216,6 +230,8 @@ static vsi_status op_init
vsi_nn_node_t* self
)
{
+ VSI_UNREFERENCED(self);
+
return VSI_SUCCESS;
} /* op_init() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c b/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c
index a8a2a7e0b..f4dcb531e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c
@@ -44,6 +44,8 @@ static vsi_status op_compute
vsi_nn_tensor_t ** outputs
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_compute_node( self );
} /* op_compute() */
@@ -69,6 +71,8 @@ static vsi_status op_optimize
vsi_nn_opt_direction_e direction
)
{
+ VSI_UNREFERENCED(inputs);
+ VSI_UNREFERENCED(outputs);
return vsi_nn_internal_optimize_node( self, direction );
} /* op_optimize() */
diff --git a/src/tim/vx/internal/src/post/vsi_nn_post_cmupose.c b/src/tim/vx/internal/src/post/vsi_nn_post_cmupose.c
index 7d1b9cf09..6e0ec8d03 100644
--- a/src/tim/vx/internal/src/post/vsi_nn_post_cmupose.c
+++ b/src/tim/vx/internal/src/post/vsi_nn_post_cmupose.c
@@ -348,7 +348,7 @@ static vx_status resize_binlinear
}
}
- return VX_SUCCESS;
+ return VSI_SUCCESS;
}
#endif
@@ -455,14 +455,15 @@ static void _convolve_same
float *input,
uint32_t input_size,
double *kernel,
- uint32_t kernel_size,
+ int32_t kernel_size,
float *output
)
{
- uint32_t pad,pad_input_size;
- uint32_t i,k,offset;
+ uint32_t pad = 0, pad_input_size = 0;
+ uint32_t i = 0, offset = 0;
+ int32_t k = 0;
float *pad_input = NULL;
- double sum;
+ double sum = 0;
uint32_t pad_input_sizef,input_sizef;
if(NULL == input || NULL == kernel || NULL == output)
@@ -536,6 +537,9 @@ static void set_cols
)
{
uint32_t w;
+
+ VSI_UNREFERENCED(height);
+
if(NULL == data || cols == NULL)
{
return ;
@@ -947,6 +951,7 @@ static vsi_nn_con_candidate_t *_get_connection_candidate
{
con_candidate = (vsi_nn_con_candidate_t *)
vsi_nn_LinkListNewNode(sizeof(vsi_nn_con_candidate_t), _init_candidate);
+ CHECK_PTR_FAIL_GOTO( con_candidate, "null point.", final );
sum++;
con_candidate->data.i = i;
@@ -963,6 +968,8 @@ static vsi_nn_con_candidate_t *_get_connection_candidate
}
*candidate_sum = sum;
+
+final:
return con_candidate_list;
}
@@ -1276,6 +1283,8 @@ static vsi_nn_subset_t *_compute_subset
vsi_nn_subset_t *subset_list = NULL, *subset = NULL;
uint32_t *deleteIdx = NULL;
+ VSI_UNREFERENCED(all_connection_num);
+
if(NULL == all_connection ||
NULL == candidate ||
NULL == special_k ||
@@ -1319,6 +1328,8 @@ static vsi_nn_subset_t *_compute_subset
{
sig_subset= (vsi_nn_subset_t *)
vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)subset_list, j);
+ CHECK_PTR_FAIL_GOTO( sig_subset, "null point.", final );
+
if(sig_subset->data.idx[indexA] == partAs[i] ||
sig_subset->data.idx[indexB] == partBs[i])
{
@@ -1338,6 +1349,8 @@ static vsi_nn_subset_t *_compute_subset
int32_t ii = partBs[i];
sig_connect = (vsi_nn_connection_t *)
vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)connection_k, i);
+ CHECK_PTR_FAIL_GOTO( sig_connect, "get point fail.", final );
+
sig_subset->data.idx[indexB] = (float)ii;
sig_subset->data.idx[20 - 1] += 1;
sig_subset->data.idx[20 - 2] +=
@@ -1362,6 +1375,8 @@ static vsi_nn_subset_t *_compute_subset
vsi_nn_subset_t *j2_iter = j2_subset;
sig_connect = (vsi_nn_connection_t *)
vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)connection_k, i);
+ CHECK_PTR_FAIL_GOTO( sig_connect, "get point fail.", final );
+
for(ii=0; ii<(20-2); ii++)
{
j1_iter->data.idx[ii] += j2_iter->data.idx[ii] + 1;
@@ -1380,6 +1395,8 @@ static vsi_nn_subset_t *_compute_subset
int32_t ii = partBs[i];
sig_connect = (vsi_nn_connection_t *)
vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)connection_k, i);
+ CHECK_PTR_FAIL_GOTO( sig_connect, "get point fail.", final );
+
sum = candidate[ii].score + sig_connect->data.score;
j1_subset->data.idx[indexB] = (float)ii;
j1_subset->data.idx[20 - 1] += 1;
@@ -1413,7 +1430,7 @@ static vsi_nn_subset_t *_compute_subset
subset = (vsi_nn_subset_t *)
vsi_nn_LinkListNewNode(sizeof(vsi_nn_subset_t), _init_subset);
-
+ CHECK_PTR_FAIL_GOTO( subset, "null point.", final );
memcpy(&subset->data, row, sizeof(float) * 20);
vsi_nn_LinkListPushEnd(
@@ -1433,6 +1450,7 @@ static vsi_nn_subset_t *_compute_subset
memset(deleteIdx, -1, sizeof(uint32_t) * num);
subset = subset_list;
+ CHECK_PTR_FAIL_GOTO( subset, "null point.", final );
for(i=0,j=0; idata.idx[20 - 1];
@@ -1445,28 +1463,13 @@ static vsi_nn_subset_t *_compute_subset
}
for(i=0; idata.idx[i]);
- }
- subset = (vsi_nn_subset_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)subset);
- n++;
- }
- #endif
-
final:
if(deleteIdx)free(deleteIdx);
return subset_list;
@@ -1499,6 +1502,7 @@ static vsi_nn_connection_t **_compute_all_connetion
score_mid = (float *)malloc(sizeof(float) * height * width * score_mid_depth);
CHECK_PTR_FAIL_GOTO( score_mid, "Create buffer fail.", final );
connection_all = (vsi_nn_connection_t **)malloc(sizeof(vsi_nn_connection_t *) * mapIdx_len);
+ CHECK_PTR_FAIL_GOTO( connection_all, "Create buffer fail.", final );
special_k = (int32_t *)malloc(sizeof(int32_t) * mapIdx_len);
CHECK_PTR_FAIL_GOTO( special_k, "Create buffer fail.", final );
@@ -1836,6 +1840,7 @@ vsi_status vsi_nn_CMUPose_Post_Process
_fill_paf_avg(net_out, config, paf_avg);
all_peaks = _compute_all_peaks(heatmap_avg, config, &peak_counter, &peak_list_num);
+ CHECK_PTR_FAIL_GOTO( all_peaks, "Create buffer fail.", final );
#if 0
for(n=0; nnode_num; i++)
{
node = vsi_nn_GetNode( graph, (vsi_nn_node_id_t)i );
//printf("i[%u] op[%s]\n", i, vsi_nn_OpGetName(node->op));
- if(node->op == VSI_NN_OP_PROPOSAL)
+ if (node && node->op == VSI_NN_OP_PROPOSAL)
{
memcpy(¶m->iminfo, &node->nn_param.proposal.im_info,
sizeof(vsi_nn_proposal_im_info));
tensor = vsi_nn_GetTensor(graph,node->output.tensors[0]);
+ CHECK_PTR_FAIL_GOTO( tensor, "Get tensor fail.", final );
+
param->rois_num = (uint32_t)tensor->attr.size[1];
}
}
@@ -164,6 +165,7 @@ static vsi_status _fill_fasterrcnn_param
param->classes_num = VSI_NN_FASTERRCNN_CLASSES_NUM;
param->classes = FASTER_RCNN_CLASSES;
+final:
return status;
} /* _fill_fasterrcnn_param() */
@@ -572,6 +574,7 @@ static vsi_status _fasterrcnn_post_process
{
box = (vsi_nn_fasterrcnn_box_t *)
vsi_nn_LinkListNewNode(sizeof(vsi_nn_fasterrcnn_box_t), _init_box);
+ CHECK_PTR_FAIL_GOTO( box, "Create box fail.", final );
box->score = dets[keep[k]*5+4];
box->class_id = i;
box->x1 = dets[keep[k]*5+0];
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c b/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c
index 85d862d23..27a3c45c7 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c
@@ -34,8 +34,11 @@ static vsi_nn_binary_tree_t * _new_node
node = (vsi_nn_binary_tree_t *)malloc(
sizeof( vsi_nn_binary_tree_t ) );
+ if (node)
+ {
+ memset( node, 0, sizeof( vsi_nn_binary_tree_t ) );
+ }
- memset( node, 0, sizeof( vsi_nn_binary_tree_t ) );
return node;
} /* _new_node() */
@@ -181,7 +184,7 @@ void vsi_nn_BinaryTreeRemoveNode
vsi_nn_binary_tree_key_t key
)
{
- if( NULL == root && NULL != *root )
+ if ( NULL != root )
{
return;
}
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
index 4ce42c95e..d696e8cd5 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@@ -465,6 +465,7 @@ static _op_param_gen_t s_op_gen[] =
/* INVERSE_SIGMOID */ NULL,
/* GRID_SAMPLE */ NULL,
/* LPNORM */ NULL,
+ /* RESIZE_3D */ NULL,
};
_compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
@@ -548,6 +549,10 @@ void vsi_nn_GenGraphCCode
node_id = i;
}
node = vsi_nn_GetNode( graph, node_id );
+ if (node == NULL)
+ {
+ continue;
+ }
_write_code( "node[%u] = vsi_nn_AppendNode( graph, %#x, NULL );",
i, node->op );
for( j = 0; j < node->input.num; j ++ )
@@ -567,7 +572,7 @@ void vsi_nn_GenGraphCCode
}
}
// write node params
- if( node->op < _cnt_of_array( s_op_gen ) )
+ if( node->op < (vsi_nn_op_t)_cnt_of_array( s_op_gen ) )
{
if( NULL != s_op_gen[node->op] )
{
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c
index 95f5cc7fb..22ab7bb47 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c
@@ -77,6 +77,8 @@ static const char* _get_qtype_name(vsi_nn_qnt_type_e type)
case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: return "ASYM";
case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: return "SYM";
case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: return "SYMM PC";
+ case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8: return "FP8";
+ case VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8: return "FP8 PC";
default:
VSILOGE("Unknown quant type: %d\n", type);
break;
@@ -162,7 +164,9 @@ vsi_bool validate_op_io_types
{
vsi_bool matched = FALSE;
- if(self && self->attr.enable_op_constraint_check) {
+ VSI_UNREFERENCED(name);
+
+ if(self && self->attr.enable_op_constraint_check && op_constraint_reg) {
uint32_t i = 0;
int32_t j = 0;
int32_t reg_tensor_num = op_constraint_reg->reg_input_num + op_constraint_reg->reg_output_num;
@@ -218,14 +222,20 @@ char* generate_op_io_types_desc
char* desc = NULL;
for(i = 0; i < inputs_num; i++) {
- if(inputs[i]) {
+ if (inputs[i] &&
+ _get_qtype_name(inputs[i]->attr.dtype.qnt_type) &&
+ _get_dtype_name(inputs[i]->attr.dtype.vx_type))
+ {
total_sz += snprintf(NULL, 0, "%s %s, ",
_get_qtype_name(inputs[i]->attr.dtype.qnt_type),
_get_dtype_name(inputs[i]->attr.dtype.vx_type));
}
}
for(i = 0; i < outputs_num; i++) {
- if(outputs[i]) {
+ if (outputs[i] &&
+ _get_qtype_name(outputs[i]->attr.dtype.qnt_type) &&
+ _get_dtype_name(outputs[i]->attr.dtype.vx_type))
+ {
total_sz += snprintf(NULL, 0, "%s %s, ",
_get_qtype_name(outputs[i]->attr.dtype.qnt_type),
_get_dtype_name(outputs[i]->attr.dtype.vx_type));
@@ -234,17 +244,24 @@ char* generate_op_io_types_desc
total_sz += 1; /* terminator */
desc = (char*)malloc(sizeof(char) * total_sz);
+ CHECK_PTR_FAIL_GOTO( desc, "Create buffer fail.", final );
memset(desc, 0x00, sizeof(char) * total_sz);
for(i = 0; i < inputs_num; i++) {
- if(inputs[i] && total_sz >= used_sz) {
+ if (inputs[i] && total_sz >= used_sz &&
+ _get_qtype_name(inputs[i]->attr.dtype.qnt_type) &&
+ _get_dtype_name(inputs[i]->attr.dtype.vx_type))
+ {
used_sz += snprintf(desc + used_sz, total_sz - used_sz, "%s %s, ",
_get_qtype_name(inputs[i]->attr.dtype.qnt_type),
_get_dtype_name(inputs[i]->attr.dtype.vx_type));
}
}
for(i = 0; i < outputs_num; i++) {
- if(outputs[i] && total_sz >= used_sz) {
+ if (outputs[i] && total_sz >= used_sz &&
+ _get_qtype_name(outputs[i]->attr.dtype.qnt_type) &&
+ _get_dtype_name(outputs[i]->attr.dtype.vx_type))
+ {
used_sz += snprintf(desc + used_sz, total_sz - used_sz, "%s %s, ",
_get_qtype_name(outputs[i]->attr.dtype.qnt_type),
_get_dtype_name(outputs[i]->attr.dtype.vx_type));
@@ -255,6 +272,7 @@ char* generate_op_io_types_desc
desc[used_sz - 2] = '\0';
}
+final:
return desc;
}
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dlfcn.c b/src/tim/vx/internal/src/utils/vsi_nn_dlfcn.c
index f64464962..dfabeed95 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_dlfcn.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_dlfcn.c
@@ -4,17 +4,22 @@
#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
void * vsi_nn_dlopen( const char *file, int mode )
{
+ VSI_UNREFERENCED(file);
+ VSI_UNREFERENCED(mode);
return NULL;
}
int vsi_nn_dlclose( void *handle )
{
+ VSI_UNREFERENCED(handle);
return -1;
}
__declspec(noinline)
void* vsi_nn_dlsym( void *handle, const char *name )
{
+ VSI_UNREFERENCED(handle);
+ VSI_UNREFERENCED(name);
return NULL;
}
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
index 18575b716..ac4aa2ab1 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
@@ -116,6 +116,92 @@ static VSI_INLINE_API void _convert_float_to_bfloat16
}
} /* _convert_float_to_bfloat16 */
+static VSI_INLINE_API vsi_bool _convert_quant_float8_e4m3_to_float
+ (
+ const uint8_t * buffer,
+ size_t size,
+ const float scale,
+ float * out_buffer
+ )
+{
+ uint32_t i = 0;
+ if( !buffer || !out_buffer )
+ {
+ return FALSE;
+ }
+ for( i = 0; i < size; i ++ )
+ {
+ out_buffer[i] = fp8_e4m3_to_fp32( (uint8_t)buffer[i], scale );
+ }
+
+ return TRUE;
+} /* _convert_quant_float8_e4m3_to_float */
+
+static VSI_INLINE_API vsi_bool _convert_float_to_quant_float8_e4m3
+ (
+ const float * buffer,
+ size_t size,
+ const float scale,
+ uint8_t * out_buffer
+ )
+{
+ uint32_t i = 0;
+ if( !buffer || !out_buffer )
+ {
+ return FALSE;
+ }
+ for( i = 0; i < size; i ++ )
+ {
+ out_buffer[i] = fp32_to_fp8_e4m3( buffer[i], scale );
+ }
+
+ return TRUE;
+} /* _convert_float_to_quant_float8_e4m3 */
+
+static VSI_INLINE_API vsi_bool _convert_quant_float8_e5m2_to_float
+ (
+ const uint8_t * buffer,
+ size_t size,
+ const float scale,
+ float * out_buffer
+ )
+{
+ uint32_t i = 0;
+
+ if( !buffer || !out_buffer )
+ {
+ return FALSE;
+ }
+
+ for( i = 0; i < size; i ++ )
+ {
+ out_buffer[i] = fp8_e5m2_to_fp32( (uint8_t)buffer[i], scale );
+ }
+
+ return TRUE;
+} /* _convert_quant_float8_e5m2_to_float */
+
+static VSI_INLINE_API vsi_bool _convert_float_to_quant_float8_e5m2
+ (
+ const float * buffer,
+ size_t size,
+ const float scale,
+ uint8_t * out_buffer
+ )
+{
+ uint32_t i = 0;
+ if( !buffer || !out_buffer )
+ {
+ return FALSE;
+ }
+ for( i = 0; i < size; i ++ )
+ {
+ out_buffer[i] = fp32_to_fp8_e5m2( buffer[i], scale );
+ }
+
+ return TRUE;
+} /* _convert_float_to_quant_float8_e5m2 */
+
#define DEF_DTYPE_CONVERT_QUANTIZE( SRC_NAME, SRC_DTYPE, ROUND, MIN, MAX ) \
vsi_bool vsi_nn_dtype_convert_quantize_##SRC_NAME##_to_float \
( \
@@ -177,6 +263,15 @@ vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm8_perchannel
int8_t * out_buffer
)
{
+ VSI_UNREFERENCED(size);
+ VSI_UNREFERENCED(shape);
+ VSI_UNREFERENCED(rank);
+ VSI_UNREFERENCED(scale);
+ VSI_UNREFERENCED(scale_size);
+ VSI_UNREFERENCED(zero_point);
+ VSI_UNREFERENCED(zero_point_size);
+ VSI_UNREFERENCED(channel_dim);
+
if( !buffer || !out_buffer )
{
return FALSE;
@@ -195,6 +290,15 @@ vsi_bool vsi_nn_dtype_convert_quantize_symm8_perchannel_to_float
float * out_buffer
)
{
+ VSI_UNREFERENCED(size);
+ VSI_UNREFERENCED(shape);
+ VSI_UNREFERENCED(rank);
+ VSI_UNREFERENCED(scale);
+ VSI_UNREFERENCED(scale_size);
+ VSI_UNREFERENCED(zero_point);
+ VSI_UNREFERENCED(zero_point_size);
+ VSI_UNREFERENCED(channel_dim);
+
if( !buffer || !out_buffer )
{
return FALSE;
@@ -270,6 +374,12 @@ vsi_bool vsi_nn_dtype_convert_float_to_quantize_asymm
case I8:
return vsi_nn_dtype_convert_float_to_quantize_symm8(
buffer, size, scale, zero_point, (int8_t*)out_buffer );
+ case FP8_E4M3:
+ return _convert_float_to_quant_float8_e4m3(
+ buffer, size, scale, (uint8_t*)out_buffer );
+ case FP8_E5M2:
+ return _convert_float_to_quant_float8_e5m2(
+ buffer, size, scale, (uint8_t*)out_buffer );
case I16:
return vsi_nn_dtype_convert_float_to_quantize_symm16(
buffer, size, scale, zero_point, (int16_t*)out_buffer );
@@ -423,6 +533,12 @@ vsi_bool vsi_nn_dtype_convert_quantize_asymm_to_float
case U8:
return vsi_nn_dtype_convert_quantize_asymm8_to_float(
(const uint8_t *)buffer, size, scale, zero_point, out_buffer );
+ case FP8_E4M3:
+ return _convert_quant_float8_e4m3_to_float(
+ (const uint8_t *)buffer, size, scale, out_buffer );
+ case FP8_E5M2:
+ return _convert_quant_float8_e5m2_to_float(
+ (const uint8_t *)buffer, size, scale, out_buffer );
case U16:
return vsi_nn_dtype_convert_quantize_asymm16_to_float(
(const uint16_t*)buffer, size, scale, zero_point, out_buffer);
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c
index 6547f463a..07249e7c4 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c
@@ -408,12 +408,15 @@ vsi_bool vsi_nn_QuantCheck
VSILOGE("input_fl[%d] + weight_fl[%d] != bias_fl[%d]",
input->attr.dtype.fl,
weight->attr.dtype.fl,
- bias->attr.dtype.fl);
+ bias ? bias->attr.dtype.fl : 0);
}
break;
case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
- if (weight->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
+ case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8:
+ if (weight->attr.dtype.qnt_type ==
+ VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC ||
+ weight->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8)
{
ret = vsi_nn_QuantAffinePerchannelCheck(input, weight, bias);
if(ret == FALSE)
@@ -429,7 +432,7 @@ vsi_bool vsi_nn_QuantCheck
VSILOGE("input_scale[%.12lf] * weight_scale[%.12lf] != bias_scale[%.12lf]",
input->attr.dtype.scale,
weight->attr.dtype.scale,
- bias->attr.dtype.scale);
+ bias ? bias->attr.dtype.scale : 0);
}
}
break;
@@ -468,6 +471,7 @@ vsi_bool vsi_nn_DtypeCompare
break;
case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
+ case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8:
{
const float diff = (float)1e-5;
if (dtype0->zero_point != dtype1->zero_point)
@@ -484,6 +488,7 @@ vsi_bool vsi_nn_DtypeCompare
}
case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC:
case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC:
+ case VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8:
{
const float diff = (float)1e-5;
int32_t i = 0;
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_hashmap.c b/src/tim/vx/internal/src/utils/vsi_nn_hashmap.c
index b576fc1e6..8a8288d86 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_hashmap.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_hashmap.c
@@ -47,7 +47,11 @@ static _binary_tree_t * _new_node
node = (_binary_tree_t *)malloc(
sizeof( _binary_tree_t ) );
- memset( node, 0, sizeof( _binary_tree_t ) );
+ if (node)
+ {
+ memset( node, 0, sizeof( _binary_tree_t ) );
+ }
+
return node;
} /* _new_node() */
@@ -395,6 +399,7 @@ void vsi_nn_hashmap_add
{
iter = (vsi_nn_hashmap_item_t *)vsi_nn_LinkListNewNode(
sizeof( vsi_nn_hashmap_item_t ), NULL );
+ VSI_ASSERT( iter );
key_size = strlen( hash_key ) + 1;
iter->hash_key = (char*)malloc( sizeof(char) * key_size );
VSI_ASSERT( iter->hash_key );
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_link_list.c b/src/tim/vx/internal/src/utils/vsi_nn_link_list.c
index 053e6e9b5..a2401aaf3 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_link_list.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_link_list.c
@@ -27,6 +27,7 @@
#include "vsi_nn_prv.h"
#include "utils/vsi_nn_link_list.h"
#include "vsi_nn_types.h"
+#include "vsi_nn_error.h"
static vsi_nn_link_list_t * _walk_to_start
(
@@ -239,6 +240,7 @@ vsi_nn_link_list_t * vsi_nn_LinkListNewNode
)
{
vsi_nn_link_list_t *node = (vsi_nn_link_list_t *)malloc(sz);
+ CHECK_PTR_FAIL_GOTO( node, "Create node fail.", final );
memset(node, 0, sz);
if(init)
@@ -246,6 +248,7 @@ vsi_nn_link_list_t * vsi_nn_LinkListNewNode
init(node);
}
+final:
return node;
} /* vsi_nn_LinkListNewNode() */
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_math.c b/src/tim/vx/internal/src/utils/vsi_nn_math.c
index b2aae0586..260646da9 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_math.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_math.c
@@ -360,8 +360,11 @@ struct r123array4x32 _philox4x32round(struct r123array4x32 ctr, struct r123array
uint32_t hi1;
uint32_t lo0 = mulhilo32(PHILOX_M4x32_0, ctr.v[0], &hi0);
uint32_t lo1 = mulhilo32(PHILOX_M4x32_1, ctr.v[2], &hi1);
- struct r123array4x32 out = {{hi1^ctr.v[1]^key.v[0], lo1,
- hi0^ctr.v[3]^key.v[1], lo0}};
+ struct r123array4x32 out = { { 0, 0, 0, 0 } };
+ out.v[0] = hi1^ctr.v[1]^key.v[0];
+ out.v[1] = lo1;
+ out.v[2] = hi0^ctr.v[3]^key.v[1];
+ out.v[3] = lo0;
return out;
}
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c
index e6a766feb..82d1aaaf1 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c
@@ -306,7 +306,7 @@ vsi_size_t vsi_nn_GetStrideSizeBySize
type_bits = vsi_nn_TypeGetBits( type);
stride[0] = type_bits / BITS_PER_BYTE;
total_bytes = stride[0];
- if( type_bits < BITS_PER_BYTE )
+ if( type_bits < BITS_PER_BYTE && type_bits != 0 )
{
total_bytes = 1;
if( size[0] % (BITS_PER_BYTE / type_bits) == 0 )
@@ -375,6 +375,8 @@ float vsi_nn_DataAsFloat32
val = (float)((int8_t*)data)[0];
break;
case VSI_NN_TYPE_UINT8:
+ case VSI_NN_TYPE_FLOAT8_E4M3:
+ case VSI_NN_TYPE_FLOAT8_E5M2:
val = (float)data[0];
break;
case VSI_NN_TYPE_INT16:
@@ -600,6 +602,8 @@ void vsi_nn_ComputePadWithPadType
vsi_size_t * out_pad
)
{
+ VSI_UNREFERENCED(in_dim_num);
+ VSI_UNREFERENCED(rounding);
vsi_nn_compute_padding(in_shape, ksize, stride, NULL, pad_type, out_pad);
} /* vsi_nn_ComputePadWithPadType() */
@@ -651,6 +655,8 @@ void vsi_nn_ComputePadWithPadTypeForConv1D
vsi_size_t * out_pad
)
{
+ VSI_UNREFERENCED(in_dim_num);
+ VSI_UNREFERENCED(rounding);
vsi_nn_compute_padding_conv1d(in_shape, ksize, stride, NULL, pad_type, out_pad);
} /* vsi_nn_ComputePadWithPadTypeForConv1D() */
@@ -708,9 +714,10 @@ vsi_bool vsi_nn_CreateTensorGroup
vsi_size_t end[VSI_NN_MAX_DIM_NUM];
vsi_nn_tensor_attr_t attr;
- if( NULL == graph || NULL == in_tensor
+ if ( NULL == graph || NULL == in_tensor
|| NULL == out_tensors || 0 == group_number
- || 0 == in_tensor->attr.size[axis] )
+ || axis >= VSI_NN_MAX_DIM_NUM ||
+ 0 == in_tensor->attr.size[axis] )
{
VSILOGW( "Create tensor group fail." );
return FALSE;
@@ -733,13 +740,14 @@ vsi_bool vsi_nn_CreateTensorGroup
end[2] = in_tensor->attr.size[2];
end[3] = in_tensor->attr.size[3];
end[axis] = 0;
-
for( i = 0; i < group_number; i ++ )
{
start[axis] = end[axis];
end[axis] += sz;
#ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT
- if ( attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC )
+ if (attr.dtype.qnt_type ==
+ VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC ||
+ attr.dtype.qnt_type == VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8)
{
attr.dtype.scales = in_tensor->attr.dtype.scales + sz * i;
attr.dtype.scale_dim = (int32_t)sz;
@@ -835,6 +843,7 @@ int32_t vsi_nn_Mkdir
int32_t mode
)
{
+ VSI_UNREFERENCED(mode);
if(NULL == path)
{
return -1;
@@ -906,6 +915,10 @@ uint8_t * vsi_nn_MallocAlignedBuffer
sz = sizeof(aligned_header) + mem_size +
align_start_size + align_block_size + END_GUARD_SIZE;
raw_addr = (uint8_t *)malloc( sz * sizeof( uint8_t ) );
+ if (raw_addr == NULL)
+ {
+ return NULL;
+ }
memset(raw_addr, 0, sizeof( uint8_t ) * sz);
p = raw_addr + sizeof(aligned_header);
@@ -1175,6 +1188,7 @@ vsi_bool vsi_nn_is_same_quant_type(
break;
case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
+ case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8:
{
const float diff = (float)1e-5;
if (src_dtype->zero_point != dst_dtype->zero_point)
@@ -1190,6 +1204,7 @@ vsi_bool vsi_nn_is_same_quant_type(
}
case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC:
case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC:
+ case VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8:
{
const float diff = (float)1e-5;
int32_t i = 0;
@@ -1340,6 +1355,7 @@ float vsi_nn_get_tensor_scale
break;
case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
+ case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8:
scale = tensor->attr.dtype.scale;
break;
default:
@@ -1359,6 +1375,7 @@ int32_t vsi_nn_get_tensor_zero_point
switch (tensor->attr.dtype.qnt_type)
{
case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
+ case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8:
zero_point = 0;
break;
case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
@@ -1408,6 +1425,14 @@ void vsi_nn_get_tensor_clamp_min_max
*clampMin = - zero_point;
*clampMax = 65535 - zero_point;
}
+ else if (vx_type == VSI_NN_TYPE_FLOAT8_E4M3) {
+ *clampMin = -448;
+ *clampMax = 448;
+ }
+ else if (vx_type == VSI_NN_TYPE_FLOAT8_E5M2) {
+ *clampMin = -57344;
+ *clampMax = 57344;
+ }
else
{
uint32_t f32_min = 0xff800000;
diff --git a/src/tim/vx/internal/src/vip/virtual_device.cpp b/src/tim/vx/internal/src/vip/virtual_device.cpp
index 88a146a83..2efa849cc 100644
--- a/src/tim/vx/internal/src/vip/virtual_device.cpp
+++ b/src/tim/vx/internal/src/vip/virtual_device.cpp
@@ -30,7 +30,7 @@ namespace vip {
Device::Device(uint32_t id) {
id_ = id;
graphqueue_ = std::make_unique ();
- worker_ = std::make_unique ();;
+ worker_ = std::make_unique ();
ThreadInit();
}
@@ -63,6 +63,9 @@ bool Device::ThreadExit() {
bool Device::GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data) {
bool status = false;
+ idle_mtx_.lock();
+ submit_num_++;
+ idle_mtx_.unlock();
status = graphqueue_->Submit(graph, func, data);
return status;
}
@@ -72,8 +75,10 @@ bool Device::GraphRemove(const vsi_nn_graph_t* graph) {
}
void Device::WaitThreadIdle() {
- ThreadExit();
- ThreadInit();
+ std::unique_lock lock(idle_mtx_);
+ while (submit_num_ > 0) {
+ cv_.wait(lock);
+ }
}
Worker::Worker() {
@@ -108,6 +113,11 @@ void Device::HandleQueue() {
break;
}
worker_->Handle(item); // run graph
+
+ idle_mtx_.lock();
+ submit_num_--;
+ idle_mtx_.unlock();
+ cv_.notify_one();
}
}
diff --git a/src/tim/vx/internal/src/vip/virtual_device_private.h b/src/tim/vx/internal/src/vip/virtual_device_private.h
index ed4c6bb68..b0e39a0cc 100644
--- a/src/tim/vx/internal/src/vip/virtual_device_private.h
+++ b/src/tim/vx/internal/src/vip/virtual_device_private.h
@@ -28,8 +28,8 @@
#include
#include
#include