diff --git a/prebuilt-sdk/x86_64_linux/VERSION b/prebuilt-sdk/x86_64_linux/VERSION
index 79d5c1795..91123ad1e 100644
--- a/prebuilt-sdk/x86_64_linux/VERSION
+++ b/prebuilt-sdk/x86_64_linux/VERSION
@@ -1 +1 @@
-6.4.14_CL650117A_D650117_A648302_R647402_T648811_O646970
\ No newline at end of file
+6.4.15_CL690884A_D690855_A690484_R690194_T690259_O688896
\ No newline at end of file
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h
index 48f824f65..c49800a9f 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h
@@ -1340,6 +1340,21 @@ VX_API_ENTRY vx_status VX_API_CALL vxAssignNodeCallback(vx_node node, vx_nodecom
  */
 VX_API_ENTRY vx_nodecomplete_f VX_API_CALL vxRetrieveNodeCallback(vx_node node);
 
+/*! \brief Assigns a callback to a node.
+ * If a callback already exists in this node, this function must return an error
+ * and the user may clear the callback by passing a NULL pointer as the callback.
+ * \param [in] node The reference to the node.
+ * \param [in] callback The callback to associate with completion of this
+ * specific node.
+ * \warning This must be used with <b><i>extreme</i></b> caution as it can \e ruin
+ * optimizations in the power/performance efficiency of a graph.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Callback assigned; any other value indicates failure.
+ * \retval VX_ERROR_INVALID_REFERENCE node is not a valid <tt>\ref vx_node</tt> reference.
+ * \ingroup group_node_callback
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxAssignNodeQueryCallback(vx_node node, vx_nodequery_f callback);
+
 /*! \brief Sets the node target to the provided value. A success invalidates the graph
  * that the node belongs to (<tt>\ref vxVerifyGraph</tt> must be called before the next execution)
  * \param [in] node  The reference to the <tt>\ref vx_node</tt> object.
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
index d35396074..8a2ac76b1 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
@@ -503,6 +503,40 @@ enum vx_kernel_e {
 
     VX_KERNEL_NN_BATCH_GEMM_RELU_POOLING_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x33,
 
+    VX_KERNEL_NN_FUSED_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x34,
+
+    VX_KERNEL_NN_CONVOLUTION_RELU_POOLING_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x35,
+
+    VX_KERNEL_NN_LAYER_NORMALIZATION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x36,
+
+    VX_KERNEL_NN_INSTANCE_NORMALIZATION_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x37,
+
+    VX_KERNEL_NN_GROUP_NORMALIZATION_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x38,
+
+    VX_KERNEL_NN_LOGICAL_OPS_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x39,
+
+    VX_KERNEL_NN_LOGICAL_NOT_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x40,
+
+    VX_KERNEL_NN_RELATIONAL_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x41,
+
+    VX_KERNEL_NN_TENSOR_REDUCE_MAX = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x42,
+
+    VX_KERNEL_NN_MAXIMUM_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x43,
+
+    VX_KERNEL_NN_MINIMUM_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x44,
+
+    VX_KERNEL_NN_TENSOR_SELECT_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x45,
+
+    VX_KERNEL_NN_REDUCE_SUM_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x46,
+
+    VX_KERNEL_NN_GRU_CELL_ACTIVATION_Z_H_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x47,
+
+    VX_KERNEL_NN_GRU_CELL_H_TIMES_ACTIVATION_R_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x48,
+
+    VX_KERNEL_NN_GRU_CELL_RESET_AFTER_ACTIVATION_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x49,
+
+    VX_KERNEL_NN_LSTM_ACTIVATION_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x50,
+
     VX_KERNEL_MAX_1_2, /*!< \internal Used for VX1.2 bounds checking in the conformance test. */
 };
 
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
index f3f019113..ec5d069ed 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
@@ -214,7 +214,7 @@ VX_STREAM_PROCESSOR_SUPPORT is used to declare that vsi openvx driver can suppor
  1: support
 */
 #ifndef VX_STREAM_PROCESSOR_SUPPORT
-#define VX_STREAM_PROCESSOR_SUPPORT 0
+#define VX_STREAM_PROCESSOR_SUPPORT 1
 #endif
 
 /*
@@ -258,5 +258,144 @@ VX_STREAM_PROCESSOR_SUPPORT is used to declare that vsi openvx driver can suppor
 #define VX_ACTIVATION_EXT2_SUPPORT 1
 #endif
 
+/*
+ VX_TENSORVIEW_ON_ANY_DIM is used to declare that ovxlib can do optimization for all concat node(all dimision) to tensor view if possiable, not only channel.
+ [value]
+ 0: disable
+ 1: enable
+*/
+#ifndef VX_TENSORVIEW_ON_ANY_DIM
+#define VX_TENSORVIEW_ON_ANY_DIM 0
+#endif
+
+/*
+VX_DEPTH2SPACE_CRD_MODE_SUPPORT is used to declare that SPACE2DEPTH can support CRD mode
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_DEPTH2SPACE_CRD_MODE_SUPPORT
+#define VX_DEPTH2SPACE_CRD_MODE_SUPPORT 1
+#endif
+
+/*
+ VX_LAYER_NORMALIZATION_VX_SUPPORT is used to declare driver support layer normalization layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_LAYER_NORMALIZATION_VX_SUPPORT
+#define VX_LAYER_NORMALIZATION_VX_SUPPORT 1
+#endif
+
+/*
+ VX_LAYER_NORMALIZATION_VX_SUPPORT is used to declare driver support layer normalization layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_INSTANCE_NORMALIZATION_VX_SUPPORT
+#define VX_INSTANCE_NORMALIZATION_VX_SUPPORT 1
+#endif
+
+/*
+ VX_GROUP_NORMALIZATION_VX_SUPPORT is used to declare driver support layer normalization layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_GROUP_NORMALIZATION_VX_SUPPORT
+#define VX_GROUP_NORMALIZATION_VX_SUPPORT 1
+#endif
+
+/*
+ VX_LOGICAL_VX_SUPPORT is used to declare driver support layer logical related layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_LOGICAL_VX_SUPPORT
+#define VX_LOGICAL_VX_SUPPORT 1
+#endif
+
+/*
+ VX_RELATIONAL_OPS_VX_SUPPORT is used to declare driver support layer relational related layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_RELATIONAL_OPS_VX_SUPPORT
+#define VX_RELATIONAL_OPS_VX_SUPPORT 1
+#endif
+
+/*
+ VX_REDUCE_MAX_VX_SUPPORT is used to declare driver support layer reduce max layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_REDUCE_MAX_VX_SUPPORT
+#define VX_REDUCE_MAX_VX_SUPPORT 1
+#endif
+
+/*
+ VX_REDUCE_MEAN_VX_SUPPORT is used to declare driver support layer reduce mean layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_REDUCE_MEAN_VX_SUPPORT
+#define VX_REDUCE_MEAN_VX_SUPPORT 1
+#endif
+
+/*
+ VX_REDUCE_SUM_VX_SUPPORT is used to declare driver support layer reduce sum layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_REDUCE_SUM_VX_SUPPORT
+#define VX_REDUCE_SUM_VX_SUPPORT 1
+#endif
+
+/*
+ VX_MAX_MIN_IMUM_VX_SUPPORT is used to declare driver support maximum and minimum layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_MAX_MIN_IMUM_VX_SUPPORT
+#define VX_MAX_MIN_IMUM_VX_SUPPORT 1
+#endif
+
+/*
+ VX_TENSOR_SELECR_VX_SUPPORT is used to declare driver support tensor select layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_TENSOR_SELECT_VX_SUPPORT
+#define VX_TENSOR_SELECT_VX_SUPPORT 1
+#endif
+
+/*
+ VX_GRU_CELL_VX_SUPPORT is used to declare driver support gru cell layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_GRU_CELL_VX_SUPPORT
+#define VX_GRU_CELL_VX_SUPPORT 1
+#endif
+
+/*
+ VX_LSTM_ACTIVATION_SUPPORT is used to declare driver support gru cell layer.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_LSTM_ACTIVATION_SUPPORT
+#define VX_LSTM_ACTIVATION_SUPPORT 1
+#endif
 
 #endif /* __VX_KHR_COMPATIBLE_H__ */
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
index a43a37ec2..49472870d 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
@@ -395,6 +395,17 @@ enum vx_tensor_lifetime_type_e
     VX_TENSOR_LIFE_TIME_DYNAMIC,
 };
 
+/*! \brief Specifies depthtospace mode
+ * \ingroup group_cnn
+ */
+enum vx_nn_depth_to_space_mode_e
+{
+    /*! \brief DCR(default) for depth-column-row order re-arrangement */
+    VX_NN_DEPTH_TO_SPACE_DCR = 0x0,
+    /*! \brief CRD for column-row-depth order re-arrangement */
+    VX_NN_DEPTH_TO_SPACE_CRD,
+};
+
 typedef struct _vx_nn_convolution_3d_params_t
 {
     vx_int32 padding_w_left;                 /*!< \brief Number of elements added at each side in the left of w dimension of the input. */
@@ -972,6 +983,16 @@ typedef struct _vx_nn_mean_params_t
     vx_int32 keep_dims;        /*!< \brief Keep dims, if positive, retains reduced dims with length 1 */
 } vx_nn_mean_params_t;
 
+/*! \brief Input parameter for reducesum layer
+* \ingroup group_cnn
+*\version 0.5
+*/
+typedef struct _vx_nn_sum_params_t
+{
+    vx_tensor axis;            /*!< \brief 1D axis tensor of reduce dims </tt> */
+    vx_int32 keep_dims;        /*!< \brief Keep dims, if positive, retains reduced dims with length 1 */
+} vx_nn_sum_params_t;
+
 /*! \brief Input parameter for tensor squeeze layer
 * \ingroup group_cnn
 *\version 0.5
@@ -1254,6 +1275,12 @@ typedef struct _vx_nn_reorg_params_ext2_t
     vx_int32 *axis;
 } vx_nn_reorg_params_ext2_t;
 
+typedef struct _vx_nn_reorg_params_ext3_t
+{
+    vx_nn_reorg_params_ext2_t base;      /*!< \brief vx_nn_reorg_params <tt>\ref vx_nn_reorg_params_t</tt> */
+    vx_enum mode;                        /*!< \brief  [Optional] Only for DEPH2SPACE */
+} vx_nn_reorg_params_ext3_t;
+
 /*! \brief [Graph] Creates a Reorgnization Layer Node, Enhancement of vxReorgLayer, Support both DEPTH to SPACE and SPACE to DEPTH.
  * \param [in] graph The reference to the parent graph.
  * \param [in] input The input tensor data to reorg.
@@ -1911,6 +1938,21 @@ VX_API_ENTRY vx_node VX_API_CALL vxRPNLayer(
     vx_tensor                   score_output
     );
 
+/*! \brief Input parameters for a lstm activation operation.
+ * \ingroup group_cnn
+ * \version 0.3
+ */
+typedef struct _vx_nn_lstm_activation_params_t
+{
+    vx_int32 is_ln;
+    vx_int32 is_cifg;
+    vx_int32 is_proj;
+    vx_int32 is_hybrid;
+    vx_int32 is_peephole;
+    vx_int32 recurrent_activation;
+    vx_float32 forget_bias;
+} vx_nn_lstm_activation_params_t;
+
 /*! \brief Input parameters for a lstm operation.
  * \ingroup group_cnn
  * \version 0.3
@@ -2115,6 +2157,28 @@ VX_API_ENTRY vx_node VX_API_CALL vxTensorMeanNode(
     vx_size size_of_mean_param,
     vx_tensor outputs);
 
+/*! \brief [Graph] Creates sum layer node.
+* \details
+*    Computes the sum of elements across dimensions of a tensor.
+*
+* \param [in] graph The handle to the graph.
+* \param [in] input A n-D tensor, specifying the input.
+* \param [in] sum_params paraments <tt>\ref vx_nn_sum_params_t </tt>.
+* \param [in] size_of_sum_param [static] The size of the vx_nn_mean_params_t.
+* \param [out] output A n-D tensor of the same type as input.
+* \return <tt> vx_node</tt>.
+* \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
+* successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+* \ingroup group_tensor
+* \version 0.5
+*/
+VX_API_ENTRY vx_node VX_API_CALL vxReduceSumNode(
+    vx_graph graph,
+    vx_tensor inputs,
+    const vx_nn_sum_params_t *sum_params,
+    vx_size size_of_sum_param,
+    vx_tensor outputs);
+
 /*! \brief [Graph] Creates squeeze layer node.
 * \details
 *    Remove dimensions of size 1 from the input tensor.
@@ -2287,6 +2351,282 @@ VX_API_ENTRY vx_node VX_API_CALL vxConv3dLayer(vx_graph graph, vx_tensor inputs,
  */
 VX_API_ENTRY vx_node VX_API_CALL vxDeconv3dLayer(vx_graph graph, vx_tensor inputs, vx_tensor weights, vx_tensor biases, const vx_nn_deconvolution_3d_params_t *convolution_params, vx_size size_of_deconv_params, vx_tensor outputs);
 
+/*! \brief [Graph] Creates a layer Normalization Node.
+ * \details Normalize the activations of the previous layer at each batch, i.e. applies a transformation that maintains the mean activation close to 0 and the activation standard deviation close to 1.
+ * \param [in] graph The handle to the graph.
+ * \param [in] eps [static] Float 32. Small value to add to the variance estimate so that we don't divide by zero.(default is 1e-5)
+ * \param [in] axis [static] The axis on which we need do normalize.
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [out] output [static] The output tensor data.
+ * \return <tt> vx_node</tt>.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxLayerNormalizationLayer(
+    vx_graph                    graph,
+    vx_float32                  eps,
+    vx_int32                    axis,
+    vx_tensor*                  input_list,
+    vx_uint32                   input_count,
+    vx_tensor                   output
+    );
+
+/*! \brief [Graph] Creates a layer instance normalization Node.
+ * \details Normalize the activations of the previous layer at each batch, i.e. applies a transformation that maintains the mean activation close to 0 and the activation standard deviation close to 1.
+ * \param [in] graph The handle to the graph.
+ * \param [in] eps [static] Float 32. Small value to add to the variance estimate so that we don't divide by zero.(default is 1e-5)
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [out] output [static] The output tensor data.
+ * \return <tt> vx_node</tt>.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxInstanceNormalizationLayer(
+    vx_graph                    graph,
+    vx_float32                  eps,
+    vx_tensor*                  input_list,
+    vx_uint32                   input_count,
+    vx_tensor                   output
+    );
+
+/*! \brief [Graph] Creates a layer instance normalization Node.
+ * \details Normalize the activations of the previous layer at each batch, i.e. applies a transformation that maintains the mean activation close to 0 and the activation standard deviation close to 1.
+ * \param [in] graph The handle to the graph.
+ * \param [in] eps [static] Float 32. Small value to add to the variance estimate so that we don't divide by zero.(default is 1e-5)
+ * \param [in] group_num  [static] Int 32. Number of groups for GN
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [out] output [static] The output tensor data.
+ * \return <tt> vx_node</tt>.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxGroupNormalizationLayer(
+    vx_graph                    graph,
+    vx_float32                  eps,
+    vx_int32                    group_num,
+    vx_tensor*                  input_list,
+    vx_uint32                   input_count,
+    vx_tensor                   output
+    );
+
+/*! \brief [Graph] Creates a layer logical ops Node.
+ * \details Return the truth value of x AND, XOR,OR y element-wise.
+ * \param [in] graph The handle to the graph.
+ * \param [in] ops_type  [static] Int 32. Operation Type
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [out] output [static] The output tensor data.
+ * \return <tt> vx_node</tt>.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxLogicalOpsLayer(
+    vx_graph                    graph,
+    vx_int32                    ops_type,
+    vx_tensor*                  input_list,
+    vx_uint32                   input_count,
+    vx_tensor                   output
+    );
+
+/*! \brief [Graph] Creates a layer logical not Node.
+ * \details Return the truth value of not x element-wise.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input [static] The input tensor data.
+ * \param [out] output [static] The output tensor data.
+ * \return <tt> vx_node</tt>.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxLogicalNotLayer(
+    vx_graph                    graph,
+    vx_tensor                   input,
+    vx_tensor                   output
+    );
+
+/*! \brief [Graph] Creates a layer relational Node.
+ * \param [in] graph The handle to the graph.
+ * \param [in] ops_type  [static] Int 32. Operation Type
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [out] output [static] The output tensor data.
+ * \return <tt> vx_node</tt>.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxRelationalLayer(
+    vx_graph                    graph,
+    vx_int32                    ops_type,
+    vx_tensor*                  input_list,
+    vx_uint32                   input_count,
+    vx_tensor                   output
+    );
+
+/*! \brief [Graph] Computes the max of elements across dimensions of input tensor.
+* \param [in] graph The handle to the graph.
+* \param [in] in input tensor data,
+* \param [in] axis [static] used to determine max across which dimension(dimension 0 means width, etc). If not given, compute the sum across all dimensions.
+* \param [in] keep_dim [static] means if keep the dimesion count.
+* \param [out] out output tensor data.
+* \ingroup group_tensor
+* \return <tt> vx_node</tt>.
+* \retval 0 Node could not be created.
+* \retval * Node handle.
+* \version 0.3
+*/
+VX_API_ENTRY vx_node VX_API_CALL vxTensorReduceMaxNode(
+    vx_graph graph,
+    vx_tensor inputs,
+    vx_tensor axis,
+    vx_bool keep_dims,
+    vx_tensor outputs);
+
+/*! \brief [Graph] Creates a layer minumum Node.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [out] output [static] The output tensor data.
+ * \return <tt> vx_node</tt>.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxMinimumLayer(
+    vx_graph                    graph,
+    vx_tensor*                  input_list,
+    vx_uint32                   input_count,
+    vx_tensor                   output
+    );
+
+/*! \brief [Graph] Creates a layer maximum Node.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [out] output [static] The output tensor data.
+ * \return <tt> vx_node</tt>.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxMaximumLayer(
+    vx_graph                    graph,
+    vx_tensor*                  input_list,
+    vx_uint32                   input_count,
+    vx_tensor                   output
+    );
+
+/*! \brief [Graph] Creates a layer select Node.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [out] output [static] The output tensor data.
+ * \return <tt> vx_node</tt>.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxTensorSelectLayer(
+    vx_graph                    graph,
+    vx_tensor*                  input_list,
+    vx_uint32                   input_count,
+    vx_tensor                   output
+    );
+
+/*! \brief [Graph] Creates a layer gru cell activation z h Node.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [in] recurrent_activation [static] recurrent activation type.
+ * \param [in] activation [static] activation type.
+ * \param [out] output_list [static] The output tensor data.
+ * \param [out] output_count [static] The output tensor number.
+ * \return <tt> vx_node</tt>.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxGruCellActivationZHLayer(
+    vx_graph                    graph,
+    vx_tensor*                  input_list,
+    vx_uint32                   input_count,
+    vx_int32                    recurrent_activation,
+    vx_int32                    activation,
+    vx_tensor*                  output_list,
+    vx_uint32                   output_count
+    );
+
+/*! \brief [Graph] Creates a layer gru cell h times activation r Node.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [in] recurrent_activation [static] recurrent activation type.
+ * \param [out] output_list [static] The output tensor data.
+ * \param [out] output_count [static] The output tensor number.
+ * \return <tt> vx_node</tt>.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxGruCellHTimeActivationRLayer(
+    vx_graph                    graph,
+    vx_tensor*                  input_list,
+    vx_uint32                   input_count,
+    vx_int32                    recurrent_activation,
+    vx_tensor*                  output_list,
+    vx_uint32                   output_count
+    );
+
+/*! \brief [Graph] Creates a layer gru cell reset after activationNode.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [in] recurrent_activation [static] recurrent activation type.
+ * \param [in] activation [static] activation type.
+ * \param [out] output_list [static] The output tensor data.
+ * \param [out] output_count [static] The output tensor number.
+ * \return <tt> vx_node</tt>.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxGruCellResetAfterActivationLayer(
+    vx_graph                    graph,
+    vx_tensor*                  input_list,
+    vx_uint32                   input_count,
+    vx_int32                    recurrent_activation,
+    vx_int32                    activation,
+    vx_tensor*                  output_list,
+    vx_uint32                   output_count
+    );
+
+/*! \brief [Graph] Creates a layer lstm activation Node.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input_list [static] The input tensor data.
+ * \param [in] input_count [static] The input tensor number.
+ * \param [in] lstm_activation_param <tt>\ref vx_nn_lstm_activation_params_t </tt>.
+ * \param [out] output_list [static] The output tensor data.
+ * \param [out] output_count [static] The output tensor number.
+ * \return <tt> vx_node</tt>.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxLSTMActivationLayer(
+    vx_graph                                     graph,
+    vx_tensor*                                   input_list,
+    vx_uint32                                    input_count,
+    const vx_nn_lstm_activation_params_t *       lstm_activation_param,
+    vx_tensor*                                   output_list,
+    vx_uint32                                    output_count
+    );
 #ifdef  __cplusplus
 }
 #endif
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
index 6570e1d81..e824d55a7 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
@@ -242,6 +242,48 @@ typedef struct _vx_nn_convolution_relu_pooling_params_ext7_t
     vx_bool       isSub;
 } vx_nn_convolution_relu_pooling_params_ext7_t, * vx_nn_convolution_relu_pooling_params_ext7;
 
+typedef struct _vx_nn_fused_sp_params_t
+{
+    vx_enum multi_sp_kernel_type;
+    /*!<for mul>*/
+    vx_scalar mul_scale;
+     /*!<for sp>*/
+    union
+    {
+        struct
+        {
+            vx_scalar linear_a, linear_b;
+        } linear;
+        struct
+        {
+            vx_scalar tanh_a, tanh_b;
+            float a_v, b_v;
+        } tanh_linear;
+        struct
+        {
+            vx_scalar hsigmoid_a, hsigmoid_b;
+        } hsigmoid;
+        struct
+        {
+            vx_scalar clip_a, clip_b;
+        } clip;
+        struct
+        {
+            vx_scalar scalar_a, scalar_b, scalar_c, scalar_d;
+        } params;
+    } scalar_params;
+      /*!<for other kernel>*/
+} vx_nn_fused_sp_params_t, * vx_nn_fused_sp_params;
+
+typedef struct _vx_nn_convolution_relu_pooling_params_sp_ext_t
+{
+    vx_nn_convolution_relu_pooling_params_ext4_t ext4;  /*!< \brief convolution relu pooling params <tt>\ref vx_nn_convolution_relu_pooling_params_ext_t</tt> */
+    vx_object_array inputs_list;
+    vx_object_array outputs_list;
+    vx_nn_fused_sp_params_t sp_param;
+
+} vx_nn_convolution_relu_pooling_params_sp_ext_t, * vx_nn_convolution_relu_pooling_params_sp_ext;
+
 /*! \brief [Graph] Creates a Convolutional Network Convolution and Activation(Relu) and Pooling Layer Node, this fucntion match kronos NN Extension 1.2 verion.
  * \details This function implement Convolutional Network Convolution and Activation(Relu) and Pooling layer.
  *  For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined,
@@ -1129,6 +1171,48 @@ VX_API_ENTRY vx_node VX_API_CALL vxBatchGemmReluPoolingLayer(vx_graph graph,
                                                              const vx_nn_gemm_relu_pooling_params merge_param,
                                                              vx_tensor output);
 
+/*! \brief  Create a fuse stream process node.
+ * \param [in] graph The handle to the graph.
+ * \param [in] input_list input tensor list.
+ * \param [in] input_count input tensor number.
+ * \param [in] output_list output tensor list.
+ * \param [in] output_count output tensor number.
+ * \param [in] params the parameters for multi streamprocessor merging.
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation
+ * should be checked using <tt>\ref vxGetStatus</tt>
+ * \ingroup group_vision_function_sp
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxFusedSpNode(
+    vx_graph                                graph,
+    vx_tensor*                              input_list,
+    vx_uint32                               input_count,
+    vx_tensor*                              output_list,
+    vx_uint32                               output_count,
+    const vx_nn_fused_sp_params_t *         params
+    );
+
+/*! \brief  Create a conv fuse stream process node.
+ * \param [in] graph The handle to the graph.
+ * \param [in] inputs input tensor.
+ * \param [in] weights_biases [static] Point to WeightBiasesParameter data, vx_weights_biases_parameter is an opaque reference. 
+ * \param [in] convolution_relu_pooling_params [static] Pointer to parameters of type <tt>\ref vx_nn_convolution_relu_pooling_params_t</tt>
+ * \param [in] size_of_convolution_relu_pooling_params [static] Size in bytes of convolution_relu_pooling_params.
+ * \param [in] outputs output tensor.
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation
+ * should be checked using <tt>\ref vxGetStatus</tt>
+ * \ingroup group_vision_function_sp
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxConvSpNode(
+    vx_graph                                        graph,
+    vx_tensor                                       inputs,
+    vx_weights_biases_parameter                     weights_biases,
+    const vx_nn_convolution_relu_pooling_params_t * convolution_relu_pooling_params,
+    vx_size                                         size_of_convolution_relu_pooling_params,
+    vx_tensor                                       outputs
+);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h
index 36df37487..38d2223a4 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h
@@ -345,16 +345,6 @@ VX_API_ENTRY vx_spinst VX_API_CALL vxCreateSPINST(
     vx_context          context
     );
 
-/*! \brief Creates an internal reference to a spinst data.
- * \param [in] context The reference to the implementation context.
- * \return A spinst data reference.
- * \Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>.
- * \ingroup group_object_spinst
- */
-VX_API_ENTRY vx_spinst VX_API_CALL vxCreateSPINSTInternal(
-    vx_context          context
-    );
-
 /*! \brief Releases a reference to a external spinst object.
  * The object may not be garbage collected until its total reference count is zero.
  * \param [in] spinst_obj The pointer to the spinst data to release.
@@ -368,19 +358,6 @@ VX_API_ENTRY vx_status VX_API_CALL vxReleaseSPINST(
     vx_spinst            *spinst_obj
     );
 
-/*! \brief Releases a reference to a internal spinst object.
- * The object may not be garbage collected until its total reference count is zero.
- * \param [in] spinst_obj The pointer to the spinst data to release.
- * \post After returning from this function the reference is zeroed.
- * \return A <tt>\ref vx_status_e</tt> enumeration.
- * \retval VX_SUCCESS No errors; all other values indicate failure
- * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
- * \ingroup group_object_spinst
- */
-VX_API_ENTRY vx_status VX_API_CALL vxReleaseSPINSTInternal(
-    vx_spinst            *spinst_obj
-    );
-
 /*! \brief Add a instruction to spinst object.
  * \param [in] spinst_obj The reference to the spinst object.
  * \param [in] inst_unit_array The units of one instruction. Use a <tt>\ref vx_spinst_unit_param</tt>.
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
index 6f75ea9db..eefa39ce5 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
@@ -477,6 +477,8 @@ enum vx_type_e {
     VX_TYPE_SPINST          = 0x81B,/*!< \brief A <tt>\ref vx_spinst</tt>. */
     VX_TYPE_INT4            = 0x81C,/*!< \brief A <tt>\ref signed 4bits tensor.</tt>. */
     VX_TYPE_UINT4           = 0x81D,/*!< \brief A <tt>\ref unsigned 4bits tensor.</tt>. */
+    VX_TYPE_FLOAT8_E4M3     = 0x81E,/*!< \brief A <tt>\ref vx_float8_e4m3</tt>. */
+    VX_TYPE_FLOAT8_E5M2     = 0x81F,/*!< \brief A <tt>\ref vx_float8_e5m2</tt>. */
 };
 
 /*! \brief The enumeration of all status codes.
@@ -803,6 +805,8 @@ enum vx_convert_policy_e {
     VX_CONVERT_POLICY_WRAP = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CONVERT_POLICY) + 0x0,
     /*! \brief Results are saturated to the bit depth of the output operand. */
     VX_CONVERT_POLICY_SATURATE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CONVERT_POLICY) + 0x1,
+    /*! \brief Results preserve infinity and nan value. */
+    VX_CONVERT_POLICY_INF = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_CONVERT_POLICY) + 0x0,
 };
 
 /*! \brief Based on the VX_DF_IMAGE definition.
diff --git a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so
index 0e2036813..40b91d016 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so and b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libCLC.so b/prebuilt-sdk/x86_64_linux/lib/libCLC.so
index 9c8839038..a50839e36 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libCLC.so and b/prebuilt-sdk/x86_64_linux/lib/libCLC.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so
index 96a5ab43d..201f51c15 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so and b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libGAL.so b/prebuilt-sdk/x86_64_linux/lib/libGAL.so
index 06525dac1..fa303327d 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libGAL.so and b/prebuilt-sdk/x86_64_linux/lib/libGAL.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so
index 1566bab34..fee4a57db 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so and b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0
index 71f33843a..b8a0d961d 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 and b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so
index 9b7e0caf8..cfa02ae3a 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so and b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libVSC.so b/prebuilt-sdk/x86_64_linux/lib/libVSC.so
index 1bafe16b3..e482f3097 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libVSC.so and b/prebuilt-sdk/x86_64_linux/lib/libVSC.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libvdtproxy.so b/prebuilt-sdk/x86_64_linux/lib/libvdtproxy.so
index 628f663a4..0deaff134 100644
Binary files a/prebuilt-sdk/x86_64_linux/lib/libvdtproxy.so and b/prebuilt-sdk/x86_64_linux/lib/libvdtproxy.so differ
diff --git a/src/tim/vx/internal/include/custom/custom_node_type.def b/src/tim/vx/internal/include/custom/custom_node_type.def
index 90d772799..c5ef3e04a 100644
--- a/src/tim/vx/internal/include/custom/custom_node_type.def
+++ b/src/tim/vx/internal/include/custom/custom_node_type.def
@@ -6,3 +6,6 @@ DEF_NODE_TYPE(custom_ainr_denoise_postprocess)
 DEF_NODE_TYPE(custom_warp_affine)
 DEF_NODE_TYPE(custom_warp_perspective)
 DEF_NODE_TYPE(custom_sample)
+DEF_NODE_TYPE(custom_tiny_yolov4_postprocess)
+DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_confidence)
+DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_box)
diff --git a/src/tim/vx/internal/include/custom/custom_ops.def b/src/tim/vx/internal/include/custom/custom_ops.def
index 00504392c..2074b8f30 100644
--- a/src/tim/vx/internal/include/custom/custom_ops.def
+++ b/src/tim/vx/internal/include/custom/custom_ops.def
@@ -6,3 +6,6 @@ DEF_OP(CUSTOM_AINR_DENOISE_POSTPROCESS)
 DEF_OP(CUSTOM_WARP_AFFINE)
 DEF_OP(CUSTOM_WARP_PERSPECTIVE)
 DEF_OP(CUSTOM_SAMPLE)
+DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS)
+DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE)
+DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX)
diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h
new file mode 100644
index 000000000..5234d56d6
--- /dev/null
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h
@@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_H
+#define _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_custom_tiny_yolov4_postprocess_param
+{
+    struct _custom_tiny_yolov4_postprocess_local_data_t* local;
+    // Add parameters here
+} vsi_nn_custom_tiny_yolov4_postprocess_param;
+_compiler_assert(offsetof(vsi_nn_custom_tiny_yolov4_postprocess_param, local) == 0, \
+    vsi_nn_custom_tiny_yolov4_postprocess_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h
new file mode 100644
index 000000000..854c3a9e1
--- /dev/null
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h
@@ -0,0 +1,49 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX_H
+#define _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_custom_tiny_yolov4_postprocess_box_param
+{
+    struct _custom_tiny_yolov4_postprocess_box_local_data_t* local;
+    // Add parameters here
+    float bias_0;
+    float bias_1;
+} vsi_nn_custom_tiny_yolov4_postprocess_box_param;
+_compiler_assert(offsetof(vsi_nn_custom_tiny_yolov4_postprocess_box_param, local) == 0, \
+    vsi_nn_custom_tiny_yolov4_postprocess_box_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h
new file mode 100644
index 000000000..181595289
--- /dev/null
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h
@@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE_H
+#define _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_custom_tiny_yolov4_postprocess_confidence_param
+{
+    struct _custom_tiny_yolov4_postprocess_confidence_local_data_t* local;
+    // Add parameters here
+} vsi_nn_custom_tiny_yolov4_postprocess_confidence_param;
+_compiler_assert(offsetof(vsi_nn_custom_tiny_yolov4_postprocess_confidence_param, local) == 0, \
+    vsi_nn_custom_tiny_yolov4_postprocess_confidence_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h
index 815a064fc..adf769f7f 100644
--- a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h
@@ -38,6 +38,7 @@ typedef struct _vsi_nn_custom_warp_affine_param
     const float *matrix;
     vsi_enum type;
     int32_t size[2];
+    vsi_enum rgb_type;
 } vsi_nn_custom_warp_affine_param;
 _compiler_assert(offsetof(vsi_nn_custom_warp_affine_param, local) == 0, \
     vsi_nn_custom_warp_affine_h );
diff --git a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
index 8976be307..eb23a2055 100644
--- a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
+++ b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
@@ -31,5 +31,8 @@
 #include "custom/ops/vsi_nn_op_custom_warp_affine.h"
 #include "custom/ops/vsi_nn_op_custom_warp_perspective.h"
 #include "custom/ops/vsi_nn_op_custom_sample.h"
+#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h"
+#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h"
+#include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h"
 
 #endif
diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def
index 82d843fc5..0753df06d 100755
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@@ -193,3 +193,4 @@ DEF_OP(REVERSESEQUENCE)
 DEF_OP(INVERSE_SIGMOID)
 DEF_OP(GRID_SAMPLE)
 DEF_OP(LPNORM)
+DEF_OP(RESIZE_3D)
diff --git a/src/tim/vx/internal/include/internal/internal_ops.def b/src/tim/vx/internal/include/internal/internal_ops.def
old mode 100755
new mode 100644
index de3332709..a47559a3a
--- a/src/tim/vx/internal/include/internal/internal_ops.def
+++ b/src/tim/vx/internal/include/internal/internal_ops.def
@@ -20,4 +20,3 @@ DEF_OP(SPACE2DEPTH_INTERNAL)
 DEF_OP(GRUCELL_H_TIMES_ACTIVATION_R)
 DEF_OP(GRUCELL_ACTIVATION_Z_H)
 DEF_OP(REDUCE_MEAN_INTERNAL)
-DEF_OP(BILINEAR_GRID_SAMPLE)
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
index c118e137f..5150b0e4a 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
@@ -79,6 +79,8 @@ typedef enum
     BOOL8,
     I4,
     U4,
+    FP8_E4M3,
+    FP8_E5M2,
 } VSI_PUBLIC_TYPE vsi_nn_kernel_dtype_e;
 
 typedef enum
@@ -89,6 +91,8 @@ typedef enum
     VSI_NN_KERNEL_QUANT_ASYMM_PERCHANNEL,
     VSI_NN_KERNEL_QUANT_SYMM,
     VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL,
+    VSI_NN_KERNEL_QUANT_FLOAT8,
+    VSI_NN_KERNEL_QUANT_FLOAT8_PERCHANNEL,
     VSI_NN_KERNEL_QUANT_TYPE_NUM
 } vsi_nn_kernel_quant_type_e;
 
@@ -522,6 +526,10 @@ static VSI_INLINE_API vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
         return BF16;
     case VSI_NN_TYPE_FLOAT32:
         return F32;
+    case VSI_NN_TYPE_FLOAT8_E4M3:
+        return FP8_E4M3;
+    case VSI_NN_TYPE_FLOAT8_E5M2:
+        return FP8_E5M2;
     default:
         VSILOGE("error data type %d", dtype);
         break;
@@ -579,6 +587,8 @@ static VSI_INLINE_API size_t vsi_nn_kernel_dtype_get_bytes
         case I8:
         case U8:
         case BOOL8:
+        case FP8_E4M3:
+        case FP8_E5M2:
             return sizeof(int8_t);
         case I16:
         case U16:
@@ -611,6 +621,8 @@ static VSI_INLINE_API vsi_size_t vsi_nn_kernel_dtype_get_bits
         case I8:
         case U8:
         case BOOL8:
+        case FP8_E4M3:
+        case FP8_E5M2:
             return 8;
         case I16:
         case U16:
@@ -879,7 +891,7 @@ static VSI_INLINE_API void vsi_nn_kernel_tensor_attr_get_stride
     shape = attr->shape->data;
     type_bits = vsi_nn_kernel_dtype_get_bits( attr->dtype );
 
-    if ( type_bits < BITS_PER_BYTE )
+    if ( type_bits < BITS_PER_BYTE && type_bits != 0)
     {
         vsi_size_t i;
 
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
index cfecfd1fe..c834d040e 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
@@ -91,4 +91,21 @@ vsi_bool vsi_nn_kernel_optimize_scatter_elements_shape
     vsi_size_t* out_shape_x, uint32_t* out_rank_x, int32_t* out_axis, vsi_size_t max_size
     );
 
+vsi_bool vsi_nn_kernel_optimize_matrixmul_broadcast_shape
+    (
+    const vsi_size_t * shape_x,
+    const vsi_size_t * shape_y,
+    const vsi_size_t * shape_output,
+    vsi_size_t rank_x,
+    vsi_size_t rank_y,
+    vsi_size_t rank_out,
+    vsi_size_t* out_shape_x,
+    vsi_size_t* out_shape_y,
+    vsi_size_t* out_shape_output,
+    uint32_t* new_rank,
+    uint32_t* cross_flg,
+    uint32_t* size_axis_inner_outer,
+    uint32_t* strides_axis_inner_outer
+    );
+
 #endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h
index 3f614139a..749a432e7 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h
@@ -82,6 +82,12 @@ typedef struct _vsi_nn_pre_process_param
 
     vsi_nn_pre_process_type_e type;
 
+    struct
+    {
+        float   mean[3];
+        float   scale[3];
+    } norm2;
+
     vsi_nn_pre_process_lcl_data *local;
 } vsi_nn_pre_process_param;
 
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h
index d01fba846..d2772b5c1 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h
@@ -65,6 +65,10 @@ typedef struct _vsi_nn_pre_process_bgra_param
 
     vsi_bool reverse_channel;
 
+    float r_scale;
+    float g_scale;
+    float b_scale;
+
     /* pre process rgb layer local data structure */
     vsi_nn_pre_process_bgra_lcl_data local;
 } vsi_nn_pre_process_bgra_param;
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h
index aa8fc820f..34c5a6de6 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h
@@ -70,6 +70,10 @@ typedef struct _vsi_nn_pre_process_nv12_param
     vsi_nn_pre_process_nv12_lcl_data* local;
 
     vsi_nn_nv_type nv_type;
+
+    float r_scale;
+    float g_scale;
+    float b_scale;
 } vsi_nn_pre_process_nv12_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
index da52fa0d2..9e05a5966 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
@@ -76,6 +76,9 @@ typedef struct _vsi_nn_pre_process_rgb_param
 
     vsi_bool reverse_channel;
 
+    float r_scale;
+    float g_scale;
+    float b_scale;
     /* pre process rgb layer local data structure */
     vsi_nn_pre_process_rgb_lcl_data local;
 } vsi_nn_pre_process_rgb_param;
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h
index f384e4fb3..171df70c3 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h
@@ -53,6 +53,15 @@ typedef struct _vsi_nn_pre_process_rgb888_planar_param
     float g_mean;
     float b_mean;
     float scale;
+
+
+    vsi_bool reverse_channel;
+    vsi_bool enable_rgb88_planar_nhwc;
+
+    float r_scale;
+    float g_scale;
+    float b_scale;
+
 } vsi_nn_pre_process_rgb888_planar_param;
 _compiler_assert(offsetof(vsi_nn_pre_process_rgb888_planar_param, local) == 0, \
     vsi_nn_pre_process_rgb888_planar_h );
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h
index 998de5ee2..2ceabcb75 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h
@@ -66,6 +66,11 @@ typedef struct _vsi_nn_pre_process_yuv420_param
     float rgb_scale;
 
     vsi_bool reverse_channel;
+
+    float r_scale;
+    float g_scale;
+    float b_scale;
+
     /* local data must be the first. */
     vsi_nn_pre_process_yuv420_lcl_data local;
 } vsi_nn_pre_process_yuv420_param;
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h
index b516e6016..1ca45170c 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h
@@ -71,6 +71,10 @@ typedef struct _vsi_nn_pre_process_yuv422_param
     float rgb_scale;
 
     vsi_bool reverse_channel;
+
+    float r_scale;
+    float g_scale;
+    float b_scale;
 } vsi_nn_pre_process_yuv422_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h
index c4391773e..7b2658968 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h
@@ -66,6 +66,10 @@ typedef struct _vsi_nn_pre_process_yuv444_param
     float rgb_scale;
 
     vsi_bool reverse_channel;
+
+    float r_scale;
+    float g_scale;
+    float b_scale;
     /* local data must be the first. */
     vsi_nn_pre_process_yuv444_lcl_data* local;
 } vsi_nn_pre_process_yuv444_param;
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_bilinear_grid_sample.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_3d.h
similarity index 76%
rename from src/tim/vx/internal/include/ops/vsi_nn_op_bilinear_grid_sample.h
rename to src/tim/vx/internal/include/ops/vsi_nn_op_resize_3d.h
index d04c589a9..0771a71f0 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_bilinear_grid_sample.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_3d.h
@@ -22,8 +22,8 @@
 *
 *****************************************************************************/
 
-#ifndef _VSI_NN_OP_BILINEAR_GRID_SAMPLE_H
-#define _VSI_NN_OP_BILINEAR_GRID_SAMPLE_H
+#ifndef _VSI_NN_OP_RESIZE_3D_H
+#define _VSI_NN_OP_RESIZE_3D_H
 
 #include "vsi_nn_types.h"
 
@@ -31,17 +31,19 @@
 extern "C" {
 #endif
 
+typedef struct _vsi_nn_resize_3d_local_data {
+    vsi_bool use_internal_node;
+} vsi_nn_resize_3d_local_data;
 
-typedef struct _vsi_nn_bilinear_grid_sample_param
+typedef struct _vsi_nn_resize_3d_param
 {
-    struct _bilinear_grid_sample_local_data_t* local;
-    vsi_bool align_corners;
-    vsi_nn_pad_mode_e padding_mode;
-    int32_t const_val;
-} vsi_nn_bilinear_grid_sample_param;
-
-_compiler_assert(offsetof(vsi_nn_bilinear_grid_sample_param, local) == 0, \
-    vsi_nn_bilinear_grid_sample_h );
+    vsi_nn_resize_3d_local_data* lcl_data;
+    vsi_enum   type;
+    float      factor;
+    int32_t    size[3];
+    vsi_bool   align_corners;
+    vsi_bool   half_pixel_centers;
+} vsi_nn_resize_3d_param;
 
 #ifdef __cplusplus
 }
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
index 7ab6ff2dd..bccc0b5e5 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
@@ -33,6 +33,7 @@ extern "C" {
 typedef struct _vsi_nn_topk_param
 {
     uint32_t     k;
+    int32_t      axis;
 } vsi_nn_topk_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h b/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h
index d7e598395..6446cd046 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h
@@ -52,7 +52,9 @@ enum {
     D_BF16 = VSI_NN_TYPE_BFLOAT16,
     D_BOOL8 = VSI_NN_TYPE_BOOL8,
     D_I4 = VSI_NN_TYPE_INT4,
-    D_U4 = VSI_NN_TYPE_UINT4
+    D_U4 = VSI_NN_TYPE_UINT4,
+    D_F8_E4M3 = VSI_NN_TYPE_FLOAT8_E4M3,
+    D_F8_E5M2 = VSI_NN_TYPE_FLOAT8_E5M2
 };
 
 /* short alias for qtype */
@@ -63,6 +65,8 @@ enum {
     Q_ASYM = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC << Q_SHIFT,
     Q_SYM_PC = VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC << Q_SHIFT,
     Q_SYM = VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC << Q_SHIFT,
+    Q_FP8 = VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 << Q_SHIFT,
+    Q_FP8_PC = VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 << Q_SHIFT,
 };
 
 typedef struct {
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
index ab63a3c70..367ff88fb 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
@@ -27,6 +27,7 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_math.h"
 #include "vsi_nn_tensor.h"
+#include "vsi_nn_log.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -78,6 +79,8 @@ static VSI_INLINE_API vsi_bool type_is_signed
     case VSI_NN_TYPE_FLOAT32:
     case VSI_NN_TYPE_FLOAT64:
     case VSI_NN_TYPE_BFLOAT16:
+    case VSI_NN_TYPE_FLOAT8_E4M3:
+    case VSI_NN_TYPE_FLOAT8_E5M2:
         ret = TRUE;
         break;
     default:
@@ -93,9 +96,14 @@ static VSI_INLINE_API uint32_t type_get_bytes
 {
     switch( type )
     {
+    case VSI_NN_TYPE_INT4:
+    case VSI_NN_TYPE_UINT4:
+        return 0;
     case VSI_NN_TYPE_INT8:
     case VSI_NN_TYPE_UINT8:
     case VSI_NN_TYPE_BOOL8:
+    case VSI_NN_TYPE_FLOAT8_E4M3:
+    case VSI_NN_TYPE_FLOAT8_E5M2:
         return 1;
     case VSI_NN_TYPE_INT16:
     case VSI_NN_TYPE_UINT16:
@@ -111,7 +119,8 @@ static VSI_INLINE_API uint32_t type_get_bytes
     case VSI_NN_TYPE_FLOAT64:
         return 8;
     default:
-        return 0;
+        VSILOGE("unsupported type: %d", type);
+        return 1;
     }
 } /* type_get_bytes() */
 
@@ -128,6 +137,8 @@ static VSI_INLINE_API uint32_t type_get_bits
     case VSI_NN_TYPE_INT8:
     case VSI_NN_TYPE_UINT8:
     case VSI_NN_TYPE_BOOL8:
+    case VSI_NN_TYPE_FLOAT8_E4M3:
+    case VSI_NN_TYPE_FLOAT8_E5M2:
         return 8;
     case VSI_NN_TYPE_INT16:
     case VSI_NN_TYPE_UINT16:
@@ -143,7 +154,8 @@ static VSI_INLINE_API uint32_t type_get_bits
     case VSI_NN_TYPE_FLOAT64:
         return 64;
     default:
-        return 0;
+        VSILOGE("unsupported type: %d", type);
+        return 1;
     }
 } /* type_get_bits() */
 
@@ -236,6 +248,7 @@ static VSI_INLINE_API float affine_to_fp32
     )
 {
     float data;
+    VSI_UNREFERENCED(type);
     data = ( (float)val - zero_point ) * scale;
     return data;
 } /* affine_to_fp32() */
@@ -279,6 +292,7 @@ static VSI_INLINE_API float dfp_to_fp32
     )
 {
     float result;
+    VSI_UNREFERENCED(type);
     if( fl > 0 )
     {
         result = (float)val * ( 1.0f / ( (float) ( (int64_t)1 << fl ) ) );
@@ -440,6 +454,139 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
     return out;
 } /* fp32_to_bfp16_rtne */
 
+#define FLOAT_BIAS_EXPONENT 127
+#define FLOAT_EXPONENT_SIZE 8
+#define FLOAT_MANTISSA_SIZE  23
+#define FLOAT8_E4M3_BIAS_EXPONENT 7
+#define FLOAT8_E4M3_EXPONENT_SIZE 4
+#define FLOAT8_E4M3_MANTISSA_SIZE 3
+#define FLOAT8_E5M2_BIAS_EXPONENT 15
+#define FLOAT8_E5M2_EXPONENT_SIZE 5
+#define FLOAT8_E5M2_MANTISSA_SIZE 2
+
+static VSI_INLINE_API uint8_t fp32_to_fp8_e4m3(float in, const float scale) {
+    float fp8_f32 = in / scale;
+    int32_t fp8_i32 = *((int32_t*)&fp8_f32);
+    //int32_t mask = (int32_t)(pow(2, 32) - 1 - (pow(2, 23 - 3) - 1));
+    int32_t eps = 1 << (23 - 3 - 1);
+    fp8_i32 += eps;
+    //fp8_i32 &= mask;
+    {
+        int sign = (fp8_i32 >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1;
+        int exp = (fp8_i32 >> FLOAT_MANTISSA_SIZE) & 0xff;
+        int expShiftValue = FLOAT8_E4M3_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT;
+        int mantissa = (fp8_i32 >> (FLOAT_MANTISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7;
+
+        exp = (exp + expShiftValue) & 0xF;
+
+        return (uint8_t)(sign << 7 | exp << 3 | mantissa);
+    }
+} /* fp32_to_fp8_e4m3() */
+
+static VSI_INLINE_API uint8_t fp32_to_fp8_e5m2(float in, const float scale) {
+    float fp8_f32 = in / scale;
+    int32_t fp8_i32 = *((int32_t*)&fp8_f32);
+    //int32_t mask = (int32_t)(pow(2, 32) - 1 - (pow(2, 23 - 2) - 1));
+    int32_t eps = 1 << (23 - 2 - 1);
+    fp8_i32 += eps;
+    //fp8_i32 &= mask;
+    {
+        int sign = (fp8_i32 >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1;
+        int exp = (fp8_i32 >> FLOAT_MANTISSA_SIZE) & 0xff;
+        int expShiftValue = FLOAT8_E5M2_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT;
+        int mantissa = (fp8_i32 >> (FLOAT_MANTISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x3;
+
+        exp = (exp + expShiftValue) & 0x1F;
+
+        return (uint8_t)(sign << 7 | exp << 2 | mantissa);
+    }
+} /* fp32_to_fp8_e5m2() */
+
+static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) {
+    float val_fp32;
+
+    uint32_t signOut = 0;
+    uint32_t exponentOut = 0;
+    uint32_t mantissaOut = 0;
+    uint32_t out_u = 0;
+
+    uint32_t signIn;
+    uint32_t exponentIn;
+    uint32_t mantissaIn;
+    int expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E4M3_BIAS_EXPONENT;
+
+    signIn = (in >> (FLOAT8_E4M3_EXPONENT_SIZE + FLOAT8_E4M3_MANTISSA_SIZE)) & 0x1;
+    exponentIn = (in >> FLOAT8_E4M3_MANTISSA_SIZE) & 0xF;
+    mantissaIn = in & 0x7;
+
+    signOut = signIn;
+
+    if (exponentIn == 0 && mantissaIn == 0)
+    {
+        goto final;
+    }
+
+    if (exponentIn == 0xf && mantissaIn == 0x7)
+    {
+        exponentOut = 0xff;
+        mantissaOut = 0x400000;
+        goto final;
+    }
+
+    exponentOut = (exponentIn + expShiftValue) & 0xff;
+    mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7fffff;
+
+
+final:
+    out_u = signOut << 31 | exponentOut << 23 | mantissaOut;
+    val_fp32 = *((float*)&out_u);
+
+    return val_fp32 * scale;
+} /* fp8_e4m3_to_fp32() */
+
+static VSI_INLINE_API float fp8_e5m2_to_fp32(int8_t in, const float scale) {
+    float val_fp32;
+
+    uint32_t signOut = 0;
+    uint32_t exponentOut = 0;
+    uint32_t mantissaOut = 0;
+    uint32_t out_u = 0;
+
+    uint32_t signIn;
+    uint32_t exponentIn;
+    uint32_t mantissaIn;
+    int expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E5M2_BIAS_EXPONENT;
+
+    signIn = (in >> 7) & 0x1;
+    exponentIn = (in >> 2) & 0x1F;
+    mantissaIn = in & 0x3;
+
+    signOut = signIn;
+
+    if (exponentIn == 0 && mantissaIn == 0)
+    {
+        goto final;
+    }
+
+    if (exponentIn == 0x1f && mantissaIn == 0x3)
+    {
+        exponentOut = 0xff;
+        mantissaOut = 0x400000;
+        goto final;
+    }
+
+
+    exponentOut = (exponentIn + expShiftValue) & 0xff;
+    mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x7fffff;
+
+
+ final:
+    out_u = signOut << 31 | exponentOut << 23 | mantissaOut;
+    val_fp32 = *((float*)&out_u);
+
+    return val_fp32 * scale;
+} /* fp8_e5m2_to_fp32() */
+
 static VSI_INLINE_API vsi_status dtype_to_float32
     (
     uint8_t *src,
@@ -458,6 +605,12 @@ static VSI_INLINE_API vsi_status dtype_to_float32
     case VSI_NN_TYPE_BFLOAT16:
         *dst = bfp16_to_fp32( *(int16_t *)src );
         break;
+    case VSI_NN_TYPE_FLOAT8_E4M3:
+        *dst = fp8_e4m3_to_fp32(*(int8_t*)src, src_dtype->scale);
+        break;
+    case VSI_NN_TYPE_FLOAT8_E5M2:
+        *dst = fp8_e5m2_to_fp32(*(int8_t *)src, src_dtype->scale);
+        break;
     case VSI_NN_TYPE_INT4:
     case VSI_NN_TYPE_UINT4:
     case VSI_NN_TYPE_INT8:
@@ -511,6 +664,12 @@ static VSI_INLINE_API vsi_status float32_to_dtype
     case VSI_NN_TYPE_BFLOAT16:
         *(int16_t *)dst = fp32_to_bfp16_rtne( src );
         break;
+    case VSI_NN_TYPE_FLOAT8_E4M3:
+        *(int8_t *)dst = fp32_to_fp8_e4m3(src, dst_dtype->scale);
+        break;
+    case VSI_NN_TYPE_FLOAT8_E5M2:
+        *(int8_t *)dst = fp32_to_fp8_e5m2(src, dst_dtype->scale);
+        break;
     case VSI_NN_TYPE_INT4:
     case VSI_NN_TYPE_UINT4:
     case VSI_NN_TYPE_INT8:
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_link_list.h b/src/tim/vx/internal/include/utils/vsi_nn_link_list.h
index 7e6afb2ea..2c800a152 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_link_list.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_link_list.h
@@ -30,7 +30,7 @@
 extern "C"{
 #endif
 
-#define vsi_nn_LinkListInitRoot(n) do{n = NULL;} while (0);
+#define vsi_nn_LinkListInitRoot(n) {n = NULL;}
 
 typedef struct _vsi_nn_link_list
 {
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_math.h b/src/tim/vx/internal/include/utils/vsi_nn_math.h
index b8a6d2a9a..924ddf004 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_math.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_math.h
@@ -53,12 +53,13 @@ extern "C" {
 #define DEFINE_ARRAY_TYPE( NAME, TYPE ) \
     typedef struct { \
         size_t size; \
-        TYPE data[0]; \
+        TYPE *data; \
     } vsi_##NAME##_array_t; \
     static VSI_INLINE_API vsi_##NAME##_array_t * vsi_##NAME##_array_create( size_t size ) { \
-        vsi_##NAME##_array_t * array = (vsi_##NAME##_array_t *)malloc( \
-                sizeof(vsi_##NAME##_array_t) + sizeof(TYPE) * size ); \
+        vsi_##NAME##_array_t * array = NULL; \
+        array = (vsi_##NAME##_array_t *)malloc( sizeof(vsi_##NAME##_array_t) + sizeof(TYPE) * size ); \
         if (array == NULL) return NULL; \
+        array->data = (TYPE *)(((TYPE**)(&(array->data))) + 1); \
         array->size = size; \
         return array; \
     } \
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h
index f939592b0..128e7d0c5 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h
@@ -50,14 +50,23 @@ extern "C" {
     free( _PTR ); _PTR = NULL; }
 
 #define vsi_safe_release_tensor(_t) if(_t){vsi_nn_ReleaseTensor(&(_t)); _t = NULL;}
-
-#define END_OF_VARIADIC_ARGUMENTS       ((size_t)0xbadcaffebadcaffe)
+#if (defined(_WIN32) || defined(__WIN32__) || defined(WIN32))
+    #if defined(_WIN64)
+        #define END_OF_VARIADIC_ARGUMENTS       ((size_t)0xbadcaffebadcaffe)
+    #else
+        #define END_OF_VARIADIC_ARGUMENTS       ((size_t)0xbadcaffe)
+    #endif
+#else
+    #define END_OF_VARIADIC_ARGUMENTS       ((size_t)0xbadcaffebadcaffe)
+#endif
 
 #define FOREACH_ARGS(_args, _next, _arg_type) \
     while(((_arg_type)((size_t)END_OF_VARIADIC_ARGUMENTS)) != (_next = va_arg(_args, _arg_type)))
 
 #define BITS_PER_BYTE 8
 
+#define VSI_UNREFERENCED( param ) ( ( void ) ( param ) )
+
 #define VSI_NN_STRINGIZE(X) VSI_NN_DO_STRINGIZE(X)
 #define VSI_NN_DO_STRINGIZE(X) #X
 
diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h
index 75e5ab7e1..777cf5c04 100644
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@@ -78,6 +78,7 @@ typedef struct _vsi_nn_runtime_option_t
     int32_t enable_asymi8_to_u8;
     int32_t enable_dataconvert_optimize;
     int32_t enable_stream_processor;
+    int32_t enable_rgb88_planar_nhwc;
 } vsi_nn_runtime_option_t;
 
 /**
diff --git a/src/tim/vx/internal/include/vsi_nn_error.h b/src/tim/vx/internal/include/vsi_nn_error.h
index 7b55aa507..bc9eca8b6 100644
--- a/src/tim/vx/internal/include/vsi_nn_error.h
+++ b/src/tim/vx/internal/include/vsi_nn_error.h
@@ -31,33 +31,42 @@
 #define VSI_ASSERT( cond )  assert(cond)
 
 #define VSI_CHECK_PTR( pointer, msg, retval ) \
-    do { \
+    { \
         if( pointer == NULL ) { \
             VSILOGD("%s",msg); \
             VSI_ASSERT(FALSE); \
         } \
-    } while(0)
+    }
 
 
-#define CHECK_STATUS_FAIL_GOTO( stat, lbl )  do {\
+#define CHECK_STATUS_FAIL_GOTO( stat, lbl )  {\
     if( VSI_SUCCESS != stat ) {\
         VSILOGE("CHECK STATUS(%d:%s)", (stat), vsi_nn_DescribeStatus(stat));\
         goto lbl;\
     }\
-} while(0)
+}
 
-#define CHECK_STATUS( stat )  do {\
+#define CHECK_STATUS( stat )  {\
     if( VSI_SUCCESS != stat ) {\
         VSILOGE("CHECK STATUS(%d:%s)", (stat), vsi_nn_DescribeStatus(stat));\
     }\
-} while(0)
+}
 
 #define CHECK_PTR_FAIL_GOTO( pointer, msg, lbl ) \
-    do { \
+    { \
         if( pointer == NULL ) { \
             VSILOGD("CHECK POINTER %s", msg); \
             goto lbl; \
         } \
-    } while(0)
+    }
+
+#define CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( pointer, node, msg, lbl ) \
+    { \
+        if( pointer == NULL ) { \
+            vsi_nn_internal_release_node(&node);\
+            VSILOGD("CHECK POINTER %s", msg); \
+            goto lbl; \
+        } \
+    }
 
 #endif
diff --git a/src/tim/vx/internal/include/vsi_nn_feature_config.h b/src/tim/vx/internal/include/vsi_nn_feature_config.h
index 01ec04c29..e93d1af19 100644
--- a/src/tim/vx/internal/include/vsi_nn_feature_config.h
+++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h
@@ -1,26 +1,3 @@
-/****************************************************************************
-*
-*    Copyright (c) 2019 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the Software),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
 /*****Auto generated header file, Please DO NOT modify manually!*****/
 #ifndef _VSI_NN_FEATURE_CONFIG_H
 #define _VSI_NN_FEATURE_CONFIG_H
@@ -42,5 +19,6 @@
 #if defined(VX_TENSORVIEW_ON_ANY_DIM) && VX_TENSORVIEW_ON_ANY_DIM
 #define VSI_CONCAT_ENHANCE_SUPPORT
 #endif
+#define VSI_CREATE_TENSOR_FROM_VIEW_SUPPORT
 
 #endif
diff --git a/src/tim/vx/internal/include/vsi_nn_graph.h b/src/tim/vx/internal/include/vsi_nn_graph.h
index 175687096..8504791f8 100644
--- a/src/tim/vx/internal/include/vsi_nn_graph.h
+++ b/src/tim/vx/internal/include/vsi_nn_graph.h
@@ -361,6 +361,27 @@ OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromHandle
     uint8_t             * data
     );
 
+/**
+ * Add a new tensor from view
+ * Create a new tensor from a view and add it to graph.
+ *
+ * @param[in] graph Graph handle.
+ * @param[in] id Required, the id of the parent tensor on which to create view.
+ * @param[in] start The start cooridinates for each dim, 0-based none-negative interger.
+ *             NULL means copy from the idx 0 of each dim.
+ * @param[in] end The end cooridinates for each dim, 0-based none-negative interger.
+ *             NULL means copy to the end of each dim. For the given idx, the end[idx]
+ *             should be greater than start[idx].
+ * @return The new tensor id on success, or VSI_NN_TENSOR_ID_NA otheriwse.
+ */
+OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromView
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_id_t id,
+    vsi_size_t* start,
+    vsi_size_t* end
+    );
+
 /**
  * Attach tensor to graph
  * Attach an exist tensor to graph.
diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h
index 37032f473..5cadddb3e 100644
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@@ -206,8 +206,8 @@
 #include "ops/vsi_nn_op_maxunpool.h"
 #include "ops/vsi_nn_op_reversesequence.h"
 #include "ops/vsi_nn_op_grid_sample.h"
-#include "ops/vsi_nn_op_bilinear_grid_sample.h"
 #include "ops/vsi_nn_op_lpnorm.h"
+#include "ops/vsi_nn_op_resize_3d.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 #include "ops/vsi_nn_op_inverse_sigmoid.h"
@@ -402,8 +402,8 @@ typedef union _vsi_nn_nn_param
     vsi_nn_reversesequence_param    reversesequence;
     vsi_nn_inverse_sigmoid_param       inverse_sigmoid;
     vsi_nn_grid_sample_param        gridsample;
-    vsi_nn_bilinear_grid_sample_param bilinear_grid_sample;
     vsi_nn_lpnorm_param             lpnorm;
+    vsi_nn_resize_3d_param          resize_3d;
     void*                         client_param;
 
     /* custom node data struct define */
diff --git a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
index 227b17f3a..59292cd0d 100644
--- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
+++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
@@ -48,6 +48,7 @@ typedef enum
     VSI_NN_PREPROCESS_IMAGE_RESIZE_BILINEAR,
     VSI_NN_PREPROCESS_IMAGE_RESIZE_NEAREST,
     VSI_NN_PREPROCESS_DTYPE_CONVERT,
+    VSI_NN_PREPROCESS_MEANS_AND_SCALES,
 } vsi_nn_preprocess_type_e;
 
 /**
@@ -150,8 +151,25 @@ typedef struct
     float scale;
 }vsi_nn_process_mean_and_scale_t;
 
+/**
+ * Process mean and scale parameter structure
+ */
+typedef struct
+{
+    /** Mean value for each channel */
+    float* channel_mean;
+    /*Channel length */
+    int32_t channel_len;
+    /** Scale value */
+    float* scale;
+    /** Scale length */
+    int32_t scale_len;
+}vsi_nn_process_means_and_scales_t;
+
 typedef vsi_nn_process_mean_and_scale_t vsi_nn_preprocess_mean_and_scale_t;
+typedef vsi_nn_process_means_and_scales_t vsi_nn_preprocess_means_and_scales_t;
 typedef vsi_nn_process_mean_and_scale_t vsi_nn_postprocess_mean_and_scale_t;
+typedef vsi_nn_process_means_and_scales_t vsi_nn_postprocess_means_and_scales_t;
 
 /**
  * Process permute parameter structure
diff --git a/src/tim/vx/internal/include/vsi_nn_rnn_helper.h b/src/tim/vx/internal/include/vsi_nn_rnn_helper.h
index 4bef7b942..14f359338 100644
--- a/src/tim/vx/internal/include/vsi_nn_rnn_helper.h
+++ b/src/tim/vx/internal/include/vsi_nn_rnn_helper.h
@@ -154,7 +154,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_transpose_time_major
     vsi_bool use_virtual_tensor
     );
 
-void vsi_nn_rnn_split_input_tensor
+vsi_status vsi_nn_rnn_split_input_tensor
     (
     vsi_nn_node_t * self,
     vsi_nn_tensor_t * input,
@@ -163,7 +163,7 @@ void vsi_nn_rnn_split_input_tensor
     vsi_bool use_virtual_tensor
     );
 
-void vsi_nn_rnn_data_check_aligned
+vsi_status vsi_nn_rnn_data_check_aligned
     (
     vsi_nn_node_t * self,
     vsi_nn_tensor_t ** input,
diff --git a/src/tim/vx/internal/include/vsi_nn_tensor.h b/src/tim/vx/internal/include/vsi_nn_tensor.h
index 5b7bdb940..d6ed09045 100644
--- a/src/tim/vx/internal/include/vsi_nn_tensor.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor.h
@@ -82,6 +82,10 @@ typedef enum
     VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC = 0x4,
     /** affine perchannel asymmetric */
     VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC = 0x5,
+    /** float8 */
+    VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6,
+    /** perchannel float8 */
+    VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7,
     /** undefined type */
     VSI_NN_QNT_TYPE_NA = 0xff,
 } vsi_nn_qnt_type_e;
diff --git a/src/tim/vx/internal/include/vsi_nn_tensor_util.h b/src/tim/vx/internal/include/vsi_nn_tensor_util.h
index 4b997f319..14bb0d62b 100644
--- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h
@@ -734,13 +734,15 @@ vsi_status vsi_nn_copy_tensor_veiw_patch
 /**
  * OVXLIB internal tensor util api
  * A wrapper api for OpenVX vxCopyTensorPatch
- * Allows the application to copy whole tensor patch from/into an tensor object.
+ * Allows the application to copy partial/whole tensor patch from/into an tensor object.
  *
  * @param[in] tensor OpenVX Tensor handle.
  * @param[in] attr OVXLIB Tensor attr.
  * @param[in] user_ptr The address of the memory location where to store the requested data.
  * @param[in] usage This declares the effect of the copy with regard to the tensor object
  *            support VX_READ_ONLY or VX_WRITE_ONLY
+ * @param[in] start The start cooridinates for each dim. NULL means copy from the idx 0 of each dim.
+ * @param[in] end The end cooridinates for each dim. NULL means copy to the end of each dim.
  * @return VSI_SUCCESS on success, or error core otherwise.
  */
 vsi_status vsi_nn_copy_tensor_patch
@@ -748,7 +750,9 @@ vsi_status vsi_nn_copy_tensor_patch
     vx_tensor tensor,
     vsi_nn_tensor_attr_t *attr,
     void * user_ptr,
-    vsi_enum usage
+    vsi_enum usage,
+    vsi_size_t* start,
+    vsi_size_t* end
     );
 
 /**
diff --git a/src/tim/vx/internal/include/vsi_nn_test.h b/src/tim/vx/internal/include/vsi_nn_test.h
index 8f5df6e6a..59bafe198 100644
--- a/src/tim/vx/internal/include/vsi_nn_test.h
+++ b/src/tim/vx/internal/include/vsi_nn_test.h
@@ -31,26 +31,26 @@
 extern "C"{
 #endif
 
-#define TEST_CHECK_TENSOR_ID( id, lbl )      do {\
+#define TEST_CHECK_TENSOR_ID( id, lbl )      {\
     if( VSI_NN_TENSOR_ID_NA == id ) {\
         VSILOGE("CHECK TENSOR ID %d", __LINE__);\
         goto lbl;\
         }\
-    } while(0)
+    }
 
-#define TEST_CHECK_PTR( ptr, lbl )      do {\
+#define TEST_CHECK_PTR( ptr, lbl )      {\
     if( NULL == ptr ) {\
         VSILOGE("CHECK PTR %d", __LINE__);\
         goto lbl;\
     }\
-} while(0)
+}
 
-#define TEST_CHECK_STATUS( stat, lbl )  do {\
+#define TEST_CHECK_STATUS( stat, lbl )  {\
     if( VSI_SUCCESS != stat ) {\
         VSILOGE("CHECK STATUS(%d:%s)", (stat), vsi_nn_DescribeStatus(stat));\
         goto lbl;\
     }\
-} while(0)
+}
 
 #if defined(__cplusplus)
 }
diff --git a/src/tim/vx/internal/include/vsi_nn_types.h b/src/tim/vx/internal/include/vsi_nn_types.h
index 6238e4f2d..380057b94 100644
--- a/src/tim/vx/internal/include/vsi_nn_types.h
+++ b/src/tim/vx/internal/include/vsi_nn_types.h
@@ -191,6 +191,16 @@ typedef enum
     VSI_NN_TYPE_BFLOAT16 = VX_TYPE_BFLOAT16,
 #else
     VSI_NN_TYPE_BFLOAT16 = 0x81A,
+#endif
+#ifdef VSI_NN_TYPE_FLOAT8_E4M3_SUPPORT
+    VSI_NN_TYPE_FLOAT8_E4M3 = VX_TYPE_FLOAT8_E4M3,
+#else
+    VSI_NN_TYPE_FLOAT8_E4M3 = 0X81E,
+#endif
+#ifdef VSI_NN_TYPE_FLOAT8_E5M2_SUPPORT
+    VSI_NN_TYPE_FLOAT8_E5M2 = VX_TYPE_FLOAT8_E5M2,
+#else
+    VSI_NN_TYPE_FLOAT8_E5M2 = 0X81F,
 #endif
     VSI_NN_TYPE_VDATA = VX_TYPE_USER_STRUCT_START + 0x1,
 
@@ -268,6 +278,11 @@ typedef enum _vsi_nn_roi_align_type_e
     VSI_NN_ROI_ALIGN
 } vsi_nn_roi_align_type_e;
 
+typedef enum _vsi_nn_custom_warp_affine_type_e {
+    VSI_NN_WARP_AFFINE_TYPE_NONE = 0,
+    VSI_NN_WARP_AFFINE_TYPE_RGB
+} vsi_nn_custom_warp_affine_type_e;
+
 /** Deprecated */
 typedef uint32_t vsi_nn_size_t;
 
diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h
index 280f0cc4c..399e72e01 100644
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@@ -33,7 +33,7 @@ extern "C"{
 
 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 1
-#define VSI_NN_VERSION_PATCH 74
+#define VSI_NN_VERSION_PATCH 84
 #define VSI_NN_VERSION \
     (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
 
diff --git a/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess.c b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess.c
new file mode 100644
index 000000000..6d6ceb98c
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess.c
@@ -0,0 +1,578 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_internal_node.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _custom_tiny_yolov4_postprocess_local_data_t {
+    vx_int32 begin_dims[6][VSI_NN_MAX_DIM_NUM];
+    vx_int32 end_dims[6][VSI_NN_MAX_DIM_NUM];
+    vx_int32 stride_dims[VSI_NN_MAX_DIM_NUM];
+} custom_tiny_yolov4_postprocess_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (4)
+#define _OUTPUT_NUM         (2)
+
+static vsi_nn_internal_tensor_t *_create_internal_tensor
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t * input
+    )
+{
+    vsi_nn_tensor_attr_t attr;
+    vsi_nn_internal_tensor_t * tensor = NULL;
+
+    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
+    memcpy( &attr.dtype, &input->attr.dtype, sizeof( attr.dtype ) );
+    attr.dim_num = VSI_NN_DIM_AUTO;
+    attr.vtl = TRUE;
+    attr.is_const = FALSE;
+    tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+
+    return tensor;
+} /* _create_internal_tensor() */
+
+static vsi_nn_internal_tensor_t *_create_sigmoid_internal_tensor
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t * input
+    )
+{
+    vsi_nn_tensor_attr_t attr;
+    vsi_nn_internal_tensor_t * tensor = NULL;
+
+    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
+    memcpy( &attr.dtype, &input->attr.dtype, sizeof( attr.dtype ) );
+    if (attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ||
+        attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC)
+    {
+        attr.dtype.scale = 0.00390625;
+        attr.dtype.zero_point = 0;
+    }
+    attr.dim_num = VSI_NN_DIM_AUTO;
+    attr.vtl = TRUE;
+    attr.is_const = FALSE;
+    tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+
+    return tensor;
+} /* _create_sigmoid_internal_tensor() */
+
+static vsi_nn_internal_tensor_t *_create_output_internal_tensor
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t * output
+    )
+{
+    vsi_nn_tensor_attr_t attr;
+    vsi_nn_internal_tensor_t * tensor = NULL;
+
+    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
+    memcpy( &attr.dtype, &output->attr.dtype, sizeof( attr.dtype ) );
+    attr.dim_num = VSI_NN_DIM_AUTO;
+    attr.vtl = TRUE;
+    attr.is_const = FALSE;
+    tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+
+    return tensor;
+} /* _create_output_internal_tensor() */
+
+static vsi_nn_internal_tensor_t *_create_strided_slice_op
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t * input,
+    int32_t begin_mask,
+    int32_t end_mask,
+    int32_t index
+    )
+{
+    vsi_nn_custom_tiny_yolov4_postprocess_param * p = NULL;
+    vsi_nn_internal_tensor_t * tensor = NULL;
+    vsi_nn_internal_node_t* curr = NULL;
+    p = (vsi_nn_custom_tiny_yolov4_postprocess_param *)&(self->nn_param.custom_tiny_yolov4_postprocess);
+
+    tensor = _create_internal_tensor(self, input);
+    CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
+    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+    curr->node->nn_param.strided_slice.begin_dims = p->local->begin_dims[index];
+    curr->node->nn_param.strided_slice.begin_dims_num = input->attr.dim_num;
+    curr->node->nn_param.strided_slice.end_dims = p->local->end_dims[index];
+    curr->node->nn_param.strided_slice.end_dims_num = input->attr.dim_num;
+    curr->node->nn_param.strided_slice.stride_dims = p->local->stride_dims;
+    curr->node->nn_param.strided_slice.stride_dims_num = input->attr.dim_num;
+    curr->node->nn_param.strided_slice.begin_mask = begin_mask;
+    curr->node->nn_param.strided_slice.end_mask = end_mask;
+    curr->node->nn_param.strided_slice.shrink_axis_mask = 0;
+    curr->node->nn_param.strided_slice.new_axis_mask = 0;
+    curr->inputs[0] = input;
+    curr->outputs[0] = tensor->t;
+    vsi_nn_internal_setup_node( self, curr );
+
+final:
+    return tensor;
+} /* _create_strided_slice() */
+
+static vsi_nn_internal_tensor_t *_create_sigmoid_op
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t * input
+    )
+{
+    vsi_nn_internal_tensor_t * tensor = NULL;
+    vsi_nn_internal_node_t* curr = NULL;
+
+    tensor = _create_sigmoid_internal_tensor(self, input);
+    CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
+
+    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SIGMOID, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+    curr->inputs[0] = input;
+    curr->outputs[0] = tensor->t;
+    vsi_nn_internal_setup_node( self, curr );
+
+final:
+    return tensor;
+} /* _create_sigmoid_op() */
+
+static vsi_nn_internal_tensor_t *_create_confidence_op
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t * input,
+    vsi_nn_tensor_t * output
+    )
+{
+    vsi_nn_internal_tensor_t * tensor = NULL;
+    vsi_nn_internal_node_t* curr = NULL;
+
+    tensor = _create_output_internal_tensor(self, output);
+    CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
+
+    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+    curr->inputs[0] = input;
+    curr->outputs[0] = tensor->t;
+    vsi_nn_internal_setup_node( self, curr );
+
+final:
+    return tensor;
+} /* _create_confidence_op() */
+
+static vsi_nn_internal_tensor_t *_create_box_op
+    (
+    vsi_nn_node_t *   self,
+    vsi_nn_tensor_t * input0,
+    vsi_nn_tensor_t * input1,
+    vsi_nn_tensor_t * output,
+    float             bias0,
+    float             bias1
+    )
+{
+    vsi_nn_internal_tensor_t * tensor = NULL;
+    vsi_nn_internal_node_t* curr = NULL;
+
+    tensor = _create_output_internal_tensor(self, output);
+    CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
+
+    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+    curr->inputs[0] = input0;
+    curr->inputs[1] = input1;
+    curr->outputs[0] = tensor->t;
+    curr->node->nn_param.custom_tiny_yolov4_postprocess_box.bias_0 = bias0;
+    curr->node->nn_param.custom_tiny_yolov4_postprocess_box.bias_1 = bias1;
+    vsi_nn_internal_setup_node( self, curr );
+
+final:
+    return tensor;
+} /* _create_box_op() */
+
+static vsi_nn_internal_tensor_t *_create_reshape_op
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t * input,
+    vsi_nn_tensor_t * output,
+    vsi_size_t        width
+    )
+{
+    vsi_nn_internal_tensor_t * tensor = NULL;
+    vsi_nn_internal_node_t* curr = NULL;
+    vsi_size_t shape_1[] = { 1, (vsi_size_t)-1, 1 };
+
+    shape_1[0] = width;
+
+    tensor = _create_output_internal_tensor(self, output);
+    CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
+
+    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+    curr->inputs[0] = input;
+    curr->outputs[0] = tensor->t;
+    curr->node->nn_param.reshape2.size = shape_1;
+    curr->node->nn_param.reshape2.dim_num = 3;
+    vsi_nn_internal_setup_node( self, curr );
+
+final:
+    return tensor;
+} /* _create_reshape_op() */
+
+static vsi_bool _create_concat_op
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t * input0,
+    vsi_nn_tensor_t * input1,
+    vsi_nn_tensor_t * input2,
+    vsi_nn_tensor_t * input3,
+    vsi_nn_tensor_t * input4,
+    vsi_nn_tensor_t * input5,
+    vsi_nn_tensor_t * output
+    )
+{
+    vsi_nn_internal_node_t* curr = NULL;
+    vsi_bool ret = FALSE;
+
+    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 6, 1 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+    curr->inputs[0] = input0;
+    curr->inputs[1] = input1;
+    curr->inputs[2] = input2;
+    curr->inputs[3] = input3;
+    curr->inputs[4] = input4;
+    curr->inputs[5] = input5;
+    curr->outputs[0] = output;
+    curr->node->nn_param.concat.axis = 1;
+    ret = vsi_nn_internal_setup_node( self, curr );
+
+final:
+    return ret;
+} /* _create_concat_op() */
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+    return vsi_nn_internal_compute_node( self );
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(CUSTOM_TINY_YOLOV4_POSTPROCESS, 4, 2)
+        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_U8|Q_ASYM)
+    END_IO_TYPE_DECL(CUSTOM_TINY_YOLOV4_POSTPROCESS)
+    if (!VALIDATE_OP_IO_TYPES(CUSTOM_TINY_YOLOV4_POSTPROCESS, self, inputs,
+        self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_status op_optimize
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_nn_opt_direction_e direction
+    )
+{
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+    return vsi_nn_internal_optimize_node( self, direction );
+}
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = FALSE;
+    vsi_nn_internal_tensor_t * tensor0[12] = {NULL};
+    vsi_nn_internal_tensor_t * tensor1[12] = {NULL};
+    int32_t index_0 = 1;
+    int32_t index_1 = 0;
+    int32_t index_2 = 3;
+    int32_t index_3 = 2;
+
+    vsi_nn_internal_init_node_wksp( self );
+
+    /**confidence**/
+    /**input 0 chunk 0**/
+    /*
+    sub0:26x26x255 --> 26x26x81, begin: [0, 0, 4, 0] end: [0, 0, 85, 0] stride: [1, 1, 1,  1]
+    sub1[26, 26, 80] = sigmoid(sub0)[26, 26, 0:0] * sigmoid(sub0)[26, 26, 1:81]
+    sub2[80, 26, 26] = transpose(sub1)
+    sub3[80, 676] = reshape(sub2)
+    */
+    tensor0[0] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 0);
+    CHECK_PTR_FAIL_GOTO( tensor0[0], "Create internal tensor fail.", final );
+    tensor0[1] = _create_sigmoid_op(self, tensor0[0]->t);
+    CHECK_PTR_FAIL_GOTO( tensor0[1], "Create internal tensor fail.", final );
+    tensor0[2] = _create_confidence_op(self, tensor0[1]->t, outputs[0]);
+    CHECK_PTR_FAIL_GOTO( tensor0[2], "Create internal tensor fail.", final );
+    tensor0[3] = _create_reshape_op(self, tensor0[2]->t, outputs[0], 80);
+    CHECK_PTR_FAIL_GOTO( tensor0[3], "Create internal tensor fail.", final );
+    /**chunk 1**/
+    /*
+    26x26x255 --> 26x26x81, begin: [0, 0, 89, 0] end: [0, 0, 170, 0] stride: [1, 1, 1,  1]
+    */
+    tensor0[4] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 1);
+    CHECK_PTR_FAIL_GOTO( tensor0[4], "Create internal tensor fail.", final );
+    tensor0[5] = _create_sigmoid_op(self, tensor0[4]->t);
+    CHECK_PTR_FAIL_GOTO( tensor0[5], "Create internal tensor fail.", final );
+    tensor0[6] = _create_confidence_op(self, tensor0[5]->t, outputs[0]);
+    CHECK_PTR_FAIL_GOTO( tensor0[6], "Create internal tensor fail.", final );
+    tensor0[7] = _create_reshape_op(self, tensor0[6]->t, outputs[0], 80);
+    CHECK_PTR_FAIL_GOTO( tensor0[7], "Create internal tensor fail.", final );
+    /**chunk 2**/
+    /*
+    26x26x255 --> 26x26x81, begin: [0, 0, 174, 0] end: [0, 0, 255, 0] stride: [1, 1, 1,  1]
+    */
+    tensor0[8] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 2);
+    CHECK_PTR_FAIL_GOTO( tensor0[8], "Create internal tensor fail.", final );
+    tensor0[9] = _create_sigmoid_op(self, tensor0[8]->t);
+    CHECK_PTR_FAIL_GOTO( tensor0[9], "Create internal tensor fail.", final );
+    tensor0[10] = _create_confidence_op(self, tensor0[9]->t, outputs[0]);
+    CHECK_PTR_FAIL_GOTO( tensor0[10], "Create internal tensor fail.", final );
+    tensor0[11] = _create_reshape_op(self, tensor0[10]->t, outputs[0], 80);
+    CHECK_PTR_FAIL_GOTO( tensor0[11], "Create internal tensor fail.", final );
+
+    /**input 1 chunk 0**/
+    /*
+    sub0:13x13x255 --> 26x26x81, begin: [0, 0, 4, 0] end: [0, 0, 85, 0] stride: [1, 1, 1,  1]
+    sub1[13, 13, 80] = sigmoid(sub0)[13, 13, 0:0] * sigmoid(sub0)[13, 13, 1:81]
+    sub2[80, 13, 13] = transpose(sub1)
+    sub3[80, 169] = reshape(sub2)
+    */
+    tensor1[0] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 0);
+    CHECK_PTR_FAIL_GOTO( tensor1[0], "Create internal tensor fail.", final );
+    tensor1[1] = _create_sigmoid_op(self, tensor1[0]->t);
+    CHECK_PTR_FAIL_GOTO( tensor1[1], "Create internal tensor fail.", final );
+    tensor1[2] = _create_confidence_op(self, tensor1[1]->t, outputs[0]);
+    CHECK_PTR_FAIL_GOTO( tensor1[2], "Create internal tensor fail.", final );
+    tensor1[3] = _create_reshape_op(self, tensor1[2]->t, outputs[0], 80);
+    CHECK_PTR_FAIL_GOTO( tensor1[3], "Create internal tensor fail.", final );
+    /**chunk 1**/
+    /*
+    13x13x255 --> 13x13x81, begin: [0, 0, 89, 0] end: [0, 0, 170, 0] stride: [1, 1, 1,  1]
+    */
+    tensor1[4] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 1);
+    CHECK_PTR_FAIL_GOTO( tensor1[4], "Create internal tensor fail.", final );
+    tensor1[5] = _create_sigmoid_op(self, tensor1[4]->t);
+    CHECK_PTR_FAIL_GOTO( tensor1[5], "Create internal tensor fail.", final );
+    tensor1[6] = _create_confidence_op(self, tensor1[5]->t, outputs[0]);
+    CHECK_PTR_FAIL_GOTO( tensor1[6], "Create internal tensor fail.", final );
+    tensor1[7] = _create_reshape_op(self, tensor1[6]->t, outputs[0], 80);
+    CHECK_PTR_FAIL_GOTO( tensor1[7], "Create internal tensor fail.", final );
+    /**chunk 2**/
+    /*
+    13x13x255 --> 13x13x81, begin: [0, 0, 174, 0] end: [0, 0, 255, 0] stride: [1, 1, 1,  1]
+    */
+    tensor1[8] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 2);
+    CHECK_PTR_FAIL_GOTO( tensor1[8], "Create internal tensor fail.", final );
+    tensor1[9] = _create_sigmoid_op(self, tensor1[8]->t);
+    CHECK_PTR_FAIL_GOTO( tensor1[9], "Create internal tensor fail.", final );
+    tensor1[10] = _create_confidence_op(self, tensor1[9]->t, outputs[0]);
+    CHECK_PTR_FAIL_GOTO( tensor1[10], "Create internal tensor fail.", final );
+    tensor1[11] = _create_reshape_op(self, tensor1[10]->t, outputs[0], 80);
+    CHECK_PTR_FAIL_GOTO( tensor1[11], "Create internal tensor fail.", final );
+
+    ret = _create_concat_op(self, tensor0[3]->t, tensor0[7]->t, tensor0[11]->t,
+       tensor1[3]->t, tensor1[7]->t, tensor1[11]->t, outputs[0]);
+    if (ret == FALSE)
+    {
+        VSILOGE("Create concat operation fail");
+        goto final;
+    }
+
+    ret = FALSE;
+    /**box**/
+    /*
+    26x26x255 --> 26x26x4, begin: [0, 0, 0, 0] end: [0, 0, 4, 0] stride: [1, 1, 1,  1]
+    */
+    tensor0[0] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 3);
+    CHECK_PTR_FAIL_GOTO( tensor0[0], "Create internal tensor fail.", final );
+    tensor0[1] = _create_box_op(self, tensor0[0]->t, inputs[index_2], outputs[1], 23, 27);
+    CHECK_PTR_FAIL_GOTO( tensor0[1], "Create internal tensor fail.", final );
+    tensor0[2] = _create_reshape_op(self, tensor0[1]->t, outputs[1], 4);
+    CHECK_PTR_FAIL_GOTO( tensor0[2], "Create internal tensor fail.", final );
+    /*
+    26x26x255 --> 26x26x4, begin: [0, 0, 85, 0] end: [0, 0, 89, 0] stride: [1, 1, 1,  1]
+    */
+    tensor0[3] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 4);
+    CHECK_PTR_FAIL_GOTO( tensor0[3], "Create internal tensor fail.", final );
+    tensor0[4] = _create_box_op(self, tensor0[3]->t, inputs[index_2], outputs[1], 37, 58);
+    CHECK_PTR_FAIL_GOTO( tensor0[4], "Create internal tensor fail.", final );
+    tensor0[5] = _create_reshape_op(self, tensor0[4]->t, outputs[1], 4);
+    CHECK_PTR_FAIL_GOTO( tensor0[5], "Create internal tensor fail.", final );
+    /*
+    26x26x255 --> 26x26x4, begin: [0, 0, 85, 0] end: [0, 0, 89, 0] stride: [1, 1, 1,  1]
+    */
+    tensor0[6] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 5);
+    CHECK_PTR_FAIL_GOTO( tensor0[6], "Create internal tensor fail.", final );
+    tensor0[7] = _create_box_op(self, tensor0[6]->t, inputs[index_2], outputs[1], 81, 82);
+    CHECK_PTR_FAIL_GOTO( tensor0[7], "Create internal tensor fail.", final );
+    tensor0[8] = _create_reshape_op(self, tensor0[7]->t, outputs[1], 4);
+    CHECK_PTR_FAIL_GOTO( tensor0[8], "Create internal tensor fail.", final );
+
+    /*
+    13x13x255 --> 13x13x4, begin: [0, 0, 0, 0] end: [0, 0, 4, 0] stride: [1, 1, 1,  1]
+    */
+    tensor1[0] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 3);
+    CHECK_PTR_FAIL_GOTO( tensor1[0], "Create internal tensor fail.", final );
+    tensor1[1] = _create_box_op(self, tensor1[0]->t, inputs[index_3], outputs[1], 81, 82);
+    CHECK_PTR_FAIL_GOTO( tensor1[1], "Create internal tensor fail.", final );
+    tensor1[2] = _create_reshape_op(self, tensor1[1]->t, outputs[1], 4);
+    CHECK_PTR_FAIL_GOTO( tensor1[2], "Create internal tensor fail.", final );
+    /*
+    13x13x255 --> 13x13x4, begin: [0, 0, 85, 0] end: [0, 0, 89, 0] stride: [1, 1, 1,  1]
+    */
+    tensor1[3] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 4);
+    CHECK_PTR_FAIL_GOTO( tensor1[3], "Create internal tensor fail.", final );
+    tensor1[4] = _create_box_op(self, tensor1[3]->t, inputs[index_3], outputs[1], 135, 169);
+    CHECK_PTR_FAIL_GOTO( tensor1[4], "Create internal tensor fail.", final );
+    tensor1[5] = _create_reshape_op(self, tensor1[4]->t, outputs[1], 4);
+    CHECK_PTR_FAIL_GOTO( tensor1[5], "Create internal tensor fail.", final );
+    /*
+    13x13x255 --> 13x13x4, begin: [0, 0, 170, 0] end: [0, 0, 174, 0] stride: [1, 1, 1,  1]
+    */
+    tensor1[6] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 5);
+    CHECK_PTR_FAIL_GOTO( tensor1[6], "Create internal tensor fail.", final );
+    tensor1[7] = _create_box_op(self, tensor1[6]->t, inputs[index_3], outputs[1], 344, 319);
+    CHECK_PTR_FAIL_GOTO( tensor1[7], "Create internal tensor fail.", final );
+    tensor1[8] = _create_reshape_op(self, tensor1[7]->t, outputs[1], 4);
+    CHECK_PTR_FAIL_GOTO( tensor1[8], "Create internal tensor fail.", final );
+
+    ret = _create_concat_op(self, tensor0[2]->t, tensor0[5]->t, tensor0[8]->t,
+        tensor1[2]->t, tensor1[5]->t, tensor1[8]->t, outputs[1]);
+    if (ret == FALSE)
+    {
+        VSILOGE("Create concat operation fail");
+        goto final;
+    }
+
+final:
+    return ret;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    int32_t i = 0;
+    vsi_nn_custom_tiny_yolov4_postprocess_param *p = &self->nn_param.custom_tiny_yolov4_postprocess;
+    p->local = \
+        (custom_tiny_yolov4_postprocess_local_data_t*)malloc(sizeof(custom_tiny_yolov4_postprocess_local_data_t));
+    CHECK_PTR_FAIL_GOTO(p->local, "create buffer fail", final);
+    memset(p->local, 0, sizeof(custom_tiny_yolov4_postprocess_local_data_t));
+    for ( i = 0; i < VSI_NN_MAX_DIM_NUM; i++ )
+    {
+        p->local->stride_dims[i] = 1;
+    }
+    p->local->begin_dims[0][2] = 4;
+    p->local->end_dims[0][2] = 85;
+
+    p->local->begin_dims[1][2] = 89;
+    p->local->end_dims[1][2] = 170;
+
+    p->local->begin_dims[2][2] = 174;
+    p->local->end_dims[2][2] = 255;
+
+    p->local->begin_dims[3][2] = 0;
+    p->local->end_dims[3][2] = 4;
+
+    p->local->begin_dims[4][2] = 85;
+    p->local->end_dims[4][2] = 89;
+
+    p->local->begin_dims[5][2] = 170;
+    p->local->end_dims[5][2] = 174;
+final:
+    return VSI_SUCCESS;
+} /* op_init() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    status = vsi_nn_op_common_deinit(self);
+
+    vsi_nn_safe_free(self->nn_param.custom_tiny_yolov4_postprocess.local);
+    vsi_nn_internal_deinit_node_wksp( self );
+
+    return status;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ CUSTOM_TINY_YOLOV4_POSTPROCESS,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ op_optimize,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bilinear_grid_sample.c b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_box.c
similarity index 59%
rename from src/tim/vx/internal/src/ops/vsi_nn_op_bilinear_grid_sample.c
rename to src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_box.c
index c664a3c16..a05ca3f42 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bilinear_grid_sample.c
+++ b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_box.c
@@ -35,9 +35,9 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 
-typedef struct _bilinear_grid_sample_local_data_t {
+typedef struct _custom_tiny_yolov4_postprocess_box_local_data_t {
     int32_t placeholder;
-} bilinear_grid_sample_local_data_t;
+} custom_tiny_yolov4_postprocess_box_local_data_t;
 
 /*
  Declare number of input and output.
@@ -53,27 +53,25 @@ static vsi_status op_compute
     )
 {
     vsi_status status = VSI_FAILURE;
-
-    vsi_nn_kernel_param_t* param = NULL;
-    int32_t align_corners = self->nn_param.bilinear_grid_sample.align_corners;
-    vsi_nn_kernel_node_t n;
+    vsi_nn_kernel_param_t * param = NULL;
+    float bias_0 = self->nn_param.custom_tiny_yolov4_postprocess_box.bias_0;
+    float bias_1 = self->nn_param.custom_tiny_yolov4_postprocess_box.bias_1;
 
     param = vsi_nn_kernel_param_create();
 
-    vsi_nn_kernel_param_add_int32(param, "align_corners", align_corners);
-    n = vsi_nn_kernel_selector(
-        self->graph, "bilinear_grid_sample", inputs, 2, outputs, 1, param);
-    if (n == NULL) {
-        vsi_nn_kernel_param_release(&param);
-        status = VSI_FAILURE;
-        return status;
-    }
-    self->n = (vx_node)n;
-    vsi_nn_kernel_param_release(&param);
-    if (self->n) {
+    vsi_nn_kernel_param_add_float32( param, "bias_0", bias_0 );
+    vsi_nn_kernel_param_add_float32( param, "bias_1", bias_1 );
+
+    self->n = vsi_nn_kernel_selector( self->graph, "tiny_yolov4_postprocess_box",
+        inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+
+    if ( self->n )
+    {
         status = VSI_SUCCESS;
     }
 
+    vsi_nn_kernel_param_release( &param );
+
     return status;
 } /* op_compute() */
 
@@ -85,6 +83,9 @@ static vsi_bool op_check
     )
 {
     /*TODO: Check tensor shapes. */
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 } /* op_check() */
 
@@ -95,61 +96,36 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
-    if (NULL == self) {
-        return FALSE;
-    }
-
-    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) {
-        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
-        outputs[0]->attr.size[0] = inputs[1]->attr.size[1];
-        outputs[0]->attr.size[1] = inputs[1]->attr.size[2];
-        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
-        if (4 == inputs[0]->attr.dim_num) {
-            outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
+    uint32_t rank = inputs[0]->attr.dim_num;
+    vsi_bool ret = TRUE;
+
+    VSI_UNREFERENCED(self);
+
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = rank;
+        outputs[0]->attr.size[0] = inputs[0]->attr.size[2];
+        outputs[0]->attr.size[1] = inputs[0]->attr.size[0];
+        outputs[0]->attr.size[2] = inputs[0]->attr.size[1];
+        if (rank > 3)
+        {
+            memcpy( &outputs[0]->attr.size[3], &inputs[0]->attr.size[3], (rank - 3) * sizeof(vsi_size_t) );
         }
     }
 
-    return TRUE;
+    return ret;
 } /* op_setup() */
 
-static vsi_status op_init
-    (
-    vsi_nn_node_t* self
-    )
-{
-    /* TODO
-    //self->nn_param.bilinear_grid_sample.local = \
-    //    (bilinear_grid_sample_local_data_t*)malloc(sizeof(bilinear_grid_sample_local_data_t));
-    */
-
-    return VSI_SUCCESS;
-} /* op_init() */
-
-static vsi_status op_deinit
-    (
-    vsi_nn_node_t* self
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-
-    status = vsi_nn_op_common_deinit(self);
-
-    /* TODO
-    //vsi_nn_safe_free(self->nn_param.bilinear_grid_sample.local);
-    */
-
-    return status;
-} /* op_deinit() */
 
 __BEGIN_DECLS
 
 /* Registrar */
 DEF_OP_REG
     (
-    /* op_name    */ BILINEAR_GRID_SAMPLE,
-    /* init       */ op_init,
+    /* op_name    */ CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX,
+    /* init       */ NULL,
     /* compute    */ op_compute,
-    /* deinit     */ op_deinit,
+    /* deinit     */ vsi_nn_op_common_deinit,
     /* check      */ op_check,
     /* setup      */ op_setup,
     /* optimize   */ NULL,
diff --git a/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_confidence.c b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_confidence.c
new file mode 100644
index 000000000..a9cf8b4a6
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_confidence.c
@@ -0,0 +1,127 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+typedef struct _tiny_yolov4_postprocess_confidence_local_data_t {
+    int32_t placeholder;
+} tiny_yolov4_postprocess_confidence_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+
+    self->n = vsi_nn_kernel_selector( self->graph, "tiny_yolov4_postprocess_confidence",
+        inputs, 1, outputs, 1, NULL );
+
+    if ( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /*TODO: Check tensor shapes. */
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    uint32_t rank = inputs[0]->attr.dim_num;
+    vsi_bool ret = TRUE;
+
+    VSI_UNREFERENCED(self);
+
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = rank;
+        outputs[0]->attr.size[0] = inputs[0]->attr.size[2] - 1;
+        outputs[0]->attr.size[1] = inputs[0]->attr.size[0];
+        outputs[0]->attr.size[2] = inputs[0]->attr.size[1];
+        if (rank > 3)
+        {
+            memcpy( &outputs[0]->attr.size[3], &inputs[0]->attr.size[3], (rank - 3) * sizeof(vsi_size_t) );
+        }
+    }
+
+    return ret;
+} /* op_setup() */
+
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c
index a1e50a481..8fc6d6ce0 100644
--- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c
@@ -54,20 +54,26 @@ DEF_KERNEL_EXECUTOR(_softmax_compute)
     size_t param_size
     )
 {
-    vsi_status status = VX_SUCCESS;
+    vsi_status status = VSI_FAILURE;
     float *buffer[_CPU_IO_NUM] = {NULL};
     vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = {NULL};
     vsi_nn_kernel_tensor_attr_t *attr[_CPU_IO_NUM] = {NULL};
     uint32_t i = 0, out_elements = 0;
     int32_t axis;
 
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param_size);
+
     tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; // input0
     tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; // input1
     tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; // output
 
     attr[0] = vsi_nn_kernel_tensor_attr_create(tensors[0]);
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create(tensors[1]);
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
     attr[2] = vsi_nn_kernel_tensor_attr_create(tensors[2]);
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
 
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis);
     CHECK_STATUS_FAIL_GOTO(status, final );
@@ -133,6 +139,8 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
     return VSI_SUCCESS;
 }
@@ -153,6 +161,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t node = NULL;
     int32_t axis = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     axis = vsi_nn_kernel_param_get_int32(params, "axis");
     status = _query_kernel(inputs, outputs, kernel);
     if(status != VSI_SUCCESS)
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c
index ed1e14932..3fb62eb74 100644
--- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c
@@ -54,7 +54,7 @@ DEF_KERNEL_EXECUTOR(_softmax_exec)
     size_t param_size
     )
 {
-    vsi_status status = VX_SUCCESS;
+    vsi_status status = VSI_FAILURE;
     float* buffer[_CPU_IO_NUM] = { NULL };
     vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
     vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL };
@@ -64,11 +64,16 @@ DEF_KERNEL_EXECUTOR(_softmax_exec)
     float fMax = 0.0;
     float fProbSum = 0.0f;
 
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param_size);
+
     tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
     tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
 
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &sf_axis);
     CHECK_STATUS_FAIL_GOTO(status, final );
@@ -141,6 +146,8 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
     return VSI_SUCCESS;
 }
@@ -161,6 +168,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t node = NULL;
     int32_t axis = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     axis = vsi_nn_kernel_param_get_int32(params, "axis");
 
     status = _query_kernel( inputs, outputs, kernel );
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c
index f2cb0315c..b9e77c299 100644
--- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c
@@ -62,6 +62,7 @@ static vx_param_description_t _custom_warp_affine_kernel_param_def[] =
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     // Add kererl parameters here
 };
 #define _CUSTOM_WARP_AFFINE_PARAM_NUM  _cnt_of_array( _custom_warp_affine_kernel_param_def )
@@ -97,7 +98,7 @@ static vsi_bool _read_pixel
 
     if (out_of_bounds)
     {
-        *pixel = 205.0f;
+        *pixel = 0.0f;
         return TRUE;
     }
 
@@ -125,6 +126,7 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
     vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL };
     int32_t type = 0;
+    int32_t rgb_type = 0;
     float matrix[6] = {0};
     vsi_size_t i = 0;
     vsi_size_t b = 0;
@@ -135,11 +137,16 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_size_t height = 0;
     vsi_size_t outer_size = 1;
 
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param_size);
+
     tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
     tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
 
     out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
 
@@ -153,6 +160,7 @@ DEF_KERNEL_EXECUTOR(_compute)
 
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_TYPE],
         &type);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &rgb_type);
     CHECK_STATUS_FAIL_GOTO(status, final );
     for (i = 0; i < 6; i++)
     {
@@ -172,34 +180,95 @@ DEF_KERNEL_EXECUTOR(_compute)
     {
         float *src_base = buffer[0] + b * attr[0]->shape->data[0] * attr[0]->shape->data[1];
         float *dst_base = buffer[1] + b * width * height;
-        for (y = 0; y < height; y++)
+
+        if ( rgb_type == VSI_NN_WARP_AFFINE_TYPE_RGB )
         {
-            for (x = 0; x < width; x++)
+            width = width / 3;
+            for (y = 0; y < height; y++)
             {
-                float xf = 0;
-                float yf = 0;
-                float dst = 0;
-
-                _transform_affine(x, y, matrix, &xf, &yf);
-                if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR)
+                for (x = 0; x < width; x++)
                 {
-                    _read_pixel(src_base, attr[0], xf, yf, &dst);
-                    dst_base[y * width + x] = dst;
+                    float xf = 0;
+                    float yf = 0;
+                    float dst = 0;
+
+                    _transform_affine(x, y, matrix, &xf, &yf);
+
+                    if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR)
+                    {
+                        _read_pixel(src_base, attr[0], 3 * floorf(xf), floorf(yf), &dst);
+                        dst_base[y * 3 * width + 3 * x] = dst;
+                        _read_pixel(src_base, attr[0], 3 * floorf(xf) + 1, floorf(yf), &dst);
+                        dst_base[y * 3 * width + 3 * x + 1] = dst;
+                        _read_pixel(src_base, attr[0], 3 * floorf(xf) + 2, floorf(yf), &dst);
+                        dst_base[y * 3 * width + 3 * x + 2] = dst;
+                    }
+                    else
+                    {
+                        float tl = 0, tr = 0, bl = 0, br = 0;
+                        float ar = xf - floorf(xf);
+                        float ab = yf - floorf(yf);
+                        float al = 1.0f - ar;
+                        float at = 1.0f - ab;
+
+                        _read_pixel(src_base, attr[0], 3 * floorf(xf), floorf(yf), &tl);
+                        _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1), floorf(yf), &tr);
+                        _read_pixel(src_base, attr[0], 3 * floorf(xf), floorf(yf) + 1, &bl);
+                        _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1), floorf(yf) + 1, &br);
+
+                        dst_base[y * 3 * width + 3 * x] =
+                            tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
+
+                        _read_pixel(src_base, attr[0], 3 * floorf(xf) + 1, floorf(yf), &tl);
+                        _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1) + 1, floorf(yf), &tr);
+                        _read_pixel(src_base, attr[0], 3 * floorf(xf) + 1, floorf(yf) + 1, &bl);
+                        _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1) + 1, floorf(yf) + 1, &br);
+
+                        dst_base[y * 3 * width + 3 * x + 1] =
+                            tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
+
+                        _read_pixel(src_base, attr[0], 3 * floorf(xf) + 2, floorf(yf), &tl);
+                        _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1) + 2, floorf(yf), &tr);
+                        _read_pixel(src_base, attr[0], 3 * floorf(xf) + 2, floorf(yf) + 1, &bl);
+                        _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1) + 2, floorf(yf) + 1, &br);
+
+                        dst_base[y * 3 * width + 3 * x + 2] =
+                            tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
+                    }
                 }
-                else
+            }
+        }
+        else
+        {
+            for (y = 0; y < height; y++)
+            {
+                for (x = 0; x < width; x++)
                 {
-                    float tl = 0, tr = 0, bl = 0, br = 0;
-                    float ar = xf - floorf(xf);
-                    float ab = yf - floorf(yf);
-                    float al = 1.0f - ar;
-                    float at = 1.0f - ab;
-
-                    _read_pixel(src_base, attr[0], floorf(xf), floorf(yf), &tl);
-                    _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf), &tr);
-                    _read_pixel(src_base, attr[0], floorf(xf), floorf(yf) + 1, &bl);
-                    _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf) + 1, &br);
-
-                    dst_base[y * width + x] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
+                    float xf = 0;
+                    float yf = 0;
+                    float dst = 0;
+
+                    _transform_affine(x, y, matrix, &xf, &yf);
+                    if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR)
+                    {
+                        _read_pixel(src_base, attr[0], xf, yf, &dst);
+                        dst_base[y * width + x] = dst;
+                    }
+                    else
+                    {
+                        float tl = 0, tr = 0, bl = 0, br = 0;
+                        float ar = xf - floorf(xf);
+                        float ab = yf - floorf(yf);
+                        float al = 1.0f - ar;
+                        float at = 1.0f - ab;
+
+                        _read_pixel(src_base, attr[0], floorf(xf), floorf(yf), &tl);
+                        _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf), &tr);
+                        _read_pixel(src_base, attr[0], floorf(xf), floorf(yf) + 1, &bl);
+                        _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf) + 1, &br);
+
+                        dst_base[y * width + x] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
+                    }
                 }
             }
         }
@@ -233,6 +302,8 @@ static vsi_status _query_kernel
     )
 {
     vsi_status status = VSI_FAILURE;
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
     kernel->info.function    = _compute;
     kernel->info.parameters  = _custom_warp_affine_kernel_param_def;
@@ -260,6 +331,7 @@ static vsi_nn_kernel_node_t _setup
     size_t i = 0;
     size_t buffer_size = 0;
     int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
+    int32_t rgb_type = vsi_nn_kernel_param_get_int32( params, "rgb_type");
     float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
 
     status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
@@ -278,6 +350,8 @@ static vsi_nn_kernel_node_t _setup
                 node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create(
                         graph, F32, &buffer[i] );
             }
+            node_params[9] = vsi_nn_kernel_scalar_create(
+                graph, I32, &rgb_type );
 
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM );
@@ -286,6 +360,7 @@ static vsi_nn_kernel_node_t _setup
             {
                 vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
             }
+            vsi_nn_kernel_scalar_release( &node_params[9] );
         }
     }
     return node;
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c
index 397f02291..98ae55858 100644
--- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c
@@ -95,7 +95,7 @@ static vsi_bool _read_pixel
     )
 {
     vsi_size_t width = attr->shape->data[0];
-    vsi_size_t height = attr->shape->data[1];
+    vsi_size_t height = attr->shape->size > 1 ? attr->shape->data[1] : 1;
     vsi_bool out_of_bounds = (x < 0 || y < 0 || x >= width || y >= height);
     vsi_size_t bx = 0, by = 0;
 
@@ -139,11 +139,16 @@ DEF_KERNEL_EXECUTOR(_compute)
     vsi_size_t height = 0;
     vsi_size_t outer_size = 1;
 
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param_size);
+
     tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
     tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
 
     out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
 
@@ -237,6 +242,8 @@ static vsi_status _query_kernel
     )
 {
     vsi_status status = VSI_FAILURE;
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
     kernel->info.function    = _compute;
     kernel->info.parameters  = _custom_warp_perspective_kernel_param_def;
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
index 0ec7145e4..6dc60cea4 100644
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
@@ -73,6 +73,8 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
         {0, 0, 0},  // local_size: local group size in thread
         {0, 0, 0}}; // global_size: image size in thread
 
+    VSI_UNREFERENCED(param_size);
+
     attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     if (!attr)
     {
@@ -144,6 +146,8 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
 
     vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
@@ -170,6 +174,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t node = NULL;
     int32_t axis = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     axis = vsi_nn_kernel_param_get_int32(params, "axis");
 
     status = _query_kernel( inputs, outputs, kernel );
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_box_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_box_evis.c
new file mode 100644
index 000000000..c56c80937
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_box_evis.c
@@ -0,0 +1,357 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_TINY_YOLOV4_POSTPROCESS_BOX,
+} _internal_kernel_e;
+
+#define _SOURCE         "tiny_yolov4_postprocess_box"
+#define _KERNEL_NAME    CVIVANTE_NAMESPACE("evis.tiny_yolov4_postprocess_box_U8_U8toU8")
+
+// Add kernel hashtable here
+#define TINY_YOLOV4_POSTPROCESS_BOX_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+        (( IN0_DTYPE ) | ( IN1_DTYPE << 8 ) | ( OUT_DTYPE << 16 ))
+#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+        { TINY_YOLOV4_POSTPROCESS_BOX_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), \
+        _KERNEL_NAME, _SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _tiny_yolov4_postprocess_box_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( U8, U8, U8 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _tiny_yolov4_postprocess_box_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _TINY_YOLOV4_POSTPROCESS_BOX_PARAM_NUM  _cnt_of_array( _tiny_yolov4_postprocess_box_kernel_param_def )
+#define SCALAR_BIAS_0_VALUE          (3)
+#define SCALAR_BIAS_1_VALUE          (4)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_tiny_yolov4_postprocess_box_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
+    float CONST2 = 16.0f;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+    // Add initializer
+    gpu_param.dim = 2;
+    gpu_param.global_scale[0] = 4;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (attr[0]->shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 8);
+    gpu_param.global_size[1] = 1;
+
+    if (attr[0]->shape->data[0] == 13 * 13)
+    {
+        CONST2 = 32.0f;
+    }
+
+    if (attr[0]->dtype == U8 && attr[1]->dtype == U8 && attr[2]->dtype == U8)
+    {
+        float input0_scale = attr[0]->scale;
+        float input0_tail = 0 - (float)attr[0]->zero_point * input0_scale;
+        float input1_scale = attr[1]->scale;
+        float input1_tail = 0 - (float)attr[1]->zero_point * input1_scale;
+        float output_scale = 1.0f / attr[2]->scale;
+        float output_zp = (float)attr[2]->zero_point;
+        gpu_dp_inst_t uniExtract8Data_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniDatatoFloat32_0_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniDatatoFloat32_1_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniDataTranspose_0_2x8 = {{
+            0x11111111, // TCfg
+            0x00000000, // ASelt
+            0x0c080400, 0x0d090501, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniDataTranspose_1_2x8 = {{
+            0x11111111, // TCfg
+            0x00000000, // ASelt
+            0x0e0a0602, 0x0f0b0703, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniDatatoFloat32_0_4x4", &uniDatatoFloat32_0_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniDatatoFloat32_1_4x4", &uniDatatoFloat32_1_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Data_2x8", &uniExtract8Data_2x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniDataTranspose_0_2x8", &uniDataTranspose_0_2x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniDataTranspose_1_2x8", &uniDataTranspose_1_2x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "input0_scale", &input0_scale);
+        status |= vsi_nn_kernel_gpu_add_param( node, "input0_tail", &input0_tail);
+        status |= vsi_nn_kernel_gpu_add_param( node, "input1_scale", &input1_scale);
+        status |= vsi_nn_kernel_gpu_add_param( node, "input1_tail", &input1_tail);
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale);
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp);
+        status |= vsi_nn_kernel_gpu_add_param( node, "CONST2", &CONST2);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+    }
+    if (attr[2])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[2] );
+    }
+
+    return status;
+} /* _tiny_yolov4_postprocess_box_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _tiny_yolov4_postprocess_box_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _tiny_yolov4_postprocess_box_kernel_map );
+    vx_param_description_t * param_def  = _tiny_yolov4_postprocess_box_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _tiny_yolov4_postprocess_box_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = TINY_YOLOV4_POSTPROCESS_BOX_HASH_KEY( in0_dtype, in1_dtype, out_dtype );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _tiny_yolov4_postprocess_box_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_TINY_YOLOV4_POSTPROCESS_BOX_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_size_t shape[3][VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    float bias_0 = vsi_nn_kernel_param_get_float32( params, "bias_0" );
+    float bias_1 = vsi_nn_kernel_param_get_float32( params, "bias_1" );
+
+    VSI_UNREFERENCED(params);
+
+    memcpy(shape[0], inputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+    shape[0][0] = shape[0][0] * shape[0][1];
+    shape[0][1] = shape[0][2];
+    shape[0][2] = 1;
+
+    memcpy(shape[1], inputs[1]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+    shape[1][0] = shape[1][0] * shape[1][1];
+    shape[1][1] = shape[1][2];
+    shape[1][2] = 1;
+
+    memcpy(shape[2], outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+    shape[2][0] = shape[2][0];
+    shape[2][1] = shape[2][2] * shape[2][1];
+    shape[2][2] = 1;
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+            inputs[0], shape[0], inputs[0]->attr.dim_num );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+            inputs[1], shape[1], inputs[1]->attr.dim_num );
+    reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shape[2], outputs[0]->attr.dim_num );
+
+    if ( !vsi_nn_kernel_gpu_check_shape(
+        reshape_tensors[0]->attr.size, reshape_tensors[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _TINY_YOLOV4_POSTPROCESS_BOX_PARAM_NUM,
+                    reshape_tensors, input_num, &reshape_tensors[2], output_num );
+            /* Pass parameters to node. */
+            node_params[SCALAR_BIAS_0_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &bias_0 );
+            node_params[SCALAR_BIAS_1_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &bias_1 );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _TINY_YOLOV4_POSTPROCESS_BOX_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_BIAS_0_VALUE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_BIAS_1_VALUE] );
+        }
+    }
+
+    vsi_safe_release_tensor( reshape_tensors[0] );
+    vsi_safe_release_tensor( reshape_tensors[1] );
+    vsi_safe_release_tensor( reshape_tensors[2] );
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( tiny_yolov4_postprocess_box, _setup )
+
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c
new file mode 100644
index 000000000..b36ec6b14
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c
@@ -0,0 +1,320 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_TINY_YOLOV4_POSTPROCESS_CONFIDENCE,
+} _internal_kernel_e;
+
+#define _SOURCE         "tiny_yolov4_postprocess_confidence"
+#define _KERNEL_NAME    CVIVANTE_NAMESPACE("evis.tiny_yolov4_postprocess_conf_U8toU8")
+
+// Add kernel hashtable here
+#define _CONFIDENCE_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        (( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { _CONFIDENCE_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+         _KERNEL_NAME, _SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _tiny_yolov4_postprocess_confidence_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( U8, U8 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _tiny_yolov4_postprocess_confidence_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM  \
+    _cnt_of_array( _tiny_yolov4_postprocess_confidence_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_tiny_yolov4_postprocess_confidence_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    gpu_param.dim = 2;
+    gpu_param.global_scale[0] = 4;
+    gpu_param.global_scale[1] = 4;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (attr[0]->shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (attr[1]->shape->data[0] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+
+    if (attr[0]->dtype == U8 && attr[1]->dtype == U8)
+    {
+        float output_scale = attr[0]->scale * attr[0]->scale / attr[1]->scale;
+        int output_zp = attr[1]->zero_point;
+        uint16_t   M0                 = 0;
+        int32_t    postShift          = 0;
+        int32_t i = 0;
+
+        gpu_dp_inst_t uniU8TimesU8_0_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x01010101, // BSelt
+            0x00010000, 0x00030002, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniU16TimesMultiplier_PostShift_2x8 = {{
+            0x11111111, // TCfg
+            0x00000000, // ASelt
+            0x03020100, 0x07060504, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniU8PlusU8_trans_0_2x8 = {{
+            0xffffffff, // TCfg
+            0x44444444, // ASelt
+            0x0c080400, 0x0d090501, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00007400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniU8PlusU8_trans_1_2x8 = {{
+            0xffffffff, // TCfg
+            0x44444444, // ASelt
+            0x0e0a0602, 0x0f0b0703, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00007400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_quantize_multiplier_16bit((double)output_scale, &M0, &postShift);
+
+        uniU16TimesMultiplier_PostShift_2x8.data[7] |= (postShift & 0x1F);
+        for ( i = 8; i < 16; i++ )
+        {
+            uniU16TimesMultiplier_PostShift_2x8.data[i] = M0;
+        }
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniU8TimesU8_0_4x4", &uniU8TimesU8_0_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniU16TimesMultiplier_PostShift_2x8",
+            &uniU16TimesMultiplier_PostShift_2x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniU8PlusU8_trans_0_2x8", &uniU8PlusU8_trans_0_2x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniU8PlusU8_trans_1_2x8", &uniU8PlusU8_trans_1_2x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+    }
+
+    return status;
+} /* _tiny_yolov4_postprocess_confidence_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _tiny_yolov4_postprocess_confidence_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _tiny_yolov4_postprocess_confidence_kernel_map );
+    vx_param_description_t * param_def  = _tiny_yolov4_postprocess_confidence_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _tiny_yolov4_postprocess_confidence_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = _CONFIDENCE_HASH_KEY( in_dtype, out_dtype );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _tiny_yolov4_postprocess_confidence_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+
+    VSI_UNREFERENCED(params);
+
+    memcpy(shape[0], inputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+    shape[0][0] = shape[0][0] * shape[0][1];
+    shape[0][1] = shape[0][2];
+    shape[0][2] = 1;
+
+    memcpy(shape[1], outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+    shape[1][0] = shape[1][0];
+    shape[1][1] = shape[1][2] * shape[1][1];
+    shape[1][2] = 1;
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+            inputs[0], shape[0], inputs[0]->attr.dim_num );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shape[1], outputs[0]->attr.dim_num );
+
+    if ( !vsi_nn_kernel_gpu_check_shape(
+        reshape_tensors[0]->attr.size, reshape_tensors[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM,
+                    reshape_tensors, input_num, &reshape_tensors[1], output_num );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params,
+                _TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM );
+        }
+    }
+
+    vsi_safe_release_tensor(reshape_tensors[0]);
+    vsi_safe_release_tensor(reshape_tensors[1]);
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( tiny_yolov4_postprocess_confidence, _setup )
+
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c
index 169825158..3272fd634 100644
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c
@@ -50,18 +50,27 @@ typedef enum _custom_warp_affine_type_e
 }custom_warp_affine_type_e;
 
 #define _CUSTOM_WARP_AFFINE_KERNEL_SOURCE      "custom_warp_affine"
+#define _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE      "custom_warp_affine_rgb"
 
 // Add kernel hashtable here
-#define CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D ) \
-        (( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20))
+#define CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D, RGB_TYPE ) \
+        (( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20) | (RGB_TYPE << 24))
 #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
-        { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0 ), \
+        { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 0 ), \
           CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE), \
           _CUSTOM_WARP_AFFINE_KERNEL_SOURCE }
 #define PACK_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
-        { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1 ), \
+        { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 0 ), \
           CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D"), \
           _CUSTOM_WARP_AFFINE_KERNEL_SOURCE }
+#define PACK_RGB_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
+        { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 1 ), \
+          CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb"), \
+          _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE }
+#define PACK_RGB_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
+        { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 1 ), \
+          CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb_2D"), \
+          _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE }
 
 typedef struct
 {
@@ -78,6 +87,12 @@ static const _kernel_map_type _custom_warp_affine_kernel_map[] =
 
     PACK_2D_KERNEL_MAP( U8, U8, nearest_neighbor ),
     PACK_2D_KERNEL_MAP( U8, U8, bilinear ),
+
+    PACK_RGB_KERNEL_MAP( U8, U8, nearest_neighbor ),
+    PACK_RGB_KERNEL_MAP( U8, U8, bilinear ),
+
+    PACK_RGB_2D_KERNEL_MAP( U8, U8, nearest_neighbor ),
+    PACK_RGB_2D_KERNEL_MAP( U8, U8, bilinear ),
 };
 
 /*
@@ -124,6 +139,8 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_initializer)
     float matrix4[4] = {0};
     int32_t i = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -178,7 +195,81 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_initializer)
     return status;
 } /* _custom_warp_affine_initializer() */
 
+DEF_KERNEL_INITIALIZER(_custom_warp_affine_rgb_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    vsi_size_array_t * out_shape = NULL;
+    float m[6] = {0};
+    float matrix0[4] = {0};
+    float matrix1[4] = {0};
+    int32_t i = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    for (i = 0; i < 6; i++)
+    {
+        status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
+            &m[i]);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    matrix0[0] = m[0]; matrix0[1] = m[1]; matrix0[2] = m[2]; matrix0[3] = m[3];
+    matrix1[0] = m[4]; matrix1[1] = m[5];
 
+    out_shape  = attr[1]->shape;
+
+    gpu_param.global_scale[0] = 2;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+    gpu_param.global_size[0] = (
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / (3 * gpu_param.global_scale[0]));
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+    status = vsi_nn_kernel_gpu_add_param( node,
+        "matrix0", &matrix0 );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "matrix1", &matrix1 );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+} /* _custom_warp_affine_rgb_initializer() */
 
 /*
  * Query kernel
@@ -188,7 +279,8 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t * kernel,
     vsi_nn_tensor_t * const * const inputs,
     vsi_nn_tensor_t * const * const outputs,
-    int32_t type
+    int32_t type,
+    int32_t rgb_type
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -205,8 +297,11 @@ static vsi_status _query_kernel
     in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    key = CUSTOM_WARP_AFFINE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img );
-
+    key = CUSTOM_WARP_AFFINE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img, rgb_type );
+    if (rgb_type == 1)
+    {
+        initializer = _custom_warp_affine_rgb_initializer;
+    }
     for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
     {
         if ( kernel_map[i].key == key )
@@ -251,6 +346,7 @@ static vsi_nn_kernel_node_t _setup
     size_t i = 0;
     size_t buffer_size = 0;
     int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
+    int32_t rgb_type = vsi_nn_kernel_param_get_int32( params, "rgb_type");
     float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
 
     if (vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
@@ -258,7 +354,7 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
-    status = _query_kernel( kernel, inputs, outputs, type );
+    status = _query_kernel( kernel, inputs, outputs, type, rgb_type );
     if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
@@ -282,7 +378,7 @@ static vsi_nn_kernel_node_t _setup
                 vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
             }
             // Set default border mode.
-            border.constant_value.U32 = 0xcdcdcdcd;
+            border.constant_value.U32 = 0x00000000;
             status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
             CHECK_STATUS(status);
         }
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c
index 69367599b..ab6d8437e 100644
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c
@@ -127,6 +127,8 @@ DEF_KERNEL_INITIALIZER(_custom_warp_perspective_initializer)
     float matrix4[4] = {0};
     int32_t i = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c b/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c
index 2e7415e62..606b7c80f 100644
--- a/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c
+++ b/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c
@@ -48,6 +48,9 @@ static vsi_status op_compute
 {
     vsi_status status = VSI_SUCCESS;
 
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+
 #if defined(VX_DENOISE_POSTPROCESS_SUPPORT) && VX_DENOISE_POSTPROCESS_SUPPORT
     self->n = vxDenoisePostProcesslayer(
         self->graph->g,
@@ -83,6 +86,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 } /* op_check() */
 
@@ -93,6 +99,9 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 } /* op_setup() */
 
@@ -101,6 +110,7 @@ static vsi_status op_init
     vsi_nn_node_t* self
     )
 {
+    VSI_UNREFERENCED(self);
     return VSI_SUCCESS;
 } /* op_init() */
 
diff --git a/src/tim/vx/internal/src/custom/ops/op_custom_sample.c b/src/tim/vx/internal/src/custom/ops/op_custom_sample.c
index 145953922..ef28a2e64 100644
--- a/src/tim/vx/internal/src/custom/ops/op_custom_sample.c
+++ b/src/tim/vx/internal/src/custom/ops/op_custom_sample.c
@@ -63,6 +63,9 @@ static vsi_bool op_check
     )
 {
     /*TODO: Check params. */
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 } /* op_check() */
 
@@ -73,6 +76,7 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(node);
     if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
     {
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c b/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c
index 3a37247a9..6da5e6136 100644
--- a/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c
+++ b/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c
@@ -62,6 +62,9 @@ static vsi_bool op_check
     )
 {
     /*TODO: Check params. */
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 } /* op_check() */
 
@@ -72,6 +75,7 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(node);
     if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c
index e076b7c7c..5ee37c58e 100644
--- a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c
+++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c
@@ -59,6 +59,7 @@ static vsi_status op_compute
     param = vsi_nn_kernel_param_create();
     vsi_nn_kernel_param_add_const_buffer( param, "matrix", p->matrix, 6 );
     vsi_nn_kernel_param_add_int32( param, "type", p->type);
+    vsi_nn_kernel_param_add_int32( param, "rgb_type", p->rgb_type);
 
     self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
             "custom_warp_affine",
@@ -78,6 +79,9 @@ static vsi_bool op_check
     )
 {
     /*TODO: Check tensor shapes. */
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 } /* op_check() */
 
diff --git a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c
index 7afbd8352..91f788c94 100644
--- a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c
+++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c
@@ -78,6 +78,9 @@ static vsi_bool op_check
     )
 {
     /*TODO: Check tensor shapes. */
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 } /* op_check() */
 
diff --git a/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c b/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c
index 6a84a5e0b..b9a840ff3 100644
--- a/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c
@@ -100,7 +100,7 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
     size_t                              param_size
     )
 {
-    vsi_status status = VX_FAILURE;
+    vsi_status status = VSI_FAILURE;
     // Alignment with a power of two value.
     gpu_param_t gpu_param = {
         2,
@@ -113,6 +113,8 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
     vsi_nn_kernel_tensor_attr_t *input0_attr = NULL;
     vsi_size_array_t             *input_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     input0_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0);
     CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
     input_shape   = input0_attr->shape;
diff --git a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
index 5741690d3..bc7d36efc 100644
--- a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
@@ -143,6 +143,8 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -183,7 +185,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int32_t i;
+    size_t i;
 
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -240,6 +242,9 @@ static vsi_nn_kernel_node_t _setup
     int32_t axis = 0;
     vsi_size_t axis_size = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     axis = vsi_nn_kernel_param_get_int32(params, "axis");
 
     if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
diff --git a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c
index b710fa11e..6fb6cd872 100644
--- a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c
@@ -143,6 +143,8 @@ DEF_KERNEL_INITIALIZER(_argmin_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -183,7 +185,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int32_t i;
+    size_t i;
 
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -240,6 +242,9 @@ static vsi_nn_kernel_node_t _setup
     int32_t axis = 0;
     size_t axis_size = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     axis = vsi_nn_kernel_param_get_int32(params, "axis");
 
     if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
diff --git a/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c b/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c
index c0ed53eee..24b266439 100644
--- a/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c
@@ -129,6 +129,8 @@ DEF_KERNEL_INITIALIZER(_avg_pool3d_initializer)
     vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
     vsi_size_array_t            *output_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     vxReadScalarValue(depth_out, &depth_out_value);
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
     CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c
index c62f0b4c0..689603021 100644
--- a/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c
@@ -135,6 +135,8 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_size_array_t * in_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
 
@@ -170,7 +172,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    size_t i;
 
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -238,6 +240,9 @@ static vsi_nn_kernel_node_t _setup
     float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
     float eps = vsi_nn_kernel_param_get_float32(params, "eps");
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if ( (inputs[1]->attr.is_const && inputs[2]->attr.is_const)
         || ( inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16
           && inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32 )
diff --git a/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c b/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c
index bda96ffcb..84811fd82 100644
--- a/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c
@@ -128,6 +128,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
     vsi_nn_kernel_tensor_attr_t* output_attr = NULL;
     vsi_size_array_t* out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr =
         vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
     CHECK_PTR_FAIL_GOTO(output_attr, "Create tensor attr buffer fail.", final);
@@ -140,9 +142,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
 
     gpu_param.dim = 2;
     gpu_param.global_size[0] =
-        gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1) /
-                         gpu_param.global_scale[0],
-                     4);
+        (out_shape->data[0] + gpu_param.global_scale[0] - 1) /
+                         gpu_param.global_scale[0];
     gpu_param.global_size[1] =
         ((out_shape->data[1] + gpu_param.global_scale[1] - 1) /
          gpu_param.global_scale[1]);
diff --git a/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c b/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c
index e20cb1be4..d3c4968a8 100644
--- a/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c
@@ -134,6 +134,8 @@ DEF_KERNEL_INITIALIZER(_bucketize_initializer)
     vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
     vsi_size_array_t * out_shape                = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/cast_cl.c b/src/tim/vx/internal/src/kernel/cl/cast_cl.c
index 33291a799..e379000ea 100644
--- a/src/tim/vx/internal/src/kernel/cl/cast_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/cast_cl.c
@@ -128,6 +128,8 @@ DEF_KERNEL_INITIALIZER(_cast_initializer)
     vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
     vsi_size_array_t * out_shape                 = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
 
@@ -251,6 +253,8 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
 
+    VSI_UNREFERENCED(params);
+
     if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
                 inputs[0]->attr.dim_num ) )
     {
diff --git a/src/tim/vx/internal/src/kernel/cl/clip_cl.c b/src/tim/vx/internal/src/kernel/cl/clip_cl.c
index 4b518b2be..ec74f361b 100644
--- a/src/tim/vx/internal/src/kernel/cl/clip_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/clip_cl.c
@@ -128,6 +128,8 @@ DEF_KERNEL_INITIALIZER(_clip_initializer)
     vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
     vsi_size_array_t * out_shape                = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
index 8fec39b3c..4b1369f96 100644
--- a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
@@ -229,6 +229,8 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -285,7 +287,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    size_t i;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -347,6 +349,9 @@ static vsi_nn_kernel_node_t _setup
     float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
     float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     ret = vsi_nn_kernel_optimize_eltwise_shape(
             inputs[0]->attr.size, inputs[0]->attr.dim_num,
             inputs[1]->attr.size, inputs[1]->attr.dim_num,
@@ -363,11 +368,11 @@ static vsi_nn_kernel_node_t _setup
                 outputs[0], shapes[2], new_rank );
 
 #define _swap_tensor(a, b, tmp)  \
-    do { \
+    { \
         tmp = a; \
         a = b; \
         b = tmp; \
-    } while(0)
+    }
 
         if (shapes[1][3] > shapes[0][3] && new_rank == 4)
         {
diff --git a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
index 0aac099e6..8dca93180 100644
--- a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
@@ -135,6 +135,8 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
     int32_t       c         = 1;
     uint32_t      dim       = 1;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
 
@@ -203,7 +205,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e input0_dtype = U8;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -288,11 +290,28 @@ static vsi_nn_kernel_node_t _setup
     int32_t width      = 0;
     int32_t height     = 0;
     int32_t channel    = 1;
-    int32_t i = 0;
+    uint32_t i = 0;
+
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
 
-    vsi_nn_kernel_optimize_softmax_shape(
-                inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
-                shapes[0], &rs_dim, &axis_new);
+    if (axis < 0)
+    {
+        axis_new = 0;
+        shapes[0][0] = 1;
+        shapes[0][1] = 1;
+        for (i = 0; i < inputs[0]->attr.dim_num; i++)
+        {
+            shapes[0][0] *= inputs[0]->attr.size[i];
+        }
+        rs_dim = 2;
+    }
+    else
+    {
+        vsi_nn_kernel_optimize_softmax_shape(
+                    inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+                    shapes[0], &rs_dim, &axis_new);
+    }
     if (rs_dim > 3)
     {
         return NULL;
diff --git a/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c
index e1bb5f9c4..94e79fe56 100644
--- a/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c
@@ -103,6 +103,8 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
     int32_t     output_height = 0;
     int32_t     output_chn    = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
 
@@ -145,7 +147,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e input0_dtype = U8;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -195,6 +197,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t node = NULL;
     int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
diff --git a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c
index f34393ecf..596aab56e 100644
--- a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c
@@ -126,6 +126,9 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
     vsi_nn_kernel_tensor_attr_t * input_attr   = NULL;
     vsi_size_array_t * in_shape                 = NULL;
 
+    VSI_UNREFERENCED(param_size);
+    VSI_UNREFERENCED(node);
+
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
     in_shape  = input_attr->shape;
diff --git a/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c b/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c
index d54182d11..c278d0603 100644
--- a/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c
@@ -181,6 +181,14 @@ static vsi_nn_kernel_node_t _setup
 {
     vsi_nn_kernel_node_t node = NULL;
 
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(outputs);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(kernel);
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
index 5d29c6796..c44010a9c 100644
--- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
@@ -211,6 +211,9 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -253,7 +256,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    size_t i;
 
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -327,6 +330,9 @@ static vsi_nn_kernel_node_t _setup
     float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" );
     float beta = vsi_nn_kernel_param_get_float32( params, "beta" );
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if (unary_type == UNARY_SELU)
     {
         alpha = alpha * beta;
diff --git a/src/tim/vx/internal/src/kernel/cl/erf_cl.c b/src/tim/vx/internal/src/kernel/cl/erf_cl.c
index d6ef8d85b..e7aa1d3d2 100644
--- a/src/tim/vx/internal/src/kernel/cl/erf_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/erf_cl.c
@@ -135,6 +135,9 @@ DEF_KERNEL_INITIALIZER(_erf_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -243,6 +246,10 @@ static vsi_nn_kernel_node_t _setup
     float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
     float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+
     ret = vsi_nn_kernel_optimize_element_shape(
             inputs[0]->attr.size, inputs[0]->attr.dim_num,
             shape, &new_rank );
diff --git a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
index af31ed15d..7341f3282 100644
--- a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
@@ -122,11 +122,14 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
         {0, 0, 0},
         {0, 0, 0}
         };
-    vx_status     status             = VX_FAILURE;
-    vx_tensor     output              = (vx_tensor)param[2];
+    vsi_status    status             = VSI_FAILURE;
+    vx_tensor     output             = (vx_tensor)param[2];
     vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
     vsi_size_array_t             *output_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+    VSI_UNREFERENCED(node);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
     CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
@@ -258,6 +261,8 @@ static vsi_nn_kernel_node_t _setup
     float input1Scale  = vsi_nn_get_tensor_scale(inputs[1]);
     float input1Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
 
+    VSI_UNREFERENCED(params);
+
     outputScale = 1.0f / outputScale;
     input0Tail   = -(input0Tail * input0Scale);
     input1Tail   = -(input1Tail * input1Scale);
diff --git a/src/tim/vx/internal/src/kernel/cl/gather_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
index bafe86c15..a3fa2d61d 100644
--- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
@@ -205,6 +205,9 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
     size_t        input_dims1 = 0;
     size_t        i           = 0;
 
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -264,7 +267,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e input0_dtype = U8;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -334,6 +337,9 @@ static vsi_nn_kernel_node_t _setup
     int32_t is_array    = block_size >= GPU_TENSOR_MAX_WIDTH ? 1 : 0;
     int32_t i           = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0, &is_array);
     status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1, &is_array);
     status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0, &is_array);
diff --git a/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c
index a8d56a2bc..82838648c 100644
--- a/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c
@@ -51,18 +51,30 @@ typedef enum
 
 #define STR(a) #a
 // Add kernel hashtable here
-#define GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMG_2D ) \
-        (( AXIS ) | ( IN0_DTYPE << 2 ) | ( IN1_DTYPE << 10 ) | ( OUT_DTYPE << 18 ) | ( IMG_2D << 26 ))
+#define GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMG_2D, BEYOND_MAXWIDTH ) \
+        (( AXIS ) | ( IN0_DTYPE << 2 ) | ( IN1_DTYPE << 10 ) | ( OUT_DTYPE << 18 ) | ( IMG_2D << 26 ) | \
+        (BEYOND_MAXWIDTH << 28))
 #define PACK_KERNEL_3D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
-    { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 ), \
+    { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 , 0), \
     CVIVANTE_NAMESPACE("cl.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
     _GATHER_ELEMENTS_KERNEL_SOURCE}
 
 #define PACK_KERNEL_2D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
-    { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 ), \
+    { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 , 0), \
     CVIVANTE_NAMESPACE("cl.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
     _GATHER_ELEMENTS_KERNEL_SOURCE}
 
+#define PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+    { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 , 1), \
+    CVIVANTE_NAMESPACE("cl.gather_elements_beyond_maxwidth_axis"STR(AXIS)"_"STR(IN0_DTYPE)\
+    "_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
+    _GATHER_ELEMENTS_KERNEL_SOURCE}
+
+#define PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+    { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 , 1), \
+    CVIVANTE_NAMESPACE("cl.gather_elements_beyond_maxwidth_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)\
+    "to"STR(OUT_DTYPE)"_2D"), _GATHER_ELEMENTS_KERNEL_SOURCE}
+
 typedef struct
 {
     uint32_t key;
@@ -89,6 +101,44 @@ static const _kernel_map_type _gather_elements_kernel_map[] =
     PACK_KERNEL_2D_MAP( 1, F32, I32, F32 ),
     PACK_KERNEL_2D_MAP( 1, I32, I32, I32 ),
     PACK_KERNEL_2D_MAP( 1, U32, I32, U32 ),
+
+    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, F32, I32, F32),
+    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, F16, I32, F16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, I32, I32, I32 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, I16, I32, I16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, I8,  I32, I8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, U8,  I32, U8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, F32, I32, F32),
+    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, F16, I32, F16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, I32, I32, I32 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, I16, I32, I16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, I8,  I32, I8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, U8,  I32, U8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, F32, I32, F32),
+    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, F16, I32, F16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, I32, I32, I32 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, I16, I32, I16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, I8,  I32, I8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, U8,  I32, U8 ),
+
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, F32, I32, F32 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, F16, I32, F16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, I32, I32, I32 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, I16, I32, I16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, I8,  I32, I8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, U8,  I32, U8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, F32, I32, F32 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, F16, I32, F16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, I32, I32, I32 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, I16, I32, I16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, I8,  I32, I8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, U8,  I32, U8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, F32, I32, F32 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, F16, I32, F16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, I32, I32, I32 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, I16, I32, I16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, I8,  I32, I8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, U8,  I32, U8 ),
 };
 
 
@@ -126,12 +176,38 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer)
         {0, 0, 0},
         {0, 0, 0}
         };
+    vsi_nn_kernel_tensor_attr_t * input_attr0 = NULL;
+    vsi_nn_kernel_tensor_attr_t * input_attr1 = NULL;
     vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
     vsi_size_array_t * out_shape              = NULL;
+    uint32_t width0 = 0;
+    uint32_t height0 = 0;
+    uint32_t width1 = 0;
+    uint32_t height1 = 0;
+    uint32_t width_out = 0;
+    uint32_t height_out = 0;
+    uint32_t depth0 = 0;
+    uint32_t depth1 = 0;
+
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param_size);
 
+    input_attr0 = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr0, "Create tensor attr buffer fail.", final );
+    input_attr1 = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( input_attr1, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
 
+    width0 = (uint32_t)input_attr0->shape->data[0];
+    height0 = (uint32_t)input_attr0->shape->data[1];
+    depth0 = input_attr0->shape->size > 2 ? (uint32_t)input_attr0->shape->data[2] : 1;
+    width1 = (uint32_t)input_attr1->shape->data[0];
+    height1 = (uint32_t)input_attr1->shape->data[1];
+    depth1 = input_attr1->shape->size > 2 ? (uint32_t)input_attr1->shape->data[2] : 1;
+    width_out = (uint32_t)output_attr->shape->data[0];
+    height_out = (uint32_t)output_attr->shape->data[1];
+
     out_shape  = output_attr->shape;
 
     gpu_param.global_scale[0]  = 1;
@@ -146,7 +222,25 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer)
             (out_shape->data[1] + gpu_param.global_scale[1] - 1)
             / gpu_param.global_scale[1]);
     gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+    if (width0 >= GPU_TENSOR_MAX_WIDTH ||
+        width1 >= GPU_TENSOR_MAX_WIDTH ||
+        height0 >= GPU_TENSOR_MAX_WIDTH ||
+        height1 >= GPU_TENSOR_MAX_WIDTH ||
+        depth0 >= GPU_TENSOR_MAX_WIDTH ||
+        depth1 >= GPU_TENSOR_MAX_WIDTH)
+    {
+        gpu_param.global_scale[0] = 1;
+        gpu_param.global_size[0] = out_shape->data[0];
+    }
+
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    status |= vsi_nn_kernel_gpu_add_param( node, "width0", &width0 );
+    status |= vsi_nn_kernel_gpu_add_param( node, "height0", &height0 );
+    status |= vsi_nn_kernel_gpu_add_param( node, "width1", &width1 );
+    status |= vsi_nn_kernel_gpu_add_param( node, "height1", &height1 );
+    status |= vsi_nn_kernel_gpu_add_param( node, "width_out", &width_out );
+    status |= vsi_nn_kernel_gpu_add_param( node, "height_out", &height_out );
 
 final:
 #define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
@@ -178,32 +272,52 @@ static vsi_status _query_kernel
     int32_t img_2d = (outputs[0]->attr.dim_num < 3 || outputs[0]->attr.size[2] == 1) ? 1 : 0;
     uint32_t key = 0;
     uint32_t i;
+    int32_t beyond_maxwidth = 0;
+    vsi_size_t depth0 = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
+    vsi_size_t depth1 = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1;
 
     in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     in1_dtype  = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
+    if (inputs[0]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH ||
+        inputs[0]->attr.size[1] >= GPU_TENSOR_MAX_WIDTH ||
+        inputs[1]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH ||
+        inputs[1]->attr.size[1] >= GPU_TENSOR_MAX_WIDTH ||
+        depth0 >= GPU_TENSOR_MAX_WIDTH ||
+        depth1 >= GPU_TENSOR_MAX_WIDTH)
+    {
+        beyond_maxwidth = 1;
+    }
+
 #define _PACK_SELECT_KEY( in0_type, out_type ) \
     ( ( in0_type ) | ( out_type << 8 ))
 
-    switch (_PACK_SELECT_KEY(in0_dtype, out_dtype))
+    if (beyond_maxwidth == 0)
+    {
+        switch (_PACK_SELECT_KEY(in0_dtype, out_dtype))
+        {
+        case _PACK_SELECT_KEY(F32, F32):
+        case _PACK_SELECT_KEY(F16, F16):
+            key = GATHER_ELEMENTS_HASH_KEY( axis, F32, in1_dtype, F32, img_2d, 0 );
+            break;
+        case _PACK_SELECT_KEY(U32, U32):
+        case _PACK_SELECT_KEY(U16, U16):
+        case _PACK_SELECT_KEY(U8,  U8):
+            key = GATHER_ELEMENTS_HASH_KEY( axis, U32, in1_dtype, U32, img_2d, 0 );
+            break;
+        case _PACK_SELECT_KEY(I32, I32):
+        case _PACK_SELECT_KEY(I16, I16):
+        case _PACK_SELECT_KEY(I8,  I8):
+            key = GATHER_ELEMENTS_HASH_KEY( axis, I32, in1_dtype, I32, img_2d, 0 );
+            break;
+        default:
+            break;
+        }
+    }
+    else
     {
-    case _PACK_SELECT_KEY(F32, F32):
-    case _PACK_SELECT_KEY(F16, F16):
-        key = GATHER_ELEMENTS_HASH_KEY( axis, F32, in1_dtype, F32, img_2d );
-        break;
-    case _PACK_SELECT_KEY(U32, U32):
-    case _PACK_SELECT_KEY(U16, U16):
-    case _PACK_SELECT_KEY(U8,  U8):
-        key = GATHER_ELEMENTS_HASH_KEY( axis, U32, in1_dtype, U32, img_2d );
-        break;
-    case _PACK_SELECT_KEY(I32, I32):
-    case _PACK_SELECT_KEY(I16, I16):
-    case _PACK_SELECT_KEY(I8,  I8):
-        key = GATHER_ELEMENTS_HASH_KEY( axis, I32, in1_dtype, I32, img_2d );
-        break;
-    default:
-        break;
+        key = GATHER_ELEMENTS_HASH_KEY( axis, in0_dtype, in1_dtype, out_dtype, img_2d, 1 );
     }
 #undef _PACK_SELECT_KEY
 
@@ -221,7 +335,8 @@ static vsi_status _query_kernel
         kernel->info.numParams   = _cnt_of_array( _gather_elements_kernel_param_def );
         kernel->info.initialize  = initializer;
         // Register code source
-        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
                 kernel_map[i].source_name );
         // Register binary source
         vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
diff --git a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
index a41e7ace3..bfcb0df06 100644
--- a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
@@ -119,7 +119,7 @@ static vsi_status cal_gather_nd_tensor_reshape_size
     uint32_t block_size,
     uint32_t coordDim,
     int32_t* newDim,
-    int32_t  batch_dims
+    uint32_t  batch_dims
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -146,17 +146,23 @@ static vsi_status cal_gather_nd_tensor_reshape_size
 
         if (batch_dims)
         {
+            int32_t rank = 1;
             for (i = 0; i < offset; i++)
             {
                 sizes[0] *= input_size[i];
             }
 
-            for (i = 0; i < coordDim; i++)
+            for (i = 0; i < coordDim - 1; i++)
             {
-                sizes[i + 1] = input_size[i + offset];
+                sizes[rank++] = input_size[i + offset];
             }
 
-            newDim[0] = coordDim == 1 ? 2 : 3;
+            for (i = 0; i < batch_dims; i++)
+            {
+                sizes[rank] *= input_size[dims_num - i - 1];
+            }
+
+            newDim[0] = rank + 1;
         }
         else
         {
@@ -186,13 +192,27 @@ static vsi_status cal_gather_nd_tensor_reshape_size
     }
     else  // indices&output reshape
     {
-        if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
+        if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH && batch_dims == 0)
         {
             sizes[0] = block_size;
             sizes[1] = elementCnt / block_size;
             status = VSI_SUCCESS;
             newDim[0] = 2;
         }
+        else if (batch_dims > 0)
+        {
+            vsi_size_t batch_cnt = 1;
+            for (i = 0; i < batch_dims; ++i)
+            {
+                batch_cnt *= input_size[dims_num - i - 1];
+            }
+
+            sizes[0] = block_size;
+            sizes[1] = (elementCnt / block_size) / batch_cnt;
+            sizes[2] = batch_cnt;
+            status = VSI_SUCCESS;
+            newDim[0] = 3;
+        }
     }
 #undef VSI_NN_MAX_IMAGE_WIDTH
 
@@ -220,7 +240,11 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
 
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     int32_t       block_size  = 0;
-    vsi_ssize_t       indices_num = 1;
+    vsi_size_t    indices_num = 1;
+    vsi_size_t    batch_num   = 1;
+
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param_size);
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@@ -229,6 +253,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
     CHECK_STATUS_FAIL_GOTO(status, final );
 
     indices_num = attr[0]->shape->data[1];
+    batch_num = (attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1);
 
     gpu_param.global_scale[0]  = 1;
     gpu_param.global_scale[1]  = 1;
@@ -237,7 +262,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
     gpu_param.global_size[0]   = gpu_align_p2((block_size + gpu_param.global_scale[0] - 1)
                                         / gpu_param.global_scale[0], 4);
     gpu_param.global_size[1]   = indices_num;
-    gpu_param.global_size[2]   = 1;
+    gpu_param.global_size[2]   = batch_num;
 
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
     CHECK_STATUS_FAIL_GOTO(status, final);
@@ -265,7 +290,8 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype = U8;
     vsi_nn_kernel_coord_type_e coord_type = _error;
     uint32_t key = 0;
-    int i = 0;
+    int32_t batch_flg = batch_dims > 0 ? 1 : 0;
+    size_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -301,7 +327,7 @@ static vsi_status _query_kernel
         coord_type = _3D;
     }
 
-    key = HASH_GATHER_ND_KEY( input0_dtype, I32, output_dtype, coord_type, batch_dims );
+    key = HASH_GATHER_ND_KEY( input0_dtype, I32, output_dtype, coord_type, batch_flg );
 
     for ( i = 0; i < _cnt_of_array(gather_nd_map); i ++ )
     {
@@ -348,6 +374,9 @@ static vsi_nn_kernel_node_t _setup
     int32_t coord_dim  = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
     int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     status = cal_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim, batch_dims);
     status |= cal_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim, batch_dims);
     status |= cal_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim, batch_dims);
diff --git a/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c b/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c
index 1e51bd7b7..07eb2651f 100644
--- a/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c
@@ -108,6 +108,9 @@ DEF_KERNEL_INITIALIZER(_globallppool_initializer)
     vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
     vsi_size_array_t            *output_shape = NULL;
 
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
     CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
     output_shape = output_attr->shape;
diff --git a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
index 95a4bff5a..5e727fadb 100644
--- a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
@@ -220,6 +220,9 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer)
     vsi_ssize_t width = 0;
     vsi_ssize_t chn = 0;
 
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -275,6 +278,9 @@ DEF_KERNEL_INITIALIZER(_groupnorm_mean_vari_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_ssize_t chn = 0;
 
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
 
@@ -325,6 +331,9 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
     vsi_ssize_t chn = 0;
     int32_t is2D = 0;
 
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
@@ -489,6 +498,9 @@ static vsi_nn_kernel_node_t _setup
     float rSpaceOrg = 1.0f / (width * height);
     float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size);
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c
index 410fe5638..b6e0bf733 100644
--- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c
@@ -91,6 +91,9 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer)
     )
 {
     vsi_status status = VSI_FAILURE;
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param);
+    VSI_UNREFERENCED(param_size);
     // vsi_nn_kernel_tensor_attr * attr[2] = { NULL };
     // attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     // attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -172,6 +175,8 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_PARAM_NUM] = {NULL};
     vsi_nn_kernel_node_t node = NULL;
 
+    VSI_UNREFERENCED(params);
+
     /*
     // Check if gpu can support the size
     if( !vsi_nn_kernel_gpu_check_shape(
diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c
index 1a849fe60..828a88a22 100644
--- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c
@@ -91,6 +91,10 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_sma_initializer)
     )
 {
     vsi_status status = VSI_FAILURE;
+
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(param);
+    VSI_UNREFERENCED(param_size);
     // vsi_nn_kernel_tensor_attr * attr[2] = { NULL };
     // attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     // attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -172,6 +176,8 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_SMA_PARAM_NUM] = {NULL};
     vsi_nn_kernel_node_t node = NULL;
 
+    VSI_UNREFERENCED(params);
+
     /*
     // Check if gpu can support the size
     if( !vsi_nn_kernel_gpu_check_shape(
diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c
index e2b6964a8..193f388d3 100644
--- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c
@@ -118,6 +118,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
     vsi_nn_kernel_tensor_t       input      = NULL;
     vsi_nn_kernel_tensor_attr_t* input_attr = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     input = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_Z_H_HSTATE];
 
     input_attr = vsi_nn_kernel_tensor_attr_create( input );
diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c
index 3912b95cb..0896c6a1c 100644
--- a/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c
@@ -110,6 +110,8 @@ DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer)
     vsi_nn_kernel_tensor_t       output                 = NULL;
     vsi_nn_kernel_tensor_attr_t* output_attr;
 
+    VSI_UNREFERENCED(param_size);
+
     output = (vsi_nn_kernel_tensor_t)param[3];
 
     output_attr = vsi_nn_kernel_tensor_attr_create( output );
diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c
index a18b1121e..a99f8b908 100644
--- a/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c
@@ -120,6 +120,8 @@ DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer)
     vsi_nn_kernel_tensor_t       input      = NULL;
     vsi_nn_kernel_tensor_attr_t* input_attr = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     input = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_H_STATE];
 
     input_attr = vsi_nn_kernel_tensor_attr_create( input );
diff --git a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
index 892377b53..942585037 100644
--- a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
@@ -188,6 +188,8 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer)
     vsi_ssize_t height = 0;
     vsi_ssize_t chn = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -255,6 +257,8 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
     vsi_ssize_t height = 0;
     vsi_ssize_t chn = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
@@ -405,6 +409,9 @@ static vsi_nn_kernel_node_t _setup
     float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
     float inv_multiplier = (float)1.0 / (float)(width * height);
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
diff --git a/src/tim/vx/internal/src/kernel/cl/l1norm_cl.c b/src/tim/vx/internal/src/kernel/cl/l1norm_cl.c
index 2626bfeaa..44186d138 100644
--- a/src/tim/vx/internal/src/kernel/cl/l1norm_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/l1norm_cl.c
@@ -164,6 +164,8 @@ DEF_KERNEL_INITIALIZER(_l1norm_initializer)
     vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
     vsi_size_array_t            *output_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
     vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis);
     CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c
index 7b2f50aa5..83e598bb0 100644
--- a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c
@@ -115,6 +115,8 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
     vsi_nn_kernel_tensor_attr_t *output_attr   = NULL;
     vsi_size_array_t * output_shape             = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis);
diff --git a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
index 20f3ab01c..a13ec2e19 100644
--- a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
@@ -123,6 +123,8 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
     vsi_ssize_t height = 0;
     vsi_ssize_t chn = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
@@ -175,7 +177,9 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e input0_dtype = U8;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
+
+    VSI_UNREFERENCED(reshape2D);
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -243,6 +247,9 @@ static vsi_nn_kernel_node_t _setup
     float zp2ScaleE2 = 0.0f;
     float sumZpScaleE2 = 0.0f;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     scale_inOut = input_scale * output_scale;
     e2InScale = input_scale * input_scale;
     sumZpScale = width * input_zp * input_scale;
diff --git a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
index 311de9729..3fc716cad 100644
--- a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
@@ -148,6 +148,8 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer)
     vsi_size_array_t * out_shape = NULL;
     int32_t axis = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -194,7 +196,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    size_t i;
 
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -244,6 +246,9 @@ static vsi_nn_kernel_node_t _setup
     float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
     float scaleValue = (vx_float32)(log10(exp(1.0f)) / log10(2.0f));
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     axis = vsi_nn_kernel_param_get_int32(params, "axis");
     beta = vsi_nn_kernel_param_get_float32(params, "beta");
 
diff --git a/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c b/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c
index bcf4d7a7f..27b97ebb6 100644
--- a/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c
@@ -106,11 +106,13 @@ DEF_KERNEL_INITIALIZER(_logical_not_initializer)
         {0, 0, 0},
         {0, 0, 0}
         };
-    vx_status     status             = VX_FAILURE;
+    vsi_status    status             = VSI_FAILURE;
     vx_tensor     output             = (vx_tensor)param[1];
     vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
     vsi_size_array_t             *output_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
     CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
@@ -218,6 +220,8 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t new_rank = 0;
     vsi_bool ret = FALSE;
 
+    VSI_UNREFERENCED(params);
+
     ret = vsi_nn_kernel_optimize_element_shape(
             inputs[0]->attr.size, inputs[0]->attr.dim_num,
             shape, &new_rank );
diff --git a/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c b/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c
index 7121aa93b..4d0c23ab7 100644
--- a/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c
@@ -111,11 +111,13 @@ DEF_KERNEL_INITIALIZER(_logical_ops_initializer)
         {0, 0, 0},
         {0, 0, 0}
         };
-    vx_status     status             = VX_FAILURE;
-    vx_tensor     output              = (vx_tensor)param[2];
+    vsi_status    status             = VSI_FAILURE;
+    vx_tensor     output             = (vx_tensor)param[2];
     vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
     vsi_size_array_t             *output_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
     CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
@@ -250,11 +252,11 @@ static vsi_nn_kernel_node_t _setup
                 outputs[0], shapes[2], new_rank );
 
 #define _swap_tensor(a, b, tmp)  \
-    do { \
+    { \
         tmp = a; \
         a = b; \
         b = tmp; \
-    } while(0)
+    }
 
         if (shapes[1][3] > shapes[0][3] && new_rank == 4)
         {
diff --git a/src/tim/vx/internal/src/kernel/cl/lppool_cl.c b/src/tim/vx/internal/src/kernel/cl/lppool_cl.c
index 514bec0c7..a46c728d7 100644
--- a/src/tim/vx/internal/src/kernel/cl/lppool_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/lppool_cl.c
@@ -121,6 +121,8 @@ DEF_KERNEL_INITIALIZER(_lppool_initializer)
     vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
     vsi_size_array_t            *output_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
     CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c
index a7bdb2c89..dec27e3f9 100644
--- a/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c
@@ -68,7 +68,8 @@ typedef enum _LSTMUNIT_nn_activation_e
 #define LSTMUNIT_ACTIVATION_HASH_KEY(_is_ln, _is_cifg, _is_proj, _is_hybrid, _is_peephole, \
 _input_type, _output_type, _cell_type, _rec_act) \
 ((_is_ln << 31) | (_is_cifg << 30) | (_is_proj << 29) | (_is_hybrid << 28) | (_is_peephole << 27) \
-| (_input_type << 23) | (_output_type << 19) | (_cell_type << 15) | (_rec_act << 10))
+| (((uint32_t)_input_type) << 23) | (((uint32_t)_output_type) << 19) | (((uint32_t)_cell_type) << 15) \
+| (_rec_act << 10))
 
 #define LSTMUNIT_ACTIVATION_SOURCE_NAME(_ln_cifg_proj_hybrid_, _input_type) \
     "lstmunit_activation_"#_ln_cifg_proj_hybrid_"_"#_input_type
@@ -941,6 +942,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_CL_initializer)
     vsi_nn_kernel_tensor_t       output                 = NULL;
     vsi_nn_kernel_tensor_attr_t* output_attr;
 
+    VSI_UNREFERENCED(param_size);
+
     output = (vsi_nn_kernel_tensor_t)param[CL_OUTPUT];
 
     output_attr = vsi_nn_kernel_tensor_attr_create( output );
@@ -983,6 +986,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_CB_initializer)
     vsi_nn_kernel_tensor_t       output                 = NULL;
     vsi_nn_kernel_tensor_attr_t* output_attr;
 
+    VSI_UNREFERENCED(param_size);
+
     output = (vsi_nn_kernel_tensor_t)param[CB_OUTPUT];
     output_attr = vsi_nn_kernel_tensor_attr_create( output );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
@@ -1027,6 +1032,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_CS_initializer)
     vsi_nn_kernel_tensor_t       output                 = NULL;
     vsi_nn_kernel_tensor_attr_t* output_attr;
 
+    VSI_UNREFERENCED(param_size);
+
     output = (vsi_nn_kernel_tensor_t)param[CS_OUTPUT];
 
     output_attr = vsi_nn_kernel_tensor_attr_create( output );
@@ -1073,6 +1080,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_L_initializer)
     vsi_nn_kernel_tensor_t       output                 = NULL;
     vsi_nn_kernel_tensor_attr_t* output_attr;
 
+    VSI_UNREFERENCED(param_size);
+
     output = (vsi_nn_kernel_tensor_t)param[L_OUTPUT];
 
     output_attr = vsi_nn_kernel_tensor_attr_create( output );
@@ -1118,6 +1127,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_B_initializer)
     vsi_nn_kernel_tensor_t       output                 = NULL;
     vsi_nn_kernel_tensor_attr_t* output_attr;
 
+    VSI_UNREFERENCED(param_size);
+
     output = (vsi_nn_kernel_tensor_t)param[B_OUTPUT];
 
     output_attr = vsi_nn_kernel_tensor_attr_create( output );
@@ -1164,6 +1175,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_S_initializer)
     vsi_nn_kernel_tensor_t       output                 = NULL;
     vsi_nn_kernel_tensor_attr_t* output_attr;
 
+    VSI_UNREFERENCED(param_size);
+
     output = (vsi_nn_kernel_tensor_t)param[S_OUTPUT];
 
     output_attr = vsi_nn_kernel_tensor_attr_create( output );
diff --git a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
index 5ff2a9308..de336c9ba 100644
--- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
@@ -43,6 +43,7 @@ __BEGIN_DECLS
  */
 #define KERNEL_SOURCE_1    "matrixmul"
 #define KERNEL_SOURCE_2    "matrixmul_transA"
+#define KERNEL_SOURCE_3    "matrixmul_cross"
 
  typedef enum
 {
@@ -50,8 +51,8 @@ __BEGIN_DECLS
     _3D
 } vsi_nn_kernel_image_dim_type_e;
 
-#define HASH_MATRIXMUL_KEY(_input0_type, _input1_type, _output_type, _image_dim, _trans_a) \
-    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_dim << 4) | (_trans_a))
+#define HASH_MATRIXMUL_KEY(_type0, _type1, _type2, _image_dim, _trans_a, _cross) \
+    ((_type0 << 24) | (_type1 << 16) | (_type2 << 8) | (_image_dim << 4) | (_trans_a << 2) | (_cross))
 
 #define HASH_MATRIXMUL_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
     CVIVANTE_NAMESPACE("cl.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
@@ -62,21 +63,29 @@ __BEGIN_DECLS
 #define HASH_MATRIXMUL_TRANSB_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
     CVIVANTE_NAMESPACE("cl.gemm_transb_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
 
+#define HASH_MATRIXMUL_SH_KERNEL_MERGE_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_merge")
+
 #define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
-    { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0), \
+    { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 0), \
         HASH_MATRIXMUL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
         SOURCE },
 
 #define TENSOR_MATRIXMUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
-    { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 1), \
+    { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 1, 0), \
         HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
         SOURCE },
 
 #define TENSOR_MATRIXMUL_TRANSB_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
-    { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 2), \
+    { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 2, 0), \
         HASH_MATRIXMUL_TRANSB_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
         SOURCE },
 
+#define TENSOR_MATRIXMUL_MERGE_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
+    { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 2), \
+        HASH_MATRIXMUL_SH_KERNEL_MERGE_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
+        SOURCE },
+
 static const struct {
         uint32_t key;
         char* function_name;
@@ -109,6 +118,9 @@ static const struct {
     TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, F32, _3D,    KERNEL_SOURCE_2)
     TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, F32, _2D,    KERNEL_SOURCE_1)
     TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, F32, _3D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_MERGE_KERNELS(U8,  U8, U8,  _3D,    KERNEL_SOURCE_3)
+    TENSOR_MATRIXMUL_MERGE_KERNELS(I8,  I8, I8,  _3D,    KERNEL_SOURCE_3)
+    TENSOR_MATRIXMUL_MERGE_KERNELS(F32, F32, F32, _3D,   KERNEL_SOURCE_3)
 };
 
 /*
@@ -132,7 +144,27 @@ static vx_param_description_t _matrixmul_kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 
+static vx_param_description_t _matrixmul_merge_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
 #define _MATRIXMUL_PARAM_NUM          _cnt_of_array(_matrixmul_kernel_param_def)
+#define _MATRIXMUL_MERGE_PARAM_NUM    _cnt_of_array(_matrixmul_merge_kernel_param_def)
 
 /*
  * Kernel initializer
@@ -153,17 +185,40 @@ DEF_KERNEL_INITIALIZER(_matrixmul_initializer)
         {0, 0, 0}
         };
 
-    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
-    vsi_ssize_t width = 0;
-    vsi_ssize_t height = 0;
-    vsi_ssize_t chn = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
+    vsi_size_t width = 0;
+    vsi_size_t height = 0;
+    vsi_size_t chn = 0;
 
-    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
 
-    width = attr[0]->shape->data[0];
-    height = attr[0]->shape->data[1];
-    chn = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1;
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
+
+    width = attr[2]->shape->data[0];
+    height = attr[2]->shape->data[1];
+    chn = attr[2]->shape->size > 2 ? attr[2]->shape->data[2] : 1;
+
+    if (((attr[0]->shape->size == 4 && attr[1]->shape->size == 3) ||
+        (attr[0]->shape->size == 3 && attr[1]->shape->size == 4))
+        && attr[0]->shape->data[2] > 1 && attr[1]->shape->data[2] > 1
+        && chn == attr[0]->shape->data[2] * attr[1]->shape->data[2])
+    {
+        if (attr[0]->shape->size == 4)
+        {
+            chn = attr[1]->shape->data[2];
+        }
+        else
+        {
+            chn = attr[0]->shape->data[2];
+        }
+    }
 
     gpu_param.global_scale[0]  = 1;
     gpu_param.global_scale[1]  = 1;
@@ -184,6 +239,16 @@ DEF_KERNEL_INITIALIZER(_matrixmul_initializer)
         vsi_nn_kernel_tensor_attr_release( &attr[0] );
         attr[0] = NULL;
     }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    if (attr[2])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[2] );
+        attr[2] = NULL;
+    }
     return status;
 } /* _matrixmul_initializer() */
 
@@ -193,7 +258,8 @@ static vsi_status _query_kernel
     vsi_nn_tensor_t * const * const inputs,
     vsi_nn_tensor_t * const * const outputs,
     vsi_size_t depth,
-    int32_t transa
+    int32_t transa,
+    int32_t cross
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -202,7 +268,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype = U8;
     vsi_nn_kernel_image_dim_type_e dim_type = _2D;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -251,7 +317,7 @@ static vsi_status _query_kernel
         output_dtype = U8;
     }
 
-    key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, transa );
+    key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, transa, cross );
 
     for( i = 0; i < _cnt_of_array(matrixmul_map); i ++ )
     {
@@ -264,8 +330,16 @@ static vsi_status _query_kernel
     if ( i < _cnt_of_array(matrixmul_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  matrixmul_map[i].function_name );
-        kernel->info.parameters = _matrixmul_kernel_param_def;
-        kernel->info.numParams = _cnt_of_array( _matrixmul_kernel_param_def );
+        if (cross == 0)
+        {
+            kernel->info.parameters = _matrixmul_kernel_param_def;
+            kernel->info.numParams = _cnt_of_array( _matrixmul_kernel_param_def );
+        }
+        else if (cross == 2)
+        {
+            kernel->info.parameters = _matrixmul_merge_kernel_param_def;
+            kernel->info.numParams = _cnt_of_array( _matrixmul_merge_kernel_param_def );
+        }
         kernel->info.initialize = _matrixmul_initializer;
         // Register code source
         vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
@@ -290,14 +364,17 @@ static vsi_nn_kernel_node_t _setup
     )
 {
     vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_MATRIXMUL_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_param_t node_params[_MATRIXMUL_MERGE_PARAM_NUM] = {NULL};
     vsi_nn_kernel_node_t node = NULL;
     int32_t transposeA  = vsi_nn_kernel_param_get_int32( params, "transposeA" );
     int32_t transposeB  = vsi_nn_kernel_param_get_int32( params, "transposeB" );
+    int32_t cross_flg  = vsi_nn_kernel_param_get_int32( params, "cross_flg" );
     int32_t transFlg    = 0;
     vsi_size_t M = inputs[0]->attr.size[1];
     vsi_size_t K = inputs[0]->attr.size[0];
     vsi_size_t N = inputs[1]->attr.size[0];
+    vsi_size_t a_depth = 0;
+    vsi_size_t b_depth = 0;
     vsi_size_t depth = outputs[0]->attr.dim_num > 2 ? outputs[0]->attr.size[2] : 1;
     uint32_t ac2zero = 0;
     uint32_t bc2zero = 0;
@@ -307,6 +384,10 @@ static vsi_nn_kernel_node_t _setup
     float    zp_b = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
     float    scale_out = vsi_nn_get_tensor_scale(outputs[0]);
     float    zp_out = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    int32_t  outer = 0;
+
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
 
     scale_out = 1 / scale_out;
 
@@ -329,28 +410,43 @@ static vsi_nn_kernel_node_t _setup
         transFlg = 1;
     }
 
-    if ((inputs[0]->attr.dim_num > inputs[1]->attr.dim_num) ||
-       (inputs[0]->attr.size[2] > inputs[1]->attr.size[2]
-            && inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2))
+    a_depth = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
+    b_depth = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1;
+
+    if (b_depth == 1)
     {
         bc2zero = 1;
     }
-    else if ((inputs[1]->attr.dim_num > inputs[0]->attr.dim_num) ||
-       (inputs[1]->attr.size[2] > inputs[0]->attr.size[2]
-            && inputs[0]->attr.dim_num > 2 && inputs[1]->attr.dim_num > 2))
+    if (a_depth == 1)
+    {
+        ac2zero = 1;
+    }
+
+    if (inputs[0]->attr.dim_num == 4 && inputs[1]->attr.dim_num == 3
+        && a_depth > 1 && b_depth > 1 && cross_flg == 2)
     {
         ac2zero = 1;
+        bc2zero = 0;
+        outer = (int32_t)a_depth;
+    }
+    else if (inputs[1]->attr.dim_num == 4 && inputs[0]->attr.dim_num == 3
+        && a_depth > 1 && b_depth > 1 && cross_flg == 2)
+    {
+        ac2zero = 0;
+        bc2zero = 1;
+        outer = (int32_t)b_depth;
     }
 
-    status = _query_kernel( kernel, inputs, outputs, depth, transFlg );
+    status = _query_kernel( kernel, inputs, outputs, depth, transFlg, cross_flg );
     if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
         if ( node )
         {
             uint32_t index = 3;
+            size_t param_num = cross_flg == 2 ? _MATRIXMUL_MERGE_PARAM_NUM : _MATRIXMUL_PARAM_NUM;
             /* Pass parameters to node. */
-            vsi_nn_kernel_node_pack_io( node_params, _MATRIXMUL_PARAM_NUM,
+            vsi_nn_kernel_node_pack_io( node_params, param_num,
                     inputs, 2, outputs, 1 );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &M );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &K );
@@ -363,8 +459,12 @@ static vsi_nn_kernel_node_t _setup
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp_b );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_out );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &zp_out );
+            if (cross_flg == 2)
+            {
+                node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &outer );
+            }
             /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _MATRIXMUL_PARAM_NUM );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, param_num );
             CHECK_STATUS(status);
             vsi_nn_kernel_scalar_release( &node_params[3] );
             vsi_nn_kernel_scalar_release( &node_params[4] );
@@ -377,6 +477,10 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &node_params[11] );
             vsi_nn_kernel_scalar_release( &node_params[12] );
             vsi_nn_kernel_scalar_release( &node_params[13] );
+            if (cross_flg == 2)
+            {
+                vsi_nn_kernel_scalar_release( &node_params[14] );
+            }
         }
     }
     return node;
diff --git a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
index c81289ed6..3446fef8b 100644
--- a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
@@ -136,6 +136,8 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -190,7 +192,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    size_t i;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -252,6 +254,10 @@ static vsi_nn_kernel_node_t _setup
     float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
     float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+
     outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
 
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
diff --git a/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c b/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c
index 2311810e9..b8ecf2ae9 100644
--- a/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c
@@ -115,11 +115,13 @@ DEF_KERNEL_INITIALIZER(_maxpoolwithargmax_initializer)
         {0, 0, 0}
         };
 
-    vx_status    status             = VX_FAILURE;
+    vsi_status   status             = VSI_FAILURE;
     vx_tensor    output             = (vx_tensor)param[1];
     vsi_nn_kernel_tensor_attr_t * attr_out = NULL;
     vsi_size_array_t * out_shape   = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr_out = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
     CHECK_PTR_FAIL_GOTO( attr_out, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
@@ -159,7 +161,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output0_dtype = U8;
     vsi_nn_kernel_dtype_e output1_dtype = I32;
     uint32_t key = 0;
-    int32_t i = 0;
+    size_t i = 0;
 
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output0_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
diff --git a/src/tim/vx/internal/src/kernel/cl/maxunpool_cl.c b/src/tim/vx/internal/src/kernel/cl/maxunpool_cl.c
index 408164bfb..f4086a8e1 100644
--- a/src/tim/vx/internal/src/kernel/cl/maxunpool_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/maxunpool_cl.c
@@ -120,6 +120,8 @@ DEF_KERNEL_INITIALIZER(_maxunpool_initializer)
     vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
     vsi_size_array_t            *output_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
     CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
index 92a19a3e5..5d85656cb 100644
--- a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
@@ -136,6 +136,8 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -190,7 +192,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    size_t i;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -252,6 +254,11 @@ static vsi_nn_kernel_node_t _setup
     float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
     float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+
+
     outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
 
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
diff --git a/src/tim/vx/internal/src/kernel/cl/mod_cl.c b/src/tim/vx/internal/src/kernel/cl/mod_cl.c
index 1398823d9..b6c50164a 100644
--- a/src/tim/vx/internal/src/kernel/cl/mod_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/mod_cl.c
@@ -119,6 +119,8 @@ DEF_KERNEL_INITIALIZER(_mod_initializer)
     vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
     vsi_size_array_t             *output_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
     CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/moments_cl.c b/src/tim/vx/internal/src/kernel/cl/moments_cl.c
index e5bae713e..4afda3666 100644
--- a/src/tim/vx/internal/src/kernel/cl/moments_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/moments_cl.c
@@ -224,6 +224,8 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
     int32_t axis = 0;
     int32_t axis_num = 1;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
 
@@ -306,7 +308,9 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e input0_dtype = U8;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
+
+    VSI_UNREFERENCED(params);
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -381,6 +385,9 @@ static vsi_nn_kernel_node_t _setup
     float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
     float dim_ratio = (float)1.0 / (float)(width * height);
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     axis_num = (int32_t)axis_num_temp;
 
     if (axis_num == 1 && axis[0] == 0)
diff --git a/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c b/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c
new file mode 100644
index 000000000..cc6d53800
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c
@@ -0,0 +1,401 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_NEAREST_GRID_SAMPLE,
+} _internal_kernel_e;
+
+#define _NEAREST_GRID_SAMPLE_KERNEL_SOURCE()      "nearest_grid_sample"
+
+#define STR(a) #a
+
+// Add kernel hashtable here
+#define NEAREST_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+    ((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE))
+
+#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE)                \
+    {                                                                   \
+        NEAREST_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
+            CVIVANTE_NAMESPACE("cl.nearest_grid_sample_" STR(          \
+                IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)),     \
+            _NEAREST_GRID_SAMPLE_KERNEL_SOURCE()                       \
+    }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _nearest_grid_sample_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP(F32, F32, F32),
+    PACK_KERNEL_MAP(U8, U8, U8),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _nearest_grid_sample_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+#define _NEAREST_GRID_SAMPLE_PARAM_NUM 8
+#define _NEAREST_GRID_SAMPLE_PARAM_QUANT_NUM \
+    _cnt_of_array(_nearest_grid_sample_kernel_param_def)
+
+#define SCALAR_HALF_INPUT0_W (3)
+#define SCALAR_HALF_INPUT0_H (4)
+#define SCALAR_ADD_VALUE_W (5)
+#define SCALAR_ADD_VALUE_H (6)
+#define SCALAR_DEPTH (7)
+#define SCALAR_INPUT0_SCALE (8)
+#define SCALAR_INPUT0_TAIL (9)
+#define SCALAR_INPUT1_SCALE (10)
+#define SCALAR_INPUT1_TAIL (11)
+#define SCALAR_OUTPUT_SCALE (12)
+#define SCALAR_OUTPUT_TAIL (13)
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_nearest_grid_sample_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {3, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
+    vsi_nn_kernel_tensor_attr_t* output_attr = NULL;
+    vsi_size_array_t* out_shape = NULL;
+
+    VSI_UNREFERENCED(param_size);
+
+    output_attr =
+        vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
+    CHECK_PTR_FAIL_GOTO(output_attr, "Create tensor attr buffer fail.", final);
+
+    out_shape = output_attr->shape;
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+
+    gpu_param.dim = 2;
+    gpu_param.global_size[0] =
+        (out_shape->data[0] + gpu_param.global_scale[0] - 1) /
+        gpu_param.global_scale[0];
+    gpu_param.global_size[1] =
+        ((out_shape->data[1] + gpu_param.global_scale[1] - 1) /
+         gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = 1;
+    status = vsi_nn_kernel_gpu_config(node, &gpu_param);
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR)               \
+    if (_PTR) {                                   \
+        vsi_nn_kernel_tensor_attr_release(&_PTR); \
+        _PTR = NULL;                              \
+    }
+    SAFE_FREE_TENSOR_ATTR(output_attr);
+    return status;
+} /* _nearest_grid_sample_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool* is_use_u8_kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype, in1_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _nearest_grid_sample_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _nearest_grid_sample_kernel_map );
+    vx_param_description_t * param_def  = _nearest_grid_sample_kernel_param_def;
+    size_t param_def_size =
+        _cnt_of_array(_nearest_grid_sample_kernel_param_def);
+    vx_kernel_initialize_f  initializer = _nearest_grid_sample_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
+    in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type);
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (F16 == in0_dtype) {
+        in0_dtype = F32;
+    }
+    if (F16 == in1_dtype) {
+        in1_dtype = F32;
+    }
+    if (F16 == out_dtype) {
+        out_dtype = F32;
+    }
+    if ((U8 == in0_dtype) || (U8 == out_dtype)) {
+        param_def_size = _NEAREST_GRID_SAMPLE_PARAM_QUANT_NUM;
+        *is_use_u8_kernel = TRUE;
+    } else {
+        param_def_size = _NEAREST_GRID_SAMPLE_PARAM_NUM;
+        *is_use_u8_kernel = FALSE;
+    }
+
+    key = NEAREST_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype);
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_NEAREST_GRID_SAMPLE_PARAM_QUANT_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
+    uint32_t final_in1_rank = 0;
+    vsi_nn_tensor_t* rs_tensors = NULL;
+    vsi_nn_tensor_t* final_tensors[3] = {NULL};
+    vsi_size_t in0_width = inputs[0]->attr.size[0];
+    vsi_size_t in0_height = inputs[0]->attr.size[1];
+    float input0_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input0_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float input0_tail = -(input0_zp * input0_scale);
+    float input1_zp = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
+    float input1_scale = vsi_nn_get_tensor_scale(inputs[1]);
+    float input1_tail = -(input1_zp * input1_scale);
+    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    vsi_bool is_use_u8_kernel = FALSE;
+    int32_t align_corners =
+        vsi_nn_kernel_param_get_int32(params, "align_corners");
+    uint32_t pad_val = 0;
+    int32_t depth = 0;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    float half_input0_w, half_input0_h, add_float_value_w, add_float_value_h;
+
+    // Check if gpu can support the size
+    if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
+                                       inputs[0]->attr.dim_num)) {
+        return NULL;
+    }
+
+    if (!vsi_nn_kernel_gpu_check_shape(inputs[1]->attr.size,
+                                       inputs[1]->attr.dim_num)) {
+        return NULL;
+    }
+
+    final_tensors[0] = inputs[0];
+    if (inputs[1]->attr.dim_num >= 3) {
+        final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0];
+        final_shape[1] = inputs[1]->attr.size[2];
+        final_shape[2] = 1;
+        final_shape[3] =
+            inputs[1]->attr.dim_num > 3 ? inputs[1]->attr.size[3] : 1;
+        final_in1_rank =
+            inputs[1]->attr.dim_num == 3 ? 2 : inputs[1]->attr.dim_num;
+        if (!vsi_nn_kernel_gpu_check_shape(final_shape, final_in1_rank)) {
+            return NULL;
+        }
+
+        rs_tensors = vsi_nn_reshape_tensor(
+            graph, inputs[1], final_shape, final_in1_rank);
+        final_tensors[1] = rs_tensors;
+    } else {
+        final_tensors[1] = inputs[1];
+    }
+    final_tensors[2] = outputs[0];
+
+    if (align_corners) {
+        half_input0_w = ((float)in0_width - 1.0f) * 0.5f;
+        half_input0_h = ((float)in0_height - 1.0f) * 0.5f;
+        add_float_value_w = half_input0_w;
+        add_float_value_h = half_input0_h;
+    } else {
+        half_input0_w = (float)in0_width * 0.5f;
+        half_input0_h = (float)in0_height * 0.5f;
+        add_float_value_w = half_input0_w - 0.5f;
+        add_float_value_h = half_input0_h - 0.5f;
+    }
+
+    add_float_value_w = add_float_value_w + 0.5f;
+    add_float_value_h = add_float_value_h + 0.5f;
+
+    depth = (int32_t)inputs[0]->attr.size[2];
+    in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
+    if (U8 == in0_dtype) {
+        pad_val = inputs[0]->attr.dtype.zero_point;
+    }
+
+    status = _query_kernel(kernel, inputs, outputs, &is_use_u8_kernel);
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node(graph, kernel);
+        if (node) {
+            size_t node_params_num = _NEAREST_GRID_SAMPLE_PARAM_NUM;
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io(node_params,
+                                       _NEAREST_GRID_SAMPLE_PARAM_QUANT_NUM,
+                                       final_tensors,
+                                       input_num,
+                                       &final_tensors[2],
+                                       output_num);
+            node_params[SCALAR_HALF_INPUT0_W] =
+                vsi_nn_kernel_scalar_create(graph, F32, &half_input0_w);
+            node_params[SCALAR_HALF_INPUT0_H] =
+                vsi_nn_kernel_scalar_create(graph, F32, &half_input0_h);
+            node_params[SCALAR_ADD_VALUE_W] =
+                vsi_nn_kernel_scalar_create(graph, F32, &add_float_value_w);
+            node_params[SCALAR_ADD_VALUE_H] =
+                vsi_nn_kernel_scalar_create(graph, F32, &add_float_value_h);
+            node_params[SCALAR_DEPTH] =
+                vsi_nn_kernel_scalar_create(graph, I32, &depth);
+            if (is_use_u8_kernel) {
+                node_params[SCALAR_INPUT0_SCALE] =
+                    vsi_nn_kernel_scalar_create(graph, F32, &input0_scale);
+                node_params[SCALAR_INPUT0_TAIL] =
+                    vsi_nn_kernel_scalar_create(graph, F32, &input0_tail);
+                node_params[SCALAR_INPUT1_SCALE] =
+                    vsi_nn_kernel_scalar_create(graph, F32, &input1_scale);
+                node_params[SCALAR_INPUT1_TAIL] =
+                    vsi_nn_kernel_scalar_create(graph, F32, &input1_tail);
+                node_params[SCALAR_OUTPUT_SCALE] =
+                    vsi_nn_kernel_scalar_create(graph, F32, &output_scale);
+                node_params[SCALAR_OUTPUT_TAIL] =
+                    vsi_nn_kernel_scalar_create(graph, F32, &output_zp);
+                node_params_num = _NEAREST_GRID_SAMPLE_PARAM_QUANT_NUM;
+            }
+            /* Pass parameters to node. */
+            status = vsi_nn_kernel_node_pass_param(
+                node, node_params, node_params_num);
+            VSI_ASSERT(status == VSI_SUCCESS);
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_HALF_INPUT0_W]);
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_HALF_INPUT0_H]);
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_W]);
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_H]);
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_DEPTH]);
+            if (is_use_u8_kernel) {
+                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_SCALE]);
+                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_TAIL]);
+                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_SCALE]);
+                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_TAIL]);
+                vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_SCALE]);
+                vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_TAIL]);
+            }
+            {
+                // Set default border mode.
+                vx_border_t border;
+                border.mode = VX_BORDER_CONSTANT;
+                border.constant_value.U32 = pad_val;
+                status = vxSetNodeAttribute(
+                    (vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
+                CHECK_STATUS(status);
+            }
+        }
+    }
+
+    vsi_safe_release_tensor(rs_tensors);
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( nearest_grid_sample, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
index 4369beaf6..a66b89b3e 100644
--- a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
@@ -121,6 +121,8 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
     vsi_size_array_t * in_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -234,6 +236,7 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_tensor_t* rs_tensors[2] = { NULL };
     vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
     int32_t i = 0;
+    size_t j = 0;
     vsi_size_t num_elements = vsi_nn_vxGetTensorElementNum(&inputs[0]->attr);
     vsi_size_t prefix_dim_size = 1;
     vsi_size_t suffix_dim_size = 0;
@@ -320,11 +323,11 @@ static vsi_nn_kernel_node_t _setup
         vsi_nn_ReleaseTensor( &rs_tensors[1] );
     }
 
-    for (i = SCALAR_INPUT_DEPTH; i < _ONE_HOT_PARAM_NUM; i++)
+    for (j = SCALAR_INPUT_DEPTH; j < _ONE_HOT_PARAM_NUM; j++)
     {
-        if (node_params[i])
+        if (node_params[j])
         {
-            vsi_nn_kernel_scalar_release( &node_params[i] );
+            vsi_nn_kernel_scalar_release( &node_params[j] );
         }
     }
 
diff --git a/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c b/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c
index 558a1e0d1..18468ae5c 100644
--- a/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c
@@ -111,12 +111,14 @@ DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer)
         {0, 0, 0}
         };
 
-    vx_status    status             = VX_FAILURE;
+    vsi_status   status             = VSI_FAILURE;
     vx_tensor    output             = (vx_tensor)param[1];
     vsi_nn_kernel_tensor_attr_t * attr_out = NULL;
     vsi_size_array_t * out_shape   = NULL;
     vsi_bool          image_2d    = FALSE;
 
+    VSI_UNREFERENCED(param_size);
+
     attr_out = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
     CHECK_PTR_FAIL_GOTO( attr_out, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/pow_cl.c b/src/tim/vx/internal/src/kernel/cl/pow_cl.c
index 1d1020d7a..6a38b4e85 100644
--- a/src/tim/vx/internal/src/kernel/cl/pow_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/pow_cl.c
@@ -126,6 +126,8 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -180,7 +182,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -248,6 +250,10 @@ static vsi_nn_kernel_node_t _setup
     float   inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
     float   inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+
     outputScale = 1.0f / outputScale;
     inputTail   = -(inputTail * inputScale);
 
diff --git a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c
index 609c90e18..87c8593a3 100644
--- a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c
@@ -136,6 +136,8 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -190,7 +192,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    size_t i;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -248,6 +250,9 @@ static vsi_nn_kernel_node_t _setup
     float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
     int32_t is_per_channel_alpha = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     is_per_channel_alpha = vsi_nn_kernel_param_get_int32(params, "is_per_channel_alpha");
 
     if (is_per_channel_alpha)
diff --git a/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c b/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c
index 696303b21..7e4504008 100644
--- a/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -153,6 +152,8 @@ DEF_KERNEL_INITIALIZER(_multinomial_initializer)
     vsi_nn_kernel_tensor_attr_t * attr  = NULL;
     vsi_size_array_t * in_shape          = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
 
@@ -196,6 +197,8 @@ DEF_KERNEL_INITIALIZER(_cdf_initializer)
     vsi_size_array_t * in_shape          = NULL;
     vsi_size_t      batch                 = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
 
@@ -235,6 +238,9 @@ DEF_KERNEL_INITIALIZER(_seed_initializer)
         {0, 0, 0}
         };
 
+    VSI_UNREFERENCED(param);
+    VSI_UNREFERENCED(param_size);
+
     gpu_param.global_scale[0] = 1;
     gpu_param.global_scale[1] = 1;
     gpu_param.global_size[0]  = 1;
@@ -351,6 +357,10 @@ static vsi_nn_kernel_node_t _setup
     float    rand_max       = (float)(pow(2.0,32));
     float    re_rand_max    = 1 / rand_max;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+
     // Check if gpu can support the size
     if( !vsi_nn_kernel_gpu_check_shape(
         outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
@@ -370,17 +380,20 @@ static vsi_nn_kernel_node_t _setup
     attr.is_const = FALSE;
     attr.vtl = TRUE;
     tensors[SEED_INDEX] = vsi_nn_CreateTensor( graph, &attr );
+    CHECK_PTR_FAIL_GOTO(tensors[SEED_INDEX], "Create tensor failed", final);
 
     attr.size[0] = inputs[0]->attr.size[0];
     attr.size[1] = inputs[0]->attr.size[1];
     attr.dim_num = 2;
     tensors[CDF_INDEX] = vsi_nn_CreateTensor( graph, &attr );
+    CHECK_PTR_FAIL_GOTO(tensors[CDF_INDEX], "Create tensor failed", final);
 
     memcpy( &attr, &(inputs[1]->attr), sizeof(vsi_nn_tensor_attr_t) );
     attr.size[1] = 1;
     attr.dim_num = 2;
     tensors[SEEDS_INDEX] = vsi_nn_reshape_tensor( graph,
                 inputs[1], attr.size, attr.dim_num );
+    CHECK_PTR_FAIL_GOTO(tensors[SEEDS_INDEX], "Create tensor failed", final);
 
     in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
diff --git a/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c
index 9b92246fd..aa2a45c89 100644
--- a/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c
@@ -105,6 +105,8 @@ DEF_KERNEL_INITIALIZER(_reduceall_internal_initializer)
     vsi_nn_kernel_tensor_attr_t *output_attr   = NULL;
     vsi_size_array_t * output_shape             = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c
index b347758c1..b5ff4e262 100644
--- a/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c
@@ -105,6 +105,8 @@ DEF_KERNEL_INITIALIZER(_reduceany_internal_initializer)
     vsi_nn_kernel_tensor_attr_t *output_attr   = NULL;
     vsi_size_array_t * output_shape             = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c
index 05a867406..5ee818064 100644
--- a/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c
@@ -120,6 +120,8 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer)
     vsi_nn_kernel_tensor_attr_t *output_attr   = NULL;
     vsi_size_array_t * output_shape             = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c
index 50a502565..ba31ed9fe 100644
--- a/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c
@@ -119,6 +119,8 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer)
     vsi_nn_kernel_tensor_attr_t *output_attr   = NULL;
     vsi_size_array_t * output_shape             = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c
index 8d1b7c0dd..b04a246a5 100644
--- a/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c
@@ -129,6 +129,8 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer)
     vsi_nn_kernel_tensor_attr_t *output_attr   = NULL;
     vsi_size_array_t * output_shape             = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c b/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c
index 8cfd331fa..1ea137bdc 100644
--- a/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c
@@ -126,6 +126,8 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
     vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
     vsi_size_array_t * out_shape                 = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/repeat_cl.c b/src/tim/vx/internal/src/kernel/cl/repeat_cl.c
index c2f28dda7..d40ae1f26 100644
--- a/src/tim/vx/internal/src/kernel/cl/repeat_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/repeat_cl.c
@@ -129,6 +129,8 @@ DEF_KERNEL_INITIALIZER(_repeat_initializer)
     int32_t is1d = 0;
     int32_t axis = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &axis);
@@ -190,7 +192,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype = U8;
     int32_t is1d = inputs[0]->attr.dim_num == 1 ? 1 : 0;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -308,6 +310,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t height = inputs[0]->attr.dim_num > 1 ? inputs[0]->attr.size[1] : 1;
     vsi_size_t channel = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
diff --git a/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c
index fda7acdc9..d9b18e718 100644
--- a/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c
@@ -116,6 +116,8 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer)
     vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
     vsi_size_array_t * out_shape                 = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c
index eef5bec37..8868565f9 100644
--- a/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c
@@ -117,6 +117,8 @@ DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer)
     vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
     vsi_size_array_t * out_shape                 = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/resize_3d_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_3d_bilinear_cl.c
new file mode 100644
index 000000000..77afbc1ca
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/resize_3d_bilinear_cl.c
@@ -0,0 +1,329 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+#define _RESIZE_3D_BILINEAR_KERNEL_SOURCE()      "resize_3d_bilinear"
+
+#define STR(a) #a
+// Add kernel hashtable here
+#define RESIZE_3D_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) )
+
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_3D_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+          CVIVANTE_NAMESPACE("cl.resize_3d_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+          _RESIZE_3D_BILINEAR_KERNEL_SOURCE() }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _resize_3d_bilinear_kernel_map[] =
+{
+    PACK_KERNEL_MAP( F32, F32),
+    PACK_KERNEL_MAP( F32, U8),
+    PACK_KERNEL_MAP( U8,  F32),
+    PACK_KERNEL_MAP( U8,  U8),
+    PACK_KERNEL_MAP( I8,  I8),
+    PACK_KERNEL_MAP( BF16,BF16),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _resize_3d_bilinear_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+
+#define RESIZE_3D_BILINEAR_NUM   _cnt_of_array( _resize_3d_bilinear_kernel_param_def )
+
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_resize_3d_bilinear_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
+    vsi_size_array_t * out_shape                 = NULL;
+
+    VSI_UNREFERENCED(param_size);
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    out_shape  = output_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.dim = 3;
+    gpu_param.global_size[0] = (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0];
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->data[2];
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(output_attr);
+    return status;
+} /* _resize_3d_bilinear_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _resize_3d_bilinear_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _resize_3d_bilinear_kernel_map );
+    vx_param_description_t * param_def  = _resize_3d_bilinear_kernel_param_def;
+    size_t param_def_size               = _cnt_of_array( _resize_3d_bilinear_kernel_param_def );
+    vx_kernel_initialize_f  initializer = _resize_3d_bilinear_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (F16 == in_dtype)
+    {
+        in_dtype = F32;
+    }
+    if (F16 == out_dtype)
+    {
+        out_dtype = F32;
+    }
+
+    if (I16 == in_dtype)
+    {
+        in_dtype = I8;
+    }
+    if (I16 == out_dtype)
+    {
+        out_dtype = I8;
+    }
+
+    key = RESIZE_3D_BILINEAR_HASH_KEY( in_dtype, out_dtype );
+
+    for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if( i < kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[RESIZE_3D_BILINEAR_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
+    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
+    vsi_size_t in_width     = inputs[0]->attr.size[0];
+    vsi_size_t in_height    = inputs[0]->attr.size[1];
+    vsi_size_t in_depth     = inputs[0]->attr.size[2];
+    vsi_size_t out_width    = outputs[0]->attr.size[0];
+    vsi_size_t out_height   = outputs[0]->attr.size[1];
+    vsi_size_t out_depth    = outputs[0]->attr.size[2];
+    float   input_zp     = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float   input_scale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float   input_tail   = -(input_zp * input_scale);
+    float   output_zp    = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    float   half_pixel_value = 0.0f;
+    float   scale_factor_x = 0.0f;
+    float   scale_factor_y = 0.0f;
+    float   scale_factor_z = 0.0f;
+
+    if (align_corners && out_width > 1)
+    {
+        scale_factor_x = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1);
+    }
+    else
+    {
+        scale_factor_x = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width;
+    }
+
+    if (align_corners && out_height > 1)
+    {
+        scale_factor_y = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1);
+    }
+    else
+    {
+        scale_factor_y = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height;
+    }
+
+    if (align_corners && out_depth > 1)
+    {
+        scale_factor_z = ((vx_float32)(in_depth - 1) * 1.0f) / (vx_float32)(out_depth - 1);
+    }
+    else
+    {
+        scale_factor_z = ((vx_float32)in_depth * 1.0f) / (vx_float32)out_depth;
+    }
+
+    if (half_pixel_centers)
+    {
+        half_pixel_value = 0.5f;
+    }
+    else
+    {
+        half_pixel_value = 0.0f;
+    }
+
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if( node )
+        {
+            size_t node_params_num = RESIZE_3D_BILINEAR_NUM;
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, RESIZE_3D_BILINEAR_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[2]  = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_x );
+            node_params[3]  = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_y );
+            node_params[4]  = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_z );
+            node_params[5]  = vsi_nn_kernel_scalar_create( graph, F32, &half_pixel_value );
+            node_params[6]  = vsi_nn_kernel_scalar_create( graph, U32, &in_width );
+            node_params[7]  = vsi_nn_kernel_scalar_create( graph, U32, &in_height );
+            node_params[8]  = vsi_nn_kernel_scalar_create( graph, U32, &in_depth );
+            node_params[9]  = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
+            node_params[10] = vsi_nn_kernel_scalar_create( graph, F32, &input_tail );
+            node_params[11] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
+            node_params[12] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( resize_3d_bilinear, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cl/resize_3d_nearest_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_3d_nearest_cl.c
new file mode 100644
index 000000000..b0e6138c7
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/resize_3d_nearest_cl.c
@@ -0,0 +1,332 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_RESIZE_3D_NEAREST,
+} _internal_kernel_e;
+
+#define _RESIZE_3D_NEAREST_KERNEL_SOURCE      "resize_3d_nearest"
+
+#define STR(a) #a
+// Add kernel hashtable here
+#define RESIZE_3D_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        (( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
+
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_3D_NEAREST_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+          CVIVANTE_NAMESPACE("cl.resize_3d_nearest_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+          _RESIZE_3D_NEAREST_KERNEL_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _resize_3d_nearest_kernel_map[] =
+{
+    PACK_KERNEL_MAP( F32, F32),
+    PACK_KERNEL_MAP( F32, U8),
+    PACK_KERNEL_MAP( U8,  F32),
+    PACK_KERNEL_MAP( U8,  U8),
+    PACK_KERNEL_MAP( I8,  I8),
+    PACK_KERNEL_MAP( BF16,BF16),
+};
+
+
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _resize_3d_nearest_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+#define _RESIZE_3D_NEAREST_PARAM_NUM  _cnt_of_array( _resize_3d_nearest_kernel_param_def )
+
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_resize_3d_nearest_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
+    vsi_size_array_t * out_shape                 = NULL;
+
+    VSI_UNREFERENCED(param_size);
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    out_shape  = output_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.dim = 3;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->data[2];
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(output_attr);
+    return status;
+} /* _resize_3d_nearest_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _resize_3d_nearest_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _resize_3d_nearest_kernel_map );
+    vx_param_description_t * param_def  = _resize_3d_nearest_kernel_param_def;
+    size_t param_def_size               = _cnt_of_array( _resize_3d_nearest_kernel_param_def );
+    vx_kernel_initialize_f  initializer = _resize_3d_nearest_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (F16 == in_dtype)
+    {
+        in_dtype = F32;
+    }
+    if (F16 == out_dtype)
+    {
+        out_dtype = F32;
+    }
+
+    if (I16 == in_dtype)
+    {
+        in_dtype = I8;
+    }
+    if (I16 == out_dtype)
+    {
+        out_dtype = I8;
+    }
+
+    key = RESIZE_3D_NEAREST_HASH_KEY( in_dtype, out_dtype );
+
+    for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_RESIZE_3D_NEAREST_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
+    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
+    vsi_size_t in_width     = inputs[0]->attr.size[0];
+    vsi_size_t in_height    = inputs[0]->attr.size[1];
+    vsi_size_t in_depth     = inputs[0]->attr.size[2];
+    vsi_size_t out_width    = outputs[0]->attr.size[0];
+    vsi_size_t out_height   = outputs[0]->attr.size[1];
+    vsi_size_t out_depth    = outputs[0]->attr.size[2];
+    float   input_zp     = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float   input_scale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float   output_scale = input_scale / vsi_nn_get_tensor_scale(outputs[0]);
+    float   output_tail  = (float)vsi_nn_get_tensor_zero_point(outputs[0]) - input_zp * output_scale;
+    float   half_pixel_value = 0.0f;
+    float   round_value    = 0.0f;
+    float   scale_factor_x = 0.0f;
+    float   scale_factor_y = 0.0f;
+    float   scale_factor_z = 0.0f;
+
+    if (align_corners && out_width > 1)
+    {
+        scale_factor_x = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1);
+    }
+    else
+    {
+        scale_factor_x = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width;
+    }
+
+    if (align_corners && out_height > 1)
+    {
+        scale_factor_y = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1);
+    }
+    else
+    {
+        scale_factor_y = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height;
+    }
+
+    if (align_corners && out_depth > 1)
+    {
+        scale_factor_z = ((vx_float32)(in_depth - 1) * 1.0f) / (vx_float32)(out_depth - 1);
+    }
+    else
+    {
+        scale_factor_z = ((vx_float32)in_depth * 1.0f) / (vx_float32)out_depth;
+    }
+
+    if (align_corners)
+    {
+        round_value = 0.5f;
+    }
+    else
+    {
+        round_value = 0.0f;
+    }
+
+    if (half_pixel_centers)
+    {
+        half_pixel_value = 0.5f;
+    }
+    else
+    {
+        half_pixel_value = 0.0f;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if( node )
+        {
+            size_t node_params_num = _RESIZE_3D_NEAREST_PARAM_NUM;
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _RESIZE_3D_NEAREST_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[2]     = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_x );
+            node_params[3]     = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_y );
+            node_params[4]     = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_z );
+            node_params[5]     = vsi_nn_kernel_scalar_create( graph, F32, &half_pixel_value );
+            node_params[6]     = vsi_nn_kernel_scalar_create( graph, F32, &round_value );
+            node_params[7]  = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
+            node_params[8]  = vsi_nn_kernel_scalar_create(graph, F32, &output_tail );
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( resize_3d_nearest, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c
index a9c0285fb..60fbda3eb 100644
--- a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c
@@ -115,6 +115,8 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
     vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
     vsi_size_array_t * out_shape                 = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c
index d61abcf30..1ca6ba9f1 100644
--- a/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c
@@ -121,6 +121,8 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer)
     vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
     vsi_size_array_t * out_shape                 = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/reversesequence_cl.c b/src/tim/vx/internal/src/kernel/cl/reversesequence_cl.c
index cb9cdcd19..10b3855d2 100644
--- a/src/tim/vx/internal/src/kernel/cl/reversesequence_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/reversesequence_cl.c
@@ -124,6 +124,8 @@ DEF_KERNEL_INITIALIZER(_reversesequence_initializer)
     vsi_nn_kernel_tensor_attr_t *input_attr  = NULL;
     vsi_size_array_t            *input_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input );
     CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
@@ -161,16 +163,16 @@ static vsi_status _query_kernel
     int32_t batch_axis
     )
 {
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_dtype_e in_dtype;
-    vsi_nn_kernel_dtype_e out_dtype;
+    vsi_status status                   = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype      = 0;
+    vsi_nn_kernel_dtype_e out_dtype     = 0;
     const _kernel_map_type * kernel_map = _reversesequence_kernel_map;
     size_t kernel_map_size              = _cnt_of_array( _reversesequence_kernel_map );
     vx_param_description_t * param_def  = _reversesequence_kernel_param_def;
     vx_kernel_initialize_f  initializer = _reversesequence_initializer;
     vsi_nn_kernel_batch_axis_type_e axis_type = _axis1;
-    uint32_t key;
-    uint32_t i;
+    uint32_t key = 0;
+    size_t i = 0;
 
     in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -223,7 +225,7 @@ static vsi_status _query_kernel
         break;
     }
 
-    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    for ( i = 0; i < kernel_map_size; i ++ )
     {
         if ( kernel_map[i].key == key )
         {
@@ -272,6 +274,13 @@ static vsi_nn_kernel_node_t _setup
     float   inoutScale   = inputScale / outputScale;
     float   inoutTail    = outputTail - inputTail * inoutScale;
 
+    vsi_nn_kernel_tensor_t  reshape_tensor = NULL;
+    vsi_size_t shapes[VSI_NN_MAX_DIM_NUM]  = {1};
+    uint32_t new_rank = 2;
+
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
                 inputs[0]->attr.dim_num )
      || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
@@ -280,6 +289,11 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
+    shapes[0] = inputs[1]->attr.size[0];
+    shapes[1] = 1;
+
+    reshape_tensor = vsi_nn_kernel_tensor_reshape(inputs[1]->t, shapes, new_rank);
+
     status = _query_kernel( kernel, inputs, outputs, batch_axis );
     if ( VSI_SUCCESS == status)
     {
@@ -287,9 +301,10 @@ static vsi_nn_kernel_node_t _setup
         if ( node )
         {
             /* Set inputs and outputs */
-            uint32_t index = 3;
-            vsi_nn_kernel_node_pack_io( node_params, _REVERSESEQUENCE_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
+            uint32_t index = 0;
+            node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
+            node_params[index++] = reshape_tensor;
+            node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t;
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inoutScale );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inoutTail );
             /* Pass parameters to node. */
@@ -298,6 +313,11 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &node_params[4] );
         }
     }
+
+    if (reshape_tensor)
+    {
+        vsi_nn_kernel_tensor_release( &reshape_tensor );
+    }
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
index e897d0f78..9cf2818a6 100644
--- a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
@@ -134,6 +134,8 @@ DEF_KERNEL_INITIALIZER(_roi_align_initializer)
     vsi_size_array_t * rois_shape                = NULL;
     vsi_size_array_t * out_shape                 = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     rois_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( rois_attr, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c
index 2be6a78da..fec2f3b69 100644
--- a/src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c
@@ -155,6 +155,8 @@ DEF_KERNEL_INITIALIZER(_scatter_elements_initializer)
     vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
     vsi_size_array_t * out_shape              = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c
index d409c4c45..e56d37dde 100644
--- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c
@@ -183,6 +183,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer)
     vsi_ssize_t       block_size  = 0;
     vsi_ssize_t       height = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
 
@@ -222,7 +224,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype = U8;
     vsi_nn_kernel_coord_type_e coord_type = _1D;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
 
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -290,6 +292,9 @@ static vsi_nn_kernel_node_t _setup
     int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
     vsi_size_t width = 0, area = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if (coord_dim > 3)
     {
         return NULL;
diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c
index d5f2867bd..94c4fa330 100644
--- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c
@@ -188,6 +188,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer)
     vsi_ssize_t       block_size  = 0;
     vsi_ssize_t       height = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
 
@@ -227,7 +229,9 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e input2_dtype = U8;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
+
+    VSI_UNREFERENCED(coord_dim);
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
@@ -284,6 +288,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t *input_size = inputs[2]->attr.size;
     uint32_t dims_num = inputs[2]->attr.dim_num;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if (coord_dim > 4 && input_size[dims_num - 1] > 1)
     {
         return NULL;
diff --git a/src/tim/vx/internal/src/kernel/cl/select_cl.c b/src/tim/vx/internal/src/kernel/cl/select_cl.c
index 53b1fcdd9..ab449010a 100644
--- a/src/tim/vx/internal/src/kernel/cl/select_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/select_cl.c
@@ -35,6 +35,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vx_lib_nnext.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
 
 __BEGIN_DECLS
 
@@ -62,6 +63,10 @@ typedef enum _internal_img_dim_e
         CVIVANTE_NAMESPACE("cl.select_"STR(COND_DTYPE)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
         _SELECT_KERNEL_SOURCE}
 
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
+
 typedef struct
 {
     uint32_t key;
@@ -111,7 +116,7 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
     size_t                              param_size
     )
 {
-    vsi_status status = VX_SUCCESS;
+    vsi_status status = VSI_FAILURE;
     // Alignment with a power of two value.
     gpu_param_t gpu_param = {
         3,
@@ -125,6 +130,8 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
     vsi_nn_kernel_tensor_attr_t *output_attr   = NULL;
     vsi_size_array_t             *output_shape  = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output);
     CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
@@ -247,19 +254,73 @@ static vsi_nn_kernel_node_t _setup
     float   input1Scale  = vsi_nn_get_tensor_scale(inputs[2]);
     float   input1Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[2]);
 
+    vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL };
+    vsi_size_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    vsi_size_t* shapes_ptr[_IO_NUM];
+    vsi_size_t* shapes_in[_INPUT_NUM];
+    vsi_size_t rank_in[_INPUT_NUM];
+    uint32_t new_rank = 0;
+    uint32_t i = 0;
+    vsi_bool ret = FALSE;
+
+    VSI_UNREFERENCED(params);
+
     input0Scale = input0Scale / outputScale;
     input1Scale = input1Scale / outputScale;
     input0Tail  = outputZP - input0Tail * input0Scale;
     input1Tail  = outputZP - input1Tail * input1Scale;
 
-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+
+    for (i = 0; i < _IO_NUM; i++)
+    {
+        shapes_ptr[i] = shapes[i];
+    }
+
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        shapes_in[i] = inputs[i]->attr.size;
+        rank_in[i]   = (vsi_size_t)inputs[i]->attr.dim_num;
+    }
+
+    ret = vsi_nn_kernel_optimize_broadcast_shape(
+            (const vsi_size_t**)shapes_in, rank_in, _INPUT_NUM,
+            outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes_ptr, shapes[_INPUT_NUM], &new_rank);
+
+    if ( ret )
+    {
+        for (i = 0; i < _INPUT_NUM; i++)
+        {
+            reshape_tensors[i] = vsi_nn_reshape_tensor( graph,
+                    inputs[i], shapes[i], new_rank );
+        }
+
+        for (i = 0; i < _OUTPUT_NUM; i++)
+        {
+            reshape_tensors[i + _INPUT_NUM] = vsi_nn_reshape_tensor( graph,
+                    outputs[i], shapes[i + _INPUT_NUM], new_rank );
+        }
+    }
+    else
+    {
+        for (i = 0; i < _INPUT_NUM; i++)
+        {
+            reshape_tensors[i] = inputs[i];
+        }
+        for (i = 0; i < _OUTPUT_NUM; i++)
+        {
+            reshape_tensors[i + _INPUT_NUM] = outputs[i];
+        }
+    }
+
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[3]->attr.size,
+                reshape_tensors[3]->attr.dim_num ) )
     {
         return NULL;
     }
 
-    image_2d = (outputs[0]->attr.dim_num == 2 || outputs[0]->attr.size[2] == 1);
-    status = _query_kernel( kernel, inputs, outputs, image_2d);
+    image_2d = (reshape_tensors[3]->attr.dim_num == 2);
+    status = _query_kernel( kernel, inputs, &reshape_tensors[3], image_2d);
 
     if( VSI_SUCCESS == status)
     {
@@ -268,7 +329,7 @@ static vsi_nn_kernel_node_t _setup
         {
             /* Set inputs and outputs */
             vsi_nn_kernel_node_pack_io( node_params, _SELECT_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
+                    &reshape_tensors[0], input_num, &reshape_tensors[3], output_num );
             node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale );
             node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input0Tail );
             node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale );
@@ -283,6 +344,15 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] );
         }
     }
+
+    if (ret)
+    {
+        for (i = 0; i < _IO_NUM; i++)
+        {
+            vsi_safe_release_tensor( reshape_tensors[i] );
+        }
+    }
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c b/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c
index d65200d33..4c620f4ce 100644
--- a/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c
@@ -116,6 +116,8 @@ DEF_KERNEL_INITIALIZER(_sequence_mask_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
 
@@ -155,7 +157,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype = U8;
     vsi_status status = VSI_FAILURE;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -204,6 +206,8 @@ static int32_t _optimize_mask_shape
     vsi_size_t new_rank = 0;
     uint32_t i = 0;
 
+    VSI_UNREFERENCED(outputs);
+
     for(i = 0; i < inputs[0]->attr.dim_num; i++)
     {
         in_shape[i] = inputs[0]->attr.size[i];
@@ -253,6 +257,9 @@ static vsi_nn_kernel_node_t _setup
     float input_zpScale = 0;
     float outputVal1 = 1.0f;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
diff --git a/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c b/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c
index 7aee0e0af..7a2bef62f 100644
--- a/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c
@@ -95,6 +95,8 @@ DEF_KERNEL_INITIALIZER(_signal_frame_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -199,6 +201,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_tensor_t* rs_tensors[2] = { NULL };
     vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     for (i = 0; i < axis; i++)
     {
         inner *= inputs[0]->attr.size[i];
diff --git a/src/tim/vx/internal/src/kernel/cl/slice_cl.c b/src/tim/vx/internal/src/kernel/cl/slice_cl.c
index 4900bb129..d3379bbfe 100644
--- a/src/tim/vx/internal/src/kernel/cl/slice_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/slice_cl.c
@@ -128,6 +128,8 @@ DEF_KERNEL_INITIALIZER(_slice_initializer)
     vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
     vsi_size_array_t * out_shape                 = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
 
@@ -245,6 +247,8 @@ static vsi_nn_kernel_node_t _setup
     float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
     float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
 
+    VSI_UNREFERENCED(params);
+
     outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
 
     vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num,
diff --git a/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c
index 7c7a59a2f..3bca54f63 100644
--- a/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c
@@ -114,6 +114,8 @@ DEF_KERNEL_INITIALIZER(_space2depth_internal_initializer)
     vsi_ssize_t height = 0;
     vsi_ssize_t chn = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
 
@@ -155,7 +157,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype = U8;
     vsi_status status = VSI_FAILURE;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -215,6 +217,9 @@ static vsi_nn_kernel_node_t _setup
     float scaleInOut = 1.0f;
     float zpInOut = 0.0f;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     scaleInOut = inputScale / outputScale;
     zpInOut = outputZp - inputZp * scaleInOut;
 
diff --git a/src/tim/vx/internal/src/kernel/cl/swish_cl.c b/src/tim/vx/internal/src/kernel/cl/swish_cl.c
index b616a84ac..97d0db96b 100644
--- a/src/tim/vx/internal/src/kernel/cl/swish_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/swish_cl.c
@@ -167,11 +167,13 @@ DEF_KERNEL_INITIALIZER(_swish_initializer)
         {0, 0, 0}
         };
 
-    vx_status    status             = VX_FAILURE;
+    vsi_status   status             = VSI_FAILURE;
     vx_tensor    output             = (vx_tensor)param[1];
     vsi_nn_kernel_tensor_attr_t * attr_out = NULL;
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr_out = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
     CHECK_PTR_FAIL_GOTO( attr_out, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
@@ -293,6 +295,9 @@ static vsi_nn_kernel_node_t _setup
     vx_float32  logE    = (vx_float32)(log10(exp(1.0f)) / log10(2.0f));
     vsi_bool ret = FALSE;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
 #if (VX_ACTIVATION_EXT_SUPPORT)
     if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
     {
diff --git a/src/tim/vx/internal/src/kernel/cl/tile_cl.c b/src/tim/vx/internal/src/kernel/cl/tile_cl.c
index 63816947e..266b8ed6a 100644
--- a/src/tim/vx/internal/src/kernel/cl/tile_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/tile_cl.c
@@ -106,6 +106,7 @@ static const struct {
     TENSOR_TILE_AXIS0_UINT32(U32, U32)
     TENSOR_TILE_AXIS0_FLOAT(F16,  F16)
     TENSOR_TILE_AXIS0_FLOAT(F32,  F32)
+    TENSOR_TILE_AXIS0_KERNELS(F32, U32)
 
     TENSOR_TILE_AXIS0_INT32_2D(I8,   I8)
     TENSOR_TILE_AXIS0_INT32_2D(I16,  I16)
@@ -114,6 +115,7 @@ static const struct {
     TENSOR_TILE_AXIS0_UINT32_2D(U32, U32)
     TENSOR_TILE_AXIS0_FLOAT_2D(F16,  F16)
     TENSOR_TILE_AXIS0_FLOAT_2D(F32,  F32)
+    TENSOR_TILE_AXIS0_KERNELS_2D(F32, U32)
 };
 
 /*
@@ -130,6 +132,8 @@ static vx_param_description_t kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 
 #define _CL_PARAM_NUM          _cnt_of_array(kernel_param_def)
@@ -140,6 +144,8 @@ static vx_param_description_t kernel_param_def[] =
 #define SCALAR_INPUT_MULTIPLES_1    (6)
 #define SCALAR_INPUT_MULTIPLES_2    (7)
 #define SCALAR_INPUT_MULTIPLES_3    (8)
+#define IN_OUT_SCALE                (9)
+#define IN_OUT_TAIL                 (10)
 
 /*
  * Kernel initializer
@@ -163,6 +169,8 @@ DEF_KERNEL_INITIALIZER(_tile_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
     vsi_size_array_t * in_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -205,10 +213,29 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    size_t i;
 
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+    if (input_dtype == F16)
+    {
+        input_dtype = F32;
+    }
+    else if (input_dtype == U8)
+    {
+        input_dtype = U32;
+    }
+
+    if (output_dtype == F16)
+    {
+        output_dtype = F32;
+    }
+    else if (output_dtype == U8)
+    {
+        output_dtype = U32;
+    }
+
+
     key = HASH_TILE_AXIS0_KEY( input_dtype, output_dtype, image_2d );
 
     for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
@@ -280,6 +307,16 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool ret = FALSE;
     uint32_t dim = inputs[0]->attr.dim_num;
     vsi_size_t multiples[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 };
+    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
+    float outputTail  = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
+    float inputTail  = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float inoutScale = inputScale / outputScale;
+    float inoutTail = outputTail - inputTail * inoutScale;
+
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
 
     for ( i = 0;  i < dim;  i++)
     {
@@ -299,10 +336,34 @@ static vsi_nn_kernel_node_t _setup
             return NULL;
         }
 
-        reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
-            inputs[0], shapes[0], new_rank );
-        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
-            outputs[0], shapes[2], new_rank );
+        if ( new_rank == 4)
+        {
+            vsi_size_t newshapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+            newshapes[0][0] = shapes[0][0];
+            newshapes[2][0] = shapes[2][0];
+            newshapes[0][1] = shapes[0][1];
+            newshapes[2][1] = shapes[2][1];
+            newshapes[0][2] = shapes[0][2] * shapes[0][3];
+            newshapes[2][2] = shapes[2][2] * shapes[2][3];
+
+            if (newshapes[0][2] >= GPU_TENSOR_MAX_WIDTH ||
+                newshapes[2][2] >= GPU_TENSOR_MAX_WIDTH)
+            {
+                return NULL;
+            }
+
+            reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], newshapes[0], 3 );
+            reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                outputs[0], newshapes[2], 3 );
+        }
+        else
+        {
+            reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], shapes[0], new_rank );
+            reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                outputs[0], shapes[2], new_rank );
+        }
     }
     else
     {
@@ -315,7 +376,7 @@ static vsi_nn_kernel_node_t _setup
         goto final;
     }
 
-    image_2d = ((reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1));
+    image_2d = reshape_tensors[0]->attr.dim_num == 2;
     status = _query_kernel( &reshape_tensors[0], &reshape_tensors[1], image_2d, kernel );
     if( VSI_SUCCESS == status)
     {
@@ -323,13 +384,16 @@ static vsi_nn_kernel_node_t _setup
 
         if( node )
         {
-            uint32_t depthIn = (uint32_t)(new_rank > 2 ? reshape_tensors[0]->attr.size[2] : 1);
-            uint32_t depthOut = (uint32_t)(new_rank > 2 ? reshape_tensors[1]->attr.size[2] : 1);
-            uint32_t batchIn = (uint32_t)(new_rank > 3 ? reshape_tensors[0]->attr.size[3] : 1);
+            uint32_t depthIn = (uint32_t)(new_rank > 2 ? shapes[0][2] : 1);
+            uint32_t depthOut = (uint32_t)(new_rank > 2 ? shapes[2][2] : 1);
+            uint32_t batchIn = (uint32_t)(new_rank > 3 ? shapes[0][3] : 1);
 
             vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
                     &reshape_tensors[0], 1, &reshape_tensors[1], 1 );
 
+            shapes[1][2] = shapes[1][2] == 0 ? 1 : shapes[1][2];
+            shapes[1][3] = shapes[1][3] == 0 ? 1 : shapes[1][3];
+
             /* Pass parameters to node. */
             node_params[SCALAR_INPUT_BATCH_IN] = vsi_nn_kernel_scalar_create(
                     graph, I32, &batchIn );
@@ -338,14 +402,17 @@ static vsi_nn_kernel_node_t _setup
             node_params[SCALAR_INPUT_DEPTH_OUT] = vsi_nn_kernel_scalar_create(
                     graph, I32, &depthOut );
             node_params[SCALAR_INPUT_MULTIPLES_0] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &multiples[0] );
+                    graph, I32, &shapes[1][0] );
             node_params[SCALAR_INPUT_MULTIPLES_1] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &multiples[1] );
+                    graph, I32, &shapes[1][1] );
             node_params[SCALAR_INPUT_MULTIPLES_2] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &multiples[2] );
+                    graph, I32, &shapes[1][2] );
             node_params[SCALAR_INPUT_MULTIPLES_3] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &multiples[3] );
-
+                    graph, I32, &shapes[1][3] );
+            node_params[IN_OUT_SCALE] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &inoutScale );
+            node_params[IN_OUT_TAIL] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &inoutTail );
             status  = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
             VSI_ASSERT( status == VSI_SUCCESS );
 
@@ -356,6 +423,8 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MULTIPLES_1] );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MULTIPLES_2] );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MULTIPLES_3] );
+            vsi_nn_kernel_scalar_release( &node_params[IN_OUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[IN_OUT_TAIL] );
         }
     }
 
diff --git a/src/tim/vx/internal/src/kernel/cl/topk_cl.c b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
index 0354a1e3f..3d6884065 100644
--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@@ -181,6 +181,8 @@ DEF_KERNEL_INITIALIZER(_topk_initializer)
     vsi_size_array_t * in_shape                = NULL;
     int32_t num_stages = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
 
@@ -222,6 +224,8 @@ DEF_KERNEL_INITIALIZER(_topk_odd_even_sort_initializer)
     vsi_nn_kernel_tensor_attr_t * input_attr   = NULL;
     vsi_size_array_t * in_shape                = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
 
@@ -424,7 +428,7 @@ static vsi_nn_kernel_node_t _setup
     )
 {
     vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_TOPK_ODD_EVEN_SORT_PARAM_NUM];
+    vsi_nn_kernel_node_param_t node_params[_TOPK_ODD_EVEN_SORT_PARAM_NUM] = {NULL};
     vsi_nn_kernel_node_t node = NULL;
     vsi_size_t block_size = inputs[0]->attr.size[0];
     vsi_size_t block_num = 1;
@@ -473,8 +477,10 @@ static vsi_nn_kernel_node_t _setup
 
         rs_tensors[1] = vsi_nn_reshape_tensor( graph,
             outputs[0], shape[1], 2 );
+        CHECK_PTR_FAIL_GOTO(rs_tensors[1], "Create tensor failed", final);
         rs_tensors[2] = vsi_nn_reshape_tensor( graph,
             outputs[1], shape[1], 2 );
+        CHECK_PTR_FAIL_GOTO(rs_tensors[2], "Create tensor failed", final);
     }
     else
     {
@@ -484,14 +490,17 @@ static vsi_nn_kernel_node_t _setup
 
         memcpy( &attr, &(rs_tensors[0]->attr), sizeof(vsi_nn_tensor_attr_t) );
         rs_tensors[1] = vsi_nn_CreateTensor( graph, &attr );
+        CHECK_PTR_FAIL_GOTO(rs_tensors[1], "Create tensor failed", final);
         attr.dtype.vx_type = VSI_NN_TYPE_INT32;
         attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
         rs_tensors[2] = vsi_nn_CreateTensor( graph, &attr );
-
+        CHECK_PTR_FAIL_GOTO(rs_tensors[2], "Create tensor failed", final);
         rs_tensors[3] = vsi_nn_reshape_tensor( graph,
             outputs[0], shape[1], 2 );
+        CHECK_PTR_FAIL_GOTO(rs_tensors[3], "Create tensor failed", final);
         rs_tensors[4] = vsi_nn_reshape_tensor( graph,
             outputs[1], shape[1], 2 );
+        CHECK_PTR_FAIL_GOTO(rs_tensors[4], "Create tensor failed", final);
 
         input_num = 3;
     }
@@ -505,10 +514,10 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_node_pack_io( node_params, param_num,
                     rs_tensors, input_num, &rs_tensors[input_num], output_num );
             /* Pass parameters to node. */
-            node_params[index++]  = vsi_nn_kernel_scalar_create(graph, I32, &inputScale );
-            node_params[index++]   = vsi_nn_kernel_scalar_create(graph, I32, &inputTail );
+            node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &inputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &inputTail );
             node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &outputScale );
-            node_params[index++]  = vsi_nn_kernel_scalar_create(graph, I32, &outputTail );
+            node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &outputTail );
             if (is_odd_even_sort)
             {
                 node_params[SCALAR_INPUT_SIZE] = vsi_nn_kernel_scalar_create(
diff --git a/src/tim/vx/internal/src/kernel/cl/upsample_cl.c b/src/tim/vx/internal/src/kernel/cl/upsample_cl.c
index 6f469883a..d2c33870a 100644
--- a/src/tim/vx/internal/src/kernel/cl/upsample_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/upsample_cl.c
@@ -123,12 +123,14 @@ DEF_KERNEL_INITIALIZER(_upsample_initializer)
         {0, 0, 0}
         };
 
-    vx_status    status             = VX_FAILURE;
+    vsi_status   status             = VSI_FAILURE;
     vx_tensor    input              = (vx_tensor)param[0];
     vsi_nn_kernel_tensor_attr_t * attr_in = NULL;
     vsi_size_array_t * in_shape   = NULL;
     vsi_bool          image_2d    = FALSE;
 
+    VSI_UNREFERENCED(param_size);
+
     attr_in = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input );
     CHECK_PTR_FAIL_GOTO( attr_in, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c b/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c
index c241e1e16..e0b4517a2 100644
--- a/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c
@@ -109,7 +109,7 @@ DEF_KERNEL_INITIALIZER(_a_times_b_plus_c_initializer)
 {
 #define _PACK_A_TIMES_B_PLUS_C_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE )    \
         (( IN2_TYPE << 24) | ( IN1_TYPE << 16) | ( IN0_TYPE << 8) | ( OUT_TYPE))
-    vsi_status status = VX_SUCCESS;
+    vsi_status status = VSI_FAILURE;
     // Alignment with a power of two value.
     gpu_param_t gpu_param = {
         3,
@@ -128,6 +128,8 @@ DEF_KERNEL_INITIALIZER(_a_times_b_plus_c_initializer)
     vsi_size_array_t             *output_shape   = NULL;
     uint32_t pack_key                           = 0;
 
+    VSI_UNREFERENCED(param_size);
+
 
     attr[0]  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0);
     CHECK_PTR_FAIL_GOTO( attr[0], "vsi_nn_kernel_tensor_attr_create fail.", final );
@@ -331,6 +333,8 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool ret      = FALSE;
     vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL };
 
+    VSI_UNREFERENCED(params);
+
     for (i = 0; i < _IO_NUM; i++)
     {
         shapes_ptr[i] = shapes[i];
diff --git a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c
index 679a07d9a..e1861a262 100644
--- a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c
@@ -90,7 +90,7 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
     size_t                              param_size
     )
 {
-    vsi_status status = VX_FAILURE;
+    vsi_status status = VSI_FAILURE;
     // Alignment with a power of two value.
     gpu_param_t gpu_param = {
         2,
@@ -119,6 +119,8 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
     float   dimRatio   = 0.0f;
     int32_t width      = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     input0_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0);
     CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
     input1_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input1);
diff --git a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c
index 3fe4185ba..f5010111c 100644
--- a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c
@@ -173,6 +173,8 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer)
     vsi_size_array_t * output_shape          = NULL;
     uint32_t    packedArgIdx[4]             = {0};
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -413,7 +415,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    size_t i;
 
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -469,6 +471,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t node = NULL;
     int32_t axis = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     axis = vsi_nn_kernel_param_get_int32(params, "axis");
 
     if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
diff --git a/src/tim/vx/internal/src/kernel/evis/argmin_evis.c b/src/tim/vx/internal/src/kernel/evis/argmin_evis.c
index bce04ac52..90713e08b 100644
--- a/src/tim/vx/internal/src/kernel/evis/argmin_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/argmin_evis.c
@@ -166,6 +166,8 @@ DEF_KERNEL_INITIALIZER(_argmin_initializer)
     vsi_size_array_t * output_shape          = NULL;
     uint32_t    packedArgIdx[4]             = {0};
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -351,7 +353,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    size_t i;
 
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -397,6 +399,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t node = NULL;
     int32_t axis = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     axis = vsi_nn_kernel_param_get_int32(params, "axis");
 
     if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
diff --git a/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c
index a794ee542..80a1b21ea 100644
--- a/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c
@@ -188,7 +188,7 @@ DEF_KERNEL_INITIALIZER(_batch_norm_initializer)
 #define _PACK_BATCH_NORM_KEY( IN_TYPE, OUT_TYPE )    \
         ( ( IN_TYPE << 16) | ( OUT_TYPE ) )
 
-    vsi_status status = VX_SUCCESS;
+    vsi_status status = VSI_FAILURE;
     // Alignment with a power of two value.
     gpu_param_t gpu_param = {
         3,
@@ -208,6 +208,8 @@ DEF_KERNEL_INITIALIZER(_batch_norm_initializer)
     float    output_zp                          = 0;
     uint32_t pack_key                           = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input);
     CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
     output_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output);
diff --git a/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c b/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c
index 01ea2ab4d..553f8b739 100644
--- a/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c
@@ -58,8 +58,8 @@ typedef enum
 #define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
         {                                                                   \
         BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
-            CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
-            _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE)     \
+        CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
+        _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE)     \
         }
 
 typedef struct
@@ -139,6 +139,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
     float   output_scale    = 1.0;
     int32_t outputZP        = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr[0] =
         vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
     CHECK_PTR_FAIL_GOTO(
@@ -418,14 +420,17 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
             status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP", &input0ZP);
             status |= vsi_nn_kernel_gpu_add_param(node, "uint8Scale", &uint8Scale);
             status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &uint8ZP_out);
-            status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_left_4x4", &uniU8SubZPtoFp32_left_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_left_4x4",
+            &uniU8SubZPtoFp32_left_4x4);
             status |= vsi_nn_kernel_gpu_add_param(node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4);
             status |= vsi_nn_kernel_gpu_add_param(node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
             if (U8 == input1_dtype) {
                 status |= vsi_nn_kernel_gpu_add_param(node, "input1_ZP", &input1ZP);
                 status |= vsi_nn_kernel_gpu_add_param(node, "input1Scale", &input1_scale);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part0_4x4", &uniU8SubZPtoFp32_part0_4x4);
-                status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part1_4x4", &uniU8SubZPtoFp32_part1_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part0_4x4",
+                &uniU8SubZPtoFp32_part0_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part1_4x4",
+                &uniU8SubZPtoFp32_part1_4x4);
             }
             else if (F16 == input1_dtype) {
                 status |= vsi_nn_kernel_gpu_add_param(
@@ -552,9 +557,9 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
     gpu_param.global_scale[2] = 1;
 
     gpu_param.dim = 2;
-    gpu_param.global_size[0] = gpu_align_p2(
+    gpu_param.global_size[0] =
         (out_width + gpu_param.global_scale[0] - 1) /
-         gpu_param.global_scale[0], 4);
+         gpu_param.global_scale[0];
     gpu_param.global_size[1] = ((out_height + gpu_param.global_scale[1] - 1) /
          gpu_param.global_scale[1]);
 
diff --git a/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c b/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c
index d7074c3db..75623dda3 100644
--- a/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c
@@ -109,6 +109,8 @@ DEF_KERNEL_INITIALIZER(_bucketize_initializer)
     vsi_size_array_t * input0_shape             = NULL;
     vsi_size_array_t * input1_shape             = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input0_attr, "Create tensor attr buffer fail.", final );
     input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/cast_evis.c b/src/tim/vx/internal/src/kernel/evis/cast_evis.c
index f36e100b1..7908dd581 100644
--- a/src/tim/vx/internal/src/kernel/evis/cast_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/cast_evis.c
@@ -150,6 +150,8 @@ DEF_KERNEL_INITIALIZER(_cast_initializer)
     vsi_nn_kernel_tensor_attr_t * input_attr    = NULL;
     vsi_size_array_t * out_shape                 = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
 
@@ -289,6 +291,8 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
 
+    VSI_UNREFERENCED(params);
+
     if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
                 inputs[0]->attr.dim_num ) )
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/clip_evis.c b/src/tim/vx/internal/src/kernel/evis/clip_evis.c
index 87784bf31..add96c2c0 100644
--- a/src/tim/vx/internal/src/kernel/evis/clip_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/clip_evis.c
@@ -142,6 +142,8 @@ DEF_KERNEL_INITIALIZER(_clip_initializer)
     int32_t   srcFixPointPos  = 0;
     int32_t   dstFixPointPos  = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
index 2fb8330de..4547dfb11 100644
--- a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
@@ -308,6 +308,8 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)
     float    input1Scale                    = 1.0f;
     float    input1Tail                     = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -365,7 +367,6 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)
             / gpu_param.global_scale[1]);
     gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
 
-    if (1)
     {
             gpu_dp_inst_t uniExtractInteger_2x8 = {{
                 0x33333333, // TCfg
@@ -475,7 +476,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    size_t i;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -527,6 +528,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t new_rank = 0;
     vsi_bool ret = FALSE;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     ret = vsi_nn_kernel_optimize_eltwise_shape(
             inputs[0]->attr.size, inputs[0]->attr.dim_num,
             inputs[1]->attr.size, inputs[1]->attr.dim_num,
@@ -543,11 +547,11 @@ static vsi_nn_kernel_node_t _setup
                 outputs[0], shapes[2], new_rank );
 
 #define _swap_tensor(a, b, tmp)  \
-    do { \
+    { \
         tmp = a; \
         a = b; \
         b = tmp; \
-    } while(0)
+    }
 
         if (shapes[1][3] > shapes[0][3] && new_rank == 4)
         {
diff --git a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
index 8e5d05e6c..e5669b0fd 100644
--- a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
@@ -134,6 +134,8 @@ DEF_KERNEL_INITIALIZER(_conv1d_ovxlib_initializer)
     int32_t           input_width     = 0;
     int32_t           output_width    = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
index cad8476a6..dbdd513ab 100644
--- a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
@@ -36,6 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "utils/vsi_nn_dtype_util.h"
 
 __BEGIN_DECLS
 
@@ -47,21 +48,29 @@ __BEGIN_DECLS
 #define KERNEL_SOURCE_2    "cumsum_2d"
 #define KERNEL_SOURCE_3    "cumsum_bf16"
 #define KERNEL_SOURCE_4    "cumsum_f16_u8"
+#define KERNEL_SOURCE_5    "cumsum_ex_rev_axis0"
+#define KERNEL_SOURCE_6    "cumsum_ex_rev_axis1"
+#define KERNEL_SOURCE_7    "cumsum_ex_rev_axis2"
 
 // Add kernel hashtable here
-#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
-    ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
+#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, EX_REV, _image_2d) \
+    ((EX_REV << 24) | (AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
 
 #define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
-        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0), \
         CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
         SOURCE },
 
 #define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
-        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1), \
         CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
         SOURCE },
 
+#define HASH_CUMSUM_EX_REV_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0), \
+        CVIVANTE_NAMESPACE("evis.cumsum_ex_rev_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
+        SOURCE },
+
 static const struct {
         uint32_t key;
         char* function_name;
@@ -108,6 +117,24 @@ static const struct {
     HASH_CUMSUM_KERNELS_2D(1, F16,  U8,  KERNEL_SOURCE_4)
     HASH_CUMSUM_KERNELS_2D(1, F16,  I8,  KERNEL_SOURCE_4)
     HASH_CUMSUM_KERNELS_2D(1, F16,  I16, KERNEL_SOURCE_4)
+    HASH_CUMSUM_EX_REV_KERNELS(0, U8,   U8,  KERNEL_SOURCE_5)
+    HASH_CUMSUM_EX_REV_KERNELS(0, I8,   I8,  KERNEL_SOURCE_5)
+    HASH_CUMSUM_EX_REV_KERNELS(0, I16,  I16, KERNEL_SOURCE_5)
+    HASH_CUMSUM_EX_REV_KERNELS(0, F16,  F16, KERNEL_SOURCE_5)
+    HASH_CUMSUM_EX_REV_KERNELS(1, U8,   U8,  KERNEL_SOURCE_6)
+    HASH_CUMSUM_EX_REV_KERNELS(1, I8,   I8,  KERNEL_SOURCE_6)
+    HASH_CUMSUM_EX_REV_KERNELS(1, I16,  I16, KERNEL_SOURCE_6)
+    HASH_CUMSUM_EX_REV_KERNELS(1, F16,  F16, KERNEL_SOURCE_6)
+    HASH_CUMSUM_EX_REV_KERNELS(2, U8,   U8,  KERNEL_SOURCE_7)
+    HASH_CUMSUM_EX_REV_KERNELS(2, I8,   I8,  KERNEL_SOURCE_7)
+    HASH_CUMSUM_EX_REV_KERNELS(2, I16,  I16, KERNEL_SOURCE_7)
+    HASH_CUMSUM_EX_REV_KERNELS(2, F16,  F16, KERNEL_SOURCE_7)
+    HASH_CUMSUM_EX_REV_KERNELS(1, F16,  U8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_EX_REV_KERNELS(1, F16,  I8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_EX_REV_KERNELS(1, F16,  I16, KERNEL_SOURCE_4)
+    HASH_CUMSUM_EX_REV_KERNELS(2, F16,  U8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_EX_REV_KERNELS(2, F16,  I8,  KERNEL_SOURCE_4)
+    HASH_CUMSUM_EX_REV_KERNELS(2, F16,  I16, KERNEL_SOURCE_4)
 };
 
 /*
@@ -143,6 +170,8 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
         {0, 0, 0}}; // globalWorkSize: image size in thread
 
     int32_t       axis    = 0;
+    int32_t       exclusive = 0;
+    int32_t       reverse = 0;
     int32_t       width   = 0;
     int32_t       height  = 0;
     int32_t       channel = 0;
@@ -161,6 +190,8 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
 
     uint32_t pack_key = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -168,6 +199,10 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
 
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &exclusive);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &reverse);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
@@ -204,7 +239,7 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
     }
 
     in_out_scale = input_scale * output_scale;
-    in_out_zp_scale = (float)in_out_scale * input_zp;
+    in_out_zp_scale = (float)in_out_scale * input_zp * (-1);
 
     input_shape  = attr[0]->shape;
     dim     = (uint32_t)input_shape->size;
@@ -460,14 +495,121 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
             0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16};
 
+        gpu_dp_inst_t uniSumHorzRevF16toF16A_4x4 = {{
+            0x01051555, // TCfg
+            0x00000000, // ASelt
+            0x05674567, 0x00070067, // ABin
+            0x020a2aaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00003c00,
+            0x3c003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniSumHorzRevF16toF16B_4x4 = {{
+            0x01051555, // TCfg
+            0x00000000, // ASelt
+            0x01230123, 0x00030023, // ABin
+            0x020a2aaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00003c00,
+            0x3c003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniSumHorzRevF16toF16C_2x8 = {{
+            0x11115555, // TCfg
+            0x00000000, // ASelt
+            0x43424140, 0x07060504, // ABin
+            0x2222aaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00,
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniAccSumHorzRevF16toF16_2x8 = {{
+            0x55555555, // TCfg
+            0x44444444, // ASelt
+            0x03020100, 0x07060504, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00,
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniSumHorzRevU8toI16A_4x4 = {{
+            0x01051555, // TCfg
+            0x00000000, // ASelt
+            0x05674567, 0x00070067, // ABin
+            0x020a2aaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00010001, 0x00010001, 0x00000001,
+            0x00010001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniSumHorzRevU8toI16B_8x4 = {{
+            0x15555555, 0x01550555, // TCfg
+            0x443214c7, 0x3214c700, 0x14c70044, 0xc7000432, 0x00003214, // BinSelect
+            0x00000700, // AccumType, ConstantType, and PostShift
+            0x01010101, 0x01010101, 0x01010101, 0x00010101,
+            0x01010101, 0x00000101, 0x01010101, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniSubZpRevI16toI16_2x8 = {{
+            0x55555555, // TCfg
+            0x44444444, // ASelt
+            0x03020100, 0x07060504, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00080001, 0x00070001, 0x00060001, 0x00050001,
+            0x00040001, 0x00030001, 0x00020001, 0x00010001 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniAccSumHorzRevI16toI32A_4x4 = {{
+            0x0d0d0d0d, // TCfg
+            0x04040404, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniAccSumHorzRevI16toI32B_4x4 = {{
+            0x0d0d0d0d, // TCfg
+            0x04040404, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
         gpu_quantize_multiplier_16bit( (double)input_scale * output_scale, &M0, &postShift);
         multAndoutZP0[0] = (uint32_t)(M0);
         multAndoutZP0[1] = (uint32_t)((attr[1]->asymm.zero_point << postShift) - input_zp * M0);
         gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift );
 
-        status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
-        status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
-        CHECK_STATUS_FAIL_GOTO(status, OnError );
+        if ((exclusive || reverse) && axis == 0)
+        {
+            status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzRevF16toF16A_4x4", &uniSumHorzRevF16toF16A_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzRevF16toF16B_4x4", &uniSumHorzRevF16toF16B_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzRevF16toF16C_2x8", &uniSumHorzRevF16toF16C_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumHorzRevF16toF16_2x8", &uniAccSumHorzRevF16toF16_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzRevU8toI16A_4x4", &uniSumHorzRevU8toI16A_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumHorzRevU8toI16B_8x4", &uniSumHorzRevU8toI16B_8x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSubZpRevI16toI16_2x8", &uniSubZpRevI16toI16_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumHorzRevI16toI32A_4x4", &uniAccSumHorzRevI16toI32A_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniAccSumHorzRevI16toI32B_4x4", &uniAccSumHorzRevI16toI32B_4x4 );
+            CHECK_STATUS_FAIL_GOTO(status, OnError );
+        }
 
         switch( pack_key )
         {
@@ -477,7 +619,6 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
         case _PACK_SELECT_KEY( F16,  F16,  2, 3):
             {
                 status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
-                status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
                 status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
                 status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale);
                 status |= vsi_nn_kernel_gpu_add_param(node, "in_out_zp_scale", &in_out_zp_scale);
@@ -493,47 +634,21 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
                     "uniAccSumVertU8toI32D_4x4", &uniAccSumVertU8toI32D_4x4 );
                 status |= vsi_nn_kernel_gpu_add_param( node,
                     "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniSumHorzF16toF16A_4x4", &uniSumHorzF16toF16A_4x4 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniSumHorzF16toF16B_4x4", &uniSumHorzF16toF16B_4x4 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniSumHorzF16toF16C_2x8", &uniSumHorzF16toF16C_2x8 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniAccSumHorzF16toF16_2x8", &uniAccSumHorzF16toF16_2x8 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniSumHorzU8toI16A_4x4", &uniSumHorzU8toI16A_4x4 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniSumHorzU8toI16B_8x4", &uniSumHorzU8toI16B_8x4 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniSubZpI16toI16_2x8", &uniSubZpI16toI16_2x8 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniAccSumHorzI16toI32A_4x4", &uniAccSumHorzI16toI32A_4x4 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniAccSumHorzI16toI32B_4x4", &uniAccSumHorzI16toI32B_4x4 );
                 status |= vsi_nn_kernel_gpu_add_param(
                     node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8);
                 CHECK_STATUS_FAIL_GOTO(status, OnError );
             }
             break;
-        case _PACK_SELECT_KEY( U8,   U8,   0, 2):
-        case _PACK_SELECT_KEY( U8,   U8,   1, 2):
-        case _PACK_SELECT_KEY( U8,   U8,   0, 3):
         case _PACK_SELECT_KEY( U8,   U8,   1, 3):
-        case _PACK_SELECT_KEY( I8,   I8,   0, 2):
-        case _PACK_SELECT_KEY( I8,   I8,   1, 2):
-        case _PACK_SELECT_KEY( I8,   I8,   0, 3):
         case _PACK_SELECT_KEY( I8,   I8,   1, 3):
-        case _PACK_SELECT_KEY( I16,  I16,  0, 2):
-        case _PACK_SELECT_KEY( I16,  I16,  1, 2):
-        case _PACK_SELECT_KEY( I16,  I16,  0, 3):
         case _PACK_SELECT_KEY( I16,  I16,  1, 3):
-        case _PACK_SELECT_KEY( F16,  F16,  0, 2):
-        case _PACK_SELECT_KEY( F16,  F16,  1, 2):
-        case _PACK_SELECT_KEY( F16,  F16,  0, 3):
         case _PACK_SELECT_KEY( F16,  F16,  1, 3):
+        case _PACK_SELECT_KEY( U8,   U8,   1, 2):
+        case _PACK_SELECT_KEY( I8,   I8,   1, 2):
+        case _PACK_SELECT_KEY( I16,  I16,  1, 2):
+        case _PACK_SELECT_KEY( F16,  F16,  1, 2):
             {
-                status = vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
+                status = vsi_nn_kernel_gpu_add_param(node, "height", &height);
                 status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
                 status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale);
                 status |= vsi_nn_kernel_gpu_add_param(node, "in_out_zp_scale", &in_out_zp_scale);
@@ -547,6 +662,26 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
                     "uniAccSumVertU8toI32C_4x4", &uniAccSumVertU8toI32C_4x4 );
                 status |= vsi_nn_kernel_gpu_add_param( node,
                     "uniAccSumVertU8toI32D_4x4", &uniAccSumVertU8toI32D_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case _PACK_SELECT_KEY( U8,   U8,   0, 2):
+        case _PACK_SELECT_KEY( U8,   U8,   0, 3):
+        case _PACK_SELECT_KEY( I8,   I8,   0, 2):
+        case _PACK_SELECT_KEY( I8,   I8,   0, 3):
+        case _PACK_SELECT_KEY( I16,  I16,  0, 2):
+        case _PACK_SELECT_KEY( I16,  I16,  0, 3):
+        case _PACK_SELECT_KEY( F16,  F16,  0, 2):
+        case _PACK_SELECT_KEY( F16,  F16,  0, 3):
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
+                status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
+                status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+                status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale);
                 status |= vsi_nn_kernel_gpu_add_param( node,
                     "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
                 status |= vsi_nn_kernel_gpu_add_param( node,
@@ -578,7 +713,9 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
         case _PACK_SELECT_KEY( BF16, BF16, 1, 3):
         case _PACK_SELECT_KEY( BF16, BF16, 2, 3):
             {
-                status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
+                status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
+                status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
+                status |= vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
                 status |= vsi_nn_kernel_gpu_add_param(
                     node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
                 status |= vsi_nn_kernel_gpu_add_param(
@@ -604,7 +741,9 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
         case _PACK_SELECT_KEY( F16, I16, 1, 3):
         case _PACK_SELECT_KEY( F16, I16, 2, 3):
             {
-                status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
+                status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
+                status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
+                status |= vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
                 status |= vsi_nn_kernel_gpu_add_param(
                     node, "uniAccSumVertF16toF16_2x8", &uniAccSumVertF16toF16_2x8);
                 status |= vsi_nn_kernel_gpu_add_param(
@@ -655,21 +794,24 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t* kernel,
     const vsi_nn_kernel_param_t * params,
     int32_t axis,
-    int32_t is_2d
+    int32_t is_2d,
+    int32_t is_ex_rev
     )
 {
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_dtype_e input0_dtype = U8;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
+
+    VSI_UNREFERENCED(params);
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d);
+    key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_ex_rev, is_2d);
 
-    for( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
     {
         if ( cumsum_map[i].key == key )
         {
@@ -716,17 +858,35 @@ static vsi_nn_kernel_node_t _setup
     int32_t axis_new   = 0;
     int32_t is_2d      = 0;
     uint32_t rs_dim    = 2;
-    int32_t i          = 0;
+    uint32_t i         = 0;
+    int32_t is_ex_or_rev  = exclusive || reverse;
 
-    vsi_nn_kernel_optimize_softmax_shape(
-                inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
-                shapes[0], &rs_dim, &axis_new);
-    if (exclusive || reverse || rs_dim > 3)
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    if (axis < 0)
+    {
+        axis_new = 0;
+        shapes[0][0] = 1;
+        shapes[0][1] = 1;
+        for (i = 0; i < inputs[0]->attr.dim_num; i++)
+        {
+            shapes[0][0] *= inputs[0]->attr.size[i];
+        }
+        rs_dim = 2;
+    }
+    else
+    {
+        vsi_nn_kernel_optimize_softmax_shape(
+                    inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+                    shapes[0], &rs_dim, &axis_new);
+    }
+    if (rs_dim > 3)
     {
         return NULL;
     }
 
-    if (rs_dim == 2)
+    if (rs_dim == 2 && is_ex_or_rev == 0)
     {
         is_2d = 1;
     }
@@ -736,7 +896,7 @@ static vsi_nn_kernel_node_t _setup
     reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
         outputs[0], shapes[0], (vsi_size_t)rs_dim );
 
-    status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d);
+    status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d, is_ex_or_rev);
     if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
@@ -754,6 +914,14 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &tmp_params[2] );
             vsi_nn_kernel_scalar_release( &tmp_params[3] );
             vsi_nn_kernel_scalar_release( &tmp_params[4] );
+            {
+                // Set default border mode.
+                vx_border_t border;
+                border.mode = VX_BORDER_CONSTANT;
+                vsi_nn_Float32ToDtype(0, (uint8_t*)&border.constant_value.U32, &outputs[0]->attr.dtype);
+                status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
+                CHECK_STATUS(status);
+            }
         }
     }
 
diff --git a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
index de5aa8326..9d464623f 100644
--- a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
@@ -152,6 +152,8 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
 
     uint32_t pack_key = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -363,7 +365,9 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e input0_dtype = U8;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
+
+    VSI_UNREFERENCED(params);
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -422,6 +426,9 @@ static vsi_nn_kernel_node_t _setup
     int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
     int32_t blk_flg = block_size == 2 ? 1 : 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
index 45c4073fd..a2f10ce82 100644
--- a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
@@ -197,6 +197,8 @@ DEF_KERNEL_INITIALIZER(_depthwise_conv1d_initializer)
     vx_context                  ctx             = vxGetContext((vx_reference)node);
     uint64_t                    pack_key        = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     memset(&hw_param, 0, sizeof(vx_hardware_caps_params_t));
     status = vxQueryHardwareCaps(ctx, &hw_param, sizeof(vx_hardware_caps_params_t));
     CHECK_STATUS_FAIL_GOTO(status, final);
@@ -729,7 +731,9 @@ static vsi_nn_kernel_node_t _setup
 
     reshape_tensors[0] = inputs[0];
 
-    if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
+    if (inputs[1]->attr.dtype.qnt_type !=
+            VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC &&
+        inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8)
     {
         shape[0] = inputs[1]->attr.size[0];
         shape[1] = 1;
@@ -811,7 +815,9 @@ static vsi_nn_kernel_node_t _setup
     }
 
 final:
-    if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
+    if (inputs[1]->attr.dtype.qnt_type !=
+            VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC &&
+        inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8)
     {
         vsi_nn_ReleaseTensor( &reshape_tensors[1] );
     }
diff --git a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c
index ee5faf1c3..aa781c8d8 100644
--- a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c
@@ -122,6 +122,8 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
     int32_t   input1_ZP       = 0;
     int32_t   input0_ZP       = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/evis/detect_post_nms_evis.c b/src/tim/vx/internal/src/kernel/evis/detect_post_nms_evis.c
index bc849b4da..5359233ba 100644
--- a/src/tim/vx/internal/src/kernel/evis/detect_post_nms_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/detect_post_nms_evis.c
@@ -145,7 +145,13 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_t             * kernel
     )
 {
-
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(outputs);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(kernel);
     return NULL;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
index 23b1433a7..5d383a15e 100644
--- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
@@ -223,6 +223,8 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
     float    beta                           = 0;
     uint32_t pack_key;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -467,7 +469,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    size_t i;
 
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -518,6 +520,9 @@ static vsi_nn_kernel_node_t _setup
     float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" );
     float beta = vsi_nn_kernel_param_get_float32( params, "beta" );
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     ret = vsi_nn_kernel_optimize_element_shape(
             inputs[0]->attr.size, inputs[0]->attr.dim_num,
             shape, &new_rank );
diff --git a/src/tim/vx/internal/src/kernel/evis/erf_evis.c b/src/tim/vx/internal/src/kernel/evis/erf_evis.c
index a4203164a..ebc8ad8f2 100644
--- a/src/tim/vx/internal/src/kernel/evis/erf_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/erf_evis.c
@@ -136,6 +136,8 @@ DEF_KERNEL_INITIALIZER(_erf_initializer)
     float    outputZP                       = 0;
     uint32_t pack_key;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -376,6 +378,10 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool image_2d = FALSE;
     vsi_bool ret = FALSE;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+
     ret = vsi_nn_kernel_optimize_element_shape(
             inputs[0]->attr.size, inputs[0]->attr.dim_num,
             shape, &new_rank );
diff --git a/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c b/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c
index 627e48b58..eec0f08e0 100644
--- a/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c
@@ -97,7 +97,10 @@ DEF_KERNEL_INITIALIZER(_extra_ending_initializer)
     vsi_nn_kernel_tensor_attr_t * attr  = NULL;
     vsi_size_array_t * out_shape          = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
     out_shape = attr->shape;
 
     gpu_param.global_scale[0] = 8;
@@ -136,6 +139,8 @@ static vsi_status _query_kernel
     uint32_t key = 0;
     uint32_t i = 0;
 
+    VSI_UNREFERENCED(inputs);
+
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
     key = EXTRA_ENDING_HASH_KEY( out_dtype );
@@ -186,6 +191,8 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
     int32_t i = 0;
 
+    VSI_UNREFERENCED(params);
+
     vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num,
         shapes[0], &rank[0]);
     vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num,
diff --git a/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c b/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c
index be1bd1714..86d4d585b 100644
--- a/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c
@@ -120,7 +120,7 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
         {0, 0, 0},
         {0, 0, 0}
         };
-    vx_status     status             = VX_FAILURE;
+    vsi_status    status              = VSI_FAILURE;
     vx_tensor     input0              = (vx_tensor)param[0];
     vx_tensor     input1              = (vx_tensor)param[1];
     vx_tensor     output              = (vx_tensor)param[2];
@@ -139,6 +139,8 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
     float                        in1Tail      = 0;
     float                        outZp        = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0 );
     CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
@@ -402,6 +404,8 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t node = NULL;
     vsi_bool image_2d = FALSE;
 
+    VSI_UNREFERENCED(params);
+
     if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_elements_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_elements_evis.c
index 0554d1124..07f159311 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_elements_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_elements_evis.c
@@ -51,18 +51,31 @@ typedef enum
 
 #define STR(a) #a
 // Add kernel hashtable here
-#define GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMG_2D ) \
-        (( AXIS ) | ( IN0_DTYPE << 2 ) | ( IN1_DTYPE << 10 ) | ( OUT_DTYPE << 18 ) | ( IMG_2D << 26 ))
+#define GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMG_2D , BEYOND_MAXWIDTH) \
+        (( AXIS ) | ( IN0_DTYPE << 2 ) | ( IN1_DTYPE << 10 ) | ( OUT_DTYPE << 18 ) | ( IMG_2D << 26 ) |\
+        (BEYOND_MAXWIDTH << 28))
 #define PACK_KERNEL_3D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
-  { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 ), \
+  { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 , 0), \
   CVIVANTE_NAMESPACE("evis.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
   _GATHER_ELEMENTS_KERNEL_SOURCE}
 
 #define PACK_KERNEL_2D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
-  { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 ), \
+  { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 , 0), \
   CVIVANTE_NAMESPACE("evis.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
   _GATHER_ELEMENTS_KERNEL_SOURCE}
 
+#define PACK_KERNEL_BEYOND_MAXWIDTH_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+  { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 , 1), \
+  CVIVANTE_NAMESPACE("evis.gather_elements_beyond_maxwidth_axis"STR(AXIS)\
+  "_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
+  _GATHER_ELEMENTS_KERNEL_SOURCE}
+
+#define PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+  { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 , 1), \
+  CVIVANTE_NAMESPACE("evis.gather_elements_beyond_maxwidth_axis"STR(AXIS)\
+  "_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
+  _GATHER_ELEMENTS_KERNEL_SOURCE}
+
 typedef struct
 {
     uint32_t key;
@@ -94,6 +107,32 @@ static const _kernel_map_type _gather_elements_kernel_map[] =
     PACK_KERNEL_2D_MAP( 1, I16, I32, I16 ),
     PACK_KERNEL_2D_MAP( 1, I8,  I32, I8 ),
     PACK_KERNEL_2D_MAP( 1, U8,  I32, U8 ),
+
+    PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 0, F16, I32, F16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 0, I16, I32, I16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 0, I8,  I32, I8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 0, U8,  I32, U8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 1, F16, I32, F16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 1, I16, I32, I16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 1, I8,  I32, I8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 1, U8,  I32, U8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 2, F16, I32, F16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 2, I16, I32, I16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 2, I8,  I32, I8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_MAP( 2, U8,  I32, U8 ),
+
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, F16, I32, F16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, I16, I32, I16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, I8,  I32, I8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, U8,  I32, U8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, F16, I32, F16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, I16, I32, I16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, I8,  I32, I8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, U8,  I32, U8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, F16, I32, F16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, I16, I32, I16 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, I8,  I32, I8 ),
+    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, U8,  I32, U8 ),
 };
 
 
@@ -128,26 +167,48 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer)
         {0, 0, 0},
         {0, 0, 0}
         };
-    vsi_nn_kernel_tensor_attr_t * input_attr  = NULL;
+    vsi_nn_kernel_tensor_attr_t * input_attr0 = NULL;
+    vsi_nn_kernel_tensor_attr_t * input_attr1 = NULL;
     vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
     vsi_size_array_t * out_shape              = NULL;
     int32_t axis = 0;
     int32_t axis_size = 0;
-
-    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
-    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+    uint32_t width0 = 0;
+    uint32_t height0 = 0;
+    uint32_t width1 = 0;
+    uint32_t height1 = 0;
+    uint32_t width_out = 0;
+    uint32_t height_out = 0;
+    uint32_t depth0 = 0;
+    uint32_t depth1 = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    input_attr0 = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr0, "Create tensor attr buffer fail.", final );
+    input_attr1 = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( input_attr1, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
     CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
 
     vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_AXIS], &axis);
 
     out_shape = output_attr->shape;
-    axis_size = (int32_t)input_attr->shape->data[axis];
+    axis_size = (int32_t)input_attr0->shape->data[axis];
     if (axis == 0)
     {
         gpu_param.global_scale[0] = 4;
     }
 
+    width0 = (uint32_t)input_attr0->shape->data[0];
+    height0 = (uint32_t)input_attr0->shape->data[1];
+    depth0 = input_attr0->shape->size > 2 ? (uint32_t)input_attr0->shape->data[2] : 1;
+    width1 = (uint32_t)input_attr1->shape->data[0];
+    height1 = (uint32_t)input_attr1->shape->data[1];
+    depth1 = input_attr1->shape->size > 2 ? (uint32_t)input_attr1->shape->data[2] : 1;
+    width_out = (uint32_t)output_attr->shape->data[0];
+    height_out = (uint32_t)output_attr->shape->data[1];
+
     gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3;
     gpu_param.global_size[0] = gpu_align_p2(
             (out_shape->data[0] + gpu_param.global_scale[0] - 1)
@@ -157,13 +218,31 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer)
             / gpu_param.global_scale[1]);
     gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
 
+    if (width0 >= GPU_TENSOR_MAX_WIDTH ||
+        width1 >= GPU_TENSOR_MAX_WIDTH ||
+        height0 >= GPU_TENSOR_MAX_WIDTH ||
+        height1 >= GPU_TENSOR_MAX_WIDTH ||
+        depth0 >= GPU_TENSOR_MAX_WIDTH ||
+        depth1 >= GPU_TENSOR_MAX_WIDTH)
+    {
+        gpu_param.global_scale[0] = 1;
+        gpu_param.global_size[0] = out_shape->data[0];
+    }
+
     status  = vsi_nn_kernel_gpu_config( node, &gpu_param );
     status |= vsi_nn_kernel_gpu_add_param( node, "axis_size", &axis_size );
+    status |= vsi_nn_kernel_gpu_add_param( node, "width0", &width0 );
+    status |= vsi_nn_kernel_gpu_add_param( node, "height0", &height0 );
+    status |= vsi_nn_kernel_gpu_add_param( node, "width1", &width1 );
+    status |= vsi_nn_kernel_gpu_add_param( node, "height1", &height1 );
+    status |= vsi_nn_kernel_gpu_add_param( node, "width_out", &width_out );
+    status |= vsi_nn_kernel_gpu_add_param( node, "height_out", &height_out );
     CHECK_STATUS_FAIL_GOTO(status, final );
 
 final:
 #define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
-    SAFE_FREE_TENSOR_ATTR(input_attr);
+    SAFE_FREE_TENSOR_ATTR(input_attr0);
+    SAFE_FREE_TENSOR_ATTR(input_attr1);
     SAFE_FREE_TENSOR_ATTR(output_attr);
     return status;
 } /* _gather_elements_initializer() */
@@ -190,6 +269,9 @@ static vsi_status _query_kernel
     vx_param_description_t * param_def  = _gather_elements_kernel_param_def;
     vx_kernel_initialize_f  initializer = _gather_elements_initializer;
     int32_t img_2d = (outputs[0]->attr.dim_num < 3 || outputs[0]->attr.size[2] == 1) ? 1 : 0;
+    int32_t beyond_maxwidth = 0;
+    vsi_size_t depth0 = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
+    vsi_size_t depth1 = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1;
 
     uint32_t key;
     uint32_t i;
@@ -207,7 +289,17 @@ static vsi_status _query_kernel
         out_dtype = F16;
     }
 
-    key = GATHER_ELEMENTS_HASH_KEY( axis, in0_dtype, in1_dtype, out_dtype, img_2d );
+    if (inputs[0]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH ||
+        inputs[0]->attr.size[1] >= GPU_TENSOR_MAX_WIDTH ||
+        inputs[1]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH ||
+        inputs[1]->attr.size[1] >= GPU_TENSOR_MAX_WIDTH ||
+        depth0 >= GPU_TENSOR_MAX_WIDTH ||
+        depth1 >= GPU_TENSOR_MAX_WIDTH)
+    {
+        beyond_maxwidth = 1;
+    }
+
+    key = GATHER_ELEMENTS_HASH_KEY( axis, in0_dtype, in1_dtype, out_dtype, img_2d, beyond_maxwidth );
 
     for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
index 499bc5a28..ba7ad75f4 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
@@ -294,6 +294,8 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
 
     uint32_t pack_key = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -491,6 +493,8 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
 
     uint32_t pack_key = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -692,7 +696,9 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e input0_dtype = U8;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
+
+    VSI_UNREFERENCED(params);
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -768,6 +774,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t rs_dim   = batch_dims == 0 ? 2 : 3;
     int32_t i           = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if (axis == 0)
     {
         status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, batch_dims, 0, &is_array);
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
index 355e90857..91c8f1744 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
@@ -148,7 +148,7 @@ static vsi_status get_gather_nd_tensor_reshape_size
     vsi_size_t block_size,
     uint32_t coordDim,
     int32_t* newDim,
-    int32_t  batch_dims
+    uint32_t  batch_dims
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -175,17 +175,23 @@ static vsi_status get_gather_nd_tensor_reshape_size
 
         if (batch_dims)
         {
+            int32_t rank = 1;
             for (i = 0; i < offset; i++)
             {
                 sizes[0] *= input_size[i];
             }
 
-            for (i = 0; i < coordDim; i++)
+            for (i = 0; i < coordDim - 1; i++)
             {
-                sizes[i + 1] = input_size[i + offset];
+                sizes[rank++] = input_size[i + offset];
             }
 
-            newDim[0] = coordDim == 1 ? 2 : 3;
+            for (i = 0; i < batch_dims; i++)
+            {
+                sizes[rank] *= input_size[dims_num - i - 1];
+            }
+
+            newDim[0] = rank + 1;
         }
         else
         {
@@ -215,13 +221,27 @@ static vsi_status get_gather_nd_tensor_reshape_size
     }
     else  // indices&output reshape
     {
-        if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
+        if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH && batch_dims == 0)
         {
             sizes[0] = block_size;
             sizes[1] = elementCnt / block_size;
             status = VSI_SUCCESS;
             newDim[0] = 2;
         }
+        else if (batch_dims > 0)
+        {
+            vsi_size_t batch_cnt = 1;
+            for (i = 0; i < batch_dims; ++i)
+            {
+                batch_cnt *= input_size[dims_num - i - 1];
+            }
+
+            sizes[0] = block_size;
+            sizes[1] = (elementCnt / block_size) / batch_cnt;
+            sizes[2] = batch_cnt;
+            status = VSI_SUCCESS;
+            newDim[0] = 3;
+        }
     }
 #undef VSI_NN_MAX_IMAGE_WIDTH
 
@@ -248,15 +268,18 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
         };
 
     vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
-    int32_t       block_size  = 0;
-    int32_t       indices_num = 1;
-    int32_t     src0ZP     = 0;
-    float       src0Scale  = 1;
-    int32_t     dstZP      = 0;
-    float       dstScale   = 1;
+    int32_t     block_size  = 0;
+    int32_t     indices_num = 1;
+    int32_t     batch_num   = 1;
+    int32_t     src0ZP      = 0;
+    float       src0Scale   = 1;
+    int32_t     dstZP       = 0;
+    float       dstScale    = 1;
 
     uint32_t pack_key = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -302,6 +325,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
     }
 
     indices_num = (int32_t)(attr[1]->shape->data[1]);
+    batch_num = (int32_t)(attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1);
 
     gpu_param.global_scale[0]  = 1;
     gpu_param.global_scale[1]  = 1;
@@ -310,7 +334,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
     gpu_param.global_size[0]   = gpu_align_p2((block_size + gpu_param.global_scale[0] - 1)
                                         / gpu_param.global_scale[0], 4);
     gpu_param.global_size[1]   = indices_num;
-    gpu_param.global_size[2]   = 1;
+    gpu_param.global_size[2]   = batch_num;
 
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
     CHECK_STATUS_FAIL_GOTO(status, OnError);
@@ -422,7 +446,8 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype = U8;
     vsi_nn_kernel_coord_type_e coord_type = _error;
     uint32_t key = 0;
-    int i = 0;
+    int32_t batch_flg = batch_dims > 0 ? 1 : 0;
+    size_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -448,7 +473,7 @@ static vsi_status _query_kernel
         coord_type = _3D;
     }
 
-    key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, batch_dims );
+    key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, batch_flg );
 
     for ( i = 0; i < _cnt_of_array(gather_nd_map); i ++ )
     {
@@ -495,6 +520,9 @@ static vsi_nn_kernel_node_t _setup
     int32_t coord_dim   = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
     int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     status = get_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim, batch_dims);
     status |= get_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim, batch_dims);
     status |= get_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim, batch_dims);
diff --git a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
index 8a9971fc6..ce13b84f7 100644
--- a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
@@ -246,6 +246,8 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer)
     float sum_x2_tail1 = 1;
     float work_item_pixels = 1;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -381,6 +383,8 @@ DEF_KERNEL_INITIALIZER(_groupnorm_means_initializer)
     int32_t chn = 0;
     int32_t group_stride = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
 
@@ -450,6 +454,8 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
     int32_t height = 0, width = 0, chn = 0;
     int32_t is2D = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
@@ -776,6 +782,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t group_size = inputs[0]->attr.size[2] / group_num;
     float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size);
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     // Check if gpu can support the size
     if ( !vsi_nn_kernel_gpu_check_shape(
         outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
@@ -898,11 +907,11 @@ static vsi_nn_kernel_node_t _setup
     if (node)
     {
         uint32_t index = 0;
-        int32_t  pStride = 0;
+        float  pStride = 0;
         if (!is2D_flg)
         {
-            pStride = (int32_t)(inputs[1]->attr.size[0] / new_shape[1]);
-            rSpaceOrg = 1.0f / (new_shape[0] / pStride);
+            pStride = (float)inputs[1]->attr.size[0] / (float)new_shape[1];
+            rSpaceOrg = pStride < 1.0f ? 0.0f : 1.0f / (new_shape[0] / pStride);
         }
         node_params[index++] = rs_input;
         node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
@@ -912,7 +921,7 @@ static vsi_nn_kernel_node_t _setup
         node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
         node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg );
         node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rSpaceOrg );
-        node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pStride );
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &pStride );
 
         status  = vsi_nn_kernel_node_pass_param( node, node_params,
             _GROUPNORM_PARAM_NUM );
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c
index 9b5a2c1fb..1bfdb49fd 100644
--- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c
@@ -227,6 +227,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer)
     vsi_size_array_t * output_shape          = NULL;
     vsi_nn_kernel_tensor_attr_t * attr[4]   = { NULL, NULL, NULL, NULL };
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -635,7 +637,7 @@ static vsi_status _query_kernel
     int32_t input_category,
     int32_t input_layout,
     int32_t use_cudnn,
-    int32_t* param_count,
+    vsi_size_t* param_count,
     int32_t* input_count,
     int32_t* output_count
     /* Add extra params */
@@ -756,7 +758,7 @@ static vsi_nn_kernel_node_t _setup
     int32_t k = 0;
     vsi_size_t input_size = inputs[0]->attr.size[0];
     vsi_size_t batch = inputs[0]->attr.size[1];
-    int32_t param_count = 0;
+    vsi_size_t param_count = 0;
     int32_t input_count = 0;
     int32_t output_count = 0;
     int32_t gate_activation = 0;
@@ -765,6 +767,8 @@ static vsi_nn_kernel_node_t _setup
     int32_t use_cudnn = vsi_nn_kernel_param_get_int32( params, "use_cudnn_implementation" );
     int32_t input_layout = vsi_nn_kernel_param_get_int32( params, "input_layout" );
 
+    VSI_UNREFERENCED(input_num);
+
     gate_activation = vsi_nn_kernel_param_get_int32( params, "gate_activation" );
     candidate_activation = vsi_nn_kernel_param_get_int32( params, "candidate_activation" );
 
@@ -783,7 +787,9 @@ static vsi_nn_kernel_node_t _setup
     if( VSI_SUCCESS == status)
     {
         _inputs = (vsi_nn_tensor_t**)malloc(input_count * sizeof(vsi_nn_tensor_t**));
+        CHECK_PTR_FAIL_GOTO( _inputs, "Create buffer fail.", final );
         node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count);
+        CHECK_PTR_FAIL_GOTO( node_params, "Create buffer fail.", final );
 
         if (use_cudnn)
         {
@@ -896,6 +902,7 @@ static vsi_nn_kernel_node_t _setup
         }
     }
 
+final:
     vsi_nn_safe_free(_inputs);
     vsi_nn_safe_free(node_params);
 
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c
index 75b6136e1..9ad5852c3 100644
--- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c
@@ -110,7 +110,7 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_sma_initializer)
 {
 #define _PACK_A_GRUCELL_ACTIVATION_SMA_KEY( IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE )    \
         (( IN1_TYPE << 24) | ( IN1_TYPE << 16) | ( IN0_TYPE << 8) | ( OUT_TYPE))
-    vsi_status status = VX_SUCCESS;
+    vsi_status status = VSI_FAILURE;
     // Alignment with a power of two value.
     gpu_param_t gpu_param = {
         3,
@@ -129,6 +129,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_sma_initializer)
     vsi_size_array_t             *output_shape   = NULL;
     uint32_t pack_key                           = 0;
 
+    VSI_UNREFERENCED(param_size);
+
 
     attr[0]  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0);
     CHECK_PTR_FAIL_GOTO( attr[0], "vsi_nn_kernel_tensor_attr_create fail.", final );
@@ -302,6 +304,8 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool ret      = FALSE;
     vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL };
 
+    VSI_UNREFERENCED(params);
+
     for (i = 0; i < _IO_NUM; i++)
     {
         shapes_ptr[i] = shapes[i];
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
index 40e22e981..7adf6bfb7 100644
--- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
@@ -124,6 +124,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
 #define _PACK_SELECT_KEY( hstate_type, fc_type, output_type )    \
         (hstate_type | (fc_type << 8) | (output_type << 16))
 
+    VSI_UNREFERENCED(param_size);
+
     output = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_Z_H_IN_CNT + GRUCELL_ACT_Z_H_OUT_OUTPUT];
     hstate_out = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_Z_H_IN_CNT + GRUCELL_ACT_Z_H_OUT_HSTATE];
 
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c
index 85220002f..afd872352 100644
--- a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c
@@ -117,6 +117,8 @@ DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer)
 #define _PACK_SELECT_KEY( hstate_type, fc_type, output_type )    \
         (hstate_type | (fc_type << 8) | (output_type << 16))
 
+    VSI_UNREFERENCED(param_size);
+
     output = (vsi_nn_kernel_tensor_t)param[3];
 
     for (i = 0; i < 2; i++)
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c
index 0c35aeaf9..60d932b80 100644
--- a/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c
@@ -46,17 +46,19 @@ typedef enum _grucell_nn_activation_type_e
 {
     SIGMOID = VSI_NN_ACT_SIGMOID,
     HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
+    TANH = VSI_NN_ACT_TANH,
 }grucell_nn_activation_type_e;
 
 #define _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE      "grucell_reset_after_activation"
 
 // Add kernel hashtable here
-#define GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \
-        (( HSTATE_DTYPE ) | ( IN_FC_DTYPE << 6 ) | ( OUT_TYPE << 12 ) | ( REC_ACT << 18 ))
-#define PACK_KERNEL_MAP( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \
-        { GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ), \
-CVIVANTE_NAMESPACE("evis.grucell_reset_after_activation_"#HSTATE_DTYPE"_"#IN_FC_DTYPE"to"#OUT_TYPE"_"#REC_ACT), \
-_GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE }
+#define GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT, ACT ) \
+        (( HSTATE_DTYPE ) | ( IN_FC_DTYPE << 6 ) | ( OUT_TYPE << 12 ) | ( REC_ACT << 18 ) | ( ACT << 24 ))
+#define PACK_KERNEL_MAP( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT, ACT ) \
+        { GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT, ACT ), \
+          CVIVANTE_NAMESPACE("evis.grucell_reset_after_activation_"\
+         #HSTATE_DTYPE"_"#IN_FC_DTYPE"to"#OUT_TYPE"_"#ACT"_"#REC_ACT), \
+         _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE }
 
 typedef struct
 {
@@ -68,10 +70,14 @@ typedef struct
 static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] =
 {
     // Register kernel here
-    PACK_KERNEL_MAP( U8,  F16, U8,  SIGMOID ),
-    PACK_KERNEL_MAP( I8,  F16, I8,  SIGMOID ),
-    PACK_KERNEL_MAP( I16, F16, I16, SIGMOID ),
-    PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
+    PACK_KERNEL_MAP( U8,  F16, U8,  SIGMOID, TANH ),
+    PACK_KERNEL_MAP( I8,  F16, I8,  SIGMOID, TANH ),
+    PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, TANH ),
+    PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, TANH ),
+    PACK_KERNEL_MAP( U8,  F16, U8,  SIGMOID, SIGMOID ),
+    PACK_KERNEL_MAP( I8,  F16, I8,  SIGMOID, SIGMOID ),
+    PACK_KERNEL_MAP( I16, F16, I16, SIGMOID, SIGMOID ),
+    PACK_KERNEL_MAP( F16, F16, F16, SIGMOID, SIGMOID ),
 };
 
 
@@ -123,6 +129,8 @@ DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer)
 #define _PACK_SELECT_KEY( hstate_type, fc_type, output_type )    \
         (hstate_type | (fc_type << 8) | (output_type << 16))
 
+    VSI_UNREFERENCED(param_size);
+
 
     output = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_IN_CNT + GRUCELL_ACT_OUT_OUTPUT];
     hstate_out = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_IN_CNT + GRUCELL_ACT_OUT_H_STATE];
@@ -297,7 +305,8 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t * kernel,
     vsi_nn_tensor_t * const * const inputs,
     vsi_nn_tensor_t * const * const outputs,
-    int32_t  recurrent_activation
+    int32_t  recurrent_activation,
+    int32_t  activation
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -309,14 +318,15 @@ static vsi_status _query_kernel
     vx_param_description_t * param_def  = _grucell_reset_after_activation_kernel_param_def;
     vx_kernel_initialize_f  initializer = _grucell_reset_after_activation_initializer;
 
-    uint32_t key;
-    uint32_t i;
+    uint32_t key = 0;
+    uint32_t i = 0;
 
     hstate_dtype  = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_H_STATE]->attr.dtype.vx_type );
     fc_dtype  = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_I_FC_Z]->attr.dtype.vx_type );
     out_dtype = vsi_nn_kernel_map_dtype( outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dtype.vx_type );
 
-    key = GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( hstate_dtype, fc_dtype, out_dtype, recurrent_activation );
+    key = GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( hstate_dtype, fc_dtype, out_dtype,
+        recurrent_activation, activation );
 
     for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
     {
@@ -362,12 +372,7 @@ static vsi_nn_kernel_node_t _setup
     int32_t activation = vsi_nn_kernel_param_get_int32( params, "activation" );
     int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" );
 
-    if( activation != VSI_NN_ACT_TANH )
-    {
-        return NULL;
-    }
-
-    status = _query_kernel( kernel, inputs, outputs, recurrent_activation );
+    status = _query_kernel( kernel, inputs, outputs, recurrent_activation, activation );
     if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
diff --git a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
index 48af7f85a..7e5a84650 100644
--- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
@@ -246,6 +246,8 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer)
     float sum_x2_tail1 = 1;
     float work_item_pixels = 1;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -402,6 +404,8 @@ DEF_KERNEL_INITIALIZER(_instancenorm_means_initializer)
     vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
     vsi_size_array_t * input_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -452,6 +456,8 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
     vsi_size_array_t * input_shape = NULL;
     vx_int32 width = 0, chn = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -808,6 +814,10 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t batch = 1;
     vsi_bool ret = FALSE;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+
     memcpy(new_shape, inputs[0]->attr.size, sizeof(inputs[0]->attr.size));
 
     if (new_shape[0] >= GPU_TENSOR_MAX_WIDTH || new_shape[1] >= GPU_TENSOR_MAX_WIDTH)
diff --git a/src/tim/vx/internal/src/kernel/evis/l1norm_evis.c b/src/tim/vx/internal/src/kernel/evis/l1norm_evis.c
index 00c31c319..ce097d624 100644
--- a/src/tim/vx/internal/src/kernel/evis/l1norm_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/l1norm_evis.c
@@ -168,6 +168,8 @@ DEF_KERNEL_INITIALIZER(_l1norm_initializer_axis)
     vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
     vsi_size_array_t            *output_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
     vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis);
 
@@ -197,12 +199,12 @@ DEF_KERNEL_INITIALIZER(_l1norm_initializer_axis)
     }
     else if (axis == 1)
     {
-        gpu_param.global_size[0] = (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0];;
+        gpu_param.global_size[0] = (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0];
         gpu_param.global_size[1] = depth;
     }
     else if (axis == 2)
     {
-        gpu_param.global_size[0] = (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0];;
+        gpu_param.global_size[0] = (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0];
         gpu_param.global_size[1] = height;
     }
 
diff --git a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
index be4a29953..068257c43 100644
--- a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
@@ -139,6 +139,8 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
     int32_t   axis2Dflg     = 0;
     int32_t   inputWidth    = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
diff --git a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
index 966a6cdd8..0a477c525 100644
--- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
@@ -250,6 +250,8 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
     float inv_multiplier = 0;
     int32_t height = 0, width = 0, chn = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
@@ -539,6 +541,8 @@ DEF_KERNEL_INITIALIZER(_layernorm_axis01_sums_initializer)
     int32_t height = 0;
     int32_t chn = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -653,6 +657,8 @@ DEF_KERNEL_INITIALIZER(_layernorm_axis01_initializer)
     vx_uint32 group_num = 0;
     vx_int32 height = 0, width = 0, chn = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
@@ -787,7 +793,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e input2_dtype = F16;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
-    int32_t i = 0;
+    size_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
@@ -832,7 +838,7 @@ static vsi_status _query_kernel_axis01
     vsi_nn_kernel_dtype_e input2_dtype = F16;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
@@ -917,6 +923,9 @@ static vsi_nn_kernel_node_t _setup_axis01
     uint32_t axis_size = 0;
     uint32_t rank_in = 0, rank_para = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     status = vsi_nn_kernel_optimize_tensor_shape(
         inputs[0]->attr.size, inputs[0]->attr.dim_num,
         axis, axis_num, new_shape[0], &rank_in, new_axis, &axis_size);
@@ -942,6 +951,7 @@ static vsi_nn_kernel_node_t _setup_axis01
     rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[0], rank_in);
 
     kernel_sums = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+    CHECK_PTR_FAIL_GOTO( kernel_sums, "Create kernel fail.", final );
     // Assign unique_id
     kernel_sums->unique_id = kernel->unique_id;
 
@@ -961,6 +971,7 @@ static vsi_nn_kernel_node_t _setup_axis01
     attr.size[3] = new_shape[0][3];
     attr.dim_num = rank_in;
     tensor_sums = vsi_nn_CreateTensor( graph, &attr );
+    CHECK_PTR_FAIL_GOTO( tensor_sums, "Create tensor fail.", final );
 
     status = _query_kernel_axis01(inputs, outputs, kernel_sums, kernel);
     if ( VSI_SUCCESS != status )
@@ -972,6 +983,7 @@ static vsi_nn_kernel_node_t _setup_axis01
     ** sum(x) and sumsq(x*x)
     */
     sums_node = vsi_nn_kernel_create_node(graph, kernel_sums);
+    CHECK_PTR_FAIL_GOTO( sums_node, "Create kernel fail.", final );
     if (sums_node)
     {
         sums_node_params[0] = rs_input;
@@ -992,6 +1004,7 @@ static vsi_nn_kernel_node_t _setup_axis01
     }
 
     node = vsi_nn_kernel_create_node( graph, kernel );
+    CHECK_PTR_FAIL_GOTO( node, "Create kernel fail.", final );
     if (node)
     {
         uint32_t index = 0;
@@ -1065,6 +1078,9 @@ static vsi_nn_kernel_node_t _setup_axis0
     uint32_t rank_in = 0;
     int32_t is_img2d_input = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     status = vsi_nn_kernel_optimize_tensor_shape(
         inputs[0]->attr.size, inputs[0]->attr.dim_num,
         axis, axis_num, new_shape[0], &rank_in, new_axis, &axis_size);
diff --git a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c
index 3ee30282d..4e7b8a087 100644
--- a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c
@@ -166,6 +166,8 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer)
     float   rlogE                           = (float)(log10(2.0f) / log10(exp(1.0f)));
     float   scaleLogE                       = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -482,7 +484,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    size_t i;
 
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -529,6 +531,9 @@ static vsi_nn_kernel_node_t _setup
     int32_t axis = 0;
     float beta = 1.0f;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     axis = vsi_nn_kernel_param_get_int32(params, "axis");
     beta = vsi_nn_kernel_param_get_float32(params, "beta");
 
diff --git a/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c b/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c
index 890f7bc78..d59d851ed 100644
--- a/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c
@@ -98,7 +98,7 @@ DEF_KERNEL_INITIALIZER(_logical_not_initializer)
     size_t                              param_size
     )
 {
-    vsi_status status = VX_SUCCESS;
+    vsi_status status = VSI_FAILURE;
     // Alignment with a power of two value.
     gpu_param_t gpu_param = {
         3,
@@ -112,6 +112,8 @@ DEF_KERNEL_INITIALIZER(_logical_not_initializer)
     vsi_nn_kernel_tensor_attr_t *output_attr   = NULL;
     vsi_size_array_t             *output_shape  = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     output_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output);
     CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
@@ -226,6 +228,8 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t new_rank = 0;
     vsi_bool ret = FALSE;
 
+    VSI_UNREFERENCED(params);
+
     ret = vsi_nn_kernel_optimize_element_shape(
             inputs[0]->attr.size, inputs[0]->attr.dim_num,
             shape, &new_rank );
diff --git a/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c b/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c
index 7e5476b74..54713cb08 100644
--- a/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c
@@ -109,7 +109,7 @@ DEF_KERNEL_INITIALIZER(_logical_ops_initializer)
     size_t                              param_size
     )
 {
-    vsi_status status = VX_FAILURE;
+    vsi_status status = VSI_FAILURE;
     // Alignment with a power of two value.
     gpu_param_t gpu_param = {
         3,
@@ -125,6 +125,8 @@ DEF_KERNEL_INITIALIZER(_logical_ops_initializer)
     vsi_nn_kernel_tensor_attr_t *input_attr = NULL, *output_attr = NULL;
     vsi_size_array_t             *output_shape  = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input);
     CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
@@ -308,11 +310,11 @@ static vsi_nn_kernel_node_t _setup
                 outputs[0], shapes[2], new_rank );
 
 #define _swap_tensor(a, b, tmp)  \
-    do { \
+    { \
         tmp = a; \
         a = b; \
         b = tmp; \
-    } while(0)
+    }
 
         if (shapes[1][3] > shapes[0][3] && new_rank == 4)
         {
diff --git a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
index a99acc6cd..95232b9d1 100644
--- a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
@@ -65,7 +65,8 @@ typedef enum _LSTMUNIT_nn_activation_e
 #define LSTMUNIT_ACTIVATION_HASH_KEY(_is_ln, _is_cifg, _is_proj, _is_hybrid, _is_peephole, \
 _input_type, _output_type, _cell_type, _rec_act) \
 ((_is_ln << 31) | (_is_cifg << 30) | (_is_proj << 29) | (_is_hybrid << 28) | (_is_peephole << 27) \
-| (_input_type << 23) | (_output_type << 19) | (_cell_type << 15) | (_rec_act << 10))
+| (((uint32_t)_input_type) << 23) | (((uint32_t)_output_type) << 19) | (((uint32_t)_cell_type) << 15) \
+| (_rec_act << 10))
 
 #define LSTMUNIT_ACTIVATION_SOURCE_NAME(_ln_cifg_proj_hybrid_, _input_type) \
     "lstmunit_activation_"#_ln_cifg_proj_hybrid_"_"#_input_type
diff --git a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
index 6e4ee41b1..f5dc60b1e 100644
--- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
@@ -58,9 +58,12 @@ __BEGIN_DECLS
 #define KERNEL_SOURCE_14   "matrixmul_f16i16_i16"
 #define KERNEL_SOURCE_15   "matrixmul_bf16"
 #define KERNEL_SOURCE_16   "matrixmul_u8i16_i16"
+#define KERNEL_SOURCE_17   "matrixmul_merge"
+#define KERNEL_SOURCE_18   "matrixmul_cross"
+#define KERNEL_SOURCE_19   "matrixmul_cross_i16"
 
-#define HASH_MATRIX_MUL_KEY(_input0_type, _input1_type, _output_type, _trans_a, _trans_b) \
-    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_trans_a << 4) | (_trans_b))
+#define HASH_MATRIX_MUL_KEY(_type0, _type1, _type2, _trans_a, _trans_b, _cross) \
+    ((_type0 << 24) | (_type1 << 16) | (_type2 << 8) | (_trans_a << 4) | (_trans_b << 2) | (_cross))
 
 #define HASH_MATRIX_MUL_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
     CVIVANTE_NAMESPACE("evis.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE)
@@ -71,21 +74,37 @@ __BEGIN_DECLS
 #define HASH_MATRIX_MUL_TRANSA_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
     CVIVANTE_NAMESPACE("evis.gemm_transa_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE)
 
+#define HASH_MATRIX_MUL_SH_KERNEL_CROSS_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_cross")
+
+#define HASH_MATRIX_MUL_SH_KERNEL_MERGE_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.gemm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_merge")
+
 #define TENSOR_MATRIX_MUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 0), \
+    { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 0, 0), \
         HASH_MATRIX_MUL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
         SOURCE },
 
 #define TENSOR_MATRIX_MUL_TRANSB_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 1), \
+    { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 1, 0), \
         HASH_MATRIX_MUL_TRANSB_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
         SOURCE },
 
 #define TENSOR_MATRIX_MUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1, 0), \
+    { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1, 0, 0), \
         HASH_MATRIX_MUL_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
         SOURCE },
 
+#define TENSOR_MATRIX_MUL_CROSS_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 0, 1), \
+        HASH_MATRIX_MUL_SH_KERNEL_CROSS_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
+        SOURCE },
+
+#define TENSOR_MATRIX_MUL_MERGE_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_MATRIX_MUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0, 0, 2), \
+        HASH_MATRIX_MUL_SH_KERNEL_MERGE_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
+        SOURCE },
+
 
 static const struct {
         uint32_t key;
@@ -135,6 +154,14 @@ static const struct {
     TENSOR_MATRIX_MUL_TRANSA_KERNELS(F16, F16, F16,    KERNEL_SOURCE_7)
     TENSOR_MATRIX_MUL_TRANSA_KERNELS(BF16,BF16,BF16,   KERNEL_SOURCE_15)
     TENSOR_MATRIX_MUL_TRANSA_KERNELS(U8,  I16, I16,    KERNEL_SOURCE_7)
+    TENSOR_MATRIX_MUL_MERGE_KERNELS(U8,  U8,  U8,      KERNEL_SOURCE_17)
+    TENSOR_MATRIX_MUL_MERGE_KERNELS(I8,  I8,  I8,      KERNEL_SOURCE_17)
+    TENSOR_MATRIX_MUL_MERGE_KERNELS(I16, I16, I16,     KERNEL_SOURCE_19)
+    TENSOR_MATRIX_MUL_MERGE_KERNELS(F16, F16, F16,     KERNEL_SOURCE_17)
+    TENSOR_MATRIX_MUL_CROSS_KERNELS(U8,  U8,  U8,      KERNEL_SOURCE_18)
+    TENSOR_MATRIX_MUL_CROSS_KERNELS(I8,  I8,  I8,      KERNEL_SOURCE_18)
+    TENSOR_MATRIX_MUL_CROSS_KERNELS(I16, I16, I16,     KERNEL_SOURCE_19)
+    TENSOR_MATRIX_MUL_CROSS_KERNELS(F16, F16, F16,     KERNEL_SOURCE_18)
 };
 
 /*
@@ -154,7 +181,35 @@ static vx_param_description_t _matrix_mul_kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     // Add kererl parameters here
 };
+
+static vx_param_description_t _matrix_mul_kernel_cross_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
 #define _MATRIX_MUL_PARAM_NUM  _cnt_of_array( _matrix_mul_kernel_param_def )
+#define _MATRIX_MUL_CROSS_PARAM_NUM  _cnt_of_array( _matrix_mul_kernel_cross_param_def )
 
 /*
  * Kernel initializer
@@ -180,7 +235,10 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
     int32_t       transB = 0;
     int32_t       width  = 0;
     int32_t       height = 0;
-    int32_t       chn    = 0;
+    vsi_size_t    chn    = 0;
+    int32_t       a_depth = 0;
+    int32_t       b_depth = 0;
+    vsi_size_t    outer   = 0;
 
     int32_t     src0ZP     = 0;
     float       src0Scale  = 0;
@@ -204,6 +262,8 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
     uint32_t  evis2 = 0;
     vx_context  ctx        = vxGetContext((vx_reference)node);
     vx_hardware_caps_params_t   hw_param;
+
+    VSI_UNREFERENCED(param_size);
     memset(&hw_param, 0, sizeof(vx_hardware_caps_params_t));
     status = vxQueryHardwareCaps(ctx, &hw_param, sizeof(vx_hardware_caps_params_t));
     CHECK_STATUS_FAIL_GOTO(status, OnError );
@@ -294,22 +354,59 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
     mulKIn0In1Zp = (float)((int)(K + 3) / 4 * 4 * src1ZP * src0ZP);
     inOutScale =  src0Scale * src1Scale / dstScale;
 
-    if ((attr[0]->shape->size > attr[1]->shape->size) ||
-        (attr[0]->shape->data[2] > attr[1]->shape->data[2]
-    && attr[0]->shape->size > 2 && attr[1]->shape->size > 2))
+    a_depth = (int32_t)(attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1);
+    b_depth = (int32_t)(attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1);
+
+    if (b_depth == 1)
     {
         bc2zero = 1;
     }
-    else if ((attr[1]->shape->size > attr[0]->shape->size) ||
-        (attr[1]->shape->data[2] > attr[0]->shape->data[2]
-    && attr[0]->shape->size > 2 && attr[1]->shape->size > 2))
+    if (a_depth == 1)
     {
         ac2zero = 1;
     }
 
     width = (int32_t)(attr[2]->shape->data[0]);
     height = (int32_t)(attr[2]->shape->data[1]);
-    chn = (int32_t)(attr[2]->shape->size > 2 ? attr[2]->shape->data[2] : 1);
+    chn = (attr[2]->shape->size > 2 ? attr[2]->shape->data[2] : 1);
+
+    if (((attr[0]->shape->size == 4 && attr[1]->shape->size == 3) ||
+        (attr[0]->shape->size == 3 && attr[1]->shape->size == 4))
+        && attr[0]->shape->data[2] > 1 && attr[1]->shape->data[2] > 1
+        && chn != attr[0]->shape->data[2] * attr[1]->shape->data[2])
+    {
+        vsi_size_t iter = attr[0]->shape->data[2] * attr[1]->shape->data[2] / chn;
+        if (attr[0]->shape->size == 4)
+        {
+            ac2zero = 1;
+            bc2zero = 0;
+            chn = attr[1]->shape->data[2];
+            outer = attr[0]->shape->data[2] / iter;
+        }
+        else
+        {
+            ac2zero = 0;
+            bc2zero = 1;
+            chn = attr[0]->shape->data[2];
+            outer = attr[1]->shape->data[2] / iter;
+        }
+    }
+    else if (attr[0]->shape->size == 4 && attr[1]->shape->size == 3
+        && attr[0]->shape->data[2] != 1 && attr[1]->shape->data[2] != 1)
+    {
+        ac2zero = 1;
+        bc2zero = 0;
+        chn = attr[1]->shape->data[2];
+        outer = attr[0]->shape->data[2];
+    }
+    else if (attr[1]->shape->size == 4 && attr[0]->shape->size == 3
+        && attr[0]->shape->data[2] != 1 && attr[1]->shape->data[2] != 1)
+    {
+        ac2zero = 0;
+        bc2zero = 1;
+        chn = attr[0]->shape->data[2];
+        outer = attr[1]->shape->data[2];
+    }
 
     gpu_param.global_scale[0]  = 4;
     gpu_param.global_scale[1]  = 4;
@@ -319,7 +416,7 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
                                         / gpu_param.global_scale[0], 4);
     gpu_param.global_size[1]   = gpu_align_p2((height + gpu_param.global_scale[1] - 1)
                                         / gpu_param.global_scale[1], 4);
-    gpu_param.global_size[2]   = chn;
+    gpu_param.global_size[2]   = (size_t)chn;
 
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
     CHECK_STATUS_FAIL_GOTO(status, OnError);
@@ -683,6 +780,12 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
             uniI16MulI16SumtoI32_16x1.data[i] = multiplierZpB;
         }
 
+        if (outer)
+        {
+            status = vsi_nn_kernel_gpu_add_param( node, "outer", &outer );
+            CHECK_STATUS_FAIL_GOTO(status, OnError );
+        }
+
         switch( pack_key )
         {
         case _PACK_SELECT_KEY( U8, U8, F16, 0, 1, 0 ):
@@ -790,16 +893,19 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
                         "uniConvertUint8SubZpToFp32_4x4", &uniConvertUint8SubZpToFp32_4x4 );
                 status |= vsi_nn_kernel_gpu_add_param( node,
                         "uniConvertUint8SubZpToFp32B_4x4", &uniConvertUint8SubZpToFp32B_4x4 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                        "uniI16MulI16SumtoI32_16x1", &uniI16MulI16SumtoI32_16x1 );
-                status |= vsi_nn_kernel_gpu_add_param( node,
-                        "uniI16MulI16SumtoI32B_16x1", &uniI16MulI16SumtoI32B_16x1 );
                 status |= vsi_nn_kernel_gpu_add_param( node, "input0_ZP", &src0ZP );
                 status |= vsi_nn_kernel_gpu_add_param( node, "input1_ZP", &src1ZP );
                 status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP );
                 status |= vsi_nn_kernel_gpu_add_param( node, "outputScale", &reScaleOut );
-                status |= vsi_nn_kernel_gpu_add_param( node, "inout_scale", &inScaledivOut );
-                status |= vsi_nn_kernel_gpu_add_param( node, "inout_beta", &inout_beta );
+                if (outer == 0)
+                {
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniI16MulI16SumtoI32_16x1", &uniI16MulI16SumtoI32_16x1 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniI16MulI16SumtoI32B_16x1", &uniI16MulI16SumtoI32B_16x1 );
+                    status |= vsi_nn_kernel_gpu_add_param( node, "inout_scale", &inScaledivOut );
+                    status |= vsi_nn_kernel_gpu_add_param( node, "inout_beta", &inout_beta );
+                }
             }
             break;
         case _PACK_SELECT_KEY( F16, U8,  F16, 0, 0, 0 ):
@@ -1093,6 +1199,308 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
     return status;
 } /* _matrix_mul_initializer() */
 
+DEF_KERNEL_INITIALIZER(_matrix_mul_cross_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
+    int32_t       transA = 0;
+    int32_t       transB = 0;
+    int32_t       width  = 0;
+    int32_t       height = 0;
+    int32_t       axis_size = 0;
+
+    int32_t     src0ZP     = 0;
+    float       src0Scale  = 0;
+    int32_t     src1ZP     = 0;
+    float       src1Scale  = 0;
+    float       dstZP      = 0;
+    float       dstScale   = 0;
+
+    uint32_t pack_key = 0;
+
+    float    mulKIn0In1Zp  = 0;
+    float    inOutScale    = 0;
+    int32_t  K             = 0;
+
+    uint32_t  evis2 = 0;
+    vx_context  ctx        = vxGetContext((vx_reference)node);
+    vx_hardware_caps_params_t   hw_param;
+
+    VSI_UNREFERENCED(param_size);
+    memset(&hw_param, 0, sizeof(vx_hardware_caps_params_t));
+    status = vxQueryHardwareCaps(ctx, &hw_param, sizeof(vx_hardware_caps_params_t));
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    if (hw_param.evis2 == TRUE)
+    {
+        evis2 = 1;
+    }
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &transA);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &transB);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &K);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &axis_size);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    src0ZP     = attr[0]->asymm.zero_point;
+    src0Scale  = attr[0]->asymm.scale;
+    src1ZP     = attr[1]->asymm.zero_point;
+    src1Scale  = attr[1]->asymm.scale;
+    dstZP      = (float)attr[2]->asymm.zero_point;
+    dstScale   = attr[2]->asymm.scale;
+
+    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        if (attr[0]->dfp.fl > 0)
+        {
+            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
+        }
+        else
+        {
+            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
+        }
+        src0ZP = 0;
+    }
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
+    {
+        src0Scale = 1;
+        src0ZP = 0;
+    }
+
+    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        if (attr[1]->dfp.fl > 0)
+        {
+            src1Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl)));
+        }
+        else
+        {
+            src1Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl));
+        }
+        src1ZP = 0;
+    }
+    else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
+    {
+        src1Scale = 1;
+        src1ZP = 0;
+    }
+
+    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        if (attr[2]->dfp.fl > 0)
+        {
+            dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
+        }
+        else
+        {
+            dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
+        }
+        dstScale = 1.0f / dstScale;
+        dstZP = 0.0f;
+    }
+    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    {
+        dstScale = 1;
+        dstZP = 0.0f;
+    }
+
+    mulKIn0In1Zp = (float)((int)(K + 3) / 4 * 4 * src1ZP * src0ZP);
+    inOutScale =  src0Scale * src1Scale / dstScale;
+
+    width = (int32_t)(attr[2]->shape->data[0]);
+    height = (int32_t)(attr[2]->shape->data[1]);
+
+    gpu_param.global_scale[0]  = 4;
+    gpu_param.global_scale[1]  = 4;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0]   = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
+                                        / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = gpu_align_p2((height + gpu_param.global_scale[1] - 1)
+                                        / gpu_param.global_scale[1], 4);
+    gpu_param.global_size[2]   = (size_t)axis_size;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE, TRANSA, TRANSB, EVIS2)    \
+        ((IN0_TYPE << 24) | (IN1_TYPE << 16) | (OUT_TYPE << 8) | (TRANSA << 4) | (TRANSB << 2) | (EVIS2))
+
+    pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, attr[2]->dtype, transA, transB, evis2);
+    {
+        uint16_t M0            = 0;
+        uint16_t M1            = 0;
+        int32_t  postShift0    = 0;
+        int32_t  postShift1    = 0;
+        uint32_t multiplierA   = 0;
+        uint32_t multiplierB   = 0;
+        gpu_dp_inst_t uniGemmU8U8MulZptoFp32_8x4 = {{
+            0xaaaaaaaa, 0xaaaaaaaa, // TCfg
+            0xf02a0600, 0x2a8620e0, 0x0640e8f2, 0x60f0f42b, 0xf8f62b86, // BinSelect
+            0x00000700, // AccumType, ConstantType, and PostShift
+            0x03020302, 0x03020302, 0x03020302, 0x03020302,
+            0x03020302, 0x03020302, 0x03020302, 0x03020302 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniGemmU8U8toFp32Block4_4x4 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x32103210, 0x32103210, // ABin
+            0x55555555, // BSelt
+            0xd951c840, 0xfb73ea62, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniGemmU8F16toF32Lo_4x4b = {{
+            0x55555555, // TCfg
+            0x50505050, // ASelt
+            0x51514040, 0x73736262, // ABin
+            0x00000000, // BSelt
+            0x32103210, 0x32103210, // BBin
+            0x00000000, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniConvertUint8SubZpToFp32_4x4 = {{
+            0x09090909, // TCfg
+            0x04040404, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertUint8SubZpToFp32B_4x4 = {{
+            0x09090909, // TCfg
+            0x04040404, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        float reScaleOut = 1 / dstScale;
+        uint32_t multiplierU8ZpAB = (src0ZP << 24) | (src1ZP << 16) | (src0ZP << 8) | (src1ZP);
+        int32_t i = 8;
+        gpu_quantize_multiplier_16bit(src0Scale / 1.0f, &M0, &postShift0);
+        gpu_quantize_multiplier_16bit(src1Scale / 1.0f, &M1, &postShift1);
+
+        multiplierA = (M0 << 16) | M0;
+        multiplierB = (M1 << 16) | M1;
+
+        uniConvertUint8SubZpToFp32_4x4.data[7] |= (postShift0 & 0x1F);
+        uniConvertUint8SubZpToFp32B_4x4.data[7] |= (postShift1 & 0x1F);
+        for( i = 8; i < 16; i += 2)
+        {
+            uniConvertUint8SubZpToFp32_4x4.data[i] = multiplierA;
+            uniConvertUint8SubZpToFp32B_4x4.data[i] = multiplierB;
+        }
+        for( i = 8; i < 16; i++)
+        {
+            uniGemmU8U8MulZptoFp32_8x4.data[i] = multiplierU8ZpAB;
+        }
+
+        switch( pack_key )
+        {
+        case _PACK_SELECT_KEY( U8,  U8,  U8,  0, 0, 1 ):
+        case _PACK_SELECT_KEY( I8,  I8,  I8,  0, 0, 1 ):
+            {
+                status = vsi_nn_kernel_gpu_add_param( node,
+                        "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniGemmU8U8toFp32Block4_4x4", &uniGemmU8U8toFp32Block4_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniGemmU8U8MulZptoFp32_8x4", &uniGemmU8U8MulZptoFp32_8x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale );
+                status |= vsi_nn_kernel_gpu_add_param( node, "mulKIn0In1Zp", &mulKIn0In1Zp );
+                status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP );
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case _PACK_SELECT_KEY( I16, I16, I16, 0, 0, 0 ):
+        case _PACK_SELECT_KEY( I16, I16, I16, 0, 0, 1 ):
+            {
+                status = vsi_nn_kernel_gpu_add_param( node,
+                        "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniConvertUint8SubZpToFp32_4x4", &uniConvertUint8SubZpToFp32_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniConvertUint8SubZpToFp32B_4x4", &uniConvertUint8SubZpToFp32B_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node, "input0_ZP", &src0ZP );
+                status |= vsi_nn_kernel_gpu_add_param( node, "input1_ZP", &src1ZP );
+                status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP );
+                status |= vsi_nn_kernel_gpu_add_param( node, "outputScale", &reScaleOut );
+            }
+            break;
+        case _PACK_SELECT_KEY( F16, F16, F16, 0, 0, 1 ):
+            {
+                status = vsi_nn_kernel_gpu_add_param( node,
+                        "uniGemmU8F16toF32Lo_4x4b", &uniGemmU8F16toF32Lo_4x4b );
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        default:
+            break;
+        }
+    }
+#undef _PACK_SELECT_KEY
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    if (attr[2])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[2] );
+        attr[2] = NULL;
+    }
+    return status;
+} /* _matrix_mul_cross_initializer() */
+
 /*
  * Query kernel
  */
@@ -1102,7 +1510,8 @@ static vsi_status _query_kernel
     vsi_nn_tensor_t* const* const outputs,
     vsi_nn_kernel_t* kernel,
     int32_t transa,
-    int32_t transb
+    int32_t transb,
+    int32_t cross
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -1110,13 +1519,13 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e input1_dtype = U8;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    key = HASH_MATRIX_MUL_KEY( input0_dtype, input1_dtype, output_dtype, transa, transb );
+    key = HASH_MATRIX_MUL_KEY( input0_dtype, input1_dtype, output_dtype, transa, transb, cross);
 
     for( i = 0; i < _cnt_of_array(matrix_mul_map); i ++ )
     {
@@ -1128,9 +1537,18 @@ static vsi_status _query_kernel
     if ( i < _cnt_of_array(matrix_mul_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  matrix_mul_map[i].function_name );
-        kernel->info.parameters = _matrix_mul_kernel_param_def;
-        kernel->info.numParams = _cnt_of_array( _matrix_mul_kernel_param_def );
-        kernel->info.initialize = _matrix_mul_initializer;
+        if (cross == 1)
+        {
+            kernel->info.parameters = _matrix_mul_kernel_cross_param_def;
+            kernel->info.numParams = _cnt_of_array( _matrix_mul_kernel_cross_param_def );
+            kernel->info.initialize = _matrix_mul_cross_initializer;
+        }
+        else
+        {
+            kernel->info.parameters = _matrix_mul_kernel_param_def;
+            kernel->info.numParams = _cnt_of_array( _matrix_mul_kernel_param_def );
+            kernel->info.initialize = _matrix_mul_initializer;
+        }
 
         vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                 "vsi_nn_kernel_header",
@@ -1155,18 +1573,28 @@ static vsi_nn_kernel_node_t _setup
     )
 {
     vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t tmp_params[_MATRIX_MUL_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_param_t tmp_params[_MATRIX_MUL_CROSS_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t node = NULL;
     vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
     int32_t transposeA  = vsi_nn_kernel_param_get_int32( params, "transposeA" );
     int32_t transposeB  = vsi_nn_kernel_param_get_int32( params, "transposeB" );
     int32_t adjointA  = vsi_nn_kernel_param_get_int32( params, "adjointA" );
     int32_t adjointB  = vsi_nn_kernel_param_get_int32( params, "adjointB" );
+    uint32_t cross_flg  = vsi_nn_kernel_param_get_int32( params, "cross_flg" );
+    size_t tmp_size = 0;
+    uint32_t* size_axis_in_out = NULL;
+    uint32_t* stride_axis_in_out = NULL;
     vsi_size_t M = inputs[0]->attr.size[1];
     vsi_size_t K = inputs[0]->attr.size[0];
     vsi_size_t N = inputs[1]->attr.size[0];
     vsi_size_t depthA = 1, depthB = 1;
 
+    size_axis_in_out = (uint32_t *)vsi_nn_kernel_param_get_buffer( params, "size_axis_inner_outer", &tmp_size);
+    stride_axis_in_out = (uint32_t *)vsi_nn_kernel_param_get_buffer( params, "stride_axis_inner_outer", &tmp_size);
+
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if ((inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32
         && inputs[1]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32
         && outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32)
@@ -1209,13 +1637,14 @@ static vsi_nn_kernel_node_t _setup
         rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
     }
 
-    status = _query_kernel( inputs, outputs, kernel, transposeA, transposeB );
+    status = _query_kernel( inputs, outputs, kernel, transposeA, transposeB, cross_flg );
     if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
         if ( node )
         {
             uint32_t index = 3;
+            size_t param_num = cross_flg == 1 ? _MATRIX_MUL_CROSS_PARAM_NUM : _MATRIX_MUL_PARAM_NUM;
             /* Pass parameters to node. */
             if (rs_input)
             {
@@ -1225,7 +1654,7 @@ static vsi_nn_kernel_node_t _setup
             }
             else
             {
-                vsi_nn_kernel_node_pack_io( tmp_params, _MATRIX_MUL_PARAM_NUM,
+                vsi_nn_kernel_node_pack_io( tmp_params, param_num,
                         inputs, 2, outputs, 1 );
             }
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &transposeA );
@@ -1235,7 +1664,22 @@ static vsi_nn_kernel_node_t _setup
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &M );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &K );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &N );
-            status = vsi_nn_kernel_node_pass_param( node, tmp_params, _MATRIX_MUL_PARAM_NUM );
+            if (cross_flg == 1)
+            {
+                tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &size_axis_in_out[0] );
+                tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &size_axis_in_out[1] );
+                tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &size_axis_in_out[2] );
+                tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[0] );
+                tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[1] );
+                tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[2] );
+                tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[3] );
+                tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[4] );
+                tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[5] );
+                tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[6] );
+                tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[7] );
+                tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_axis_in_out[8] );
+            }
+            status = vsi_nn_kernel_node_pass_param( node, tmp_params, param_num );
             CHECK_STATUS(status);
             vsi_nn_kernel_scalar_release( &tmp_params[3] );
             vsi_nn_kernel_scalar_release( &tmp_params[4] );
@@ -1244,6 +1688,21 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &tmp_params[7] );
             vsi_nn_kernel_scalar_release( &tmp_params[8] );
             vsi_nn_kernel_scalar_release( &tmp_params[9] );
+            if (cross_flg == 1)
+            {
+                vsi_nn_kernel_scalar_release( &tmp_params[10] );
+                vsi_nn_kernel_scalar_release( &tmp_params[11] );
+                vsi_nn_kernel_scalar_release( &tmp_params[12] );
+                vsi_nn_kernel_scalar_release( &tmp_params[13] );
+                vsi_nn_kernel_scalar_release( &tmp_params[14] );
+                vsi_nn_kernel_scalar_release( &tmp_params[15] );
+                vsi_nn_kernel_scalar_release( &tmp_params[16] );
+                vsi_nn_kernel_scalar_release( &tmp_params[17] );
+                vsi_nn_kernel_scalar_release( &tmp_params[18] );
+                vsi_nn_kernel_scalar_release( &tmp_params[19] );
+                vsi_nn_kernel_scalar_release( &tmp_params[20] );
+                vsi_nn_kernel_scalar_release( &tmp_params[21] );
+            }
             {
                 // Set default border mode.
                 vx_border_t border;
diff --git a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
index 460ad87f7..d862eb752 100644
--- a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
@@ -153,6 +153,8 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
     vsi_size_array_t * out_shape = NULL;
     uint32_t pack_key;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -404,7 +406,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    size_t i;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -453,6 +455,10 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type;
     vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
index 11478f544..cb9fc3563 100644
--- a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
@@ -153,6 +153,8 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
     vsi_size_array_t * out_shape = NULL;
     uint32_t pack_key;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -404,7 +406,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    size_t i;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -453,6 +455,10 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type;
     vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/mod_evis.c b/src/tim/vx/internal/src/kernel/evis/mod_evis.c
index fe7edd7cc..70188f6e7 100644
--- a/src/tim/vx/internal/src/kernel/evis/mod_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/mod_evis.c
@@ -119,7 +119,7 @@ DEF_KERNEL_INITIALIZER(_mod_initializer)
         {0, 0, 0},
         {0, 0, 0}
         };
-    vx_status     status             = VX_FAILURE;
+    vsi_status    status              = VSI_FAILURE;
     vx_tensor     input0              = (vx_tensor)param[0];
     vx_tensor     input1              = (vx_tensor)param[1];
     vx_tensor     output              = (vx_tensor)param[2];
@@ -138,6 +138,8 @@ DEF_KERNEL_INITIALIZER(_mod_initializer)
     float                        in1Tail      = 0;
     float                        outZp        = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0 );
     CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
diff --git a/src/tim/vx/internal/src/kernel/evis/moments_evis.c b/src/tim/vx/internal/src/kernel/evis/moments_evis.c
index d79142617..9dc6eae47 100644
--- a/src/tim/vx/internal/src/kernel/evis/moments_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/moments_evis.c
@@ -162,7 +162,7 @@ static vx_param_description_t _moments_kernel_param_def[] =
 };
 #define _MOMENTS_PARAM_NUM  _cnt_of_array( _moments_kernel_param_def )
 
-static int32_t set_constant_border
+static int32_t _set_constant_border
     (
     vsi_nn_kernel_node_t node,
     int32_t value
@@ -172,9 +172,6 @@ static int32_t set_constant_border
     vx_border_t border;
     border.mode = VX_BORDER_CONSTANT;
     border.constant_value.S32 = value;
-    border.constant_value.U32 = (vx_uint32)value;
-    border.constant_value.S16 = (vx_int16)value;
-    border.constant_value.U8 = (vx_uint8)value;
     status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
     return status;
 }
@@ -226,6 +223,8 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
 
     uint32_t pack_key = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -797,7 +796,9 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e input0_dtype = U8;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
+
+    VSI_UNREFERENCED(params);
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -866,6 +867,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool image_2d = FALSE;
     vsi_bool is_continue_axis = TRUE;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     axis_num = (int32_t)axis_num_temp;
 
     for ( i = 1; i < axis_num; i++)
@@ -901,7 +905,7 @@ static vsi_nn_kernel_node_t _setup
     reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
         outputs[1], shapes[1], rank_out );
 
-    if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[1]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[1]->attr.size,
         reshape_tensors[1]->attr.dim_num ) )
     {
         return NULL;
@@ -911,10 +915,10 @@ static vsi_nn_kernel_node_t _setup
     axis_first = new_axis[0];
 
     status = _query_kernel( inputs, outputs, kernel, params, new_axis, axis_size, image_2d );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             uint32_t index = 3;
             /* Pass parameters to node. */
@@ -926,17 +930,14 @@ static vsi_nn_kernel_node_t _setup
             CHECK_STATUS(status);
             vsi_nn_kernel_scalar_release( &node_params[3] );
             vsi_nn_kernel_scalar_release( &node_params[4] );
-            status = set_constant_border(node, vsi_nn_get_tensor_zero_point(inputs[0]));
+            status = _set_constant_border(node, 0);
             CHECK_STATUS(status);
         }
     }
 
-    for(i = 0; i < 3; i++)
+    for (i = 0; i < 3; i++)
     {
-        if(reshape_tensors[i])
-        {
-            vsi_nn_ReleaseTensor(&reshape_tensors[i]);
-        }
+        vsi_safe_release_tensor(reshape_tensors[i]);
     }
 
     return node;
diff --git a/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c b/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c
new file mode 100644
index 000000000..28ff2d1ae
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c
@@ -0,0 +1,614 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_NEAREST_GRID_SAMPLE,
+} _internal_kernel_e;
+
+#define STR(a) #a
+
+#define _NEAREST_GRID_SAMPLE_KERNEL_SOURCE(_input_type, _output_type) \
+    "nearest_grid_sample_" #_input_type "_to_" #_output_type
+
+// Add kernel hashtable here
+#define NEAREST_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+    ((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE))
+#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE)                \
+    {                                                                   \
+        NEAREST_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
+            CVIVANTE_NAMESPACE("evis.nearest_grid_sample_" STR(        \
+                IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)),     \
+            _NEAREST_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE)   \
+    }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _nearest_grid_sample_kernel_map[] =
+{
+    PACK_KERNEL_MAP(F16, F32, F16),
+    PACK_KERNEL_MAP(F16, U8, F16),
+    PACK_KERNEL_MAP(F16, F16, F16),
+    PACK_KERNEL_MAP(F16, F32, U8),
+    PACK_KERNEL_MAP(F16, F16, U8),
+    PACK_KERNEL_MAP(F16, U8, U8),
+    PACK_KERNEL_MAP(U8, U8, U8),
+    PACK_KERNEL_MAP(U8, F16, U8),
+    PACK_KERNEL_MAP(U8, F32, U8),
+    PACK_KERNEL_MAP(I16, I16, I16),
+    PACK_KERNEL_MAP(I8, I8, I8),
+    PACK_KERNEL_MAP(BF16, BF16, BF16),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _nearest_grid_sample_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _NEAREST_GRID_SAMPLE_PARAM_NUM  _cnt_of_array( _nearest_grid_sample_kernel_param_def )
+
+#define SCALAR_ALIGN_CORNERS (3)
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_nearest_grid_sample_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+#define MAX_POST_SHIFT_BITS (31)
+#define MAX_MULTIPLIER_NUM (65535)
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {3, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
+    vsi_nn_kernel_tensor_attr_t* output_attr = NULL;
+    vsi_nn_kernel_tensor_attr_t* input_attr[2] = {NULL};
+    vsi_size_array_t* out_shape = NULL;
+    vsi_size_array_t* in0_shape = NULL;
+    vsi_nn_kernel_dtype_e input0_dtype = F16;
+    vsi_nn_kernel_dtype_e input1_dtype = F16;
+    vsi_nn_kernel_dtype_e output_dtype = F16;
+
+    uint32_t depth = 0;
+    float half_input0_wh[2];
+    float add_float_value[2];
+    uint32_t in0_width;
+    uint32_t in0_height;
+    uint32_t out_width;
+    uint32_t out_height;
+    int32_t align_corners;
+
+    float   input0_scale    = 1.0;
+    int32_t input0ZP        = 0;
+    float   input1_scale    = 1.0;
+    int32_t input1ZP        = 0;
+    float   output_scale    = 1.0;
+    int32_t outputZP        = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    input_attr[0] =
+        vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
+    CHECK_PTR_FAIL_GOTO(
+        input_attr[0], "Create tensor attr buffer fail.", final);
+
+    input_attr[1] =
+        vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]);
+    CHECK_PTR_FAIL_GOTO(
+        input_attr[1], "Create tensor attr buffer fail.", final);
+
+    output_attr =
+        vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
+    CHECK_PTR_FAIL_GOTO(output_attr, "Create tensor attr buffer fail.", final);
+
+   status = vsi_nn_kernel_scalar_read_int32(
+        (vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners));
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+    out_shape = output_attr->shape;
+    in0_shape = input_attr[0]->shape;
+    input0_dtype = input_attr[0]->dtype;
+    input1_dtype = input_attr[1]->dtype;
+    output_dtype = output_attr->dtype;
+
+    input0_scale = input_attr[0]->scale;
+    input0ZP     = input_attr[0]->zero_point;
+    input1_scale = input_attr[1]->scale;
+    input1ZP     = input_attr[1]->zero_point;
+    output_scale = output_attr->scale;
+    outputZP     = output_attr->zero_point;
+
+
+    in0_width  = (uint32_t)(in0_shape->data[0]);
+    in0_height = (uint32_t)(in0_shape->data[1]);
+    depth      = (uint32_t)(in0_shape->data[2]);
+    out_width  = (uint32_t)(out_shape->data[0]);
+    out_height = (uint32_t)(out_shape->data[1]);
+
+    if (align_corners) {
+        half_input0_wh[0]  = ((float)in0_width - 1.0f) * 0.5f;
+        half_input0_wh[1]  = ((float)in0_height - 1.0f) * 0.5f;
+        add_float_value[0] = half_input0_wh[0] + 0.5f;
+        add_float_value[1] = half_input0_wh[1] + 0.5f;
+    } else {
+        half_input0_wh[0]  = (float)in0_width * 0.5f;
+        half_input0_wh[1]  = (float)in0_height * 0.5f;
+        add_float_value[0] = half_input0_wh[0];
+        add_float_value[1] = half_input0_wh[1];
+    }
+
+    status  = vsi_nn_kernel_gpu_add_param(node, "half_input0_wh", half_input0_wh);
+    status |= vsi_nn_kernel_gpu_add_param(node, "add_float_value", add_float_value);
+    status |= vsi_nn_kernel_gpu_add_param(node, "depth", &depth);
+
+    {
+        gpu_dp_inst_t uniFp16toFp32_part0_4x4 = {
+            {
+                0x01010101,  // TCfg
+                0x00000000,  // ASelt
+                0x00010000, 0x00030002,  // ABin
+                0x02020202,  // BSelt
+                0x00000000, 0x00000000,  // BBin
+                0x00000400,  // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000  // Constant
+            },
+            GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniFp16toFp32_part1_4x4 = {
+            {
+                0x01010101,  // TCfg
+                0x00000000,  // ASelt
+                0x00050004, 0x00070006,  // ABin
+                0x02020202,  // BSelt
+                0x00000000, 0x00000000,  // BBin
+                0x00000400,  // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000  // Constant
+            },
+            GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniU8SubZPtoFp32_part0_4x4 = {
+            {
+            0x09090909, // TCfg
+            0x04040404, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniU8SubZPtoFp32_part1_4x4 = {
+            {
+            0x09090909, // TCfg
+            0x04040404, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniExtact8Bit_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        if (F16 == input0_dtype &&
+            (F16 == input1_dtype || F32 == input1_dtype ||
+             U8 == input1_dtype) &&
+            F16 == output_dtype) {
+            if (F16 == input1_dtype) {
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniFp16toFp32_part0_4x4", &uniFp16toFp32_part0_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniFp16toFp32_part1_4x4", &uniFp16toFp32_part1_4x4);
+            } else if (U8 == input1_dtype) {
+                status |=
+                    vsi_nn_kernel_gpu_add_param(node, "input1_ZP", &input1ZP);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "input1Scale", &input1_scale);
+                status |=
+                    vsi_nn_kernel_gpu_add_param(node,
+                                                "uniU8SubZPtoFp32_part0_4x4",
+                                                &uniU8SubZPtoFp32_part0_4x4);
+                status |=
+                    vsi_nn_kernel_gpu_add_param(node,
+                                                "uniU8SubZPtoFp32_part1_4x4",
+                                                &uniU8SubZPtoFp32_part1_4x4);
+            }
+        } else if (F16 == input0_dtype &&
+                   (F16 == input1_dtype || F32 == input1_dtype ||
+                    U8 == input1_dtype) &&
+                   U8 == output_dtype) {
+            float uint8Scale = 1.0f / output_scale;
+            float uint8ZP_out = (float)outputZP;
+            status |= vsi_nn_kernel_gpu_add_param(node, "uint8Scale", &uint8Scale);
+            status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &uint8ZP_out);
+            status |= vsi_nn_kernel_gpu_add_param(
+                node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
+            if (U8 == input1_dtype) {
+                status |=
+                    vsi_nn_kernel_gpu_add_param(node, "input1_ZP", &input1ZP);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "input1Scale", &input1_scale);
+                status |=
+                    vsi_nn_kernel_gpu_add_param(node,
+                                                "uniU8SubZPtoFp32_part0_4x4",
+                                                &uniU8SubZPtoFp32_part0_4x4);
+                status |=
+                    vsi_nn_kernel_gpu_add_param(node,
+                                                "uniU8SubZPtoFp32_part1_4x4",
+                                                &uniU8SubZPtoFp32_part1_4x4);
+            } else if (F16 == input1_dtype) {
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniFp16toFp32_part0_4x4", &uniFp16toFp32_part0_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniFp16toFp32_part1_4x4", &uniFp16toFp32_part1_4x4);
+            }
+        }
+        else if (U8 == input0_dtype &&
+                   (F16 == input1_dtype || F32 == input1_dtype ||
+                    U8 == input1_dtype) &&
+                 U8 == output_dtype) {
+            uint16_t  M0                   = 0;
+            int32_t   postShift            = 0;
+            uint32_t  multAndoutZP[2]      = {0};
+            gpu_dp_inst_t uniMultiplyAndPostShift_2x8 = {{
+                0xdddddddd, // TCfg
+                0x44444444, // ASelt
+                0x13121110, 0x17161514, // ABin
+                0x11111111, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+
+            gpu_quantize_multiplier_16bit(
+                (double)input0_scale / (double)output_scale, &M0, &postShift);
+
+            multAndoutZP[0] = (uint32_t)(M0);
+            multAndoutZP[1] =
+                (uint32_t)((outputZP << postShift) - input0ZP * M0);
+
+            uniMultiplyAndPostShift_2x8.data[7] |= (postShift & 0x1F);
+            status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP", multAndoutZP);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniMultiplyAndPostShift_2x8",
+            &uniMultiplyAndPostShift_2x8);
+            if (U8 == input1_dtype) {
+                status |= vsi_nn_kernel_gpu_add_param(node, "input1_ZP", &input1ZP);
+                status |= vsi_nn_kernel_gpu_add_param(node, "input1Scale", &input1_scale);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part0_4x4",
+                &uniU8SubZPtoFp32_part0_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part1_4x4",
+                &uniU8SubZPtoFp32_part1_4x4);
+            }
+            else if (F16 == input1_dtype) {
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniFp16toFp32_part0_4x4", &uniFp16toFp32_part0_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniFp16toFp32_part1_4x4", &uniFp16toFp32_part1_4x4);
+            }
+        }
+        else if (BF16 == input0_dtype && BF16 == input1_dtype &&
+                   BF16 == output_dtype) {
+            gpu_dp_inst_t uniBF16toFp32_part0_2x8 = {
+                {
+                0x11111111, // TCfg
+                0x01010101, // ASelt
+                0x01050004, 0x03070206, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniBF16toFp32_part1_2x8 = {
+                {
+                0x11111111, // TCfg
+                0x01010101, // ASelt
+                0x05050404, 0x07070606, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16};
+            status |= vsi_nn_kernel_gpu_add_param(
+                node, "uniBF16toFp32_part0_2x8", &uniBF16toFp32_part0_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(
+                node, "uniBF16toFp32_part1_2x8", &uniBF16toFp32_part1_2x8);
+        }
+        else if (((I16 == input0_dtype && I16 == input1_dtype &&
+                    I16 == output_dtype)) ||
+                   ((I8 == input0_dtype && I8 == input1_dtype &&
+                     I8 == output_dtype))) {
+            uint16_t M0 = 0;
+            int32_t  postShift = 0;
+            uint32_t i = 0;
+            gpu_dp_inst_t uniDFPtoFp32_part0_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000300, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniDFPtoFp32_part1_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00050004, 0x00070006, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000300, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniConvertI8toI8_2x8 = {{
+                0x11111111, // TCfg
+                0x00000000, // ASelt
+                0x03020100, 0x07060504, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_quantize_multiplier_16bit(
+                (double)input0_scale / (double)output_scale, &M0, &postShift);
+            uniConvertI8toI8_2x8.data[7] |= (postShift & 0x1F);
+            for (i = 0; i < 8; i++) {
+                uniConvertI8toI8_2x8.data[i + 8] = M0;
+            }
+
+            status |= vsi_nn_kernel_gpu_add_param(node, "input1_scale", &input1_scale);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniDFPtoFp32_part0_4x4", &uniDFPtoFp32_part0_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniDFPtoFp32_part1_4x4", &uniDFPtoFp32_part1_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertI8toI8_2x8", &uniConvertI8toI8_2x8);
+        }
+        else {
+            VSILOGE("input or output's format is not support");
+            status = VSI_FAILURE;
+        }
+    }
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+    gpu_param.global_scale[0] = 4;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+
+    gpu_param.dim = 2;
+    gpu_param.global_size[0] =
+        (out_width + gpu_param.global_scale[0] - 1) /
+         gpu_param.global_scale[0];
+    gpu_param.global_size[1] = ((out_height + gpu_param.global_scale[1] - 1) /
+         gpu_param.global_scale[1]);
+
+    status = vsi_nn_kernel_gpu_config(node, &gpu_param);
+
+#undef MAX_MULTIPLIER_NUM
+#undef MAX_POST_SHIFT_BITS
+
+    final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR)               \
+    if (_PTR) {                                   \
+        vsi_nn_kernel_tensor_attr_release(&_PTR); \
+        _PTR = NULL;                              \
+    }
+        SAFE_FREE_TENSOR_ATTR(output_attr);
+        SAFE_FREE_TENSOR_ATTR(input_attr[0]);
+        SAFE_FREE_TENSOR_ATTR(input_attr[1]);
+
+        return status;
+} /* _nearest_grid_sample_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype, in1_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _nearest_grid_sample_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _nearest_grid_sample_kernel_map );
+    vx_param_description_t * param_def  = _nearest_grid_sample_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _nearest_grid_sample_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
+    in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type);
+    out_dtype = vsi_nn_kernel_map_dtype(outputs[0]->attr.dtype.vx_type);
+
+    key = NEAREST_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype);
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _nearest_grid_sample_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_NEAREST_GRID_SAMPLE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
+    uint32_t final_in1_rank = 0;
+    vsi_nn_tensor_t* rs_tensors = NULL;
+    vsi_nn_tensor_t* final_tensors[3] = {NULL};
+    vsi_nn_kernel_dtype_e in0_dtype;
+    uint32_t pad_val = 0;
+    int32_t align_corners =
+        vsi_nn_kernel_param_get_int32(params, "align_corners");
+
+    // Check if gpu can support the size
+    if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
+                                       inputs[0]->attr.dim_num)) {
+        return NULL;
+    }
+
+    if (!vsi_nn_kernel_gpu_check_shape(inputs[1]->attr.size,
+                                       inputs[1]->attr.dim_num)) {
+        return NULL;
+    }
+
+    final_tensors[0] = inputs[0];
+
+    if (inputs[1]->attr.dim_num >= 3) {
+        final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0];
+        final_shape[1] = inputs[1]->attr.size[2];
+        final_shape[2] = 1;
+        final_shape[3] =
+            inputs[1]->attr.dim_num > 3 ? inputs[1]->attr.size[3] : 1;
+        final_in1_rank =
+            inputs[1]->attr.dim_num == 3 ? 2 : inputs[1]->attr.dim_num;
+        if (!vsi_nn_kernel_gpu_check_shape(final_shape, final_in1_rank)) {
+            return NULL;
+        }
+
+        rs_tensors = vsi_nn_reshape_tensor(
+            graph, inputs[1], final_shape, final_in1_rank);
+        final_tensors[1] = rs_tensors;
+    } else {
+        final_tensors[1] = inputs[1];
+    }
+    final_tensors[2] = outputs[0];
+
+    in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
+    if (U8 == in0_dtype) {
+        pad_val = inputs[0]->attr.dtype.zero_point;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _NEAREST_GRID_SAMPLE_PARAM_NUM,
+                    final_tensors, input_num, &final_tensors[2], output_num );
+            node_params[SCALAR_ALIGN_CORNERS] =
+                vsi_nn_kernel_scalar_create(graph, I32, &align_corners);
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _NEAREST_GRID_SAMPLE_PARAM_NUM );
+            VSI_ASSERT(status == VSI_SUCCESS);
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_ALIGN_CORNERS]);
+            {
+                // Set default border mode.
+                vx_border_t border;
+                border.mode = VX_BORDER_CONSTANT;
+                border.constant_value.U32 = pad_val;
+                status = vxSetNodeAttribute(
+                    (vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
+                CHECK_STATUS(status);
+            }
+        }
+    }
+    vsi_safe_release_tensor(rs_tensors);
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( nearest_grid_sample, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
index 5dc05023c..de2d35add 100644
--- a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
@@ -148,6 +148,8 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer)
     int32_t   srcFixPointPos    = 0;
     vsi_nn_kernel_dtype_e input_dtype  = F16;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -422,6 +424,7 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_tensor_t* rs_tensors[2] = { NULL };
     vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
     int32_t i = 0;
+    size_t j = 0;
     vsi_bool image_2d = FALSE;
     vsi_size_t num_elements = vsi_nn_vxGetTensorElementNum(&inputs[0]->attr);
     vsi_size_t prefix_dim_size = 1;
@@ -505,11 +508,11 @@ static vsi_nn_kernel_node_t _setup
         vsi_nn_ReleaseTensor( &rs_tensors[1] );
     }
 
-    for (i = SCALAR_INPUT_SUFFIX_SIZE; i < _ONE_HOT_PARAM_NUM; i++)
+    for (j = SCALAR_INPUT_SUFFIX_SIZE; j < _ONE_HOT_PARAM_NUM; j++)
     {
-        if (node_params[i])
+        if (node_params[j])
         {
-            vsi_nn_kernel_scalar_release( &node_params[i] );
+            vsi_nn_kernel_scalar_release( &node_params[j] );
         }
     }
 
diff --git a/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c b/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c
index a625d97f8..e45704fe6 100644
--- a/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c
@@ -146,6 +146,8 @@ DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer)
     int32_t  output_ZP                         = 0;
     vsi_bool image_2d                          = FALSE;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/pow_evis.c b/src/tim/vx/internal/src/kernel/evis/pow_evis.c
index b4d4f218c..679526e6a 100644
--- a/src/tim/vx/internal/src/kernel/evis/pow_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pow_evis.c
@@ -149,6 +149,8 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -377,7 +379,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -423,6 +425,10 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
index 498ee4528..52588a4d4 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
@@ -84,6 +84,8 @@ static vx_param_description_t vxPreProcessBgraKernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _EVIS_PRE_PROCESS_BGRA_PARAM_NUM          _cnt_of_array(vxPreProcessBgraKernel_param_def)
 
@@ -115,6 +117,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
 
@@ -391,7 +395,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_convert_type_e convert_type = SCALE;
     vsi_status status = VSI_FAILURE;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
     vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
@@ -449,6 +453,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
     int32_t trans = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
@@ -469,7 +476,9 @@ static vsi_nn_kernel_node_t _setup
             float r_mean     = vsi_nn_kernel_param_get_float32( params, "r_mean" );
             float g_mean     = vsi_nn_kernel_param_get_float32( params, "g_mean" );
             float b_mean     = vsi_nn_kernel_param_get_float32( params, "b_mean" );
-            float bgra_scale  = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
+            float r_scale    = vsi_nn_kernel_param_get_float32( params, "r_scale" );
+            float g_scale    = vsi_nn_kernel_param_get_float32( params, "g_scale" );
+            float b_scale    = vsi_nn_kernel_param_get_float32( params, "b_scale" );
             int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
 
             /* Pass parameters to node. */
@@ -496,9 +505,11 @@ static vsi_nn_kernel_node_t _setup
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
-            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &bgra_scale );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
             status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_BGRA_PARAM_NUM );
             CHECK_STATUS(status);
             vsi_nn_kernel_scalar_release( &tmp_params[2] );
@@ -511,6 +522,8 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &tmp_params[9] );
             vsi_nn_kernel_scalar_release( &tmp_params[10] );
             vsi_nn_kernel_scalar_release( &tmp_params[11] );
+            vsi_nn_kernel_scalar_release( &tmp_params[12] );
+            vsi_nn_kernel_scalar_release( &tmp_params[13] );
         }
     }
 
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
index 797c925b2..1973eb2a3 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
@@ -124,6 +124,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
 
@@ -224,6 +226,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
 
@@ -388,6 +392,8 @@ DEF_KERNEL_INITIALIZER(_resize_gray_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -516,7 +522,7 @@ static vsi_status _query_kernel
     vsi_nn_gray_convert_type_e convert_type = SCALE;
     vsi_status status = VSI_FAILURE;
     uint32_t key = 0;
-    int32_t i = 0;
+    size_t i = 0;
     vsi_bool is_4_over_3 = FALSE;
     vsi_bool is_half_scale = FALSE;
     vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
@@ -605,6 +611,9 @@ static vsi_nn_kernel_node_t _setup
     float scale      = vsi_nn_kernel_param_get_float32( params, "scale" );
     vsi_bool is_no_range_change = FALSE;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
index fe39a5cfb..a0d76f4ba 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
@@ -112,6 +112,8 @@ static vx_param_description_t vxPreProcessNv12Kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _EVIS_PRE_PROCESS_NV12_PARAM_NUM          _cnt_of_array(vxPreProcessNv12Kernel_param_def)
 
@@ -136,13 +138,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
     int32_t     order1     = 2;
     uint32_t    width      = 0;
     uint32_t    height     = 0;
-    float       bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f;
-    float       outputScaleVar = 0.0f;
+    float       bMean = 0.0f, gMean= 0.0f, rMean = 0.0f;
+    float       b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f;
+    float       outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f;
     float       bMeanScaleVarZp = 0.0f,  gMeanScaleVarZp = 0.0f,  rMeanScaleVarZp = 0.0f;
 
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
 
@@ -152,10 +157,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
     status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &bMean);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &var);
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &r_scale);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &g_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[15], &b_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[0]->shape;
     output_scale = 1.0f / attr[0]->scale;
@@ -169,10 +178,12 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
         order1 = 0;
     }
 
-    outputScaleVar = output_scale * var;
-    bMeanScaleVarZp = output_zp - bMean * outputScaleVar;
-    gMeanScaleVarZp = output_zp - gMean * outputScaleVar;
-    rMeanScaleVarZp = output_zp - rMean * outputScaleVar;
+    outputScaleVar_b = output_scale * b_scale;
+    outputScaleVar_g = output_scale * g_scale;
+    outputScaleVar_r = output_scale * r_scale;
+    bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b;
+    gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g;
+    rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r;
 
     shaderParam.global_scale[0]  = 4;
     shaderParam.global_scale[1]  = 1;
@@ -255,7 +266,9 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
         status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
         status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
         status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8);
-        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r);
         status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
         status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
         status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
@@ -317,14 +330,17 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
     uint32_t yrIntFloat_16 = 0;
     int32_t     xRatio     = 0;
     int32_t     yRatio     = 0;
-    float       bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f;
-    float       outputScaleVar = 0.0f;
+    float       bMean = 0.0f, gMean= 0.0f, rMean = 0.0f;
+    float       b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f;
+    float       outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f;
     float       bMeanScaleVarZp = 0.0f,  gMeanScaleVarZp = 0.0f,  rMeanScaleVarZp = 0.0f;
     float       resize     = 0.0f;
 
     vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
 
@@ -341,10 +357,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
     status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &bMean);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &var);
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &r_scale);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &g_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[15], &b_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[1]->shape;
     output_scale = 1.0f / attr[1]->scale;
@@ -364,10 +384,12 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
     xrIntFloat_16 = (uint32_t)((roi_width << 16) / width + 1);
     yrIntFloat_16 = (uint32_t)((roi_height << 16) / height + 1);
 
-    outputScaleVar = output_scale * var;
-    bMeanScaleVarZp = output_zp - bMean * outputScaleVar;
-    gMeanScaleVarZp = output_zp - gMean * outputScaleVar;
-    rMeanScaleVarZp = output_zp - rMean * outputScaleVar;
+    outputScaleVar_b = output_scale * b_scale;
+    outputScaleVar_g = output_scale * g_scale;
+    outputScaleVar_r = output_scale * r_scale;
+    bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b;
+    gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g;
+    rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r;
 
     shaderParam.global_scale[0]  = 4;
     shaderParam.global_scale[1]  = 1;
@@ -472,7 +494,9 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
         status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUVtoCharSub128_2x8", &uniConvertUVtoCharSub128_2x8);
         status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16);
         status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16);
-        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r);
         status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
         status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
         status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
@@ -537,7 +561,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_convert_type_e convert_type = SCALE;
     vsi_status status = VSI_FAILURE;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
     vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
     vsi_size_t dstWidth = outputs[0]->attr.size[0];
     float scaleVal = (float)dstWidth / ((scale_x * dstWidth) >> 15);
@@ -611,6 +635,9 @@ static vsi_nn_kernel_node_t _setup
     int32_t trans = 0;
     int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
@@ -630,7 +657,9 @@ static vsi_nn_kernel_node_t _setup
             float r_mean     = vsi_nn_kernel_param_get_float32( params, "r_mean" );
             float g_mean     = vsi_nn_kernel_param_get_float32( params, "g_mean" );
             float b_mean     = vsi_nn_kernel_param_get_float32( params, "b_mean" );
-            float rgb_scale  = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
+            float r_scale    = vsi_nn_kernel_param_get_float32( params, "r_scale" );
+            float g_scale    = vsi_nn_kernel_param_get_float32( params, "g_scale" );
+            float b_scale    = vsi_nn_kernel_param_get_float32( params, "b_scale" );
             int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
             int32_t nv_type  = vsi_nn_kernel_param_get_int32( params, "nv_type" );
 
@@ -645,10 +674,12 @@ static vsi_nn_kernel_node_t _setup
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
-            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &nv_type );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
             status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM );
             CHECK_STATUS(status);
             vsi_nn_kernel_scalar_release( &tmp_params[3] );
@@ -662,6 +693,8 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &tmp_params[11] );
             vsi_nn_kernel_scalar_release( &tmp_params[12] );
             vsi_nn_kernel_scalar_release( &tmp_params[13] );
+            vsi_nn_kernel_scalar_release( &tmp_params[14] );
+            vsi_nn_kernel_scalar_release( &tmp_params[15] );
         }
     }
     vsi_safe_release_tensor(reshape_tensors[0]);
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
index ddfc9b5a8..256f7e5ce 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
@@ -143,8 +143,10 @@ static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
 {
     {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
@@ -162,8 +164,10 @@ static vx_param_description_t _pre_process_rgb888_planar_sep_kernel_param_def[]
     {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
@@ -195,8 +199,10 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_initializer)
 
     float    output_zp    = 0;
     float    output_scale = 1;
+    int32_t reverse = 0;
+    int32_t rgb_order[4] = {0};
     uint32_t width      = 0;
-    uint32_t height     = 0;
+    int32_t height     = 0;
 
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_size_array_t * out_shape = NULL;
@@ -210,30 +216,28 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_initializer)
         attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     }
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &output_scale);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 4], &reverse);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 3], &height);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-    out_shape  = attr[0]->shape;
-    width      = (uint32_t)(out_shape->data[0]);
-    height     = (uint32_t)(out_shape->data[1]);
-
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if (reverse)
     {
-        if ( attr[0]->dfp.fl > 0 )
-        {
-            output_scale *= (float)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            output_scale *= (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
-        }
+        rgb_order[0] = 2 * height;
+        rgb_order[1] = height;
+        rgb_order[2] = 0;
     }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    else
     {
-        output_zp = (float)attr[0]->asymm.zero_point;
-        output_scale /= attr[0]->asymm.scale;
+        rgb_order[0] = 0;
+        rgb_order[1] = height;
+        rgb_order[2] = 2 * height;
     }
 
+    out_shape  = attr[0]->shape;
+    width      = (uint32_t)(out_shape->data[0]);
+    output_scale /= attr[0]->scale;
+    output_zp = (float)attr[0]->zero_point;
+
     shaderParam.global_scale[0]  = 4;
     shaderParam.global_scale[1]  = 1;
     shaderParam.global_scale[2]  = 1;
@@ -322,7 +326,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_initializer)
         status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal);
         status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes);
         status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4);
-
+        status |= vsi_nn_kernel_gpu_add_param(node, "rgb_order", &rgb_order);
         status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
         status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
 
@@ -363,8 +367,10 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
 
     float    output_zp    = 0;
     float    output_scale = 1;
-    uint32_t width      = 0;
-    uint32_t height     = 0;
+    uint32_t width = 0;
+    int32_t height = 0;
+    int32_t reverse = 0;
+    int32_t rgb_order[4] = {0};
 
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_size_array_t * out_shape = NULL;
@@ -378,12 +384,25 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
         attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     }
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &output_scale);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 4], &reverse);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 3], &height);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
+    if (reverse)
+    {
+        rgb_order[0] = 2 * height;
+        rgb_order[1] = height;
+        rgb_order[2] = 0;
+    }
+    else
+    {
+        rgb_order[0] = 0;
+        rgb_order[1] = height;
+        rgb_order[2] = 2 * height;
+    }
+
     out_shape  = attr[0]->shape;
     width      = (uint32_t)(out_shape->data[0]);
-    height     = (uint32_t)(out_shape->data[1]);
 
     if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
@@ -435,6 +454,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
 
         status = vsi_nn_kernel_gpu_add_param(node, "uniDataMeanStddevLo_2x8", &uniDataMeanStddevLo_2x8);
         status |= vsi_nn_kernel_gpu_add_param(node, "uniDataMeanStddevHi_2x8", &uniDataMeanStddevHi_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "rgb_order", &rgb_order);
         status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
         status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
@@ -464,11 +484,13 @@ DEF_KERNEL_INITIALIZER(_resize_rgb888_planar_initializer)
         {0, 0, 0},  // localWorkSize: local group size in thread
         {0, 0, 0}}; // globalWorkSize: image size in thread
 
-    uint32_t    width       = 0;
-    uint32_t    height      = 0;
-    vsi_bool    is_4_over_3 = 0;
+    uint32_t width       = 0;
+    int32_t  height      = 0;
+    vsi_bool is_4_over_3 = 0;
     vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
     vsi_size_array_t * out_shape = NULL;
+    int32_t reverse = 0;
+    int32_t rgb_order[4] = {0};
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -482,12 +504,28 @@ DEF_KERNEL_INITIALIZER(_resize_rgb888_planar_initializer)
     }
     CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
 
+    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 4], &reverse);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 3], &height);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    if (reverse)
+    {
+        rgb_order[0] = 2 * height;
+        rgb_order[1] = height;
+        rgb_order[2] = 0;
+    }
+    else
+    {
+        rgb_order[0] = 0;
+        rgb_order[1] = height;
+        rgb_order[2] = 2 * height;
+    }
+
     out_shape  = attr[1]->shape;
     width      = (uint32_t)(out_shape->data[0]);
-    height     = (uint32_t)(out_shape->data[1]);
 
     is_4_over_3 = (attr[0]->shape->data[0] * 3 == width * 4) &&
-                  (attr[0]->shape->data[1] * 3 == height * 4);
+                  (attr[0]->shape->data[1] * 3 == (vsi_size_t)height * 4);
 
     if (is_4_over_3)
     {
@@ -570,7 +608,7 @@ DEF_KERNEL_INITIALIZER(_resize_rgb888_planar_initializer)
         status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l01_4x4", &uniBilinear_4over3_l01_4x4);
         status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l11_4x4", &uniBilinear_4over3_l11_4x4);
         status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l21_4x4", &uniBilinear_4over3_l21_4x4);
-
+        status |= vsi_nn_kernel_gpu_add_param(node, "rgb_order", &rgb_order);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
     }
 
@@ -609,7 +647,7 @@ static vsi_status _query_kernel
     _internal_scale_e scale_type = SCALE;
     vsi_status status = VSI_FAILURE;
     uint32_t key = 0;
-    int32_t i = 0;
+    size_t i = 0;
     vsi_bool is_4_over_3 = FALSE;
     vsi_bool is_half_scale = FALSE;
     vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
@@ -679,8 +717,7 @@ static vsi_status _query_kernel
         {
             kernel->info.initialize = _pre_process_rgb888_planar_initializer;
         }
-        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
-                "vsi_nn_kernel_header",
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
                 pre_process_rgb888_planar_kernel_map[i].source_name );
         vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
                 pre_process_rgb888_planar_kernel_map[i].source_name );
@@ -705,19 +742,31 @@ static vsi_nn_kernel_node_t _setup
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_node_param_t* node_params = NULL;
     vsi_nn_kernel_node_t node = NULL;
-    int32_t param_count = _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM;
+    vsi_nn_tensor_t* reshape_tensor = NULL;
+    vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+    size_t param_count = _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM;
     int32_t width  = vsi_nn_kernel_param_get_int32( params, "width" );
     int32_t height = vsi_nn_kernel_param_get_int32( params, "height" );
+    int32_t output_height = (int32_t)outputs[0]->attr.size[1];
     float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
     float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" );
     float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" );
-    float scale = vsi_nn_kernel_param_get_float32( params, "scale" );
+    float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" );
+    int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
+    float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" );
+    float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" );
     vsi_bool is_no_range_change = FALSE;
 
     input_num = inputs[1] == NULL ? 1 : input_num;
     param_count = inputs[1] == NULL ? _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM : param_count;
 
-    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    memcpy(shape, outputs[0]->attr.size, outputs[0]->attr.dim_num * sizeof(shape[0]));
+    shape[1] *= shape[2];
+    shape[2] = 1;
+    reshape_tensor = vsi_nn_reshape_tensor( graph,
+            outputs[0], shape, outputs[0]->attr.dim_num );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
@@ -727,7 +776,9 @@ static vsi_nn_kernel_node_t _setup
          outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 &&
          outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC &&
          (float)outputs[0]->attr.dtype.zero_point == r_mean && r_mean == g_mean && r_mean == b_mean &&
-         vsi_nn_abs(outputs[0]->attr.dtype.scale - scale) < 1e-8 )
+         vsi_nn_abs(outputs[0]->attr.dtype.scale - r_scale) < 1e-8 &&
+         vsi_nn_abs(outputs[0]->attr.dtype.scale - g_scale) < 1e-8 &&
+         vsi_nn_abs(outputs[0]->attr.dtype.scale - b_scale) < 1e-8)
     {
         is_no_range_change = TRUE;
     }
@@ -736,10 +787,11 @@ static vsi_nn_kernel_node_t _setup
     if ( VSI_SUCCESS == status)
     {
         node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count);
+        CHECK_PTR_FAIL_GOTO( node_params, "Create buffer fail.", final );
         node = vsi_nn_kernel_create_node( graph, kernel );
         if ( node )
         {
-            uint32_t index = inputs[1] == NULL ? 4 : 6;
+            uint32_t index = inputs[1] == NULL ? 2 : 4;
             uint32_t scalar_index = index;
             int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
             int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" );
@@ -748,7 +800,7 @@ static vsi_nn_kernel_node_t _setup
 
             /* Set inputs and outputs */
             vsi_nn_kernel_node_pack_io( node_params, param_count,
-                    inputs, input_num, outputs, output_num );
+                    inputs, input_num, &reshape_tensor, output_num );
 
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
@@ -757,7 +809,11 @@ static vsi_nn_kernel_node_t _setup
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
             node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &output_height );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, param_count );
             index = scalar_index;
@@ -769,9 +825,14 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &node_params[index++] );
             vsi_nn_kernel_scalar_release( &node_params[index++] );
             vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
         }
     }
 
+final:
     vsi_nn_safe_free(node_params);
 
     return node;
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
new file mode 100644
index 000000000..ae559dac1
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
@@ -0,0 +1,1002 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+#define RGB888_SEP_SOURCE_0     "pre_process_rgb888_planar_sep_nhwc_0",
+#define RGB888_SEP_SOURCE_1     "pre_process_rgb888_planar_sep_nhwc_1",
+#define RGB888_SEP_SOURCE_2     "pre_process_rgb888_planar_sep_nhwc_2",
+#define RGB888_SOURCE_0         "pre_process_rgb888_planar_nhwc_0",
+#define RGB888_SOURCE_1         "pre_process_rgb888_planar_nhwc_1",
+#define RGB888_SOURCE_2         "pre_process_rgb888_planar_nhwc_2",
+
+#define STR(a) #a
+
+typedef enum
+{
+    COPY = 0,
+    SCALE,
+    FOUR_OVER_THREE,
+    HALF
+} _internal_scale_e;
+
+// Add kernel hashtable here
+#define PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, SEP, SCALE_FLAG ) \
+        (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 ) | ( SEP << 4 ) | (SCALE_FLAG))
+
+#define PACK_KERNEL_SCALE_MAP( IN_DTYPE, OUT_DTYPE ) \
+   { PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, SCALE ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_nhwc"), \
+     RGB888_SOURCE_0 }
+
+#define PACK_KERNEL_SEP_SCALE_MAP( IN_DTYPE, OUT_DTYPE ) \
+  { PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, SCALE ), \
+    CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_nhwc"), \
+    RGB888_SEP_SOURCE_0 }
+
+#define PACK_KERNEL_COPY_MAP( IN_DTYPE, OUT_DTYPE ) \
+   { PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, COPY ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_nhwc"), \
+     RGB888_SOURCE_1 }
+
+#define PACK_KERNEL_SEP_COPY_MAP( IN_DTYPE, OUT_DTYPE ) \
+   { PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, COPY ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_nhwc"), \
+     RGB888_SEP_SOURCE_1 }
+
+#define PACK_KERNEL_HALF_MAP( IN_DTYPE, OUT_DTYPE ) \
+   { PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, HALF ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_nhwc"), \
+     RGB888_SOURCE_2 }
+
+#define PACK_KERNEL_SEP_HALF_MAP( IN_DTYPE, OUT_DTYPE ) \
+   { PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, HALF ), \
+     CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_nhwc"), \
+     RGB888_SEP_SOURCE_2 }
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _pre_process_rgb888_planar_nhwc_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_SCALE_MAP( U8, F16 ),
+    PACK_KERNEL_SCALE_MAP( U8, I16 ),
+    PACK_KERNEL_SCALE_MAP( U8, I8 ),
+    PACK_KERNEL_SCALE_MAP( U8, U8 ),
+
+    PACK_KERNEL_COPY_MAP( U8, F16 ),
+    PACK_KERNEL_COPY_MAP( U8, I16 ),
+    PACK_KERNEL_COPY_MAP( U8, I8 ),
+    PACK_KERNEL_COPY_MAP( U8, U8 ),
+
+    PACK_KERNEL_HALF_MAP( U8, U8 ),
+
+    PACK_KERNEL_SEP_SCALE_MAP( U8, F16 ),
+    PACK_KERNEL_SEP_SCALE_MAP( U8, I16 ),
+    PACK_KERNEL_SEP_SCALE_MAP( U8, I8 ),
+    PACK_KERNEL_SEP_SCALE_MAP( U8, U8 ),
+
+    PACK_KERNEL_SEP_COPY_MAP( U8, F16 ),
+    PACK_KERNEL_SEP_COPY_MAP( U8, I16 ),
+    PACK_KERNEL_SEP_COPY_MAP( U8, I8 ),
+    PACK_KERNEL_SEP_COPY_MAP( U8, U8 ),
+
+    PACK_KERNEL_SEP_HALF_MAP( U8, U8 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM  _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def )
+
+static vx_param_description_t _pre_process_rgb888_planar_sep_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM  _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        2,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    float    output_zp    = 0;
+    float    output_scale = 1;
+    int32_t reverse = 0;
+    uint32_t width = 0;
+    uint32_t height = 0;
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    vsi_size_array_t * out_shape = NULL;
+
+    if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ))
+    {
+        attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    }
+    else
+    {
+        attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    }
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 3], &reverse);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    out_shape  = attr[0]->shape;
+    width = (uint32_t)(out_shape->data[0] / 3);
+    height = (uint32_t)(out_shape->data[1]);
+    output_scale /= attr[0]->scale;
+    output_zp = (float)attr[0]->zero_point;
+
+    shaderParam.global_scale[0]  = 4;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
+        / shaderParam.global_scale[0], 4);
+    shaderParam.global_size[1]   = height;
+    shaderParam.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniVecShift10 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00020000, 0x00060004, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000400, 0x00000000, 0x00000400, 0x00000000,
+            0x00000400, 0x00000000, 0x00000400, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniAddRShift = {{
+            0x0f0f0f0f, // TCfg
+            0x04040404, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002405, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniGetTempVal = {{
+            0x09090909, // TCfg
+            0x00000000, // ASelt
+            0x00230001, 0x00670045, // ABin
+            0x05050505, // BSelt
+            0x00110000, 0x00330022, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractBytes = {{
+            0x0f0f0f0f, // TCfg
+            0x04040404, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002414, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertIntergetoF32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractInteger_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni16BitsDataInterleaveRGB_0_2x8 = {{
+            0x11111111, // TCfg
+            0x00100100, // ASelt
+            0x01000400, 0x06020105, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni16BitsDataInterleaveRGB_1_2x8 = {{
+            0x00001111, // TCfg
+            0x00001001, // ASelt
+            0x03070302, 0x00000000, // ABin
+            0x00002222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni16BitsDataInterleaveBGR_0_2x8 = {{
+            0x11111111, // TCfg
+            0x01001001, // ASelt
+            0x01000400, 0x06020105, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni16BitsDataInterleaveBGR_1_2x8 = {{
+            0x00001111, // TCfg
+            0x00000010, // ASelt
+            0x03070302, 0x00000000, // ABin
+            0x00002222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni8BitsDataInterleaveRGB_0_2x8= {{
+            0x11111111, // TCfg
+            0x00000000, // ASelt
+            0x01080400, 0x06020905, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni8BitsDataInterleaveRGB_1_2x8 = {{
+            0x00001111, // TCfg
+            0x00000000, // ASelt
+            0x0b07030a, 0x00000000, // ABin
+            0x00002222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni8BitsDataInterleaveBGR_0_2x8 = {{
+            0x11111111, // TCfg
+            0x00000000, // ASelt
+            0x09000408, 0x060a0105, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni8BitsDataInterleaveBGR_1_2x8 = {{
+            0x00001111, // TCfg
+            0x00000000, // ASelt
+            0x03070b02, 0x00000000, // ABin
+            0x00002222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        status = vsi_nn_kernel_gpu_add_param(node, "uniVecShift10", &uniVecShift10);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniAddRShift", &uniAddRShift);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniGetTempVal", &uniGetTempVal);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertIntergetoF32_4x4", &uniConvertIntergetoF32_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
+        if (reverse)
+        {
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_0_2x8",
+                &uni16BitsDataInterleaveBGR_0_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_1_2x8",
+                &uni16BitsDataInterleaveBGR_1_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_0_2x8",
+                &uni8BitsDataInterleaveBGR_0_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_1_2x8",
+                &uni8BitsDataInterleaveBGR_1_2x8);
+        }
+        else
+        {
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_0_2x8",
+                &uni16BitsDataInterleaveRGB_0_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_1_2x8",
+                &uni16BitsDataInterleaveRGB_1_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_0_2x8",
+                &uni8BitsDataInterleaveRGB_0_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_1_2x8",
+                &uni8BitsDataInterleaveRGB_1_2x8);
+        }
+
+        if (attr[0]->dtype == F16)
+        {
+            status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
+        }
+        else
+        {
+            status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8);
+        }
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _pre_process_rgb888_planar_initializer() */
+
+DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        2,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    float    output_zp    = 0;
+    float    output_scale = 1;
+    uint32_t width = 0;
+    uint32_t height = 0;
+    int32_t reverse = 0;
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    vsi_size_array_t * out_shape = NULL;
+
+    if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ))
+    {
+        attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    }
+    else
+    {
+        attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    }
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 3], &reverse);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    out_shape  = attr[0]->shape;
+    width = (uint32_t)(out_shape->data[0] / 3);
+    height = (uint32_t)(out_shape->data[1]);
+
+    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        if ( attr[0]->dfp.fl > 0 )
+        {
+            output_scale *= (float)((int64_t)1 << attr[0]->dfp.fl);
+        }
+        else
+        {
+            output_scale *= (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
+        }
+    }
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        output_zp = (float)attr[0]->asymm.zero_point;
+        output_scale /= attr[0]->asymm.scale;
+    }
+
+    if (attr[0]->dtype == F16 || attr[0]->dtype == I16)
+    {
+        shaderParam.global_scale[0] = 4;
+    }
+    else
+    {
+        shaderParam.global_scale[0] = 8;
+    }
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
+        / shaderParam.global_scale[0], 4);
+    shaderParam.global_size[1]   = height;
+    shaderParam.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniDataMeanStddevLo_2x8 = {{
+            0x99999999, // TCfg
+            0x44444444, // ASelt
+            0x03020100, 0x07060504, // ABin
+            0x99999999, // BSelt
+            0x06060606, 0x06060606, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000,
+            0x3c000000, 0x3c000000, 0x3c000000, 0x3c000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni16BitsDataInterleaveRGB_0_2x8 = {{
+            0x11111111, // TCfg
+            0x00100100, // ASelt
+            0x01000400, 0x06020105, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni16BitsDataInterleaveRGB_1_2x8 = {{
+            0x00001111, // TCfg
+            0x00001001, // ASelt
+            0x03070302, 0x00000000, // ABin
+            0x00002222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni16BitsDataInterleaveBGR_0_2x8 = {{
+            0x11111111, // TCfg
+            0x01001001, // ASelt
+            0x01000400, 0x06020105, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni16BitsDataInterleaveBGR_1_2x8 = {{
+            0x00001111, // TCfg
+            0x00000010, // ASelt
+            0x03070302, 0x00000000, // ABin
+            0x00002222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni8BDataInterleaveRGB_0_2x8 = {{
+            0x11111111, // TCfg
+            0x00100100, // ASelt
+            0x01000800, 0x0a020109, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni8BDataInterleaveRGB_1_2x8 = {{
+            0x11111111, // TCfg
+            0x01001001, // ASelt
+            0x030b0302, 0x05040c04, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni8BDataInterleaveRGB_2_2x8 = {{
+            0x11111111, // TCfg
+            0x10010010, // ASelt
+            0x0e06050d, 0x070f0706, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni8BDataInterleaveBGR_0_2x8 = {{
+            0x11111111, // TCfg
+            0x01001001, // ASelt
+            0x01000800, 0x0a020109, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni8BDataInterleaveBGR_1_2x8 = {{
+            0x11111111, // TCfg
+            0x10010010, // ASelt
+            0x030b0302, 0x05040c04, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni8BDataInterleaveBGR_2_2x8 = {{
+            0x11111111, // TCfg
+            0x00100100, // ASelt
+            0x0e06050d, 0x070f0706, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        status = vsi_nn_kernel_gpu_add_param(node, "uniDataMeanStddevLo_2x8", &uniDataMeanStddevLo_2x8);
+        if (reverse)
+        {
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_0_2x8",
+                &uni16BitsDataInterleaveBGR_0_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_1_2x8",
+                &uni16BitsDataInterleaveBGR_1_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_0_2x8",
+                &uni8BDataInterleaveBGR_0_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_1_2x8",
+                &uni8BDataInterleaveBGR_1_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_2_2x8",
+                &uni8BDataInterleaveBGR_2_2x8);
+        }
+        else
+        {
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_0_2x8",
+                &uni16BitsDataInterleaveRGB_0_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni16BitsDataInterleave_1_2x8",
+                &uni16BitsDataInterleaveRGB_1_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_0_2x8",
+                &uni8BDataInterleaveRGB_0_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_1_2x8",
+                &uni8BDataInterleaveRGB_1_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_2_2x8",
+                &uni8BDataInterleaveRGB_2_2x8);
+        }
+        status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
+        status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _pre_process_gray_copy_initializer() */
+
+DEF_KERNEL_INITIALIZER(_resize_rgb888_planar_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        2,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    int32_t reverse = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ))
+    {
+        attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    }
+    else
+    {
+        attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    }
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+
+    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[param_size - 3], &reverse);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    {
+        shaderParam.global_scale[0]  = 16;
+        shaderParam.global_scale[1]  = 2;
+        shaderParam.global_size[0]   = gpu_align_p2((attr[0]->shape->data[0] + shaderParam.global_scale[0] - 1)
+            / shaderParam.global_scale[0], 4);
+        shaderParam.global_size[1]   = (attr[0]->shape->data[1] + shaderParam.global_scale[1] - 1)
+            / shaderParam.global_scale[1];
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uni8BDataInterleaveRGB_0_2x8 = {{
+            0x11111111, // TCfg
+            0x00100100, // ASelt
+            0x01000800, 0x0a020109, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni8BDataInterleaveRGB_1_2x8 = {{
+            0x11111111, // TCfg
+            0x01001001, // ASelt
+            0x030b0302, 0x05040c04, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni8BDataInterleaveRGB_2_2x8 = {{
+            0x11111111, // TCfg
+            0x10010010, // ASelt
+            0x0e06050d, 0x070f0706, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni8BDataInterleaveBGR_0_2x8 = {{
+            0x11111111, // TCfg
+            0x01001001, // ASelt
+            0x01000800, 0x0a020109, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni8BDataInterleaveBGR_1_2x8 = {{
+            0x11111111, // TCfg
+            0x10010010, // ASelt
+            0x030b0302, 0x05040c04, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uni8BDataInterleaveBGR_2_2x8 = {{
+            0x11111111, // TCfg
+            0x00100100, // ASelt
+            0x0e06050d, 0x070f0706, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        if (reverse)
+        {
+            status = vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_0_2x8",
+                &uni8BDataInterleaveBGR_0_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_1_2x8",
+                &uni8BDataInterleaveBGR_1_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_2_2x8",
+                &uni8BDataInterleaveBGR_2_2x8);
+        }
+        else
+        {
+            status  = vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_0_2x8",
+                &uni8BDataInterleaveRGB_0_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_1_2x8",
+                &uni8BDataInterleaveRGB_1_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uni8BitsDataInterleave_2_2x8",
+                &uni8BDataInterleaveRGB_2_2x8);
+        }
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+
+    return status;
+} /* _resize_rgb888_planar_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_nn_kernel_t* kernel,
+    const vsi_nn_kernel_param_t * params,
+    vsi_bool is_no_range_change,
+    int32_t width,
+    int32_t height
+    )
+{
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    _internal_scale_e scale_type = SCALE;
+    vsi_status status = VSI_FAILURE;
+    uint32_t key = 0;
+    size_t i = 0;
+    vsi_bool is_half_scale = FALSE;
+    vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
+    vsi_bool is_rgb888_sep = (vsi_bool)(inputs[1] != NULL);
+
+    is_half_scale = (width == (int32_t)outputs[0]->attr.size[0] * 2) &&
+                    (height == (int32_t)outputs[0]->attr.size[1] * 2);
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (enable_copy)
+    {
+        scale_type = COPY;
+    }
+    else
+    {
+        if (is_no_range_change && is_half_scale)
+        {
+            scale_type = HALF;
+        }
+        else
+        {
+            scale_type = SCALE;
+        }
+    }
+
+    key = PRE_PROCESS_RGB888_PLANAR_NHWC_HASH_KEY( input0_dtype, output_dtype, is_rgb888_sep, scale_type);
+
+    for ( i = 0; i < _cnt_of_array(_pre_process_rgb888_planar_nhwc_kernel_map); i ++ )
+    {
+        if ( _pre_process_rgb888_planar_nhwc_kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(_pre_process_rgb888_planar_nhwc_kernel_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",
+            _pre_process_rgb888_planar_nhwc_kernel_map[i].function_name );
+
+        if (is_rgb888_sep)
+        {
+            kernel->info.parameters = _pre_process_rgb888_planar_sep_kernel_param_def;
+            kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def );
+        }
+        else
+        {
+            kernel->info.parameters = _pre_process_rgb888_planar_kernel_param_def;
+            kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def );
+        }
+
+        if (enable_copy)
+        {
+            kernel->info.initialize = _pre_process_rgb888_planar_copy_initializer;
+        }
+        else if (scale_type == HALF)
+        {
+            kernel->info.initialize = _resize_rgb888_planar_initializer;
+        }
+        else
+        {
+            kernel->info.initialize = _pre_process_rgb888_planar_initializer;
+        }
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                _pre_process_rgb888_planar_nhwc_kernel_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                _pre_process_rgb888_planar_nhwc_kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t* node_params = NULL;
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_t* reshape_tensor = NULL;
+    vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+    size_t param_count = _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM;
+    int32_t width  = vsi_nn_kernel_param_get_int32( params, "width" );
+    int32_t height = vsi_nn_kernel_param_get_int32( params, "height" );
+    float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
+    float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" );
+    float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" );
+    float r_scale = vsi_nn_kernel_param_get_float32( params, "r_scale" );
+    float g_scale = vsi_nn_kernel_param_get_float32( params, "g_scale" );
+    float b_scale = vsi_nn_kernel_param_get_float32( params, "b_scale" );
+    int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
+    vsi_bool is_no_range_change = FALSE;
+
+    input_num = inputs[1] == NULL ? 1 : input_num;
+    param_count = inputs[1] == NULL ? _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM : param_count;
+
+    memcpy(shape, outputs[0]->attr.size, outputs[0]->attr.dim_num * sizeof(shape[0]));
+    shape[0] *= shape[1];
+    shape[1] = shape[2];
+    shape[2] = 1;
+    reshape_tensor = vsi_nn_reshape_tensor( graph,
+            outputs[0], shape, outputs[0]->attr.dim_num );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    if ( width == (int32_t)inputs[0]->attr.size[0] && height == (int32_t)inputs[0]->attr.size[1] &&
+         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 &&
+         outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC &&
+         (float)outputs[0]->attr.dtype.zero_point == r_mean && r_mean == g_mean && r_mean == b_mean &&
+         vsi_nn_abs(outputs[0]->attr.dtype.scale - r_scale) < 1e-8 &&
+         vsi_nn_abs(outputs[0]->attr.dtype.scale - g_scale) < 1e-8 &&
+         vsi_nn_abs(outputs[0]->attr.dtype.scale - b_scale) < 1e-8)
+    {
+        is_no_range_change = TRUE;
+    }
+
+    status = _query_kernel( inputs, outputs, kernel, params, is_no_range_change, width, height );
+    if ( VSI_SUCCESS == status)
+    {
+        node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count);
+        CHECK_PTR_FAIL_GOTO( node_params, "Create buffer fail.", final );
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = inputs[1] == NULL ? 2 : 4;
+            uint32_t scalar_index = index;
+            int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
+            int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" );
+            int32_t left    = vsi_nn_kernel_param_get_int32( params, "left" );
+            int32_t top     = vsi_nn_kernel_param_get_int32( params, "top" );
+
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, param_count,
+                    inputs, input_num, &reshape_tensor, output_num );
+
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, param_count );
+            index = scalar_index;
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+            vsi_nn_kernel_scalar_release( &node_params[index++] );
+        }
+    }
+
+final:
+    vsi_nn_safe_free(node_params);
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( pre_process_rgb888_planar_nhwc, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
index 5fda28142..984293bcb 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
@@ -106,6 +106,8 @@ static vx_param_description_t vxPreProcessRgbKernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _EVIS_PRE_PROCESS_RGB_PARAM_NUM          _cnt_of_array(vxPreProcessRgbKernel_param_def)
 
@@ -126,19 +128,24 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
 
     float       outputZP      = 0;
     float       outputScale   = 1;
-    int32_t     reorder    = 0;
-    int32_t     trans      = 0;
-    int32_t     xRatio     = 0;
-    int32_t     yRatio     = 0;
-    int32_t     order1     = 2;
-    uint32_t    width      = 0;
-    uint32_t    height     = 0;
-    int32_t     enable_copy= 0;
-    uint32_t    pack_key   = 0;
+    int32_t     reorder       = 0;
+    int32_t     trans         = 0;
+    int32_t     xRatio        = 0;
+    int32_t     yRatio        = 0;
+    int32_t     order1        = 2;
+    uint32_t    width         = 0;
+    uint32_t    height        = 0;
+    int32_t     enable_copy   = 0;
+    uint32_t    pack_key      = 0;
+    float       rgb_mean[4]   = {0};
+    float       rgb_scale[4]  = {0};
+    float       param_data[4] = {0};
 
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
 
@@ -148,6 +155,18 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[6], &rgb_mean[0]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &rgb_mean[1]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &rgb_mean[2]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &rgb_scale[0]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[12], &rgb_scale[1]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &rgb_scale[2]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[0]->shape;
     width      = (uint32_t)(out_shape->data[0]);
@@ -417,6 +436,12 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
         case _PACK_SELECT_KEY( 1, 0, 0):  // copy
         case _PACK_SELECT_KEY( 1, 2, 0):  // copy  reorder
             {
+                int32_t i = 0;
+                for (i = 0;i < 3; i ++)
+                {
+                    rgb_scale[i] *= outputScale;
+                    param_data[i] = rgb_mean[i] * rgb_scale[i] - outputZP;
+                }
                 if (attr[0]->dtype == I8 || attr[0]->dtype == U8)
                 {
                     shaderParam.global_scale[0]  = 16;
@@ -454,6 +479,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part3_4x4", &uniExtractBtoF32_part3_4x4);
                 status |= vsi_nn_kernel_gpu_add_param(node, "r_order", &reorder);
                 status |= vsi_nn_kernel_gpu_add_param(node, "b_order", &order1);
+                status |= vsi_nn_kernel_gpu_add_param(node, "rgb_scale", &rgb_scale);
+                status |= vsi_nn_kernel_gpu_add_param(node, "param_data", &param_data);
                 CHECK_STATUS_FAIL_GOTO(status, OnError);
             }
             break;
@@ -486,6 +513,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBytes", &uniExtractBytes);
                 status |= vsi_nn_kernel_gpu_add_param(node, "r_order", &reorder);
                 status |= vsi_nn_kernel_gpu_add_param(node, "b_order", &order1);
+                status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale);
+                status |= vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP);
                 CHECK_STATUS_FAIL_GOTO(status, OnError);
             }
             break;
@@ -493,10 +522,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
             break;
         }
 
-        status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale);
-        status |= vsi_nn_kernel_gpu_add_param(node, "outputZP", &outputZP);
-        CHECK_STATUS_FAIL_GOTO(status, OnError );
-
         status = vsi_nn_kernel_gpu_config( node, &shaderParam );
                 CHECK_STATUS_FAIL_GOTO(status, OnError);
     }
@@ -523,7 +548,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_convert_type_e convert_type = SCALE;
     vsi_status status = VSI_FAILURE;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
     vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
@@ -580,6 +605,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
     int32_t trans = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
@@ -600,7 +628,9 @@ static vsi_nn_kernel_node_t _setup
             float r_mean     = vsi_nn_kernel_param_get_float32( params, "r_mean" );
             float g_mean     = vsi_nn_kernel_param_get_float32( params, "g_mean" );
             float b_mean     = vsi_nn_kernel_param_get_float32( params, "b_mean" );
-            float rgb_scale  = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
+            float r_scale    = vsi_nn_kernel_param_get_float32( params, "r_scale" );
+            float g_scale    = vsi_nn_kernel_param_get_float32( params, "g_scale" );
+            float b_scale    = vsi_nn_kernel_param_get_float32( params, "b_scale" );
             int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
 
             /* Pass parameters to node. */
@@ -616,9 +646,11 @@ static vsi_nn_kernel_node_t _setup
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
-            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
             status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_RGB_PARAM_NUM );
             CHECK_STATUS(status);
             vsi_nn_kernel_scalar_release( &tmp_params[2] );
@@ -631,6 +663,8 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &tmp_params[9] );
             vsi_nn_kernel_scalar_release( &tmp_params[10] );
             vsi_nn_kernel_scalar_release( &tmp_params[11] );
+            vsi_nn_kernel_scalar_release( &tmp_params[12] );
+            vsi_nn_kernel_scalar_release( &tmp_params[13] );
         }
     }
 
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
index 8e5f77949..eb9d16056 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
@@ -99,6 +99,8 @@ static vx_param_description_t vxPreProcessYuv420Kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _EVIS_PRE_PROCESS_YUV420_PARAM_NUM          _cnt_of_array(vxPreProcessYuv420Kernel_param_def)
 
@@ -128,6 +130,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
 
@@ -496,6 +500,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
 
@@ -833,7 +839,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_convert_type_e convert_type = SCALE;
     vsi_status status = VSI_FAILURE;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
     vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
@@ -900,6 +906,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
     int32_t trans = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
@@ -920,7 +929,9 @@ static vsi_nn_kernel_node_t _setup
             float r_mean     = vsi_nn_kernel_param_get_float32( params, "r_mean" );
             float g_mean     = vsi_nn_kernel_param_get_float32( params, "g_mean" );
             float b_mean     = vsi_nn_kernel_param_get_float32( params, "b_mean" );
-            float rgb_scale  = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
+            float r_scale    = vsi_nn_kernel_param_get_float32( params, "r_scale" );
+            float g_scale    = vsi_nn_kernel_param_get_float32( params, "g_scale" );
+            float b_scale    = vsi_nn_kernel_param_get_float32( params, "b_scale" );
             int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
 
             /* Pass parameters to node. */
@@ -935,9 +946,11 @@ static vsi_nn_kernel_node_t _setup
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
-            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
             status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_YUV420_PARAM_NUM );
             CHECK_STATUS(status);
             vsi_nn_kernel_scalar_release( &tmp_params[4] );
@@ -950,6 +963,8 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &tmp_params[11] );
             vsi_nn_kernel_scalar_release( &tmp_params[12] );
             vsi_nn_kernel_scalar_release( &tmp_params[13] );
+            vsi_nn_kernel_scalar_release( &tmp_params[14] );
+            vsi_nn_kernel_scalar_release( &tmp_params[15] );
         }
     }
     if (reshape_tensors[0])
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c
index ca397de23..61d421d27 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c
@@ -99,6 +99,8 @@ static vx_param_description_t vxPreProcessyuv422Kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _EVIS_PRE_PROCESS_YUV422_PARAM_NUM    _cnt_of_array(vxPreProcessyuv422Kernel_param_def)
 
@@ -126,13 +128,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer)
     int32_t     order1      = 2;
     uint32_t    width       = 0;
     uint32_t    height      = 0;
-    float       bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f;
-    float       outputScaleVar = 0.0f;
+    float       bMean = 0.0f, gMean= 0.0f, rMean = 0.0f;
+    float       b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f;
+    float       outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f;
     float       bMeanScaleVarZp = 0.0f,  gMeanScaleVarZp = 0.0f,  rMeanScaleVarZp = 0.0f;
 
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
 
@@ -142,10 +147,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
     status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &bMean);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &var);
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &r_scale);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &g_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &b_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[0]->shape;
     output_scale = 1.0f / attr[0]->scale;
@@ -159,10 +168,12 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer)
         order1 = 0;
     }
 
-    outputScaleVar = output_scale * var;
-    bMeanScaleVarZp = output_zp - bMean * outputScaleVar;
-    gMeanScaleVarZp = output_zp - gMean * outputScaleVar;
-    rMeanScaleVarZp = output_zp - rMean * outputScaleVar;
+    outputScaleVar_b = output_scale * b_scale;
+    outputScaleVar_g = output_scale * g_scale;
+    outputScaleVar_r = output_scale * r_scale;
+    bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b;
+    gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g;
+    rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r;
 
     shaderParam.global_scale[0]  = 4;
     shaderParam.global_scale[1]  = 1;
@@ -245,7 +256,9 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer)
         status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
         status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
         status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractYUVtoShortSub_2x8", &uniExtractYUVtoShortSub_2x8);
-        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r);
         status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
         status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
         status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
@@ -308,13 +321,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer)
     uint32_t    yrIntFloat_16 = 0;
     int32_t     xRatio        = 0;
     int32_t     yRatio        = 0;
-    float       bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f;
-    float       outputScaleVar = 0.0f;
+    float       bMean = 0.0f, gMean= 0.0f, rMean = 0.0f;
+    float       b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f;
+    float       outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f;
     float       bMeanScaleVarZp = 0.0f,  gMeanScaleVarZp = 0.0f,  rMeanScaleVarZp = 0.0f;
 
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
 
@@ -328,10 +344,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
     status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &bMean);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &var);
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &r_scale);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &g_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &b_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[0]->shape;
     output_scale = 1.0f / attr[0]->scale;
@@ -350,10 +370,12 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer)
     xrIntFloat_16 = (uint32_t)((roi_width << 16) / width + 1);
     yrIntFloat_16 = (uint32_t)((roi_height << 16) / height + 1);
 
-    outputScaleVar = output_scale * var;
-    bMeanScaleVarZp = output_zp - bMean * outputScaleVar;
-    gMeanScaleVarZp = output_zp - gMean * outputScaleVar;
-    rMeanScaleVarZp = output_zp - rMean * outputScaleVar;
+    outputScaleVar_b = output_scale * b_scale;
+    outputScaleVar_g = output_scale * g_scale;
+    outputScaleVar_r = output_scale * r_scale;
+    bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b;
+    gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g;
+    rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r;
 
     shaderParam.global_scale[0]  = 4;
     shaderParam.global_scale[1]  = 1;
@@ -445,7 +467,9 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer)
         status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toR_4x4", &uniConvertYUV422toR_4x4);
         status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16);
         status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16);
-        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r);
         status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
         status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
         status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
@@ -503,9 +527,11 @@ static vsi_status _query_kernel
     vsi_nn_kernel_convert_type_e convert_type = SCALE;
     vsi_status status = VSI_FAILURE;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
     vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
 
+    VSI_UNREFERENCED(scale_x);
+
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
@@ -570,6 +596,9 @@ static vsi_nn_kernel_node_t _setup
     int32_t trans = 0;
     int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
@@ -589,7 +618,9 @@ static vsi_nn_kernel_node_t _setup
             float r_mean     = vsi_nn_kernel_param_get_float32( params, "r_mean" );
             float g_mean     = vsi_nn_kernel_param_get_float32( params, "g_mean" );
             float b_mean     = vsi_nn_kernel_param_get_float32( params, "b_mean" );
-            float rgb_scale  = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
+            float r_scale    = vsi_nn_kernel_param_get_float32( params, "r_scale" );
+            float g_scale    = vsi_nn_kernel_param_get_float32( params, "g_scale" );
+            float b_scale    = vsi_nn_kernel_param_get_float32( params, "b_scale" );
             int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
             int32_t yuv422_type = vsi_nn_kernel_param_get_int32( params, "yuv422_type" );
 
@@ -604,10 +635,12 @@ static vsi_nn_kernel_node_t _setup
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
-            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &yuv422_type );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
             status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_YUV422_PARAM_NUM );
             CHECK_STATUS(status);
             vsi_nn_kernel_scalar_release( &tmp_params[2] );
@@ -621,6 +654,8 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &tmp_params[10] );
             vsi_nn_kernel_scalar_release( &tmp_params[11] );
             vsi_nn_kernel_scalar_release( &tmp_params[12] );
+            vsi_nn_kernel_scalar_release( &tmp_params[13] );
+            vsi_nn_kernel_scalar_release( &tmp_params[14] );
         }
     }
     vsi_safe_release_tensor(reshape_tensors[0]);
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
index 7c7efc765..4c322a8fc 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
@@ -95,6 +95,8 @@ static vx_param_description_t vxPreProcessYuv444Kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _EVIS_PRE_PROCESS_YUV444_PARAM_NUM          _cnt_of_array(vxPreProcessYuv444Kernel_param_def)
 
@@ -123,6 +125,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
 
@@ -488,6 +492,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_size_array_t * out_shape = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
 
@@ -845,7 +851,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_convert_type_e convert_type = SCALE;
     vsi_status status = VSI_FAILURE;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
     vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
@@ -910,6 +916,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
     int32_t trans = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
@@ -930,7 +939,9 @@ static vsi_nn_kernel_node_t _setup
             float r_mean     = vsi_nn_kernel_param_get_float32( params, "r_mean" );
             float g_mean     = vsi_nn_kernel_param_get_float32( params, "g_mean" );
             float b_mean     = vsi_nn_kernel_param_get_float32( params, "b_mean" );
-            float rgb_scale  = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
+            float r_scale    = vsi_nn_kernel_param_get_float32( params, "r_scale" );
+            float g_scale    = vsi_nn_kernel_param_get_float32( params, "g_scale" );
+            float b_scale    = vsi_nn_kernel_param_get_float32( params, "b_scale" );
             int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
 
             /* Pass parameters to node. */
@@ -944,9 +955,11 @@ static vsi_nn_kernel_node_t _setup
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
-            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
             status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_YUV444_PARAM_NUM );
             CHECK_STATUS(status);
             vsi_nn_kernel_scalar_release( &tmp_params[4] );
@@ -959,6 +972,8 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &tmp_params[11] );
             vsi_nn_kernel_scalar_release( &tmp_params[12] );
             vsi_nn_kernel_scalar_release( &tmp_params[13] );
+            vsi_nn_kernel_scalar_release( &tmp_params[14] );
+            vsi_nn_kernel_scalar_release( &tmp_params[15] );
         }
     }
     if(reshape_tensors[0])
diff --git a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
index c007a088e..bed0b6c46 100644
--- a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
@@ -142,6 +142,8 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
     vx_context                  ctx       = vxGetContext((vx_reference)node);
     vx_hardware_caps_params_t   hw_param;
 
+    VSI_UNREFERENCED(param_size);
+
     memset(&hw_param, 0, sizeof(vx_hardware_caps_params_t));
     status = vxQueryHardwareCaps(ctx, &hw_param, sizeof(vx_hardware_caps_params_t));
     CHECK_STATUS_FAIL_GOTO(status, final);
@@ -531,7 +533,7 @@ static vsi_status _query_kernel
     vsi_nn_shader_type_e  sh_type = image_2d ? (input_fl >= output_fl ? _2D_OPT : _2D) : _3D;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    size_t i;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -583,6 +585,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool ret;
     int32_t is_per_channel_alpha = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     is_per_channel_alpha = vsi_nn_kernel_param_get_int32(params, "is_per_channel_alpha");
 
     if (is_per_channel_alpha)
diff --git a/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c b/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c
index daa40605e..cac4e3b13 100644
--- a/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -151,6 +150,8 @@ DEF_KERNEL_INITIALIZER(_multinomial_initializer)
     vsi_nn_kernel_tensor_attr_t * attr  = NULL;
     vsi_size_array_t * in_shape          = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
 
@@ -196,6 +197,8 @@ DEF_KERNEL_INITIALIZER(_cdf_initializer)
     uint32_t      class_size            = 0;
     uint32_t      batch                 = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
 
@@ -292,6 +295,8 @@ DEF_KERNEL_INITIALIZER(_seed_initializer)
     float             rand_max          = (float)(pow(2.0,32));
     float             re_rand_max       = 1 / rand_max;
 
+    VSI_UNREFERENCED(param_size);
+
     attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
 
@@ -425,20 +430,24 @@ static vsi_nn_kernel_node_t _setup
     uint32_t hashkey = 0;
     int32_t i;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+
     // Check if gpu can support the size
-    if( !vsi_nn_kernel_gpu_check_shape(
+    if ( !vsi_nn_kernel_gpu_check_shape(
         outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
     {
         return NULL;
     }
 
-    for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
+    for ( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
     {
         ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
         // Assign unique_id
         ikernels[i]->unique_id = kernel->unique_id;
     }
-    if( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 )
+    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 )
     {
         class_max_stride = (int32_t)gpu_align_p2(inputs[0]->attr.size[0], 4);
     }
@@ -453,17 +462,20 @@ static vsi_nn_kernel_node_t _setup
     attr.is_const = FALSE;
     attr.vtl = TRUE;
     tensors[SEED_INDEX] = vsi_nn_CreateTensor( graph, &attr );
+    CHECK_PTR_FAIL_GOTO(tensors[SEED_INDEX], "Create tensor failed", final);
 
     attr.size[0] = class_max_stride * inputs[0]->attr.size[1];
     attr.size[1] = inputs[0]->attr.size[1];
     attr.dim_num = 2;
     tensors[CDF_INDEX] = vsi_nn_CreateTensor( graph, &attr );
+    CHECK_PTR_FAIL_GOTO(tensors[CDF_INDEX], "Create tensor failed", final);
 
     memcpy( &attr, &(inputs[1]->attr), sizeof(vsi_nn_tensor_attr_t) );
     attr.size[1] = 1;
     attr.dim_num = 2;
     tensors[SEEDS_INDEX] = vsi_nn_reshape_tensor( graph,
                 inputs[1], attr.size, attr.dim_num );
+    CHECK_PTR_FAIL_GOTO(tensors[SEEDS_INDEX], "Create tensor failed", final);
 
     in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -474,17 +486,17 @@ static vsi_nn_kernel_node_t _setup
     hashkey = MULTINOMIAL_HASH_KEY( F32, F32, out_dtype );
 
     status = _query_kernel( ikernels[SEED_INDEX], hashkeys[SEED_INDEX], INTERNAL_KERNEL_SEED );
-    if( VSI_SUCCESS != status )
+    if ( VSI_SUCCESS != status )
     {
         goto final;
     }
     status = _query_kernel( ikernels[CDF_INDEX], hashkeys[CDF_INDEX], INTERNAL_KERNEL_CDF );
-    if( VSI_SUCCESS != status )
+    if ( VSI_SUCCESS != status )
     {
         goto final;
     }
     status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_MULTINOMIAL );
-    if( VSI_SUCCESS != status )
+    if ( VSI_SUCCESS != status )
     {
         goto final;
     }
@@ -518,13 +530,13 @@ static vsi_nn_kernel_node_t _setup
 
     /* Pass parameters to node. */
 final:
-    for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
+    for ( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
     {
-        if( ikernels[i] )
+        if ( ikernels[i] )
         {
             vsi_nn_kernel_release( &ikernels[i] );
         }
-        if( tensors[i] )
+        if ( tensors[i] )
         {
             vsi_nn_ReleaseTensor( &tensors[i] );
         }
diff --git a/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c
index caf40b973..a133a121e 100644
--- a/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c
@@ -111,6 +111,8 @@ DEF_KERNEL_INITIALIZER(_reduceall_internal_initializer)
     vsi_size_array_t * output_shape             = NULL;
     int32_t  axisSize = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c
index df45307c9..11aa099ec 100644
--- a/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c
@@ -111,6 +111,8 @@ DEF_KERNEL_INITIALIZER(_reduceany_internal_initializer)
     vsi_size_array_t * output_shape             = NULL;
     int32_t  axisSize = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c
index e70b58a52..efb52f080 100644
--- a/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c
@@ -159,6 +159,8 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer)
     float    outputScale                       = 1.0f;
     float    output_offset_asymmetric          = 0.0f;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c
index b1149fd59..d9bd40d8a 100644
--- a/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c
@@ -161,6 +161,8 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer)
     float    outputScale                       = 1.0f;
     float    output_offset_asymmetric          = 0.0f;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c
index 6fd1b7d63..3c710f599 100644
--- a/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c
@@ -167,6 +167,8 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer)
     float    outputScale                       = 1.0f;
     float    output_offset_asymmetric          = 0.0f;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c b/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c
index d7cb58d43..131111732 100644
--- a/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c
@@ -141,6 +141,8 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
     int32_t                       srcFixPointPos = 0;
     int32_t                       dstFixPointPos = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/repeat_evis.c b/src/tim/vx/internal/src/kernel/evis/repeat_evis.c
index 7fe19bc70..164ab495c 100644
--- a/src/tim/vx/internal/src/kernel/evis/repeat_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/repeat_evis.c
@@ -147,6 +147,8 @@ DEF_KERNEL_INITIALIZER(_preprocess_initializer)
     vsi_nn_kernel_tensor_attr_t* attr[1] = {NULL};
     int32_t width = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
 
@@ -212,6 +214,8 @@ DEF_KERNEL_INITIALIZER(_repeat_initializer)
     int32_t is1d = 0;
     int32_t axis = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
 
@@ -303,7 +307,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
     int32_t is1d = inputs[0]->attr.dim_num == 1 ? 1 : 0;
-    int i = 0;
+    size_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@@ -453,6 +457,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t new_rank[2] = {0, 0};
     int32_t axis  = vsi_nn_kernel_param_get_int32( params, "axis" );
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     // Check if gpu can support the size
     if ( !vsi_nn_kernel_gpu_check_shape(
         outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
@@ -497,7 +504,7 @@ static vsi_nn_kernel_node_t _setup
     attr.size[1] = 1;
     attr.dim_num = 2;
     tensor_preprocess = vsi_nn_CreateTensor( graph, &attr );
-
+    CHECK_PTR_FAIL_GOTO( tensor_preprocess, "Create tensor fail.", final );
     // preprocess
     tmp_node = vsi_nn_kernel_create_node( graph, kernel_preprocess );
     if (tmp_node)
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c
index f893feaf2..95c33b80b 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 #include "utils/vsi_nn_dtype_util_prv.h"
 
 __BEGIN_DECLS
@@ -855,7 +854,6 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer)
         else if (F16 == output_dtype)
         {
             status  = vsi_nn_kernel_gpu_add_param( node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
             status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertFp2FP32_left_4x4",
                                                           &uniConvertFp2FP32_left_4x4);
             status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertFp2FP32_right_4x4",
@@ -1187,7 +1185,7 @@ static vsi_nn_tensor_t* _create_scale_tensor
     uint32_t   dims                 = output->attr.dim_num;
     vsi_size_t   batch                = dims > 3 ? output->attr.size[3] : 1;
     vsi_size_t   width                = output->attr.size[0];
-    vsi_size_t   sizes[4]             = {width * 2, 1, 1, batch};
+    vsi_size_t   sizes[4]             = { 0, 0, 0, 0 };
     vsi_size_t   item_count           = width * 2 * batch;
     vsi_size_t   input_width          = input->attr.size[0];
     vsi_size_t   x                    = 0;
@@ -1195,6 +1193,10 @@ static vsi_nn_tensor_t* _create_scale_tensor
     float      width_scale          = 1.0f;
     uint16_t  *scale_data_ptr       = NULL;
 
+    sizes[0] = width * 2;
+    sizes[1] = 1;
+    sizes[2] = 1;
+    sizes[3] = batch;
     if (align_corners && width > 1)
     {
         width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(width - 1);
@@ -1310,6 +1312,7 @@ static vsi_nn_kernel_node_t _setup
             if (is_run_opt_kernel)
             {
                 scale = _create_scale_tensor(graph, inputs[0], outputs[0], align_corners, half_pixel_centers);
+                CHECK_PTR_FAIL_GOTO( scale, "Create tensor fail.", final );
                 node_params[SCALAR_TENSOR_SCALE] = (vsi_nn_kernel_node_param_t)(scale->t);
                 node_params_num = _RESIZE_1D_BILINEAR_PARAM_NUM;
             }
@@ -1325,16 +1328,18 @@ static vsi_nn_kernel_node_t _setup
             {
                 vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_TYPE] );
             }
+        }
+    }
 
-            if (is_run_opt_kernel)
-            {
-                if (scale)
-                {
-                    vsi_nn_ReleaseTensor(&scale);
-                }
-            }
+final:
+    if (is_run_opt_kernel)
+    {
+        if (scale)
+        {
+            vsi_nn_ReleaseTensor(&scale);
         }
     }
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c
index be1cd0972..fddd1e381 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c
@@ -144,6 +144,8 @@ DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer)
     float       half_pixel_value   = 0.0f;
     float       round_value        = 0.0f;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
index 1e79cbfe3..ebfe9ed38 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@@ -868,6 +868,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
     vsi_bool    is_4x_up_kernel  = FALSE;
     vsi_bool    is_8x_up_kernel  = FALSE;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -1167,6 +1169,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_align_corners_opt_initializer)
     uint32_t    out_height = 0;
     vsi_bool    is_8x_align_corners  = FALSE;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -1490,7 +1494,7 @@ static vsi_nn_tensor_t* _create_scale_tensor
     vsi_size_t   width                = output->attr.size[0];
     vsi_size_t   height               = output->attr.size[1];
     vsi_size_t   batch                = dims > 3 ? output->attr.size[3] : 1;
-    vsi_size_t   sizes[4]             = {width * 4, height, 1, batch};
+    vsi_size_t   sizes[4]             = { 0, 0, 0, 0 };
     vsi_size_t   item_count           = width * 4 * height * batch;
     vsi_size_t   input_width          = input->attr.size[0];
     vsi_size_t   input_height         = input->attr.size[1];
@@ -1501,6 +1505,10 @@ static vsi_nn_tensor_t* _create_scale_tensor
     float      height_scale         = 1.0f;
     uint16_t  *scale_data_ptr       = NULL;
 
+    sizes[0] = width * 4;
+    sizes[1] = height;
+    sizes[2] = 1;
+    sizes[3] = batch;
     if (align_corners && width > 1)
     {
         width_scale = ((float)(input_width - 1) * 1.0f) / (float)(width - 1);
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c
index b8e634e4e..596d528f7 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c
@@ -137,6 +137,8 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_nhwc_initializer)
     vsi_bool    is_3x_up_kernel  = FALSE;
     vsi_bool    is_4x_up_kernel  = FALSE;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -433,6 +435,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_nhwc_bound_initializer)
     vsi_bool    is_3x_up_kernel  = FALSE;
     vsi_bool    is_4x_up_kernel  = FALSE;
 
+    VSI_UNREFERENCED(param_size);
+
 
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c
index 4d0189327..6bf9ba87c 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c
@@ -145,6 +145,8 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer)
     float       half_pixel_value = 0.0f;
     float       round_value      = 0.0f;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
index 9876ebc71..bba21eabb 100644
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
@@ -188,6 +188,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer)
     int32_t     coord_dim  = 0;
     int32_t     offsetX = 0, offsetY = 0, offsetZ = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -345,6 +347,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_big_initializer)
     int32_t     coord_dim  = 0;
     int32_t     offsetX = 0, offsetY = 0, offsetZ = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -457,7 +461,9 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e input1_dtype = U8;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
+
+    VSI_UNREFERENCED(coord_dim);
 
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -517,6 +523,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t width = 0, area = 0;
     int32_t big_flg = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if (coord_dim > 3)
     {
         return NULL;
diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
index e9d6d5dd0..43ea15c3f 100644
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
@@ -45,64 +45,82 @@ __BEGIN_DECLS
 #define KERNEL_SOURCE_2    "scatter_nd_update_big"
 #define KERNEL_SOURCE_3    "scatter_nd_update_atom"
 #define KERNEL_SOURCE_4    "scatter_nd_update_special"
+#define KERNEL_SOURCE_5    "scatter_nd_update_qint"
+#define KERNEL_SOURCE_6    "scatter_nd_update_fp"
 
-#define HASH_SCATTER_ND_UPDATE_KEY(_input0_type, _input2_type, _output_type, _pre_op, _large_type) \
-    ((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | (_pre_op << 4) | (_large_type))
+#define HASH_SCATTER_ND_UPDATE_KEY(_in0_type, _in2_type, _out_type, _stage, _coord_type, _opt_flg) \
+    ((_in0_type << 24) | (_in2_type << 16) | (_out_type << 8) | (_stage << 4) | (_coord_type << 2) | (_opt_flg))
 
-#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_NAME(SRC0_TYPE, SRC2_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.scatter_nd_update_"#SRC0_TYPE#SRC2_TYPE"to"#DST_TYPE)
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_RESET_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.scatter_nd_update_reset_"#SRC0_TYPE"to"#DST_TYPE)
 
-#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_BIG_NAME(SRC0_TYPE, SRC2_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("evis.scatter_nd_update_"#SRC0_TYPE#SRC2_TYPE"to"#DST_TYPE"_big")
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_NAME(SRC2_TYPE) \
+    CVIVANTE_NAMESPACE("evis.scatter_nd_update_update_"#SRC2_TYPE)
 
-#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PRE_NAME(SRC0_TYPE) \
-    CVIVANTE_NAMESPACE("evis.scatter_nd_update_"#SRC0_TYPE"_pre")
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_4X_NAME(SRC2_TYPE) \
+    CVIVANTE_NAMESPACE("evis.scatter_nd_update_update_"#SRC2_TYPE"_4X")
 
- #define HASH_SCATTER_ND_UPDATE_SH_KERNEL_RESET_NAME() \
-    CVIVANTE_NAMESPACE("evis.scatter_nd_update_reset")
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_NAME(SRC2_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.scatter_nd_update_ref_"#SRC2_TYPE"to"#DST_TYPE)
 
-#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_NAME(SRC0_TYPE, DST_TYPE) \
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_4X_NAME(SRC2_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.scatter_nd_update_ref_"#SRC2_TYPE"to"#DST_TYPE"_4X")
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_COPY_NAME(DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.scatter_nd_update_copy_"#DST_TYPE)
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_SPECIAL_REF_NAME(SRC0_TYPE, DST_TYPE) \
     CVIVANTE_NAMESPACE("evis.scatter_nd_update_ref2out_"#SRC0_TYPE"to"#DST_TYPE)
 
-#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_NAME(SRC2_TYPE, DST_TYPE) \
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_SPECIAL_UPDATE_NAME(SRC2_TYPE, DST_TYPE) \
     CVIVANTE_NAMESPACE("evis.scatter_nd_update_update2ref_"#SRC2_TYPE"to"#DST_TYPE"_16x")
 
-#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_COPY_NAME(DST_TYPE) \
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_SPECIAL_COPY_NAME(DST_TYPE) \
     CVIVANTE_NAMESPACE("evis.scatter_nd_update_cpy2out_"#DST_TYPE"to"#DST_TYPE)
 
-#define TENSOR_SCATTER_ND_UPDATE_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 0, 0), \
-        HASH_SCATTER_ND_UPDATE_SH_KERNEL_NAME(IN0_TYPE, IN2_TYPE, OUT_TYPE), \
+#define TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 4, 1, 0), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_SPECIAL_REF_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+#define TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 5, 1, 0), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_SPECIAL_UPDATE_NAME(IN2_TYPE, OUT_TYPE), \
+        SOURCE },
+
+#define TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 6, 1, 0), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_SPECIAL_COPY_NAME(IN0_TYPE), \
         SOURCE },
 
-#define TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 0, 1), \
-        HASH_SCATTER_ND_UPDATE_SH_KERNEL_BIG_NAME(IN0_TYPE, IN2_TYPE, OUT_TYPE), \
+#define TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, 0, OUT_TYPE, 0, 0, 0), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_RESET_NAME(IN0_TYPE, OUT_TYPE), \
         SOURCE },
 
-#define TENSOR_SCATTER_ND_UPDATE_PRE_KERNELS(IN0_TYPE, SOURCE) \
-    { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, I32, I32, 1, 1), \
-        HASH_SCATTER_ND_UPDATE_SH_KERNEL_PRE_NAME(IN0_TYPE), \
+#define TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(IN2_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, 0, 1, 0, 0), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_NAME(IN2_TYPE), \
         SOURCE },
 
- #define TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(SOURCE) \
-    { HASH_SCATTER_ND_UPDATE_KEY(I32, I32, I32, 2, 1), \
-        HASH_SCATTER_ND_UPDATE_SH_KERNEL_RESET_NAME(), \
+#define TENSOR_SCATTER_ND_UPDATE_UPDATE_4X_KERNELS(IN2_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, 0, 1, 0, 1), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_4X_NAME(IN2_TYPE), \
         SOURCE },
 
-#define TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 3, 1), \
-        HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_NAME(IN0_TYPE, OUT_TYPE), \
+#define TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(IN2_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, OUT_TYPE, 2, 0, 0), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_NAME(IN2_TYPE, OUT_TYPE), \
         SOURCE },
 
-#define TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 4, 1), \
-        HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_NAME(IN2_TYPE, OUT_TYPE), \
+#define TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(IN2_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, OUT_TYPE, 2, 0, 1), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_4X_NAME(IN2_TYPE, OUT_TYPE), \
         SOURCE },
 
-#define TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 5, 1), \
-        HASH_SCATTER_ND_UPDATE_SH_KERNEL_COPY_NAME(IN0_TYPE), \
+#define TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(OUT_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(0, 0, OUT_TYPE, 3, 0, 0), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_COPY_NAME(OUT_TYPE), \
         SOURCE },
 
 typedef struct
@@ -112,93 +130,118 @@ typedef struct
     const char * source_name;
 } _kernel_map_type;
 
-static const _kernel_map_type scatter_nd_update_map[] =
+static const _kernel_map_type scatter_nd_update_reset_map[] =
 {
-    TENSOR_SCATTER_ND_UPDATE_KERNELS(I8,   I32, I8,   I8,     KERNEL_SOURCE_1)
-    TENSOR_SCATTER_ND_UPDATE_KERNELS(U8,   I32, U8,   U8,     KERNEL_SOURCE_1)
-    TENSOR_SCATTER_ND_UPDATE_KERNELS(I16,  I32, I16,  I16,    KERNEL_SOURCE_1)
-    TENSOR_SCATTER_ND_UPDATE_KERNELS(F16,  I32, F16,  F16,    KERNEL_SOURCE_1)
-    TENSOR_SCATTER_ND_UPDATE_KERNELS(BF16, I32, BF16, BF16,   KERNEL_SOURCE_1)
-    TENSOR_SCATTER_ND_UPDATE_KERNELS(U8,   I32, U8,   F16,    KERNEL_SOURCE_1)
-    TENSOR_SCATTER_ND_UPDATE_KERNELS(I8,   I32, I8,   F16,    KERNEL_SOURCE_1)
-    TENSOR_SCATTER_ND_UPDATE_KERNELS(I16,  I32, I16,  F16,    KERNEL_SOURCE_1)
-    TENSOR_SCATTER_ND_UPDATE_KERNELS(F16,  I32, F16,  U8,     KERNEL_SOURCE_1)
-    TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(F16, I32, F16, F16,  KERNEL_SOURCE_2)
-    TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(F16, I32, F16, U8,   KERNEL_SOURCE_2)
+    TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(U8,   U8,   KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(I8,   I8,   KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(I16,  I16,  KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(F16,  F16,  KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(BF16, BF16, KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(U8,   F16,  KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(I8,   F16,  KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(I16,  F16,  KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(F16,  U8,   KERNEL_SOURCE_5)
 };
 
-static const _kernel_map_type scatter_nd_update_reset_map[] =
+static const _kernel_map_type scatter_nd_update_update_map[] =
 {
-    TENSOR_SCATTER_ND_UPDATE_RESET_KERNELS(KERNEL_SOURCE_3)
+    TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(U8,   KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(I8,   KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(I16,  KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(F16,  KERNEL_SOURCE_6)
+    TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(BF16, KERNEL_SOURCE_6)
+    TENSOR_SCATTER_ND_UPDATE_UPDATE_4X_KERNELS(U8,   KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_UPDATE_4X_KERNELS(I8,   KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_UPDATE_4X_KERNELS(I16,  KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_UPDATE_4X_KERNELS(F16,  KERNEL_SOURCE_6)
+    TENSOR_SCATTER_ND_UPDATE_UPDATE_4X_KERNELS(BF16, KERNEL_SOURCE_6)
 };
 
-static const _kernel_map_type scatter_nd_update_pre_map[] =
+static const _kernel_map_type scatter_nd_update_ref_map[] =
 {
-    TENSOR_SCATTER_ND_UPDATE_PRE_KERNELS(U8,     KERNEL_SOURCE_3)
-    TENSOR_SCATTER_ND_UPDATE_PRE_KERNELS(I8,     KERNEL_SOURCE_3)
-    TENSOR_SCATTER_ND_UPDATE_PRE_KERNELS(I16,    KERNEL_SOURCE_3)
+    TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(I32, U8,   KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(I32, I8,   KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(I32, I16,  KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(I32, F16,  KERNEL_SOURCE_6)
+    TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(F32, F16,  KERNEL_SOURCE_6)
+    TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(F32, BF16, KERNEL_SOURCE_6)
+    TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(I32, U8,   KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(I32, I8,   KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(I32, I16,  KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(I32, F16,  KERNEL_SOURCE_6)
+    TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(F32, F16,  KERNEL_SOURCE_6)
+    TENSOR_SCATTER_ND_UPDATE_REF_4X_KERNELS(F32, BF16, KERNEL_SOURCE_6)
 };
 
-static const _kernel_map_type scatter_nd_update_post_map[] =
+static const _kernel_map_type scatter_nd_update_copy_map[] =
 {
-    TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(U8,  I32, U8,  F16,  KERNEL_SOURCE_3)
-    TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(I8,  I32, I8,  F16,  KERNEL_SOURCE_3)
-    TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(I16, I32, I16, F16,  KERNEL_SOURCE_3)
-    TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(I8,  I32, I8,  I8,   KERNEL_SOURCE_3)
-    TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(U8,  I32, U8,  U8,   KERNEL_SOURCE_3)
-    TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(I16, I32, I16, I16,  KERNEL_SOURCE_3)
+    TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(U8,  KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(I8,  KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(I16, KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(F16, KERNEL_SOURCE_5)
+    TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(BF16, KERNEL_SOURCE_5)
 };
 
-static const _kernel_map_type scatter_nd_update_ref_map[] =
+static const _kernel_map_type scatter_nd_update_special_ref_map[] =
 {
-    TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(U8,  I32, U8,  U8, KERNEL_SOURCE_4)
-    TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(I8,  I32, I8,  I8, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(U8,  I32, U8,  U8, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_REF_KERNELS(I8,  I32, I8,  I8, KERNEL_SOURCE_4)
 };
 
-static const _kernel_map_type scatter_nd_update_update_map[] =
+static const _kernel_map_type scatter_nd_update_special_update_map[] =
 {
-    TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(U8,  I32, U8,  U8, KERNEL_SOURCE_4)
-    TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(I8,  I32, I8,  I8, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(U8,  I32, U8,  U8, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_UPDATE_KERNELS(I8,  I32, I8,  I8, KERNEL_SOURCE_4)
 };
 
-static const _kernel_map_type scatter_nd_update_copy_map[] =
+static const _kernel_map_type scatter_nd_update_special_copy_map[] =
 {
-    TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(U8,  I32, U8,  U8, KERNEL_SOURCE_4)
-    TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(I8,  I32, I8,  I8, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(U8,  I32, U8,  U8, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_SPECIAL_COPY_KERNELS(I8,  I32, I8,  I8, KERNEL_SOURCE_4)
 };
 
 /*
  * Kernel params
  */
-static vx_param_description_t _scatter_nd_update_kernel_param_def[] =
+static vx_param_description_t _scatter_nd_update_reset_kernel_param_def[] =
 {
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     // Add kererl parameters here
 };
 
-static vx_param_description_t _scatter_nd_update_reset_kernel_param_def[] =
+static vx_param_description_t _scatter_nd_update_update_kernel_param_def[] =
 {
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     // Add kererl parameters here
 };
 
-static vx_param_description_t _scatter_nd_update_pre_kernel_param_def[] =
+static vx_param_description_t _scatter_nd_update_ref_kernel_param_def[] =
 {
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    //{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
@@ -206,22 +249,17 @@ static vx_param_description_t _scatter_nd_update_pre_kernel_param_def[] =
     // Add kererl parameters here
 };
 
-static vx_param_description_t _scatter_nd_update_post_kernel_param_def[] =
+static vx_param_description_t _scatter_nd_update_copy_kernel_param_def[] =
 {
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     // Add kererl parameters here
 };
 
-static vx_param_description_t _scatter_nd_update_ref_kernel_param_def[] =
+static vx_param_description_t _scatter_nd_update_special_ref_kernel_param_def[] =
 {
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@@ -229,7 +267,7 @@ static vx_param_description_t _scatter_nd_update_ref_kernel_param_def[] =
     // Add kererl parameters here
 };
 
-static vx_param_description_t _scatter_nd_update_update_kernel_param_def[] =
+static vx_param_description_t _scatter_nd_update_special_update_kernel_param_def[] =
 {
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@@ -243,7 +281,7 @@ static vx_param_description_t _scatter_nd_update_update_kernel_param_def[] =
     // Add kererl parameters here
 };
 
-static vx_param_description_t _scatter_nd_update_copy_kernel_param_def[] =
+static vx_param_description_t _scatter_nd_update_special_copy_kernel_param_def[] =
 {
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@@ -251,13 +289,14 @@ static vx_param_description_t _scatter_nd_update_copy_kernel_param_def[] =
     // Add kererl parameters here
 };
 
-#define _SCATTER_ND_UPDATE_PARAM_NUM  _cnt_of_array( _scatter_nd_update_kernel_param_def )
-#define _SCATTER_ND_UPDATE_PRE_PARAM_NUM  _cnt_of_array( _scatter_nd_update_pre_kernel_param_def )
-#define _SCATTER_ND_UPDATE_POST_PARAM_NUM  _cnt_of_array( _scatter_nd_update_post_kernel_param_def )
 #define _SCATTER_ND_UPDATE_RESET_PARAM_NUM  _cnt_of_array( _scatter_nd_update_reset_kernel_param_def )
-#define _SCATTER_ND_UPDATE_REF_PARAM_NUM  _cnt_of_array( _scatter_nd_update_ref_kernel_param_def )
-#define _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM  _cnt_of_array( _scatter_nd_update_update_kernel_param_def )
-#define _SCATTER_ND_UPDATE_COPY_PARAM_NUM  _cnt_of_array( _scatter_nd_update_copy_kernel_param_def )
+#define _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM  _cnt_of_array(_scatter_nd_update_update_kernel_param_def)
+#define _SCATTER_ND_UPDATE_REF_PARAM_NUM  _cnt_of_array(_scatter_nd_update_ref_kernel_param_def)
+#define _SCATTER_ND_UPDATE_COPY_PARAM_NUM  _cnt_of_array(_scatter_nd_update_copy_kernel_param_def)
+
+#define _SCATTER_ND_UPDATE_SPECIAL_REF_PARAM_NUM  _cnt_of_array(_scatter_nd_update_special_ref_kernel_param_def)
+#define _SCATTER_ND_UPDATE_SPECIAL_UPDATE_PARAM_NUM  _cnt_of_array(_scatter_nd_update_special_update_kernel_param_def)
+#define _SCATTER_ND_UPDATE_SPECIAL_COPY_PARAM_NUM  _cnt_of_array(_scatter_nd_update_special_copy_kernel_param_def)
 
 static vsi_status get_scatter_nd_update_tensor_reshape_size
     (
@@ -265,24 +304,17 @@ static vsi_status get_scatter_nd_update_tensor_reshape_size
     vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
     uint32_t block_size,
     uint32_t coordDim,
-    vsi_size_t* width,
-    vsi_size_t* area,
-    vsi_size_t* vol,
+    vsi_size_t strides[VSI_NN_MAX_DIM_NUM],
     int32_t* newDim,
     int32_t* isBig
     )
 {
-    vsi_status status = VSI_FAILURE;
+    vsi_status status = VSI_SUCCESS;
     uint32_t dims_num = inputs[0]->attr.dim_num;
     vsi_size_t *input_size = inputs[0]->attr.size;
     uint32_t i = 0;
     vsi_size_t elementCnt = 1;
 
-    if (coordDim != 0 && (width == NULL || area == NULL))
-    {
-        return status;
-    }
-
 #define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
 
     newDim[0] = 0;
@@ -305,36 +337,30 @@ static vsi_status get_scatter_nd_update_tensor_reshape_size
         isBig[0] |= 1;
     }
 
-    if (coordDim == 1) // index shape
-    {
-        *width = 0;
-        *area = 0;
-    }
-    else if (coordDim == 2)
+    if (coordDim == 1 && strides) // index shape
     {
-        *width = input_size[dims_num - 2];
-        *area = 0;
-    }
-    else if (coordDim == 3)
-    {
-        *width = input_size[dims_num - 3];
-        *area = input_size[dims_num - 3] * input_size[dims_num - 2];
-    }
-    else if (coordDim == 4)
-    {
-        *width = input_size[dims_num - 4];
-        *area = input_size[dims_num - 4] * input_size[dims_num - 3];
-        *vol = input_size[dims_num - 4] * input_size[dims_num - 3] * input_size[dims_num - 2];
+        for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+        {
+            strides[i] = 0;
+        }
     }
-    else if (coordDim == 5)
+    else if (coordDim >= 2 && coordDim <= VSI_NN_MAX_DIM_NUM && strides)
     {
-        *width = input_size[dims_num - 5];
-        *area = input_size[dims_num - 5] * input_size[dims_num - 4];
-        *vol = input_size[dims_num - 5] * input_size[dims_num - 4] * input_size[dims_num - 3];
+        for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+        {
+            strides[i] = 0;
+        }
+
+        strides[0] = input_size[dims_num - coordDim];
+        for (i = 1; i < coordDim - 1; i++)
+        {
+            strides[i] = strides[i - 1] * input_size[dims_num - coordDim + i];
+        }
     }
+
 #undef VSI_NN_MAX_IMAGE_WIDTH
 
-    return VSI_SUCCESS;
+    return status;
 } /* _get_EltOP_tensor_reshape_size */
 
 static vsi_status check_scatter_nd_update_index_repeat
@@ -458,7 +484,8 @@ static vsi_status check_scatter_nd_update_index_repeat
 /*
  * Kernel initializer
  */
-DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer)
+
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_ref_initializer)
     (
     vsi_nn_kernel_node_t                node,
     const vsi_nn_kernel_node_param_t  * param,
@@ -474,157 +501,68 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer)
         {0, 0, 0}
         };
 
-    vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL };
-    int32_t     block_size = 1;
-    int32_t     height     = 1;
-    int32_t     index_num  = 1;
-    int32_t     width = 0, area = 0, vol = 0;
-    int32_t     coord_dim  = 0;
-    int32_t     offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0;
-    int32_t     src0ZP     = 0;
-    float       src0Scale  = 1;
-    int32_t     src2ZP     = 0;
-    float       src2Scale  = 1;
-    int32_t     dstZP      = 0;
-    float       dstScale   = 1;
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    int32_t block_size = 1;
+    int32_t width = 0;
+    int32_t height = 0;
+
+    int32_t input0_zp    = 0;
+    float   input0_scale = 1.0f;
+    int32_t output_zp    = 0;
+    float   output_scale = 1.0f;
+
+    uint32_t pack_key = 0;
+
+    VSI_UNREFERENCED(param_size);
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
-    attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
-    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &width);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &area);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &vol);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &coord_dim);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
-
-    block_size = (int32_t)(attr[3]->shape->data[0]);
-    height     = (int32_t)(attr[3]->shape->data[1]);
-    index_num  = (int32_t)(attr[1]->shape->data[1]);
 
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        src0ZP     = attr[0]->asymm.zero_point;
-        src0Scale  = attr[0]->asymm.scale;
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-    }
-
-    if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        src2ZP     = attr[2]->asymm.zero_point;
-        src2Scale  = attr[2]->asymm.scale;
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[2]->dfp.fl > 0)
-        {
-            src2Scale = (1.0f / ((float) ((int64_t)1 << attr[2]->dfp.fl)));
-        }
-        else
-        {
-            src2Scale = ((float) ((int64_t)1 << -attr[2]->dfp.fl));
-        }
-    }
-
-    if (attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    block_size   = (int32_t)(attr[0]->shape->data[0]);
+    height = (int32_t)(attr[0]->shape->data[1]);
+    width = (int32_t)(block_size * height);
+    if (attr[0]->dtype == F16 || attr[0]->dtype == I16 || attr[0]->dtype == U16)
     {
-        dstZP      = attr[3]->asymm.zero_point;
-        dstScale   = attr[3]->asymm.scale;
+        width = (width + 7) / 8;
     }
-    else if ( attr[3]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    else if (attr[0]->dtype == U8 || attr[0]->dtype == I8)
     {
-        if (attr[3]->dfp.fl > 0)
-        {
-            dstScale = (float)((int64_t)1 << attr[3]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (float)((int64_t)1 << -attr[3]->dfp.fl));
-        }
-        dstScale = 1.0f/dstScale;
+        width = (width + 15) / 16;
     }
 
-    if (coord_dim == 5)
-    {
-        offset_idx = 1;
-    }
-    if (coord_dim == 4 || coord_dim == 5)
-    {
-        offsetX = vol;
-        offsetY = area;
-        offsetZ = width;
-        offsetW = 1;
-    }
-    else if (coord_dim == 3)
-    {
-        offsetX = area;
-        offsetY = width;
-        offsetZ = 1;
-        offsetW = 0;
-    }
-    else if (coord_dim == 2)
-    {
-        offsetX = width;
-        offsetY = 1;
-        offsetZ = 0;
-        offsetW = 0;
-    }
-    else if (coord_dim == 1)
-    {
-        offsetX = 1;
-        offsetY = 0;
-        offsetZ = 0;
-        offsetW = 0;
-    }
+    input0_zp     = attr[0]->asymm.zero_point;
+    input0_scale  = attr[0]->asymm.scale;
+    output_zp     = attr[1]->asymm.zero_point;
+    output_scale  = 1.0f / attr[1]->asymm.scale;
 
-    gpu_param.global_scale[0]  = 8;
+    gpu_param.global_scale[0]  = 1;
     gpu_param.global_scale[1]  = 1;
     gpu_param.global_scale[2]  = 1;
 
-    gpu_param.global_size[0]   = gpu_align_p2((block_size + gpu_param.global_scale[0] - 1)
-                                        / gpu_param.global_scale[0], 4);
-    gpu_param.global_size[1]   = height;
+    gpu_param.global_size[0]   = width;
+    gpu_param.global_size[1]   = 1;
     gpu_param.global_size[2]   = 1;
 
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
     CHECK_STATUS_FAIL_GOTO(status, OnError);
 
+#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE )    \
+        (IN0_TYPE | ( OUT_TYPE << 16))
+
+    pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype );
+
+    switch( pack_key )
     {
-        uint16_t M0                = 0;
-        uint16_t M1                = 0;
-        int32_t  postShift0        = 0;
-        int32_t  postShift1        = 0;
-        uint32_t multAndoutZP0[2]  = {0};
-        uint32_t multAndoutZP1[2]  = {0};
-        gpu_dp_inst_t uniAccumulateSum_2x8 = {{
-                0x55555555, // TCfg
-                0x44444444, // ASelt
-                0x33221100, 0x77665544, // ABin
-                0xaaaaaaaa, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00010001, 0x00010001, 0x00010001, 0x00010001,
-                0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{
+    case _PACK_SELECT_KEY( I8,  I8 ):
+    case _PACK_SELECT_KEY( U8,  U8 ):
+        {
+            uint16_t M0               = 0;
+            int32_t  postShift0       = 0;
+            uint32_t multAndoutZP0[2] = {0};
+
+            gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{
                 0xdddddddd, // TCfg
                 0x44444444, // ASelt
                 0x13121110, 0x17161514, // ABin
@@ -633,80 +571,40 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer)
                 0x00002600, // AccumType, ConstantType, and PostShift
                 0x00000000, 0x00000000, 0x00000000, 0x00000000,
                 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniU8MulAndPostShift_1_Lo_2x8 = {{
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{
                 0xdddddddd, // TCfg
                 0x44444444, // ASelt
-                0x13121110, 0x17161514, // ABin
+                0x1b1a1918, 0x1f1e1d1c, // ABin
                 0x11111111, // BSelt
                 0x00000000, 0x00000000, // BBin
                 0x00002600, // AccumType, ConstantType, and PostShift
                 0x00000000, 0x00000000, 0x00000000, 0x00000000,
                 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
-            0x11111111, // TCfg
-            0x01010101, // ASelt
-            0x01050004, 0x03070206, // ABin
-            0x22222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001,
-            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
-            0x11111111, // TCfg
-            0x01010101, // ASelt
-            0x05050404, 0x07070606, // ABin
-            0x22222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001,
-            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniExtractOddData_2x8 = {{
-            0x11111111, // TCfg
-            0x11110000, // ASelt
-            0x07050301, 0x07050301, // ABin
-            0x22222222, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000600, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000001, 0x00000001, 0x00000001,
-            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
-        }, GPU_DP_TYPE_16};
+            }, GPU_DP_TYPE_16 };
 
-        gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0);
-        gpu_quantize_multiplier_16bit( (double)src2Scale / dstScale, &M1, &postShift1);
-        multAndoutZP0[0] = (uint32_t)(M0);
-        multAndoutZP0[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0);
-        multAndoutZP1[0] = (uint32_t)(M1);
-        multAndoutZP1[1] = (uint32_t)((dstZP << postShift1) - src2ZP * M1);
-        gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift0 );
-        gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_1_Lo_2x8, postShift1 );
+            gpu_quantize_multiplier_16bit( (double)input0_scale * output_scale, &M0, &postShift0);
 
-        status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniAccumulateSum_2x8", &uniAccumulateSum_2x8 );
-        status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 );
-        status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniU8MulAndPostShift_1_Lo_2x8", &uniU8MulAndPostShift_1_Lo_2x8 );
-        status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
-        status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
-        status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
-        status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
-        status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
-        status |= vsi_nn_kernel_gpu_add_param( node, "index_num", &index_num );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx );
-        CHECK_STATUS_FAIL_GOTO(status, OnError);
+            multAndoutZP0[0] = (uint32_t)(M0);
+            multAndoutZP0[1] = (uint32_t)((output_zp << postShift0) - input0_zp * M0);
+
+            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift0 );
+            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift0 );
+
+            status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                "uniU8MulAndPostShift0_Lo_2x8",  &uniU8MulAndPostShift_Lo_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                "uniU8MulAndPostShift0_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
+            CHECK_STATUS_FAIL_GOTO(status, OnError );
+        }
+        break;
+    default:
+        break;
     }
 
+#undef _PACK_SELECT_KEY
+
 OnError:
     if (attr[0])
     {
@@ -718,20 +616,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer)
         vsi_nn_kernel_tensor_attr_release( &attr[1] );
         attr[1] = NULL;
     }
-    if (attr[2])
-    {
-        vsi_nn_kernel_tensor_attr_release( &attr[2] );
-        attr[2] = NULL;
-    }
-    if (attr[3])
-    {
-        vsi_nn_kernel_tensor_attr_release( &attr[3] );
-        attr[3] = NULL;
-    }
     return status;
-} /* _scatter_nd_update_initializer() */
+} /* _scatter_nd_update_special_ref_initializer() */
 
-DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer)
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_update_initializer)
     (
     vsi_nn_kernel_node_t                node,
     const vsi_nn_kernel_node_param_t  * param,
@@ -747,19 +635,20 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer)
         {0, 0, 0}
         };
 
-    vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL };
+    vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
     int32_t     block_size = 1;
-    int32_t     height     = 1;
+    int32_t     update_width = 1;
     int32_t     index_num  = 1;
     int32_t     width = 0, area = 0, vol = 0;
     int32_t     coord_dim  = 0;
     int32_t     offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0;
-    int32_t     src0ZP     = 0;
-    float       src0Scale  = 1;
-    int32_t     src2ZP     = 0;
-    float       src2Scale  = 1;
-    int32_t     dstZP      = 0;
-    float       dstScale   = 1;
+    int32_t     input1_zp    = 0;
+    float       input1_scale = 1.0f;
+    int32_t     output_zp    = 0;
+    float       output_scale = 1.0f;
+    uint32_t    pack_key = 0;
+
+    VSI_UNREFERENCED(param_size);
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
@@ -767,73 +656,24 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer)
     CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
     attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
     CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
-    attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
-    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError );
 
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &width);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &width);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &area);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &area);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &vol);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &vol);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &coord_dim);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &coord_dim);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-    block_size = (int32_t)(attr[3]->shape->data[0]);
-    height     = (int32_t)(attr[3]->shape->data[1]);
-    index_num  = (int32_t)(attr[1]->shape->data[1]);
-
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        src0ZP     = attr[0]->asymm.zero_point;
-        src0Scale  = attr[0]->asymm.scale;
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-    }
-
-    if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        src2ZP     = attr[2]->asymm.zero_point;
-        src2Scale  = attr[2]->asymm.scale;
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[2]->dfp.fl > 0)
-        {
-            src2Scale = (1.0f / ((float) ((int64_t)1 << attr[2]->dfp.fl)));
-        }
-        else
-        {
-            src2Scale = ((float) ((int64_t)1 << -attr[2]->dfp.fl));
-        }
-    }
+    block_size   = (int32_t)(attr[2]->shape->data[0]);
+    update_width = (int32_t)(attr[1]->shape->data[0]);
+    index_num    = (int32_t)(attr[0]->shape->data[1]);
 
-    if (attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        dstZP      = attr[3]->asymm.zero_point;
-        dstScale   = attr[3]->asymm.scale;
-    }
-    else if ( attr[3]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[3]->dfp.fl > 0)
-        {
-            dstScale = (float)((int64_t)1 << attr[3]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (float)((int64_t)1 << -attr[3]->dfp.fl));
-        }
-        dstScale = 1.0f / dstScale;
-    }
+    input1_zp     = attr[1]->asymm.zero_point;
+    input1_scale  = attr[1]->asymm.scale;
+    output_zp     = attr[2]->asymm.zero_point;
+    output_scale  = 1.0f / attr[2]->asymm.scale;
 
     if (coord_dim == 5)
     {
@@ -865,35 +705,60 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer)
         offsetZ = 0;
     }
 
+    if (attr[1]->dtype == F16 || attr[1]->dtype == I16 || attr[1]->dtype == U16)
+    {
+        update_width = (update_width + 7) / 8;
+    }
+    else if (attr[1]->dtype == U8 || attr[1]->dtype == I8)
+    {
+        update_width = (update_width + 15) / 16;
+    }
+
+    if (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == U16)
+    {
+        block_size = (block_size + 7) / 8;
+    }
+    else if (attr[2]->dtype == U8 || attr[2]->dtype == I8)
+    {
+        block_size = (block_size + 15) / 16;
+    }
+
     gpu_param.global_scale[0]  = 1;
     gpu_param.global_scale[1]  = 1;
     gpu_param.global_scale[2]  = 1;
 
     gpu_param.global_size[0]   = block_size;
-    gpu_param.global_size[1]   = height;
+    gpu_param.global_size[1]   = index_num;
     gpu_param.global_size[2]   = 1;
 
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
     CHECK_STATUS_FAIL_GOTO(status, OnError);
 
     {
-        uint16_t M0                = 0;
-        uint16_t M1                = 0;
-        int32_t  postShift0        = 0;
-        int32_t  postShift1        = 0;
-        uint32_t multAndoutZP0[2]  = {0};
-        uint32_t multAndoutZP1[2]  = {0};
-        gpu_dp_inst_t uniAccumulateSum_2x8 = {{
-                0x55555555, // TCfg
-                0x44444444, // ASelt
-                0x33221100, 0x77665544, // ABin
-                0xaaaaaaaa, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000600, // AccumType, ConstantType, and PostShift
-                0x00010001, 0x00010001, 0x00010001, 0x00010001,
-                0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{
+        status = vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width );
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size );
+        status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX );
+        status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY );
+        status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ );
+        status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW );
+        status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx );
+        CHECK_STATUS_FAIL_GOTO(status, OnError);
+    }
+#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE )    \
+        (IN0_TYPE | ( OUT_TYPE << 16))
+
+    pack_key = _PACK_SELECT_KEY( attr[1]->dtype, attr[2]->dtype );
+
+    switch( pack_key )
+    {
+    case _PACK_SELECT_KEY( I8,  I8 ):
+    case _PACK_SELECT_KEY( U8,  U8 ):
+        {
+            uint16_t M1               = 0;
+            int32_t  postShift1       = 0;
+            uint32_t multAndoutZP1[2] = {0};
+
+            gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{
                 0xdddddddd, // TCfg
                 0x44444444, // ASelt
                 0x13121110, 0x17161514, // ABin
@@ -902,48 +767,38 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer)
                 0x00002600, // AccumType, ConstantType, and PostShift
                 0x00000000, 0x00000000, 0x00000000, 0x00000000,
                 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniU8MulAndPostShift_1_Lo_2x8 = {{
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{
                 0xdddddddd, // TCfg
                 0x44444444, // ASelt
-                0x13121110, 0x17161514, // ABin
+                0x1b1a1918, 0x1f1e1d1c, // ABin
                 0x11111111, // BSelt
                 0x00000000, 0x00000000, // BBin
                 0x00002600, // AccumType, ConstantType, and PostShift
                 0x00000000, 0x00000000, 0x00000000, 0x00000000,
                 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
+            }, GPU_DP_TYPE_16 };
 
-        gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0);
-        gpu_quantize_multiplier_16bit( (double)src2Scale / dstScale, &M1, &postShift1);
-        multAndoutZP0[0] = (uint32_t)(M0);
-        multAndoutZP0[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0);
-        multAndoutZP1[0] = (uint32_t)(M1);
-        multAndoutZP1[1] = (uint32_t)((dstZP << postShift1) - src2ZP * M1);
-        gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift0 );
-        gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_1_Lo_2x8, postShift1 );
+            gpu_quantize_multiplier_16bit( (double)input1_scale * output_scale, &M1, &postShift1);
 
-        status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniAccumulateSum_2x8", &uniAccumulateSum_2x8 );
-        if (attr[3]->quant != VSI_NN_KERNEL_QUANT_NONE)
-        {
+            multAndoutZP1[0] = (uint32_t)(M1);
+            multAndoutZP1[1] = (uint32_t)((output_zp << postShift1) - input1_zp * M1);
+
+            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift1 );
+            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 );
+
+            status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
             status |= vsi_nn_kernel_gpu_add_param( node,
-                "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 );
+                "uniU8MulAndPostShift1_Lo_2x8",  &uniU8MulAndPostShift_Lo_2x8 );
             status |= vsi_nn_kernel_gpu_add_param( node,
-                "uniU8MulAndPostShift_1_Lo_2x8", &uniU8MulAndPostShift_1_Lo_2x8 );
-            status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
-            status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
+                "uniU8MulAndPostShift1_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
+            CHECK_STATUS_FAIL_GOTO(status, OnError );
         }
-        status |= vsi_nn_kernel_gpu_add_param( node, "index_num", &index_num );
-        status |= vsi_nn_kernel_gpu_add_param( node, "update_width", &block_size );
-        status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx );
-        CHECK_STATUS_FAIL_GOTO(status, OnError);
+        break;
+    default:
+        break;
     }
+#undef _PACK_SELECT_KEY
 
 OnError:
     if (attr[0])
@@ -961,15 +816,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer)
         vsi_nn_kernel_tensor_attr_release( &attr[2] );
         attr[2] = NULL;
     }
-    if (attr[3])
-    {
-        vsi_nn_kernel_tensor_attr_release( &attr[3] );
-        attr[3] = NULL;
-    }
     return status;
-} /* _scatter_nd_update_big_initializer() */
+} /* _scatter_nd_update_special_update_initializer() */
 
-DEF_KERNEL_INITIALIZER(_scatter_nd_update_pre_initializer)
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_special_copy_initializer)
     (
     vsi_nn_kernel_node_t                node,
     const vsi_nn_kernel_node_param_t  * param,
@@ -985,140 +835,50 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_pre_initializer)
         {0, 0, 0}
         };
 
-    vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     int32_t     block_size = 1;
-    int32_t     update_width = 1;
-    int32_t     index_num  = 1;
-    int32_t     width = 0, area = 0, vol = 0;
-    int32_t     coord_dim  = 0;
-    int32_t     offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0;
-    int32_t     src0ZP     = 0;
-    float       src0Scale  = 1;
+    int32_t     width = 0;
+    int32_t     height = 0;
+
+    VSI_UNREFERENCED(param_size);
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &width);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &area);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &vol);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &coord_dim);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-    block_size   = (int32_t)(attr[2]->shape->data[0]);
-    update_width = (int32_t)(attr[1]->shape->data[0]);
-    index_num    = (int32_t)(attr[0]->shape->data[1]);
-
-    if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        src0ZP     = attr[1]->asymm.zero_point;
-        src0Scale  = attr[1]->asymm.scale;
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[1]->dfp.fl > 0)
-        {
-            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl)));
-        }
-        else
-        {
-            src0Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl));
-        }
-    }
+    block_size   = (int32_t)(attr[0]->shape->data[0]);
+    height = (int32_t)(attr[0]->shape->data[1]);
+    width = (int32_t)(block_size * height);
 
-    if (coord_dim == 5)
-    {
-        offset_idx = 1;
-    }
-    if (coord_dim == 4 || coord_dim == 5)
-    {
-        offsetX = vol;
-        offsetY = area;
-        offsetZ = width;
-        offsetW = 1;
-    }
-    else if (coord_dim == 3)
-    {
-        offsetX = area;
-        offsetY = width;
-        offsetZ = 1;
-    }
-    else if (coord_dim == 2)
+    if (attr[0]->dtype == F16 || attr[0]->dtype == I16 || attr[0]->dtype == U16)
     {
-        offsetX = width;
-        offsetY = 1;
-        offsetZ = 0;
+        width = (width + 7) / 8;
     }
-    else if (coord_dim == 1)
+    else if (attr[0]->dtype == U8 || attr[0]->dtype == I8)
     {
-        offsetX = 1;
-        offsetY = 0;
-        offsetZ = 0;
+        width = (width + 15) / 16;
     }
 
     gpu_param.global_scale[0]  = 1;
     gpu_param.global_scale[1]  = 1;
     gpu_param.global_scale[2]  = 1;
 
-    gpu_param.global_size[0]   = block_size;
-    gpu_param.global_size[1]   = index_num;
+    gpu_param.global_size[0]   = width;
+    gpu_param.global_size[1]   = 1;
     gpu_param.global_size[2]   = 1;
 
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
     CHECK_STATUS_FAIL_GOTO(status, OnError);
 
-    {
-        gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
-            0x05050505, // TCfg
-            0x04040404, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000,
-            0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16 };
-
-        status = vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvert1stUint8SubZpToFp32_4x4", &uniConvert1stUint8SubZpToFp32_4x4 );
-        status |= vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width );
-        status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx );
-        status |= vsi_nn_kernel_gpu_add_param( node, "input_zp", &src0ZP );
-        status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &src0Scale );
-        CHECK_STATUS_FAIL_GOTO(status, OnError);
-    }
-
 OnError:
     if (attr[0])
     {
         vsi_nn_kernel_tensor_attr_release( &attr[0] );
         attr[0] = NULL;
     }
-    if (attr[1])
-    {
-        vsi_nn_kernel_tensor_attr_release( &attr[1] );
-        attr[1] = NULL;
-    }
-    if (attr[2])
-    {
-        vsi_nn_kernel_tensor_attr_release( &attr[2] );
-        attr[2] = NULL;
-    }
     return status;
-} /* _scatter_nd_update_pre_initializer() */
+} /* _scatter_nd_update_special_copy_initializer() */
 
-DEF_KERNEL_INITIALIZER(_scatter_nd_update_post_initializer)
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_reset_initializer)
     (
     vsi_nn_kernel_node_t                node,
     const vsi_nn_kernel_node_param_t  * param,
@@ -1127,132 +887,56 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_post_initializer)
 {
     vsi_status status = VSI_FAILURE;
     gpu_param_t gpu_param = {
-        3,
+        1,
         {0, 0, 0},
         {0, 0, 0},
         {0, 0, 0},
         {0, 0, 0}
         };
 
-    vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
-    int32_t     block_size = 1;
-    int32_t     height     = 1;
-    int32_t     width = 0, area = 0, vol = 0;
-    int32_t     coord_dim  = 0;
-    int32_t     offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0;
-    int32_t     src0ZP     = 0;
-    float       src0Scale  = 1;
-    float       src2Scale  = 1;
-    int32_t     dstZP      = 0;
-    float       dstScale   = 1;
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    int32_t     width         = 0;
+    int32_t     element_size  = 1;
+    int32_t     input_zp0     = 0;
+    float       input_scale0  = 1;
+    int32_t     output_zp     = 0;
+    float       output_scale  = 1;
+    int32_t     i             = 0;
 
-    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );  // ref
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] );  // update
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[5] );  // output
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &width);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &area);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &vol);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &coord_dim);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
-
-    block_size = (int32_t)(attr[2]->shape->data[0]);
-    height     = (int32_t)(attr[2]->shape->data[1]);
 
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
     {
-        src0ZP     = attr[0]->asymm.zero_point;
-        src0Scale  = attr[0]->asymm.scale;
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
+        element_size *= (int32_t)attr[0]->shape->data[i];
     }
+    width = element_size / 8;
+
+    input_zp0     = attr[0]->asymm.zero_point;
+    input_scale0  = attr[0]->asymm.scale;
+    output_zp     = attr[1]->asymm.zero_point;
+    output_scale  = attr[1]->asymm.scale;
 
-    if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
     {
-        src2Scale  = attr[1]->asymm.scale;
+        input_scale0 = 1.0f;
     }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
     {
-        if (attr[1]->dfp.fl > 0)
-        {
-            src2Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl)));
-        }
-        else
-        {
-            src2Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl));
-        }
-    }
-
-    if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        dstZP      = attr[2]->asymm.zero_point;
-        dstScale   = attr[2]->asymm.scale;
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[2]->dfp.fl > 0)
-        {
-            dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
-        }
-        dstScale = 1.0f / dstScale;
-        dstZP = 0;
-    }
-
-    if (coord_dim == 5)
-    {
-        offset_idx = 1;
-    }
-    if (coord_dim == 4 || coord_dim == 5)
-    {
-        offsetX = vol;
-        offsetY = area;
-        offsetZ = width;
-        offsetW = 1;
-    }
-    else if (coord_dim == 3)
-    {
-        offsetX = area;
-        offsetY = width;
-        offsetZ = 1;
-    }
-    else if (coord_dim == 2)
-    {
-        offsetX = width;
-        offsetY = 1;
-        offsetZ = 0;
-    }
-    else if (coord_dim == 1)
-    {
-        offsetX = 1;
-        offsetY = 0;
-        offsetZ = 0;
+        output_scale = 1.0f;
     }
 
     gpu_param.global_scale[0]  = 1;
     gpu_param.global_scale[1]  = 1;
     gpu_param.global_scale[2]  = 1;
 
-    gpu_param.global_size[0]   = block_size;
-    gpu_param.global_size[1]   = height;
+    gpu_param.global_size[0]   = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
+                                        / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = 1;
     gpu_param.global_size[2]   = 1;
 
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
@@ -1272,38 +956,15 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_post_initializer)
                 0x00000000, 0x00000000, 0x00000000, 0x00000000,
                 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
-                0x33333333, // TCfg
-                0x11110000, // ASelt
-                0x03020100, 0x03020100, // ABin
-                0x00000000, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00002400, // AccumType, ConstantType, and PostShift
-                0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16 };
-        float output_zp = (float)dstZP;
-        float scaleInOut = src2Scale / dstScale;
 
-        gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift0);
+        gpu_quantize_multiplier_16bit( (double)input_scale0 / output_scale, &M0, &postShift0);
         multAndoutZP0[0] = (uint32_t)(M0);
-        multAndoutZP0[1] = (uint32_t)((dstZP << postShift0) - src0ZP * M0);
+        multAndoutZP0[1] = (uint32_t)((output_zp << postShift0) - input_zp0 * M0);
         gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift0 );
 
         status = vsi_nn_kernel_gpu_add_param( node,
                     "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 );
         status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
-        status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx );
-        status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &src2Scale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp );
-        status |= vsi_nn_kernel_gpu_add_param( node, "scaleInOut", &scaleInOut );
-        status |= vsi_nn_kernel_gpu_add_param( node,
-                    "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
         CHECK_STATUS_FAIL_GOTO(status, OnError);
     }
 
@@ -1318,15 +979,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_post_initializer)
         vsi_nn_kernel_tensor_attr_release( &attr[1] );
         attr[1] = NULL;
     }
-    if (attr[2])
-    {
-        vsi_nn_kernel_tensor_attr_release( &attr[2] );
-        attr[2] = NULL;
-    }
     return status;
-} /* _scatter_nd_update_post_initializer() */
+} /* _scatter_nd_update_reset_initializer() */
 
-DEF_KERNEL_INITIALIZER(_scatter_nd_update_reset_initializer)
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer)
     (
     vsi_nn_kernel_node_t                node,
     const vsi_nn_kernel_node_param_t  * param,
@@ -1335,168 +991,137 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_reset_initializer)
 {
     vsi_status status = VSI_FAILURE;
     gpu_param_t gpu_param = {
-        3,
+        2,
         {0, 0, 0},
         {0, 0, 0},
         {0, 0, 0},
         {0, 0, 0}
         };
 
-    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
     int32_t     block_size = 1;
+    int32_t     update_width = 1;
+    int32_t     index_num  = 1;
     int32_t     width = 0;
-    int32_t     height = 0;
-    int32_t     count_width = 0;
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
-
-    block_size   = (int32_t)(attr[0]->shape->data[0]);
-    height = (int32_t)(attr[0]->shape->data[1]);
-    width = (int32_t)(block_size * height);
-    count_width = (int32_t)((height + 3) / 4);
-
-    gpu_param.global_scale[0]  = 1;
-    gpu_param.global_scale[1]  = 1;
-    gpu_param.global_scale[2]  = 1;
-
-    gpu_param.global_size[0]   = (width + 3) / 4;
-    gpu_param.global_size[1]   = 1;
-    gpu_param.global_size[2]   = 1;
-
-    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
-    CHECK_STATUS_FAIL_GOTO(status, OnError);
-
-    status = vsi_nn_kernel_gpu_add_param( node, "count_width", &count_width );
-    CHECK_STATUS_FAIL_GOTO(status, OnError);
-
-OnError:
-    if (attr[0])
-    {
-        vsi_nn_kernel_tensor_attr_release( &attr[0] );
-        attr[0] = NULL;
-    }
-    return status;
-} /* _scatter_nd_update_reset_initializer() */
-
-DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    gpu_param_t gpu_param = {
-        3,
-        {0, 0, 0},
-        {0, 0, 0},
-        {0, 0, 0},
-        {0, 0, 0}
-        };
-
-    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
-    int32_t block_size = 1;
-    int32_t width = 0;
-    int32_t height = 0;
-
-    int32_t input0_zp    = 0;
-    float   input0_scale = 1.0f;
-    int32_t output_zp    = 0;
-    float   output_scale = 1.0f;
+    int32_t     coord_dim  = 0;
+    int32_t     strides[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t     coord_strides[8]  = {0};
+    int32_t     *coord_strides1 = coord_strides + 4;
+    int32_t     input2_zp = 0;
+    int32_t     i = 0;
 
-    uint32_t pack_key = 0;
+    VSI_UNREFERENCED(param_size);
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
 
-    block_size   = (int32_t)(attr[0]->shape->data[0]);
-    height = (int32_t)(attr[0]->shape->data[1]);
-    width = (int32_t)(block_size * height);
-    if (attr[0]->dtype == F16 || attr[0]->dtype == I16 || attr[0]->dtype == U16)
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &strides[0]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &strides[1]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &strides[2]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &strides[3]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &strides[4]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &strides[5]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &strides[6]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &coord_dim);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    block_size   = (int32_t)(attr[2]->shape->data[0]);
+    update_width = (int32_t)(attr[1]->shape->data[0]);
+    index_num    = (int32_t)(attr[0]->shape->data[1]);
+    width = block_size;
+    if (block_size % 4 == 0)
     {
-        width = (width + 7) / 8;
+        update_width = update_width / 4;
+        width = block_size / 4;
     }
-    else if (attr[0]->dtype == U8 || attr[0]->dtype == I8)
+
+    input2_zp     = attr[1]->asymm.zero_point;
+
+    coord_strides[coord_dim - 1] = 1;
+    for (i = 0; i < coord_dim - 1; i++)
     {
-        width = (width + 15) / 16;
+        coord_strides[i] = strides[coord_dim - 2 - i];
     }
 
-    input0_zp     = attr[0]->asymm.zero_point;
-    input0_scale  = attr[0]->asymm.scale;
-    output_zp     = attr[1]->asymm.zero_point;
-    output_scale  = 1.0f / attr[1]->asymm.scale;
-
     gpu_param.global_scale[0]  = 1;
     gpu_param.global_scale[1]  = 1;
     gpu_param.global_scale[2]  = 1;
 
     gpu_param.global_size[0]   = width;
-    gpu_param.global_size[1]   = 1;
+    gpu_param.global_size[1]   = index_num;
     gpu_param.global_size[2]   = 1;
 
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
     CHECK_STATUS_FAIL_GOTO(status, OnError);
 
-#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE )    \
-        (IN0_TYPE | ( OUT_TYPE << 16))
-
-    pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype );
-
-    switch( pack_key )
     {
-    case _PACK_SELECT_KEY( I8,  I8 ):
-    case _PACK_SELECT_KEY( U8,  U8 ):
-        {
-            uint16_t M0               = 0;
-            int32_t  postShift0       = 0;
-            uint32_t multAndoutZP0[2] = {0};
-
-            gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{
-                0xdddddddd, // TCfg
-                0x44444444, // ASelt
-                0x13121110, 0x17161514, // ABin
-                0x11111111, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00002600, // AccumType, ConstantType, and PostShift
-                0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{
-                0xdddddddd, // TCfg
-                0x44444444, // ASelt
-                0x1b1a1918, 0x1f1e1d1c, // ABin
-                0x11111111, // BSelt
+        gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x0a0a0a0a, // BSelt
                 0x00000000, 0x00000000, // BBin
-                0x00002600, // AccumType, ConstantType, and PostShift
-                0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16 };
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0xffff0001, 0x00000000, 0xffff0001, 0x00000000,
+                0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
 
-            gpu_quantize_multiplier_16bit( (double)input0_scale * output_scale, &M0, &postShift0);
+        gpu_dp_inst_t uniConvertFp16ToFp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
 
-            multAndoutZP0[0] = (uint32_t)(M0);
-            multAndoutZP0[1] = (uint32_t)((output_zp << postShift0) - input0_zp * M0);
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
 
-            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift0 );
-            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift0 );
+        status = vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width );
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size );
+        status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride", &coord_strides );
+        status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride1", &coord_strides1 );
+        CHECK_STATUS_FAIL_GOTO(status, OnError);
 
-            status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                "uniU8MulAndPostShift0_Lo_2x8",  &uniU8MulAndPostShift_Lo_2x8 );
+        if (attr[1]->dtype == U8 || attr[1]->dtype == I8 || attr[1]->dtype == I16)
+        {
+            status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvert1stUint8SubZpToFp32_4x4",  &uniConvert1stUint8SubZpToFp32_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node, "input_zp", &input2_zp );
+            CHECK_STATUS_FAIL_GOTO(status, OnError );
+        }
+        else if (attr[1]->dtype == F16 || attr[1]->dtype == BF16)
+        {
+            status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvertFp16ToFp32_4x4",  &uniConvertFp16ToFp32_4x4 );
             status |= vsi_nn_kernel_gpu_add_param( node,
-                "uniU8MulAndPostShift0_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
+                    "uniConvBF16toF32_Part0_2x8",  &uniConvBF16toF32_Part0_2x8 );
             CHECK_STATUS_FAIL_GOTO(status, OnError );
         }
-        break;
-    default:
-        break;
     }
 
-#undef _PACK_SELECT_KEY
-
 OnError:
     if (attr[0])
     {
@@ -1508,10 +1133,15 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer)
         vsi_nn_kernel_tensor_attr_release( &attr[1] );
         attr[1] = NULL;
     }
+    if (attr[2])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[2] );
+        attr[2] = NULL;
+    }
     return status;
-} /* _scatter_nd_update_ref_initializer() */
+} /* _scatter_nd_update_update_initializer() */
 
-DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer)
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer)
     (
     vsi_nn_kernel_node_t                node,
     const vsi_nn_kernel_node_param_t  * param,
@@ -1531,164 +1161,127 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer)
     int32_t     block_size = 1;
     int32_t     update_width = 1;
     int32_t     index_num  = 1;
-    int32_t     width = 0, area = 0, vol = 0;
+    int32_t     width = 0;
     int32_t     coord_dim  = 0;
-    int32_t     offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0;
-    int32_t     input1_zp    = 0;
-    float       input1_scale = 1.0f;
-    int32_t     output_zp    = 0;
+    int32_t     strides[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t     coord_strides[8]  = {0};
+    int32_t     *coord_strides1 = coord_strides + 4;
+    float       output_zp = 0;
+    float       input_scale = 1.0f;
     float       output_scale = 1.0f;
-    uint32_t    pack_key = 0;
+    float       inout_scale = 1.0f;
+    int32_t     i = 0;
+
+    VSI_UNREFERENCED(param_size);
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
     CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
 
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &width);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &strides[0]);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &area);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &strides[1]);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &vol);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &strides[2]);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &coord_dim);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &strides[3]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &strides[4]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &strides[5]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &strides[6]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &coord_dim);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     block_size   = (int32_t)(attr[2]->shape->data[0]);
     update_width = (int32_t)(attr[1]->shape->data[0]);
     index_num    = (int32_t)(attr[0]->shape->data[1]);
 
-    input1_zp     = attr[1]->asymm.zero_point;
-    input1_scale  = attr[1]->asymm.scale;
-    output_zp     = attr[2]->asymm.zero_point;
-    output_scale  = 1.0f / attr[2]->asymm.scale;
-
-    if (coord_dim == 5)
-    {
-        offset_idx = 1;
-    }
-    if (coord_dim == 4 || coord_dim == 5)
-    {
-        offsetX = vol;
-        offsetY = area;
-        offsetZ = width;
-        offsetW = 1;
-    }
-    else if (coord_dim == 3)
-    {
-        offsetX = area;
-        offsetY = width;
-        offsetZ = 1;
-    }
-    else if (coord_dim == 2)
+    input_scale  = attr[1]->asymm.scale;
+    output_scale = attr[2]->asymm.scale;
+    output_zp    = (float)attr[2]->asymm.zero_point;
+    if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
     {
-        offsetX = width;
-        offsetY = 1;
-        offsetZ = 0;
+        input_scale = 1.0f;
     }
-    else if (coord_dim == 1)
+    if (attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE)
     {
-        offsetX = 1;
-        offsetY = 0;
-        offsetZ = 0;
+        output_scale = 1.0f;
     }
+    inout_scale   = input_scale / output_scale;
 
-    if (attr[1]->dtype == F16 || attr[1]->dtype == I16 || attr[1]->dtype == U16)
+    coord_strides[coord_dim - 1] = 1;
+    for (i = 0; i < coord_dim - 1; i++)
     {
-        update_width = (update_width + 7) / 8;
-    }
-    else if (attr[1]->dtype == U8 || attr[1]->dtype == I8)
-    {
-        update_width = (update_width + 15) / 16;
+        coord_strides[i] = strides[coord_dim - 2 - i];
     }
 
-    if (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == U16)
-    {
-        block_size = (block_size + 7) / 8;
-    }
-    else if (attr[2]->dtype == U8 || attr[2]->dtype == I8)
+    width = block_size;
+    if (block_size % 4 == 0)
     {
-        block_size = (block_size + 15) / 16;
+        width = block_size / 4;
     }
 
     gpu_param.global_scale[0]  = 1;
     gpu_param.global_scale[1]  = 1;
     gpu_param.global_scale[2]  = 1;
 
-    gpu_param.global_size[0]   = block_size;
+    gpu_param.global_size[0]   = width;
     gpu_param.global_size[1]   = index_num;
     gpu_param.global_size[2]   = 1;
 
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
-    CHECK_STATUS_FAIL_GOTO(status, OnError);
-
-    {
-        status = vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width );
-        status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW );
-        status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx );
-        CHECK_STATUS_FAIL_GOTO(status, OnError);
-    }
-#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE )    \
-        (IN0_TYPE | ( OUT_TYPE << 16))
-
-    pack_key = _PACK_SELECT_KEY( attr[1]->dtype, attr[2]->dtype );
-
-    switch( pack_key )
-    {
-    case _PACK_SELECT_KEY( I8,  I8 ):
-    case _PACK_SELECT_KEY( U8,  U8 ):
-        {
-            uint16_t M1               = 0;
-            int32_t  postShift1       = 0;
-            uint32_t multAndoutZP1[2] = {0};
-
-            gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{
-                0xdddddddd, // TCfg
-                0x44444444, // ASelt
-                0x13121110, 0x17161514, // ABin
-                0x11111111, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00002600, // AccumType, ConstantType, and PostShift
-                0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16 };
-            gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{
-                0xdddddddd, // TCfg
-                0x44444444, // ASelt
-                0x1b1a1918, 0x1f1e1d1c, // ABin
-                0x11111111, // BSelt
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+                0x33333333, // TCfg
+                0x11110000, // ASelt
+                0x03020100, 0x03020100, // ABin
+                0x00000000, // BSelt
                 0x00000000, 0x00000000, // BBin
-                0x00002600, // AccumType, ConstantType, and PostShift
+                0x00002400, // AccumType, ConstantType, and PostShift
                 0x00000000, 0x00000000, 0x00000000, 0x00000000,
                 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
             }, GPU_DP_TYPE_16 };
 
-            gpu_quantize_multiplier_16bit( (double)input1_scale * output_scale, &M1, &postShift1);
-
-            multAndoutZP1[0] = (uint32_t)(M1);
-            multAndoutZP1[1] = (uint32_t)((output_zp << postShift1) - input1_zp * M1);
+        gpu_dp_inst_t uniExtractOddData_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x07050301, 0x07050301, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
 
-            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift1 );
-            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 );
+        status = vsi_nn_kernel_gpu_add_param( node, "output_stride", &width );
+        status |= vsi_nn_kernel_gpu_add_param( node, "ref_stride", &update_width );
+        status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride", &coord_strides );
+        status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride1", &coord_strides1 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp );
+        status |= vsi_nn_kernel_gpu_add_param( node, "inout_scale", &inout_scale );
+        CHECK_STATUS_FAIL_GOTO(status, OnError);
 
-            status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                "uniU8MulAndPostShift1_Lo_2x8",  &uniU8MulAndPostShift_Lo_2x8 );
-            status |= vsi_nn_kernel_gpu_add_param( node,
-                "uniU8MulAndPostShift1_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
-            CHECK_STATUS_FAIL_GOTO(status, OnError );
+        if (attr[1]->dtype == U8 || attr[1]->dtype == I8 || attr[1]->dtype == I16)
+        {
+            status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvertInt32toUint8_2x8",  &uniConvertInt32toUint8_2x8 );
+            CHECK_STATUS_FAIL_GOTO(status, OnError);
+        }
+        else if (attr[1]->dtype == BF16)
+        {
+            status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniExtractOddData_2x8",  &uniExtractOddData_2x8 );
+            CHECK_STATUS_FAIL_GOTO(status, OnError);
         }
-        break;
-    default:
-        break;
     }
-#undef _PACK_SELECT_KEY
 
 OnError:
     if (attr[0])
@@ -1707,7 +1300,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer)
         attr[2] = NULL;
     }
     return status;
-} /* _scatter_nd_update_update_initializer() */
+} /* _scatter_nd_update_ref_initializer() */
 
 DEF_KERNEL_INITIALIZER(_scatter_nd_update_copy_initializer)
     (
@@ -1718,7 +1311,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_copy_initializer)
 {
     vsi_status status = VSI_FAILURE;
     gpu_param_t gpu_param = {
-        3,
+        1,
         {0, 0, 0},
         {0, 0, 0},
         {0, 0, 0},
@@ -1726,31 +1319,27 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_copy_initializer)
         };
 
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
-    int32_t     block_size = 1;
-    int32_t     width = 0;
-    int32_t     height = 0;
+    int32_t     width         = 0;
+    int32_t     element_size  = 1;
+    int32_t     i             = 0;
+
+    VSI_UNREFERENCED(param_size);
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
 
-    block_size   = (int32_t)(attr[0]->shape->data[0]);
-    height = (int32_t)(attr[0]->shape->data[1]);
-    width = (int32_t)(block_size * height);
-
-    if (attr[0]->dtype == F16 || attr[0]->dtype == I16 || attr[0]->dtype == U16)
-    {
-        width = (width + 7) / 8;
-    }
-    else if (attr[0]->dtype == U8 || attr[0]->dtype == I8)
+    for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
     {
-        width = (width + 15) / 16;
+        element_size *= (int32_t)attr[0]->shape->data[i];
     }
+    width = element_size / 8;
 
     gpu_param.global_scale[0]  = 1;
     gpu_param.global_scale[1]  = 1;
     gpu_param.global_scale[2]  = 1;
 
-    gpu_param.global_size[0]   = width;
+    gpu_param.global_size[0]   = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
+                                        / gpu_param.global_scale[0], 4);
     gpu_param.global_size[1]   = 1;
     gpu_param.global_size[2]   = 1;
 
@@ -1766,166 +1355,151 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_copy_initializer)
     return status;
 } /* _scatter_nd_update_copy_initializer() */
 
-/*
- * Query kernel
- */
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
     vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel,
-    int32_t coord_dim,
-    int32_t isBig
+    vsi_nn_kernel_t* kernel_reset,
+    vsi_nn_kernel_t* kernel_update,
+    vsi_nn_kernel_t* kernel_ref,
+    vsi_nn_kernel_t* kernel_copy,
+    int32_t coord_flg,
+    int32_t opt_flg
     )
 {
-    vsi_status status = VSI_FAILURE;
+    vsi_status status = VSI_SUCCESS;
     vsi_nn_kernel_dtype_e input0_dtype = U8;
-    vsi_nn_kernel_dtype_e input2_dtype = U8;
+    vsi_nn_kernel_dtype_e input2_dtype = F16;
     vsi_nn_kernel_dtype_e output_dtype = U8;
+    vsi_nn_kernel_dtype_e acc_dtype    = I32;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 0, isBig );
+    if (input2_dtype == F16)
+    {
+        acc_dtype = F32;
+    }
+
+    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, 0, output_dtype, 0, 0, 0);
 
-    for ( i = 0; i < _cnt_of_array(scatter_nd_update_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_reset_map); i ++ )
     {
-        if ( scatter_nd_update_map[i].key == key )
+        if ( scatter_nd_update_reset_map[i].key == key )
         {
             break;
         }
     }
-    if ( i < _cnt_of_array(scatter_nd_update_map) )
+
+    if ( i < _cnt_of_array(scatter_nd_update_reset_map) )
     {
-        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  scatter_nd_update_map[i].function_name );
-        kernel->info.parameters = _scatter_nd_update_kernel_param_def;
-        kernel->info.numParams = _cnt_of_array( _scatter_nd_update_kernel_param_def );
-        if (isBig)
-        {
-            kernel->info.initialize = _scatter_nd_update_big_initializer;
-        }
-        else
-        {
-            kernel->info.initialize = _scatter_nd_update_initializer;
-        }
+        snprintf( kernel_reset->info.name, VX_MAX_KERNEL_NAME, "%s",
+                        scatter_nd_update_reset_map[i].function_name );
+        kernel_reset->info.parameters = _scatter_nd_update_reset_kernel_param_def;
+        kernel_reset->info.numParams = _SCATTER_ND_UPDATE_RESET_PARAM_NUM;
+        kernel_reset->info.initialize = _scatter_nd_update_reset_initializer;
 
-        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+        vsi_nn_kernel_add_source( kernel_reset, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                 "vsi_nn_kernel_header",
-                scatter_nd_update_map[i].source_name );
-        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
-                scatter_nd_update_map[i].source_name );
-        status = VSI_SUCCESS;
+                scatter_nd_update_reset_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel_reset, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_reset_map[i].source_name );
+    }
+    else
+    {
+        status = VSI_FAILURE;
     }
-    return status;
-} /* _query_kernel() */
-
-static vsi_status _query_kernel_large
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel_reset,
-    vsi_nn_kernel_t* kernel_pre,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_nn_kernel_dtype_e input0_dtype = U8;
-    vsi_nn_kernel_dtype_e input2_dtype = F16;
-    vsi_nn_kernel_dtype_e output_dtype = U8;
-    uint32_t key = 0;
-    int i = 0;
-
-    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
-    input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
-    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, I32, I32, 1, 1 );
+    key = HASH_SCATTER_ND_UPDATE_KEY( 0, input2_dtype, 0, 1, coord_flg, opt_flg);
 
-    for ( i = 0; i < _cnt_of_array(scatter_nd_update_pre_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_update_map); i ++ )
     {
-        if ( scatter_nd_update_pre_map[i].key == key )
+        if ( scatter_nd_update_update_map[i].key == key )
         {
             break;
         }
     }
-
-    if ( i < _cnt_of_array(scatter_nd_update_pre_map) )
+    if ( i < _cnt_of_array(scatter_nd_update_update_map) )
     {
-        snprintf( kernel_pre->info.name, VX_MAX_KERNEL_NAME, "%s",  scatter_nd_update_pre_map[i].function_name );
-        kernel_pre->info.parameters = _scatter_nd_update_pre_kernel_param_def;
-        kernel_pre->info.numParams = _SCATTER_ND_UPDATE_PRE_PARAM_NUM;
-        kernel_pre->info.initialize = _scatter_nd_update_pre_initializer;
+        snprintf( kernel_update->info.name, VX_MAX_KERNEL_NAME, "%s",
+                        scatter_nd_update_update_map[i].function_name );
+        kernel_update->info.parameters = _scatter_nd_update_update_kernel_param_def;
+        kernel_update->info.numParams = _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM;
+        kernel_update->info.initialize = _scatter_nd_update_update_initializer;
 
-        vsi_nn_kernel_add_source( kernel_pre, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+        vsi_nn_kernel_add_source( kernel_update, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                 "vsi_nn_kernel_header",
-                scatter_nd_update_pre_map[i].source_name );
-        vsi_nn_kernel_add_source( kernel_pre, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
-                scatter_nd_update_pre_map[i].source_name );
+                scatter_nd_update_update_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel_update, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_update_map[i].source_name );
     }
     else
     {
-        status = VSI_FAILURE;
+        status |= VSI_FAILURE;
     }
 
+    key = HASH_SCATTER_ND_UPDATE_KEY( 0, acc_dtype, output_dtype, 2, coord_flg, opt_flg);
 
-    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 0, 1 );
-
-    for ( i = 0; i < _cnt_of_array(scatter_nd_update_post_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_ref_map); i ++ )
     {
-        if ( scatter_nd_update_post_map[i].key == key )
+        if ( scatter_nd_update_ref_map[i].key == key )
         {
             break;
         }
     }
-    if ( i < _cnt_of_array(scatter_nd_update_post_map) )
+
+    if ( i < _cnt_of_array(scatter_nd_update_ref_map) )
     {
-        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  scatter_nd_update_post_map[i].function_name );
-        kernel->info.parameters = _scatter_nd_update_post_kernel_param_def;
-        kernel->info.numParams = _SCATTER_ND_UPDATE_POST_PARAM_NUM;
-        kernel->info.initialize = _scatter_nd_update_post_initializer;
+        snprintf( kernel_ref->info.name, VX_MAX_KERNEL_NAME, "%s",
+                        scatter_nd_update_ref_map[i].function_name );
+        kernel_ref->info.parameters = _scatter_nd_update_ref_kernel_param_def;
+        kernel_ref->info.numParams = _SCATTER_ND_UPDATE_REF_PARAM_NUM;
+        kernel_ref->info.initialize = _scatter_nd_update_ref_initializer;
 
-        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+        vsi_nn_kernel_add_source( kernel_ref, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                 "vsi_nn_kernel_header",
-                scatter_nd_update_post_map[i].source_name );
-        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
-                scatter_nd_update_post_map[i].source_name );
+                scatter_nd_update_ref_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel_ref, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_ref_map[i].source_name );
     }
     else
     {
-        status |= VSI_FAILURE;
+        status = VSI_FAILURE;
     }
 
-    key = HASH_SCATTER_ND_UPDATE_KEY( I32, I32, I32, 2, 1 );
+    key = HASH_SCATTER_ND_UPDATE_KEY( 0, 0, output_dtype, 3, 0, 0);
 
-    for ( i = 0; i < _cnt_of_array(scatter_nd_update_reset_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_copy_map); i ++ )
     {
-        if ( scatter_nd_update_reset_map[i].key == key )
+        if ( scatter_nd_update_copy_map[i].key == key )
         {
             break;
         }
     }
-    if ( i < _cnt_of_array(scatter_nd_update_reset_map) )
+    if ( i < _cnt_of_array(scatter_nd_update_copy_map) )
     {
-        snprintf( kernel_reset->info.name, VX_MAX_KERNEL_NAME, "%s",  scatter_nd_update_reset_map[i].function_name );
-        kernel_reset->info.parameters = _scatter_nd_update_reset_kernel_param_def;
-        kernel_reset->info.numParams = _SCATTER_ND_UPDATE_RESET_PARAM_NUM;
-        kernel_reset->info.initialize = _scatter_nd_update_reset_initializer;
+        snprintf( kernel_copy->info.name, VX_MAX_KERNEL_NAME, "%s",
+                        scatter_nd_update_copy_map[i].function_name );
+        kernel_copy->info.parameters = _scatter_nd_update_copy_kernel_param_def;
+        kernel_copy->info.numParams = _SCATTER_ND_UPDATE_COPY_PARAM_NUM;
+        kernel_copy->info.initialize = _scatter_nd_update_copy_initializer;
 
-        vsi_nn_kernel_add_source( kernel_reset, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+        vsi_nn_kernel_add_source( kernel_copy, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                 "vsi_nn_kernel_header",
-                scatter_nd_update_reset_map[i].source_name );
-        vsi_nn_kernel_add_source( kernel_reset, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
-                scatter_nd_update_reset_map[i].source_name );
+                scatter_nd_update_copy_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel_copy, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_copy_map[i].source_name );
     }
     else
     {
         status |= VSI_FAILURE;
     }
+
     return status;
-} /* _query_kernel_large() */
+} /* _query_kernel() */
 
 static vsi_status _query_kernel_special
     (
@@ -1941,34 +1515,35 @@ static vsi_status _query_kernel_special
     vsi_nn_kernel_dtype_e input2_dtype = F16;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 3, 1 );
+    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 4, 1, 0);
 
-    for ( i = 0; i < _cnt_of_array(scatter_nd_update_ref_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_ref_map); i ++ )
     {
-        if ( scatter_nd_update_ref_map[i].key == key )
+        if ( scatter_nd_update_special_ref_map[i].key == key )
         {
             break;
         }
     }
 
-    if ( i < _cnt_of_array(scatter_nd_update_ref_map) )
+    if ( i < _cnt_of_array(scatter_nd_update_special_ref_map) )
     {
-        snprintf( kernel_ref->info.name, VX_MAX_KERNEL_NAME, "%s",  scatter_nd_update_ref_map[i].function_name );
-        kernel_ref->info.parameters = _scatter_nd_update_ref_kernel_param_def;
-        kernel_ref->info.numParams = _SCATTER_ND_UPDATE_REF_PARAM_NUM;
-        kernel_ref->info.initialize = _scatter_nd_update_ref_initializer;
+        snprintf( kernel_ref->info.name, VX_MAX_KERNEL_NAME, "%s",
+                        scatter_nd_update_special_ref_map[i].function_name );
+        kernel_ref->info.parameters = _scatter_nd_update_special_ref_kernel_param_def;
+        kernel_ref->info.numParams = _SCATTER_ND_UPDATE_SPECIAL_REF_PARAM_NUM;
+        kernel_ref->info.initialize = _scatter_nd_update_special_ref_initializer;
 
         vsi_nn_kernel_add_source( kernel_ref, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                 "vsi_nn_kernel_header",
-                scatter_nd_update_ref_map[i].source_name );
+                scatter_nd_update_special_ref_map[i].source_name );
         vsi_nn_kernel_add_source( kernel_ref, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
-                scatter_nd_update_ref_map[i].source_name );
+                scatter_nd_update_special_ref_map[i].source_name );
     }
     else
     {
@@ -1976,54 +1551,56 @@ static vsi_status _query_kernel_special
     }
 
 
-    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 4, 1 );
+    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 5, 1, 0);
 
-    for ( i = 0; i < _cnt_of_array(scatter_nd_update_update_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_update_map); i ++ )
     {
-        if ( scatter_nd_update_update_map[i].key == key )
+        if ( scatter_nd_update_special_update_map[i].key == key )
         {
             break;
         }
     }
-    if ( i < _cnt_of_array(scatter_nd_update_update_map) )
+    if ( i < _cnt_of_array(scatter_nd_update_special_update_map) )
     {
-        snprintf( kernel_update->info.name, VX_MAX_KERNEL_NAME, "%s",  scatter_nd_update_update_map[i].function_name );
-        kernel_update->info.parameters = _scatter_nd_update_update_kernel_param_def;
-        kernel_update->info.numParams = _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM;
-        kernel_update->info.initialize = _scatter_nd_update_update_initializer;
+        snprintf( kernel_update->info.name, VX_MAX_KERNEL_NAME, "%s",
+                        scatter_nd_update_special_update_map[i].function_name );
+        kernel_update->info.parameters = _scatter_nd_update_special_update_kernel_param_def;
+        kernel_update->info.numParams = _SCATTER_ND_UPDATE_SPECIAL_UPDATE_PARAM_NUM;
+        kernel_update->info.initialize = _scatter_nd_update_special_update_initializer;
 
         vsi_nn_kernel_add_source( kernel_update, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                 "vsi_nn_kernel_header",
-                scatter_nd_update_update_map[i].source_name );
+                scatter_nd_update_special_update_map[i].source_name );
         vsi_nn_kernel_add_source( kernel_update, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
-                scatter_nd_update_update_map[i].source_name );
+                scatter_nd_update_special_update_map[i].source_name );
     }
     else
     {
         status |= VSI_FAILURE;
     }
 
-    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 5, 1 );
+    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 6, 1, 0);
 
-    for ( i = 0; i < _cnt_of_array(scatter_nd_update_copy_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_special_copy_map); i ++ )
     {
-        if ( scatter_nd_update_copy_map[i].key == key )
+        if ( scatter_nd_update_special_copy_map[i].key == key )
         {
             break;
         }
     }
-    if ( i < _cnt_of_array(scatter_nd_update_copy_map) )
+    if ( i < _cnt_of_array(scatter_nd_update_special_copy_map) )
     {
-        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  scatter_nd_update_copy_map[i].function_name );
-        kernel->info.parameters = _scatter_nd_update_copy_kernel_param_def;
-        kernel->info.numParams = _SCATTER_ND_UPDATE_COPY_PARAM_NUM;
-        kernel->info.initialize = _scatter_nd_update_copy_initializer;
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",
+                        scatter_nd_update_special_copy_map[i].function_name );
+        kernel->info.parameters = _scatter_nd_update_special_copy_kernel_param_def;
+        kernel->info.numParams = _SCATTER_ND_UPDATE_SPECIAL_COPY_PARAM_NUM;
+        kernel->info.initialize = _scatter_nd_update_special_copy_initializer;
 
         vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                 "vsi_nn_kernel_header",
-                scatter_nd_update_copy_map[i].source_name );
+                scatter_nd_update_special_copy_map[i].source_name );
         vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
-                scatter_nd_update_copy_map[i].source_name );
+                scatter_nd_update_special_copy_map[i].source_name );
     }
     else
     {
@@ -2044,41 +1621,37 @@ static vsi_nn_kernel_node_t _setup
     )
 {
     vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t tmp_params[_SCATTER_ND_UPDATE_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t node = NULL;
     vsi_size_t  shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+    vsi_size_t  strides[VSI_NN_MAX_DIM_NUM] = {0};
     int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
     int32_t coord_dim   = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
     int32_t idx_num  = vsi_nn_kernel_param_get_int32( params, "idx_num" );
-    vsi_size_t *input_size = inputs[2]->attr.size;
-    uint32_t dims_num = inputs[2]->attr.dim_num;
     int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
-    vsi_size_t width = 0, area = 0, vol = 0;
     int32_t big_flg = 0;
     vsi_nn_kernel_dtype_e update_dtype = vsi_nn_kernel_map_dtype(inputs[2]->attr.dtype.vx_type);
     vsi_nn_kernel_dtype_e ref_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
     vsi_nn_kernel_dtype_e output_dtype = vsi_nn_kernel_map_dtype(outputs[0]->attr.dtype.vx_type);
     int32_t type_flg = ((update_dtype == U8 || update_dtype == I8 || update_dtype == I16) &&
                         (update_dtype == ref_dtype && update_dtype == output_dtype)) ? 1 : 0;
-    int32_t special_flg = (block_size % 16 == 0 && type_flg)  ? 1 : 0;
+    int32_t special_flg = (block_size % 16 == 0 && type_flg && coord_dim <= 4)  ? 1 : 0;
+    int32_t coord_flg = 0;
+    int32_t opt_flg = (block_size % 4 == 0) ? 1 : 0;
     int32_t i = 0;
     int32_t isRepeat = 0;
+    vsi_nn_tensor_t * tensors[4] = { NULL };
+    vsi_nn_kernel_t * ikernels[3] = { NULL };
 
-    if (coord_dim > 4 && input_size[dims_num - 1] > 1)
-    {
-        return NULL;
-    }
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
 
     status = get_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0], coord_dim, 0,
-                                                    NULL, NULL, NULL, &rs_idx_dim, &big_flg);
+                                                    NULL, &rs_idx_dim, &big_flg);
     status |= get_scatter_nd_update_tensor_reshape_size(&inputs[2], shapes[1], block_size, 0,
-                                                    NULL, NULL, NULL, &rs_in_dim, &big_flg);
+                                                    NULL, &rs_in_dim, &big_flg);
     status |= get_scatter_nd_update_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim,
-                                                    &width, &area, &vol, &rs_out_dim, &big_flg);
-    if (status != VSI_SUCCESS)
-    {
-        return NULL;
-    }
+                                                    strides, &rs_out_dim, &big_flg);
+    CHECK_STATUS_FAIL_GOTO( status, final );
 
     check_scatter_nd_update_index_repeat(inputs, coord_dim, block_size, idx_num, &isRepeat);
 
@@ -2087,11 +1660,9 @@ static vsi_nn_kernel_node_t _setup
         vsi_nn_tensor_attr_t attr;
         vsi_nn_kernel_node_t tmp_node = NULL;
         vsi_nn_kernel_node_t ref_node = NULL;
-        vsi_nn_kernel_node_param_t ref_params[_SCATTER_ND_UPDATE_REF_PARAM_NUM] = { NULL };
-        vsi_nn_kernel_node_param_t node_params[_SCATTER_ND_UPDATE_UPDATE_PARAM_NUM] = { NULL };
-        vsi_nn_kernel_node_param_t cpy_params[_SCATTER_ND_UPDATE_COPY_PARAM_NUM] = { NULL };
-        vsi_nn_kernel_t * ikernels[2] = { NULL };
-        vsi_nn_tensor_t * tensors[3] = { NULL };
+        vsi_nn_kernel_node_param_t ref_params[_SCATTER_ND_UPDATE_SPECIAL_REF_PARAM_NUM] = { NULL };
+        vsi_nn_kernel_node_param_t node_params[_SCATTER_ND_UPDATE_SPECIAL_UPDATE_PARAM_NUM] = { NULL };
+        vsi_nn_kernel_node_param_t cpy_params[_SCATTER_ND_UPDATE_SPECIAL_COPY_PARAM_NUM] = { NULL };
 
         ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
         ikernels[0]->unique_id = kernel->unique_id;
@@ -2127,7 +1698,8 @@ static vsi_nn_kernel_node_t _setup
                 ref_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t,  shapes[2], rs_out_dim );
                 ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
                 ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
-                status = vsi_nn_kernel_node_pass_param( ref_node, ref_params, _SCATTER_ND_UPDATE_REF_PARAM_NUM );
+                status = vsi_nn_kernel_node_pass_param( ref_node, ref_params,
+                                _SCATTER_ND_UPDATE_SPECIAL_REF_PARAM_NUM );
                 CHECK_STATUS(status);
                 vsi_nn_kernel_tensor_release( &ref_params[0] );
             }
@@ -2143,11 +1715,12 @@ static vsi_nn_kernel_node_t _setup
                 node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
                 node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
                 node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t;
-                node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
-                node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &area );
-                node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &vol );
+                node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[0] );
+                node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[1] );
+                node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[2] );
                 node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
-                status = vsi_nn_kernel_node_pass_param( tmp_node, node_params, _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM );
+                status = vsi_nn_kernel_node_pass_param( tmp_node, node_params,
+                                _SCATTER_ND_UPDATE_SPECIAL_UPDATE_PARAM_NUM );
                 CHECK_STATUS(status);
                 vsi_nn_kernel_tensor_release( &node_params[0] );
                 vsi_nn_kernel_tensor_release( &node_params[1] );
@@ -2166,7 +1739,7 @@ static vsi_nn_kernel_node_t _setup
                 cpy_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
                 cpy_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t;
                 cpy_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
-                status = vsi_nn_kernel_node_pass_param( node, cpy_params, _SCATTER_ND_UPDATE_COPY_PARAM_NUM );
+                status = vsi_nn_kernel_node_pass_param( node, cpy_params, _SCATTER_ND_UPDATE_SPECIAL_COPY_PARAM_NUM );
                 CHECK_STATUS(status);
                 vsi_nn_kernel_tensor_release( &cpy_params[2] );
             }
@@ -2195,106 +1768,159 @@ static vsi_nn_kernel_node_t _setup
         if (ref_node) {vsi_nn_kernel_node_release( &ref_node );}
         if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
     }
-    else if ((update_dtype == U8 || update_dtype == I8 || update_dtype == I16))
+    else
     {
         vsi_nn_tensor_attr_t attr;
-        vsi_nn_kernel_node_t tmp_node = NULL;
         vsi_nn_kernel_node_t reset_node = NULL;
-        vsi_nn_kernel_node_param_t pre_params[_SCATTER_ND_UPDATE_PRE_PARAM_NUM] = { NULL };
-        vsi_nn_kernel_node_param_t node_params[_SCATTER_ND_UPDATE_POST_PARAM_NUM] = { NULL };
+        vsi_nn_kernel_node_t update_node = NULL;
+        vsi_nn_kernel_node_t ref_node = NULL;
         vsi_nn_kernel_node_param_t reset_params[_SCATTER_ND_UPDATE_RESET_PARAM_NUM] = { NULL };
-        vsi_nn_kernel_t * ikernels[2] = { NULL };
-        vsi_nn_tensor_t * tensors[3] = { NULL };
+        vsi_nn_kernel_node_param_t ref_params[_SCATTER_ND_UPDATE_REF_PARAM_NUM] = { NULL };
+        vsi_nn_kernel_node_param_t update_params[_SCATTER_ND_UPDATE_UPDATE_PARAM_NUM] = { NULL };
+        vsi_nn_kernel_node_param_t cpy_params[_SCATTER_ND_UPDATE_COPY_PARAM_NUM] = { NULL };
+        int32_t width = 1;
+        int32_t res = 0;
 
         ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
         ikernels[0]->unique_id = kernel->unique_id;
         ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
         ikernels[1]->unique_id = kernel->unique_id;
+        ikernels[2] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+        ikernels[2]->unique_id = kernel->unique_id;
 
         memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
-        attr.dtype.vx_type = VSI_NN_TYPE_INT32;
-        attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+        attr.dtype = outputs[0]->attr.dtype;
         attr.is_const = FALSE;
         attr.vtl = TRUE;
 
         for (i = 0; i < rs_out_dim; i++)
         {
             attr.size[i] = shapes[2][i];
+            width *= (int32_t)shapes[2][i];
         }
         attr.dim_num = rs_out_dim;
 
-        tensors[0] = vsi_nn_CreateTensor( graph, &attr );
+        res = width % 8;
+        width = (width >> 3) << 3;
+
+        tensors[0] = vsi_nn_CreateTensor( graph, &attr );  // ref'
+        attr.dtype = inputs[2]->attr.dtype;
+        attr.dtype.vx_type = VSI_NN_TYPE_INT32;
+        attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+        if (update_dtype == F16)
+        {
+            attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+        }
+        tensors[1] = vsi_nn_CreateTensor( graph, &attr );  // temp_buf_int
         attr.size[0] = 1;
-        tensors[1] = vsi_nn_CreateTensor( graph, &attr );
         attr.size[1] = 1;
-        tensors[2] = vsi_nn_CreateTensor( graph, &attr );
+        tensors[2] = vsi_nn_CreateTensor( graph, &attr );  // link_buffer0
+        tensors[3] = vsi_nn_CreateTensor( graph, &attr );  // link_buffer1
 
-        status = _query_kernel_large( inputs, outputs, ikernels[0], ikernels[1], kernel);
+        status = _query_kernel( inputs, outputs, ikernels[0], ikernels[1], ikernels[2], kernel, coord_flg, opt_flg);
         if ( VSI_SUCCESS == status)
         {
-            // reset count
+            // convert ref to output
             reset_node = vsi_nn_kernel_create_node( graph, ikernels[0] );
             if (reset_node)
             {
                 uint32_t index = 0;
                 /* Pass parameters to node. */
-                reset_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t,  shapes[0], rs_idx_dim );
+                reset_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t,  shapes[2], rs_out_dim );
                 reset_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
                 reset_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
+                reset_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+                reset_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
                 status = vsi_nn_kernel_node_pass_param( reset_node, reset_params, _SCATTER_ND_UPDATE_RESET_PARAM_NUM );
                 CHECK_STATUS(status);
                 vsi_nn_kernel_tensor_release( &reset_params[0] );
+                vsi_nn_kernel_scalar_release( &reset_params[3] );
+                vsi_nn_kernel_scalar_release( &reset_params[4] );
             }
 
-            // pre-process
-            tmp_node = vsi_nn_kernel_create_node( graph, ikernels[1] );
-            if (tmp_node)
+            // update
+            update_node = vsi_nn_kernel_create_node( graph, ikernels[1] );
+            if (update_node)
+            {
+                uint32_t index = 0;
+                /* Pass parameters to node. */
+                update_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t,  shapes[0], rs_idx_dim );
+                update_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t,  shapes[1], rs_in_dim );
+                update_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
+                update_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t;
+                update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[0] );
+                update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[1] );
+                update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[2] );
+                update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[3] );
+                update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[4] );
+                update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[5] );
+                update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[6] );
+                update_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
+                status = vsi_nn_kernel_node_pass_param( update_node, update_params,
+                                _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM );
+                CHECK_STATUS(status);
+                vsi_nn_kernel_tensor_release( &update_params[0] );
+                vsi_nn_kernel_tensor_release( &update_params[1] );
+                vsi_nn_kernel_scalar_release( &update_params[4] );
+                vsi_nn_kernel_scalar_release( &update_params[5] );
+                vsi_nn_kernel_scalar_release( &update_params[6] );
+                vsi_nn_kernel_scalar_release( &update_params[7] );
+                vsi_nn_kernel_scalar_release( &update_params[8] );
+                vsi_nn_kernel_scalar_release( &update_params[9] );
+                vsi_nn_kernel_scalar_release( &update_params[10] );
+                vsi_nn_kernel_scalar_release( &update_params[11] );
+            }
+
+            // ref
+            ref_node = vsi_nn_kernel_create_node( graph, ikernels[2] );
+            if (ref_node)
             {
                 uint32_t index = 0;
                 /* Pass parameters to node. */
-                pre_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t,  shapes[0], rs_idx_dim );
-                pre_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t,  shapes[1], rs_in_dim );
-                pre_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
-                pre_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
-                pre_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t;
-                pre_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
-                pre_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &area );
-                pre_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &vol );
-                pre_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
-                status = vsi_nn_kernel_node_pass_param( tmp_node, pre_params, _SCATTER_ND_UPDATE_PRE_PARAM_NUM );
+                ref_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t,  shapes[0], rs_idx_dim );
+                ref_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t,  shapes[1], rs_in_dim );
+                ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
+                ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+                ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t;
+                ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[3]->t;
+                ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[0] );
+                ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[1] );
+                ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[2] );
+                ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[3] );
+                ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[4] );
+                ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[5] );
+                ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[6] );
+                ref_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
+                status = vsi_nn_kernel_node_pass_param( ref_node, ref_params, _SCATTER_ND_UPDATE_REF_PARAM_NUM );
                 CHECK_STATUS(status);
-                vsi_nn_kernel_tensor_release( &pre_params[0] );
-                vsi_nn_kernel_tensor_release( &pre_params[1] );
-                vsi_nn_kernel_scalar_release( &pre_params[5] );
-                vsi_nn_kernel_scalar_release( &pre_params[6] );
-                vsi_nn_kernel_scalar_release( &pre_params[7] );
-                vsi_nn_kernel_scalar_release( &pre_params[8] );
+                vsi_nn_kernel_tensor_release( &ref_params[0] );
+                vsi_nn_kernel_tensor_release( &ref_params[1] );
+                vsi_nn_kernel_scalar_release( &ref_params[6] );
+                vsi_nn_kernel_scalar_release( &ref_params[7] );
+                vsi_nn_kernel_scalar_release( &ref_params[8] );
+                vsi_nn_kernel_scalar_release( &ref_params[9] );
+                vsi_nn_kernel_scalar_release( &ref_params[10] );
+                vsi_nn_kernel_scalar_release( &ref_params[11] );
+                vsi_nn_kernel_scalar_release( &ref_params[12] );
+                vsi_nn_kernel_scalar_release( &ref_params[13] );
             }
 
+            // copy to output
             node = vsi_nn_kernel_create_node( graph, kernel );
             if ( node )
             {
                 uint32_t index = 0;
                 /* Pass parameters to node. */
-                node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t,  shapes[2], rs_out_dim );
-                node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
-                node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
-                node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t;
-                node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t,  shapes[1], rs_in_dim );
-                node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
-                node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
-                node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &area );
-                node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &vol );
-                node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
-                status = vsi_nn_kernel_node_pass_param( node, node_params, _SCATTER_ND_UPDATE_POST_PARAM_NUM );
+                cpy_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+                cpy_params[index++] = (vsi_nn_kernel_node_param_t)tensors[3]->t;
+                cpy_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
+                cpy_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+                cpy_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
+                status = vsi_nn_kernel_node_pass_param( node, cpy_params, _SCATTER_ND_UPDATE_COPY_PARAM_NUM );
                 CHECK_STATUS(status);
-                vsi_nn_kernel_tensor_release( &node_params[0] );
-                vsi_nn_kernel_tensor_release( &node_params[4] );
-                vsi_nn_kernel_tensor_release( &node_params[5] );
-                vsi_nn_kernel_scalar_release( &node_params[6] );
-                vsi_nn_kernel_scalar_release( &node_params[7] );
-                vsi_nn_kernel_scalar_release( &node_params[8] );
-                vsi_nn_kernel_scalar_release( &node_params[9] );
+                vsi_nn_kernel_tensor_release( &cpy_params[2] );
+                vsi_nn_kernel_scalar_release( &cpy_params[3] );
+                vsi_nn_kernel_scalar_release( &cpy_params[4] );
             }
         }
 
@@ -2306,6 +1932,10 @@ static vsi_nn_kernel_node_t _setup
         {
             vsi_nn_kernel_release( &ikernels[1] );
         }
+        if ( ikernels[2] )
+        {
+            vsi_nn_kernel_release( &ikernels[2] );
+        }
         if ( tensors[0] )
         {
             vsi_nn_ReleaseTensor( &tensors[0] );
@@ -2318,41 +1948,33 @@ static vsi_nn_kernel_node_t _setup
         {
             vsi_nn_ReleaseTensor( &tensors[2] );
         }
+        if ( tensors[3] )
+        {
+            vsi_nn_ReleaseTensor( &tensors[3] );
+        }
+        if (ref_node) {vsi_nn_kernel_node_release( &ref_node );}
         if (reset_node) {vsi_nn_kernel_node_release( &reset_node );}
-        if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
+        if (update_node) {vsi_nn_kernel_node_release( &update_node );}
     }
-    else
+
+final:
+    if (ikernels[0])
     {
-        status = _query_kernel( inputs, outputs, kernel, coord_dim, big_flg);
-        if ( VSI_SUCCESS == status)
-        {
-            node = vsi_nn_kernel_create_node( graph, kernel );
-            if ( node )
-            {
-                uint32_t index = 0;
-                /* Pass parameters to node. */
-                tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t,  shapes[2], rs_out_dim );
-                tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t,  shapes[0], rs_idx_dim );
-                tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t,  shapes[1], rs_in_dim );
-                //tmp_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
-                tmp_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
-                tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
-                tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &area );
-                tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &vol );
-                tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
-                status = vsi_nn_kernel_node_pass_param( node, tmp_params, _SCATTER_ND_UPDATE_PARAM_NUM );
-                CHECK_STATUS(status);
-                vsi_nn_kernel_tensor_release( &tmp_params[0] );
-                vsi_nn_kernel_tensor_release( &tmp_params[1] );
-                vsi_nn_kernel_tensor_release( &tmp_params[2] );
-                vsi_nn_kernel_tensor_release( &tmp_params[3] );
-                vsi_nn_kernel_scalar_release( &tmp_params[4] );
-                vsi_nn_kernel_scalar_release( &tmp_params[5] );
-                vsi_nn_kernel_scalar_release( &tmp_params[6] );
-                vsi_nn_kernel_scalar_release( &tmp_params[7] );
-            }
-        }
+        vsi_nn_kernel_release(&ikernels[0]);
+    }
+    if (ikernels[1])
+    {
+        vsi_nn_kernel_release(&ikernels[1]);
+    }
+    if (ikernels[2])
+    {
+        vsi_nn_kernel_release(&ikernels[2]);
     }
+    vsi_safe_release_tensor(tensors[0]);
+    vsi_safe_release_tensor(tensors[1]);
+    vsi_safe_release_tensor(tensors[2]);
+    vsi_safe_release_tensor(tensors[3]);
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/evis/select_evis.c b/src/tim/vx/internal/src/kernel/evis/select_evis.c
index fae6ad78c..b918e2c08 100644
--- a/src/tim/vx/internal/src/kernel/evis/select_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/select_evis.c
@@ -34,6 +34,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
 
 __BEGIN_DECLS
 
@@ -61,6 +62,10 @@ typedef enum _internal_img_dim_e
         CVIVANTE_NAMESPACE("evis.select_"STR(COND_DTYPE)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
         _SELECT_KERNEL_SOURCE}
 
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
+
 typedef struct
 {
     uint32_t key;
@@ -138,7 +143,7 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
         (( IN0_TYPE << 24) | ( IN1_TYPE << 16) | ( OUT_TYPE << 8))
 #define MAX_MULTIPLIER_NUM      (65535)
 #define MAX_POST_SHIFT_BITS     (31)
-    vsi_status status = VX_SUCCESS;
+    vsi_status status = VSI_FAILURE;
     // Alignment with a power of two value.
     gpu_param_t gpu_param = {
         3,
@@ -166,6 +171,8 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
     uint16_t in1_M0                         = 0;
     int32_t  in1_postShift                  = 0;
     uint32_t pack_key                       = 0;
+
+    VSI_UNREFERENCED(param_size);
     input0_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0);
     CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
     input1_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input1);
@@ -444,15 +451,67 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_SELECT_PARAM_NUM] = {NULL};
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL };
+    vsi_size_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    vsi_size_t* shapes_ptr[_IO_NUM];
+    vsi_size_t* shapes_in[_INPUT_NUM];
+    vsi_size_t rank_in[_INPUT_NUM];
+    uint32_t new_rank = 0;
+    uint32_t i = 0;
+    vsi_bool ret = FALSE;
+
+    VSI_UNREFERENCED(params);
+
+    for (i = 0; i < _IO_NUM; i++)
+    {
+        shapes_ptr[i] = shapes[i];
+    }
+
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        shapes_in[i] = inputs[i]->attr.size;
+        rank_in[i]   = (vsi_size_t)inputs[i]->attr.dim_num;
+    }
+
+    ret = vsi_nn_kernel_optimize_broadcast_shape(
+            (const vsi_size_t**)shapes_in, rank_in, _INPUT_NUM,
+            outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes_ptr, shapes[_INPUT_NUM], &new_rank);
+
+    if ( ret )
+    {
+        for (i = 0; i < _INPUT_NUM; i++)
+        {
+            reshape_tensors[i] = vsi_nn_reshape_tensor( graph,
+                    inputs[i], shapes[i], new_rank );
+        }
+
+        for (i = 0; i < _OUTPUT_NUM; i++)
+        {
+            reshape_tensors[i + _INPUT_NUM] = vsi_nn_reshape_tensor( graph,
+                    outputs[i], shapes[i + _INPUT_NUM], new_rank );
+        }
+    }
+    else
+    {
+        for (i = 0; i < _INPUT_NUM; i++)
+        {
+            reshape_tensors[i] = inputs[i];
+        }
+        for (i = 0; i < _OUTPUT_NUM; i++)
+        {
+            reshape_tensors[i + _INPUT_NUM] = outputs[i];
+        }
+    }
 
-    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[3]->attr.size,
+                reshape_tensors[3]->attr.dim_num ) )
     {
         return NULL;
     }
 
-    image_2d = (outputs[0]->attr.dim_num == 2);
-    status = _query_kernel( kernel, inputs, outputs, image_2d);
+    image_2d = (reshape_tensors[3]->attr.dim_num == 2);
+    status = _query_kernel( kernel, inputs, &reshape_tensors[3], image_2d);
 
     if ( VSI_SUCCESS == status)
     {
@@ -460,12 +519,22 @@ static vsi_nn_kernel_node_t _setup
         if ( node )
         {
             /* Set inputs and outputs */
+
             vsi_nn_kernel_node_pack_io( node_params, _SELECT_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
+                    &reshape_tensors[0], input_num, &reshape_tensors[3], output_num );
+
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, _SELECT_PARAM_NUM );
         }
     }
+    if (ret)
+    {
+        for (i = 0; i < _IO_NUM; i++)
+        {
+            vsi_safe_release_tensor( reshape_tensors[i] );
+        }
+    }
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c b/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c
index 5d7e2d6cf..b2e22ed7c 100644
--- a/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c
@@ -123,6 +123,8 @@ DEF_KERNEL_INITIALIZER(_sequence_mask_initializer)
     int32_t output_zp = 0;
     int32_t input_zp = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -252,7 +254,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype = U8;
     vsi_status status = VSI_FAILURE;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -299,9 +301,11 @@ static int32_t _optimize_mask_shape
     vsi_status status = VSI_SUCCESS;
     vsi_size_t in_shape[VSI_NN_MAX_DIM_NUM] = {0};
     vsi_size_t new_rank = 0;
-    uint32_t i = 0;
+    vsi_size_t i = 0;
+
+    VSI_UNREFERENCED(outputs);
 
-    for(i = 0; i < inputs[0]->attr.dim_num; i++)
+    for (i = 0; i < (vsi_size_t)inputs[0]->attr.dim_num; i++)
     {
         in_shape[i] = inputs[0]->attr.size[i];
     }
@@ -313,7 +317,7 @@ static int32_t _optimize_mask_shape
     }
 
     opt_shape_out[0] = max_len;
-    for(i = 0; i < (uint32_t)new_rank; i++)
+    for (i = 0; i < new_rank; i++)
     {
         opt_shape_out[i + 1] = opt_shape_in[i];
     }
@@ -344,6 +348,9 @@ static vsi_nn_kernel_node_t _setup
     int32_t max_len  = vsi_nn_kernel_param_get_int32( params, "max_len" );
     int32_t is2Dflg = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c b/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c
index bcfe0d01c..6fca37fce 100644
--- a/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c
@@ -95,7 +95,10 @@ DEF_KERNEL_INITIALIZER(_signal_frame_initializer)
     vsi_nn_kernel_tensor_attr_t * attr  = NULL;
     vsi_size_array_t * out_shape          = NULL;
 
+    VSI_UNREFERENCED(param_size);
+
     attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
     out_shape = attr->shape;
 
     gpu_param.global_scale[0] = 16;
diff --git a/src/tim/vx/internal/src/kernel/evis/slice_evis.c b/src/tim/vx/internal/src/kernel/evis/slice_evis.c
index 883947073..773d38b0d 100644
--- a/src/tim/vx/internal/src/kernel/evis/slice_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/slice_evis.c
@@ -162,6 +162,8 @@ DEF_KERNEL_INITIALIZER(_slice_initializer)
     int32_t   is_samefl       = 0;
     uint32_t  pack_key        = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
@@ -409,6 +411,8 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t output_batch = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
     vsi_bool is_same_quant = FALSE;
 
+    VSI_UNREFERENCED(params);
+
     vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num,
         shapes[0], &rank[0]);
     vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num,
diff --git a/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c
index 2b9d53e94..f95405aca 100644
--- a/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c
@@ -125,6 +125,8 @@ DEF_KERNEL_INITIALIZER(_space2depth_internal_initializer)
 
     uint32_t pack_key = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -273,7 +275,9 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e input0_dtype = U8;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
+
+    VSI_UNREFERENCED(params);
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@@ -323,6 +327,9 @@ static vsi_nn_kernel_node_t _setup
     int32_t block_size_y  = vsi_nn_kernel_param_get_int32( params, "block_size_y" );
     int32_t opt_flg = (block_size_x == 2 && block_size_y == 1) ? 1 : 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
index 46595a170..f31de5495 100644
--- a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
@@ -165,6 +165,8 @@ DEF_KERNEL_INITIALIZER(_get_matrix_initializer)
     float    output_h = 1.0f;
     float    scale[4] = {0};
 
+    VSI_UNREFERENCED(param_size);
+
     attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
 
@@ -256,6 +258,8 @@ DEF_KERNEL_INITIALIZER(_warp_affine_initializer)
     float    output_scale = 1.0f;
     float    output_zp  = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
@@ -309,7 +313,6 @@ DEF_KERNEL_INITIALIZER(_warp_affine_initializer)
     gpu_param.global_size[1] = out_shape->data[1];
     gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
 
-    do
     {
         gpu_dp_inst_t uniConvertDatatoF32_0_4x4 = {{
             0x01010101, // TCfg
@@ -369,7 +372,7 @@ DEF_KERNEL_INITIALIZER(_warp_affine_initializer)
                 "uniExtract8Data_2x8", &uniExtractInteger_2x8 );
         }
         CHECK_STATUS_FAIL_GOTO(status, final );
-    }while(0);
+    }
 
 
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
@@ -502,6 +505,9 @@ static vsi_nn_kernel_node_t _setup
     float output_h   = (float)outputs[0]->attr.size[1];
     int32_t i = 0;
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
     if (align_corners && output_w > 1)
     {
         output_w = output_w - 1;
@@ -565,42 +571,46 @@ static vsi_nn_kernel_node_t _setup
 
     // Get Matrix
     node = vsi_nn_kernel_create_node( graph, ikernels[MATRIX_INDEX] );
-    vsi_nn_kernel_node_pack_io( node_params, _GET_MATRIX_PARAM_NUM,
-            &inputs[1], 1, &tensors[0], 1 );
-    node_params[HAS_THETA_1_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_1 );
-    node_params[HAS_THETA_1_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_2 );
-    node_params[HAS_THETA_1_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_3 );
-    node_params[HAS_THETA_2_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_1 );
-    node_params[HAS_THETA_2_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_2 );
-    node_params[HAS_THETA_2_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_3 );
-    node_params[THETA_1_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_1 );
-    node_params[THETA_1_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_2 );
-    node_params[THETA_1_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_3 );
-    node_params[THETA_2_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_1 );
-    node_params[THETA_2_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_2 );
-    node_params[THETA_2_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_3 );
-    node_params[I_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &input_w );
-    node_params[I_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &input_h );
-    node_params[O_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &output_w );
-    node_params[O_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &output_h );
-    status  = vsi_nn_kernel_node_pass_param( node, node_params, _GET_MATRIX_PARAM_NUM );
-    vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_1] );
-    vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_2] );
-    vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_3] );
-    vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_1] );
-    vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_2] );
-    vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_3] );
-    vsi_nn_kernel_scalar_release( &node_params[THETA_1_1] );
-    vsi_nn_kernel_scalar_release( &node_params[THETA_1_2] );
-    vsi_nn_kernel_scalar_release( &node_params[THETA_1_3] );
-    vsi_nn_kernel_scalar_release( &node_params[THETA_2_1] );
-    vsi_nn_kernel_scalar_release( &node_params[THETA_2_2] );
-    vsi_nn_kernel_scalar_release( &node_params[THETA_2_3] );
-    vsi_nn_kernel_scalar_release( &node_params[I_WIDTH] );
-    vsi_nn_kernel_scalar_release( &node_params[I_HEIGHT] );
-    vsi_nn_kernel_scalar_release( &node_params[O_WIDTH] );
-    vsi_nn_kernel_scalar_release( &node_params[O_HEIGHT] );
-    vsi_nn_kernel_node_release( &node );
+
+    if (node)
+    {
+        vsi_nn_kernel_node_pack_io( node_params, _GET_MATRIX_PARAM_NUM,
+                &inputs[1], 1, &tensors[0], 1 );
+        node_params[HAS_THETA_1_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_1 );
+        node_params[HAS_THETA_1_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_2 );
+        node_params[HAS_THETA_1_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_3 );
+        node_params[HAS_THETA_2_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_1 );
+        node_params[HAS_THETA_2_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_2 );
+        node_params[HAS_THETA_2_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_3 );
+        node_params[THETA_1_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_1 );
+        node_params[THETA_1_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_2 );
+        node_params[THETA_1_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_3 );
+        node_params[THETA_2_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_1 );
+        node_params[THETA_2_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_2 );
+        node_params[THETA_2_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_3 );
+        node_params[I_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &input_w );
+        node_params[I_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &input_h );
+        node_params[O_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &output_w );
+        node_params[O_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &output_h );
+        status  = vsi_nn_kernel_node_pass_param( node, node_params, _GET_MATRIX_PARAM_NUM );
+        vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_1] );
+        vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_2] );
+        vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_3] );
+        vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_1] );
+        vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_2] );
+        vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_3] );
+        vsi_nn_kernel_scalar_release( &node_params[THETA_1_1] );
+        vsi_nn_kernel_scalar_release( &node_params[THETA_1_2] );
+        vsi_nn_kernel_scalar_release( &node_params[THETA_1_3] );
+        vsi_nn_kernel_scalar_release( &node_params[THETA_2_1] );
+        vsi_nn_kernel_scalar_release( &node_params[THETA_2_2] );
+        vsi_nn_kernel_scalar_release( &node_params[THETA_2_3] );
+        vsi_nn_kernel_scalar_release( &node_params[I_WIDTH] );
+        vsi_nn_kernel_scalar_release( &node_params[I_HEIGHT] );
+        vsi_nn_kernel_scalar_release( &node_params[O_WIDTH] );
+        vsi_nn_kernel_scalar_release( &node_params[O_HEIGHT] );
+        vsi_nn_kernel_node_release( &node );
+    }
 
     // Warp Affine
     node = vsi_nn_kernel_create_node( graph, ikernels[WARP_AFFINE_INDEX] );
@@ -617,19 +627,26 @@ static vsi_nn_kernel_node_t _setup
             border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
         }
         status  = vsi_nn_kernel_node_set_border( node, &border );
-        VSI_ASSERT( status == VSI_SUCCESS );
+        if ( VSI_SUCCESS != status )
+        {
+            goto final;
+        }
+        vsi_nn_kernel_node_pack_io( warp_affine_node_params, _WARP_AFFINE_PARAM_NUM,
+                warp_affine_tensors, 2, outputs, 1 );
+        status  = vsi_nn_kernel_node_pass_param( node, warp_affine_node_params, _WARP_AFFINE_PARAM_NUM );
+        if ( VSI_SUCCESS != status )
+        {
+            goto final;
+        }
     }
-    vsi_nn_kernel_node_pack_io( warp_affine_node_params, _WARP_AFFINE_PARAM_NUM,
-            warp_affine_tensors, 2, outputs, 1 );
-    status  = vsi_nn_kernel_node_pass_param( node, warp_affine_node_params, _WARP_AFFINE_PARAM_NUM );
 final:
-    for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
+    for ( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
     {
-        if( ikernels[i] )
+        if ( ikernels[i] )
         {
             vsi_nn_kernel_release( &ikernels[i] );
         }
-        if( tensors[i] )
+        if ( tensors[i] )
         {
             vsi_nn_ReleaseTensor( &tensors[i] );
         }
diff --git a/src/tim/vx/internal/src/kernel/evis/swish_evis.c b/src/tim/vx/internal/src/kernel/evis/swish_evis.c
index 724037575..befe6ac74 100644
--- a/src/tim/vx/internal/src/kernel/evis/swish_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/swish_evis.c
@@ -154,7 +154,7 @@ DEF_KERNEL_INITIALIZER(_swish_initializer)
     size_t param_size
     )
 {
-    vsi_status status = VX_FAILURE;
+    vsi_status status = VSI_FAILURE;
     // Alignment with a power of two value.
     gpu_param_t gpu_param = {
         3,
@@ -177,6 +177,8 @@ DEF_KERNEL_INITIALIZER(_swish_initializer)
     vsi_size_array_t             *out_shape  = NULL;
     uint32_t                     pack_key   = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input);
     CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
@@ -365,7 +367,7 @@ DEF_KERNEL_INITIALIZER(_hswish_initializer)
     size_t param_size
     )
 {
-    vsi_status status = VX_FAILURE;
+    vsi_status status = VSI_FAILURE;
     // Alignment with a power of two value.
     gpu_param_t gpu_param = {
         3,
@@ -387,6 +389,8 @@ DEF_KERNEL_INITIALIZER(_hswish_initializer)
     vsi_size_array_t             *out_shape  = NULL;
     uint32_t                     pack_key   = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input);
     CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
@@ -649,6 +653,9 @@ static vsi_nn_kernel_node_t _setup
     int32_t swish_type  = vsi_nn_kernel_param_get_int32( params, "type" );
     float   beta        = 1.0f;
     vsi_bool ret = FALSE;
+
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
 #if (VX_ACTIVATION_EXT_SUPPORT)
     if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c b/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c
index 15854526a..4a57905ce 100644
--- a/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c
@@ -123,6 +123,8 @@ DEF_KERNEL_INITIALIZER(_tensorstackconcat_initializer)
     vsi_size_array_t * in_shape             = NULL;
     // Add initializer
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
 
@@ -225,6 +227,8 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t node = NULL;
     vsi_bool image_2d = FALSE;
 
+    VSI_UNREFERENCED(params);
+
     image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
     status = _query_kernel( kernel, inputs, outputs, image_2d );
     if ( VSI_SUCCESS == status)
diff --git a/src/tim/vx/internal/src/kernel/evis/tile_evis.c b/src/tim/vx/internal/src/kernel/evis/tile_evis.c
index 50e43cf81..f46941aff 100644
--- a/src/tim/vx/internal/src/kernel/evis/tile_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/tile_evis.c
@@ -272,6 +272,8 @@ DEF_KERNEL_INITIALIZER(_tile_initializer)
     int32_t   output_ZP       = 0;
     int32_t   input_ZP        = 0;
 
+    VSI_UNREFERENCED(param_size);
+
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -408,7 +410,7 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int32_t i = 0;
+    size_t i = 0;
     int32_t dim0_size1 = inputs[0]->attr.size[0] == 1 ? 1 : 0;
 
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
@@ -497,6 +499,11 @@ static vsi_nn_kernel_node_t _setup
     uint32_t dim = inputs[0]->attr.dim_num;
     vsi_size_t multiples[VSI_NN_MAX_DIM_NUM] = { 0 };
 
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+
+
     for ( i = 0;  i < dim;  i++)
     {
         multiples[i] = outputs[0]->attr.size[i] / inputs[0]->attr.size[i];
@@ -515,10 +522,34 @@ static vsi_nn_kernel_node_t _setup
             return NULL;
         }
 
-        reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
-            inputs[0], shapes[0], new_rank );
-        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
-            outputs[0], shapes[2], new_rank );
+        if ( new_rank == 4)
+        {
+            vsi_size_t newshapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+            newshapes[0][0] = shapes[0][0];
+            newshapes[2][0] = shapes[2][0];
+            newshapes[0][1] = shapes[0][1];
+            newshapes[2][1] = shapes[2][1];
+            newshapes[0][2] = shapes[0][2] * shapes[0][3];
+            newshapes[2][2] = shapes[2][2] * shapes[2][3];
+
+            if (newshapes[0][2] >= GPU_TENSOR_MAX_WIDTH ||
+                newshapes[2][2] >= GPU_TENSOR_MAX_WIDTH)
+            {
+                return NULL;
+            }
+
+            reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], newshapes[0], 3 );
+            reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                outputs[0], newshapes[2], 3 );
+        }
+        else
+        {
+            reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], shapes[0], new_rank );
+            reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                outputs[0], shapes[2], new_rank );
+        }
     }
     else
     {
@@ -532,7 +563,7 @@ static vsi_nn_kernel_node_t _setup
     }
 
     remainder = reshape_tensors[0]->attr.size[0] % 8;
-    image_2d = (reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1);
+    image_2d = reshape_tensors[0]->attr.dim_num == 2;
     status = _query_kernel( &reshape_tensors[0], &reshape_tensors[1], image_2d, remainder, kernel );
     if( VSI_SUCCESS == status)
     {
@@ -540,9 +571,9 @@ static vsi_nn_kernel_node_t _setup
         if( node )
         {
             /* Pass parameters to node. */
-            vsi_size_t depthIn = new_rank > 2 ? reshape_tensors[0]->attr.size[2] : 1;
-            vsi_size_t depthOut = new_rank > 2 ? reshape_tensors[1]->attr.size[2] : 1;
-            vsi_size_t batchIn = new_rank > 3 ? reshape_tensors[0]->attr.size[3] : 1;
+            vsi_size_t depthIn = new_rank > 2 ? shapes[0][2] : 1;
+            vsi_size_t depthOut = new_rank > 2 ? shapes[2][2] : 1;
+            vsi_size_t batchIn = new_rank > 3 ? shapes[0][3] : 1;
 
             shapes[1][2] = shapes[1][2] == 0 ? 1 : shapes[1][2];
             shapes[1][3] = shapes[1][3] == 0 ? 1 : shapes[1][3];
diff --git a/src/tim/vx/internal/src/kernel/evis/upsample_evis.c b/src/tim/vx/internal/src/kernel/evis/upsample_evis.c
index 0ac1b6d28..fb78c4905 100644
--- a/src/tim/vx/internal/src/kernel/evis/upsample_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/upsample_evis.c
@@ -162,6 +162,8 @@ DEF_KERNEL_INITIALIZER(_upsample_initializer)
     float    factorOut                         = 1.0f;
     vsi_bool image_2d                          = FALSE;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
     axis_attr   = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
diff --git a/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c b/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c
index 27a478b0e..6bc113f3c 100644
--- a/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c
@@ -152,6 +152,8 @@ DEF_KERNEL_INITIALIZER(_upsamplescale_initializer)
     uint32_t  pack_key        = 0;
     _internal_upscale_e flag  = UP_ORG;
 
+    VSI_UNREFERENCED(param_size);
+
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
     output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@@ -344,7 +346,7 @@ static vsi_status _query_kernel
     _internal_upscale_e flag = (stride == 2 && scale >= 0 ) ? UP_K2 : UP_ORG;
 
     uint32_t key = 0;
-    int i;
+    size_t i;
 
     in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
index aa05c359d..83334269c 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@@ -136,6 +136,10 @@ static vsi_status VX_CALLBACK _kernel_validator
     vx_meta_format metas[]
     )
 {
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(parameters);
+    VSI_UNREFERENCED(num);
+    VSI_UNREFERENCED(metas);
     return VSI_SUCCESS;
 } /* _kernel_validator() */
 
@@ -146,6 +150,9 @@ static vsi_status VX_CALLBACK _kernel_initializer
     uint32_t paraNum
     )
 {
+    VSI_UNREFERENCED(nodObj);
+    VSI_UNREFERENCED(paramObj);
+    VSI_UNREFERENCED(paraNum);
     return VSI_SUCCESS;
 } /* _kernel_initializer() */
 
@@ -156,6 +163,9 @@ static vsi_status VX_CALLBACK _kernel_deinitializer
     uint32_t paraNum
     )
 {
+    VSI_UNREFERENCED(nodObj);
+    VSI_UNREFERENCED(paraObj);
+    VSI_UNREFERENCED(paraNum);
     return VSI_SUCCESS;
 } /* _kernel_deinitializer() */
 
@@ -287,6 +297,9 @@ static const uint8_t* _load_internal_executable
     vsi_nn_kernel_type_e type
     )
 {
+    VSI_UNREFERENCED(source_name);
+    VSI_UNREFERENCED(size);
+    VSI_UNREFERENCED(type);
 #if VSI_USE_VXC_BINARY
     switch( type )
     {
@@ -518,8 +531,10 @@ static vx_program _create_program_from_executable
 
     program_info.data = _load_internal_executable(
             source_info->data[0], &program_info.size, kernel->type);
+    CHECK_PTR_FAIL_GOTO( program_info.data, "Create buffer fail.", final );
     program = vxCreateProgramWithBinary( graph->ctx->c,
             (const vx_uint8 *)program_info.data, program_info.size );
+final:
     return program;
 } /* _create_program_from_executable() */
 
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c
index ecbdccf06..26c918079 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c
@@ -113,6 +113,12 @@ static vsi_size_t eltwise_fill_dim
         vsi_size_t divisor = 0;
         vsi_size_t remainder = 0;
         compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor );
+        if (divisor == 0)
+        {
+            VSILOGE( "divisor might be used in a division by zero." );
+            cost_size =  (vsi_size_t)-1;
+            goto final;
+        }
         remainder = size_output / divisor;
         if( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank )
         {
@@ -152,6 +158,7 @@ static vsi_size_t eltwise_fill_dim
             shape_output[rank + 1] = remainder;
         }
     }
+final:
     return cost_size;
 } /* eltwise_fill_dim() */
 
@@ -177,11 +184,11 @@ vsi_bool vsi_nn_kernel_optimize_eltwise_shape
     eltwise_broadcast_state_e prv_state = ELTWISE_BROADCAST_STATE_EMPTY;
 
 #define _swap_size(a, b, tmp)  \
-    do { \
+    { \
         tmp = a; \
         a = b; \
         b = tmp; \
-    } while(0)
+    }
     for( i = 0; i < rank_output; i++ )
     {
         sx = i < rank_x ? shape_x[i] : 1;
@@ -352,6 +359,12 @@ static vsi_size_t broadcast_fill_dim
         vsi_size_t divisor = 0;
         vsi_size_t remainder = 0;
         compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor );
+        if (divisor == 0)
+        {
+            VSILOGE( "divisor might be used in a division by zero." );
+            cost_size =  (vsi_size_t)-1;
+            goto final;
+        }
         remainder = size_output / divisor;
         if( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank )
         {
@@ -386,6 +399,7 @@ static vsi_size_t broadcast_fill_dim
             shape_output[rank + 1] = remainder;
         }
     }
+final:
     return cost_size;
 } /* broadcast_fill_dim() */
 
@@ -412,11 +426,11 @@ vsi_bool vsi_nn_kernel_optimize_broadcast_shape
     int32_t  prv_state_mask                   = -1;
 
 #define _swap_size(a, b, tmp)  \
-    do { \
+    { \
         tmp = a; \
         a = b; \
         b = tmp; \
-    } while(0)
+    }
 
     if (input_num > MAX_INPUT_NUM)
     {
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
index f3a8f4fce..18919b4d5 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
@@ -28,6 +28,7 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_math.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
 
 static vsi_bool compute_gpu_divisor
     (
@@ -84,6 +85,12 @@ static vsi_size_t element_fill_dim
         vsi_size_t divisor = 0;
         vsi_size_t remainder = 0;
         compute_gpu_divisor( size_x, max_rank, 1, &divisor );
+        if (divisor == 0)
+        {
+            VSILOGE( "divisor might be used in a division by zero." );
+            cost_size =  (vsi_size_t)-1;
+            goto final;
+        }
         remainder = size_x / divisor;
         if ( remainder > max_rank || rank_x >= max_rank)
         {
@@ -109,6 +116,7 @@ static vsi_size_t element_fill_dim
             }
         }
     }
+final:
     return cost_size;
 } /* element_fill_dim() */
 
@@ -132,6 +140,9 @@ vsi_bool vsi_nn_kernel_optimize_reduce_shape
     vsi_size_t  outerSize                  = 1;
     vsi_size_t  axisSize                   = 1;
 
+    VSI_UNREFERENCED(shape_output);
+    VSI_UNREFERENCED(rank_output);
+
     for (i = 0; i < axis_size; i++)
     {
         axisSize *= shape_x[axis[i]];
@@ -391,6 +402,12 @@ static vsi_size_t tile_fill_dim
         vsi_size_t divisor = 0;
         vsi_size_t remainder = 0;
         compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor );
+        if (divisor == 0)
+        {
+            VSILOGE( "divisor might be used in a division by zero." );
+            cost_size =  (vsi_size_t)-1;
+            goto final;
+        }
         remainder = size_output / divisor;
         if ( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank )
         {
@@ -430,6 +447,7 @@ static vsi_size_t tile_fill_dim
             shape_output[rank + 1] = remainder;
         }
     }
+final:
     return cost_size;
 } /* eltwise_fill_dim() */
 
@@ -442,35 +460,126 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
     vsi_size_t* out_shape_output, vsi_size_t* out_rank_output
     )
 {
-    vsi_bool ret                        = TRUE;
-    vsi_bool append_dim                 = FALSE;
-    vsi_size_t   i                          = 0;
-    vsi_size_t   dims                       = 0;
+    vsi_bool    ret                        = TRUE;
+    vsi_bool    append_dim                 = FALSE;
+    vsi_size_t  i                          = 0;
+    vsi_size_t  j                          = 0;
+    vsi_size_t  dims                       = 0;
     vsi_size_t  effective_size_x           = 1;
     vsi_size_t  effective_size_y           = 1;
     vsi_size_t  effective_size_z           = 1;
     vsi_size_t  sx                         = 0;
     vsi_size_t  sy                         = 0;
     vsi_size_t  sz                         = 0;
+    int32_t     idx_start                  = -1;
+    int32_t     idx_end                    = 0;
     tile_axis_state_e state             = TILE_STATE_EMPTY;
     tile_axis_state_e next_state        = TILE_STATE_EMPTY;
+    vsi_size_t* temp_shape_x            = NULL;
+    vsi_size_t* temp_shape_y            = NULL;
+    vsi_size_t* temp_shape_output       = NULL;
+    vsi_size_t  temp_rank               = 0;
 
 #define _swap_size(a, b, tmp)  \
-    do { \
+    { \
         tmp = a; \
         a = b; \
         b = tmp; \
-    } while(0)
-    for( i = 0; i < rank_output; i++ )
+    }
+
+    VSI_UNREFERENCED(rank_x);
+    VSI_UNREFERENCED(rank);
+
+    temp_shape_x = (vsi_size_t*)malloc(rank * sizeof(vsi_size_t));
+    if (temp_shape_x == NULL)
     {
-        sx = shape_x[i];
-        sy = multiples[i];
-        sz = shape_output[i];
+        VSILOGE( "malloc temp_shape_x error." );
+        ret = FALSE;
+        goto final;
+    }
+
+    temp_shape_y = (vsi_size_t*)malloc(rank * sizeof(vsi_size_t));
+    if (temp_shape_y == NULL)
+    {
+        VSILOGE( "malloc temp_shape_y error." );
+        ret = FALSE;
+        goto final;
+    }
+
+    temp_shape_output = (vsi_size_t*)malloc(rank * sizeof(vsi_size_t));
+    if (temp_shape_output == NULL)
+    {
+        VSILOGE( "malloc temp_shape_output error." );
+        ret = FALSE;
+        goto final;
+    }
+    memcpy(temp_shape_x, shape_x, rank * sizeof(vsi_size_t));
+    memcpy(temp_shape_y, multiples, rank * sizeof(vsi_size_t));
+    memcpy(temp_shape_output, shape_output, rank * sizeof(vsi_size_t));
+
+    for (i = 0, temp_rank = 0; i < rank_output; i++)
+    {
+        if (i == rank_output - 1 && temp_shape_x[i] == 1)
+        {
+            if (idx_start >= 0)
+            {
+               sx = 1;
+               sy = temp_shape_y[idx_start];
+               sz = temp_shape_output[idx_start];
+               idx_end = (int32_t)i ;
+               for (j = (vsi_size_t)idx_start + 1; j <= (vsi_size_t)idx_end; j++)
+               {
+                   sy *= temp_shape_y[j];
+                   sz *= temp_shape_output[j];
+               }
+               temp_rank += tile_fill_dim( temp_shape_x, temp_shape_y, temp_shape_output,
+                       temp_rank, VSI_NN_MAX_DIM_NUM, sx, sy, sz );
+               idx_start = -1;
+            }
+            else
+            {
+                temp_shape_x[temp_rank] = temp_shape_x[i];
+                temp_shape_y[temp_rank] = temp_shape_y[i];
+                temp_shape_output[temp_rank++] = temp_shape_output[i];
+            }
+        }
+        else if (temp_shape_x[i] != 1)
+        {
+            idx_end = (int32_t)i - 1;
+            if (idx_start >= 0)
+            {
+               sx = 1;
+               sy = temp_shape_y[idx_start];
+               sz = temp_shape_output[idx_start];
+               for (j = (vsi_size_t)idx_start + 1; j <= (vsi_size_t)idx_end; j++)
+               {
+                   sy *= temp_shape_y[j];
+                   sz *= temp_shape_output[j];
+               }
+               temp_rank += tile_fill_dim( temp_shape_x, temp_shape_y, temp_shape_output,
+                       temp_rank, VSI_NN_MAX_DIM_NUM, sx, sy, sz );
+               idx_start = -1;
+            }
+            temp_shape_x[temp_rank] = temp_shape_x[i];
+            temp_shape_y[temp_rank] = temp_shape_y[i];
+            temp_shape_output[temp_rank++] = temp_shape_output[i];
+        }
+        else if (idx_start == -1)
+        {
+            idx_start = (int32_t)i;
+        }
+    }
+
+    for( i = 0; i < temp_rank; i++ )
+    {
+        sx = temp_shape_x[i];
+        sy = temp_shape_y[i];
+        sz = temp_shape_output[i];
         /*
          * Skip dim if the size is equal to 1
          * Also skip if ( sx == 1 && sy == 1 )
          */
-        if ( shape_output[i] == 1 )
+        if ( temp_shape_output[i] == 1 )
         {
             continue;
         }
@@ -490,8 +599,8 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
             VSI_ASSERT( FALSE );
         }
 
-        next_state = (i + 1) < rank_output ?
-            (multiples[i + 1] == 1 ? TILE_STATE_NO_AXIS : TILE_STATE_AXIS_X) : TILE_STATE_EMPTY;
+        next_state = (i + 1) < temp_rank ?
+            (temp_shape_y[i + 1] == 1 ? TILE_STATE_NO_AXIS : TILE_STATE_AXIS_X) : TILE_STATE_EMPTY;
 
         append_dim = FALSE;
 #define _pack_state( cur_state, next_state )    (next_state << 16 | cur_state)
@@ -507,9 +616,13 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
              * ...,x1,x2,...
              * ...,y1,y2,...
              */
+            case _pack_state( TILE_STATE_AXIS_X, TILE_STATE_EMPTY ):
+                effective_size_x = sx;
+                effective_size_y = sy;
+                effective_size_z = sz;
+                break;
             case _pack_state( TILE_STATE_AXIS_X, TILE_STATE_AXIS_X ):
             case _pack_state( TILE_STATE_AXIS_X, TILE_STATE_NO_AXIS ):
-            case _pack_state( TILE_STATE_AXIS_X, TILE_STATE_EMPTY ):
                 append_dim = TRUE;
                 break;
             /*
@@ -548,7 +661,7 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
     if ( ret )
     {
         /* Append the last dim */
-        if ( i == rank_output )
+        if ( i == temp_rank )
         {
             sx = effective_size_x;
             sy = effective_size_y;
@@ -573,6 +686,23 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
         *out_rank_output = (uint32_t)dims;
     }
 #undef _swap_size
+final:
+    if (temp_shape_x)
+    {
+        free( temp_shape_x);
+        temp_shape_x = NULL;
+    }
+    if (temp_shape_y)
+    {
+        free( temp_shape_y);
+        temp_shape_y = NULL;
+    }
+    if (temp_shape_output)
+    {
+        free( temp_shape_output);
+        temp_shape_output = NULL;
+    }
+
     return ret;
 } /* vsi_nn_kernel_optimize_eltwise_shape() */
 
@@ -612,7 +742,7 @@ vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape
     return TRUE;
 }
 
-static vsi_bool vsi_nn_kernel_optimize_element_shape_with_max_rank
+vsi_bool vsi_nn_kernel_optimize_element_shape_with_max_rank
     (
     const vsi_size_t* shape_x, const vsi_size_t rank_x,
     vsi_size_t* out_shape_x, vsi_size_t* out_rank_x, vsi_size_t max_rank
@@ -755,3 +885,415 @@ vsi_bool vsi_nn_kernel_optimize_scatter_elements_shape
 
     return ret;
 } /* vsi_nn_kernel_optimize_scatter_elements_shape() */
+
+
+vsi_bool vsi_nn_kernel_optimize_matrixmul_broadcast_shape
+    (
+    const vsi_size_t * shape_x,
+    const vsi_size_t * shape_y,
+    const vsi_size_t * shape_output,
+    vsi_size_t dim_x,
+    vsi_size_t dim_y,
+    vsi_size_t dim_out,
+    vsi_size_t* out_shape_x,
+    vsi_size_t* out_shape_y,
+    vsi_size_t* out_shape_output,
+    uint32_t* new_rank_out,
+    uint32_t* cross_flg,
+    uint32_t* size_axis_inner_outer,
+    uint32_t* strides_axis_inner_outer
+    )
+{
+    vsi_bool     ret = FALSE;
+    vsi_size_t   rank_in[2] = {0, 0};
+    vsi_size_t   rank_out = 0;
+    vsi_size_t   shapes_in_broadcast_part[2][VSI_NN_MAX_DIM_NUM] = {{1}};
+    vsi_size_t*  shapes_in_broadcast_part_ptr[2]                 = {NULL, NULL};
+    vsi_size_t   shapes_out_broadcast_part[VSI_NN_MAX_DIM_NUM]   = {1};
+    vsi_size_t   out_shape_in[2][VSI_NN_MAX_DIM_NUM]             = {{1}};
+    vsi_size_t*  out_shape_in_ptr[2]                             = {NULL, NULL};
+    vsi_size_t   out_shape_boradcast_output[VSI_NN_MAX_DIM_NUM]  = {1};
+    uint32_t     new_rank   = 0;
+    uint32_t     i          = 0;
+    vsi_size_t   outer0 = 1;
+    vsi_size_t   outer1 = 1;
+    vsi_size_t   outer2 = 1;
+    vsi_size_t   axis_size = 0;
+    vsi_size_t   inner_size = 1;
+    vsi_size_t   outer_size = 1;
+    vsi_size_t   axis_size0 = 1;
+    vsi_size_t   axis_size1 = 1;
+    vsi_size_t   axis_size2 = 1;
+    vsi_size_t   inner_size0 = 0;
+    vsi_size_t   inner_size1 = 0;
+    vsi_size_t   inner_size2 = 0;
+    vsi_size_t   outer_size0 = 0;
+    vsi_size_t   outer_size1 = 0;
+    vsi_size_t   outer_size2 = 0;
+    uint32_t ne_flg = 0;
+    uint32_t axis = 0;
+    uint32_t outer_flg = 0;
+    uint32_t outer_axis = 0;
+    uint32_t first_flg = 0;
+    cross_flg[0] = 0;
+
+    if (dim_x > 2 && dim_y > 2)
+    {
+        for (i = 2; i < dim_x; i++)
+        {
+            outer0 *= shape_x[i];
+        }
+        for (i = 2; i < dim_y; i++)
+        {
+            outer1 *= shape_y[i];
+        }
+        for (i = 2; i < dim_out; i++)
+        {
+            outer2 *= shape_output[i];
+        }
+
+        for (i = 2; i < vsi_nn_min(dim_x, dim_y); i++)
+        {
+            if (shape_x[i] != shape_y[i] && first_flg == 0)
+            {
+                if (shape_x[i] == 1)
+                {
+                    ne_flg = 1;
+                    inner_size = shape_y[i];
+                }
+                else
+                {
+                    ne_flg = 2;
+                    inner_size = shape_x[i];
+                }
+                first_flg = 1;
+                continue;
+            }
+            else if (ne_flg == 1 && shape_x[i] != shape_y[i] && shape_x[i] == 1 && first_flg == 1)
+            {
+                inner_size *= shape_y[i];
+            }
+            else if (ne_flg == 2 && shape_x[i] != shape_y[i] && shape_y[i] == 1 && first_flg == 1)
+            {
+                inner_size *= shape_x[i];
+            }
+            else if (ne_flg == 1 && shape_x[i] != shape_y[i] && shape_x[i] != 1 && first_flg == 1)
+            {
+                outer_flg = 1;
+                outer_axis = i;
+                break;
+            }
+            else if (ne_flg == 2 && shape_x[i] != shape_y[i] && shape_y[i] != 1 && first_flg == 1)
+            {
+                outer_flg = 2;
+                outer_axis = i;
+                break;
+            }
+            else if (i > 2 && shape_x[i] == shape_y[i] && shape_y[i] != 1 && first_flg == 1)
+            {
+                first_flg = 2;
+            }
+            else if (shape_x[i] != shape_y[i] && shape_x[i] != 1 && first_flg == 2)
+            {
+                outer_flg = 1;
+                outer_axis = i;
+                break;
+            }
+            else if (shape_x[i] != shape_y[i] && shape_y[i] != 1 && first_flg == 2)
+            {
+                outer_flg = 2;
+                outer_axis = i;
+                break;
+            }
+            else if (i == 2 && shape_x[i] == shape_y[i] && shape_y[i] != 1)
+            {
+                /*axis = 2;
+                axis_size = shape_x[i];*/
+            }
+        }
+
+        if (ne_flg > 0 && outer0 > 1 && outer1 > 1)
+        {
+            for (i = 2; i < vsi_nn_min(dim_x, dim_y); i++)
+            {
+                if (shape_x[i] == shape_y[i] && shape_x[i] != 1)
+                {
+                    cross_flg[0] = 1;
+                    axis = i;
+                    axis_size = shape_x[i];
+                    break;
+                }
+            }
+        }
+
+        if (cross_flg[0] == 1) // cross
+        {
+            if (outer_flg == 1)
+            {
+                for (i = outer_axis; i < dim_x; i++)
+                {
+                    outer_size *= shape_x[i];
+                }
+            }
+            else if (outer_flg == 2)
+            {
+                for (i = outer_axis; i < dim_y; i++)
+                {
+                    outer_size *= shape_y[i];
+                }
+            }
+            else
+            {
+                outer_size = 1;
+            }
+
+            axis_size0 = 1;
+            axis_size1 = 1;
+            axis_size2 = 1;
+            if (axis > 2 && ne_flg == 1)
+            {
+                axis_size1 = inner_size;
+                axis_size2 = inner_size;
+            }
+            else if (axis > 2 && ne_flg == 2)
+            {
+                axis_size0 = inner_size;
+                axis_size2 = inner_size;
+            }
+
+            inner_size0 = 0;
+            inner_size1 = 0;
+            inner_size2 = 1;
+            if (axis == 2 && ne_flg == 1)
+            {
+                inner_size1 = axis_size;
+                inner_size2 = axis_size;
+            }
+            else if (axis > 2 && ne_flg == 1)
+            {
+                inner_size1 = 1;
+            }
+            else if (axis == 2 && ne_flg == 2)
+            {
+                inner_size0 = axis_size;
+                inner_size2 = axis_size;
+            }
+            else if (axis > 2 && ne_flg == 2)
+            {
+                inner_size0 = 1;
+            }
+
+            outer_size0 = 0;
+            outer_size1 = 0;
+            outer_size2 = axis_size * inner_size;
+            if (outer_flg == 1)
+            {
+                outer_size0 = axis_size0 * axis_size;
+            }
+            else if (outer_flg == 2)
+            {
+                outer_size1 = axis_size1 * axis_size;
+            }
+
+            for (i = 0; i < 2; i++)
+            {
+                out_shape_x[i] = shape_x[i];
+                out_shape_y[i] = shape_y[i];
+                out_shape_output[i] = shape_output[i];
+            }
+            out_shape_x[2] = outer0;
+            out_shape_x[3] = 1;
+            out_shape_y[2] = outer1;
+            out_shape_output[2] = outer2;
+            new_rank_out[0] = 4;
+            new_rank_out[1] = 3;
+            new_rank_out[2] = 3;
+
+            size_axis_inner_outer[0] = (uint32_t)axis_size;
+            size_axis_inner_outer[1] = (uint32_t)inner_size;
+            size_axis_inner_outer[2] = (uint32_t)outer_size;
+
+            strides_axis_inner_outer[0] = (uint32_t)axis_size0;
+            strides_axis_inner_outer[1] = (uint32_t)inner_size0;
+            strides_axis_inner_outer[2] = (uint32_t)outer_size0;
+
+            strides_axis_inner_outer[3] = (uint32_t)axis_size1;
+            strides_axis_inner_outer[4] = (uint32_t)inner_size1;
+            strides_axis_inner_outer[5] = (uint32_t)outer_size1;
+
+            strides_axis_inner_outer[6] = (uint32_t)axis_size2;
+            strides_axis_inner_outer[7] = (uint32_t)inner_size2;
+            strides_axis_inner_outer[8] = (uint32_t)outer_size2;
+
+            return TRUE;
+        }
+        else if (outer0 > 1 && outer1 > 1 && ne_flg > 0 && cross_flg[0] == 0)
+        {
+            cross_flg[0] = 2;
+        }
+    }
+
+    if (cross_flg[0] == 2) // merge
+    {
+        for (i = 0; i < 2; i++)
+        {
+            out_shape_x[i] = shape_x[i];
+            out_shape_y[i] = shape_y[i];
+            out_shape_output[i] = shape_output[i];
+        }
+        out_shape_output[2] = outer2;
+        new_rank_out[2] = 3;
+        if (ne_flg == 1)
+        {
+            out_shape_x[2] = outer0;
+            out_shape_x[3] = 1;
+            out_shape_y[2] = outer1;
+
+            new_rank_out[0] = 4;
+            new_rank_out[1] = 3;
+        }
+        else if (ne_flg == 2)
+        {
+            out_shape_x[2] = outer0;
+            out_shape_y[2] = outer1;
+            out_shape_y[3] = 1;
+
+            new_rank_out[0] = 3;
+            new_rank_out[1] = 4;
+        }
+
+        return TRUE;
+    }
+    else if (dim_x == 1 && dim_y > 1)
+    {
+        out_shape_x[0]    = shape_x[0];
+        out_shape_x[1]    = 1;
+
+        out_shape_y[0]    = shape_y[0];
+        out_shape_y[1]    = shape_y[1];
+
+        out_shape_output[0]   = shape_output[0];
+        out_shape_output[1]   = 1;
+
+        if (dim_y > 2)
+        {
+            shapes_in_broadcast_part[0][0] = 1;
+            rank_in[0] = 1;
+
+            for (i = 2; i <= dim_y; i++)
+            {
+                shapes_in_broadcast_part[1][i - 2] = shape_y[i];
+            }
+            rank_in[1] = dim_y - 2;
+
+            for(i = 1; i <= dim_out; i++)
+            {
+                shapes_out_broadcast_part[i - 1] = shape_output[i];
+            }
+            rank_out = dim_out - 1;
+        }
+    }
+    else if (dim_y == 1 && dim_x > 1)
+    {
+        out_shape_y[0]    = 1;
+        out_shape_y[1]    = shape_y[0];
+
+        out_shape_x[0]    = shape_x[0];
+        out_shape_x[1]    = shape_x[1];
+
+        out_shape_output[0]   = 1;
+        out_shape_output[1]   = shape_output[0];
+
+        if (dim_x > 2)
+        {
+            shapes_in_broadcast_part[1][0] = 1;
+            rank_in[1] = 1;
+
+            for (i = 2; i <= dim_x; i++)
+            {
+                shapes_in_broadcast_part[0][i - 2] = shape_x[i];
+            }
+            rank_in[0] = dim_x - 2;
+
+            for(i = 1; i <= dim_out; i++)
+            {
+                shapes_out_broadcast_part[i - 1] = shape_output[i];
+            }
+            rank_out = dim_out - 1;
+        }
+    }
+    else
+    {
+        out_shape_x[0]    = shape_x[0];
+        out_shape_x[1]    = shape_x[1];
+
+        out_shape_y[0]    = shape_y[0];
+        out_shape_y[1]    = shape_y[1];
+
+        out_shape_output[0]    = shape_output[0];
+        out_shape_output[1]    = shape_output[1];
+
+        for (i = 2; i < dim_x; i++)
+        {
+            shapes_in_broadcast_part[0][i - 2] = shape_x[i];
+        }
+        for (i = 2; i < dim_y; i++)
+        {
+            shapes_in_broadcast_part[1][i - 2] = shape_y[i];
+        }
+        for (i = 2; i < dim_out; i++)
+        {
+            shapes_out_broadcast_part[i - 2] = shape_output[i];
+        }
+        rank_in[0] = dim_x - 2;
+        rank_in[1] = dim_y - 2;
+        rank_out = dim_out - 2;
+
+    }
+
+    shapes_in_broadcast_part_ptr[0] = shapes_in_broadcast_part[0];
+    shapes_in_broadcast_part_ptr[1] = shapes_in_broadcast_part[1];
+    out_shape_in_ptr[0] = out_shape_in[0];
+    out_shape_in_ptr[1] = out_shape_in[1];
+
+    ret = vsi_nn_kernel_optimize_broadcast_shape(
+            (const vsi_size_t **)shapes_in_broadcast_part_ptr, rank_in, 2,
+            shapes_out_broadcast_part, rank_out,
+            (vsi_size_t **)out_shape_in_ptr, out_shape_boradcast_output, &new_rank);
+
+    if (ret)
+    {
+        int32_t j = 0;
+
+        new_rank_out[0] = new_rank + 2;
+        new_rank_out[1] = new_rank + 2;
+        new_rank_out[2] = new_rank + 2;
+
+        j = new_rank - 1;
+        while (out_shape_in[0][j] == 1 && j >= 0) {
+            new_rank_out[0]--;
+            j--;
+        }
+
+        j = new_rank - 1;
+        while (out_shape_in[1][j] == 1 && j >= 0) {
+            new_rank_out[1]--;
+            j--;
+        }
+
+        j = new_rank - 1;
+        while (out_shape_boradcast_output[j] == 1 && j >= 0) {
+            new_rank_out[2]--;
+            j--;
+        }
+
+        for (i = 0; i < new_rank; i++)
+        {
+            out_shape_x[i + 2] = out_shape_in[0][i];
+            out_shape_y[i + 2] = out_shape_in[1][i];
+            out_shape_output[i + 2] = out_shape_boradcast_output[i];
+        }
+    }
+
+    return ret;
+}
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c
index c5b640c55..426dacf16 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c
@@ -62,13 +62,13 @@ typedef struct
 } _param_type;
 
 #define CHECK_PARAM_NULL( ptr, rval, ... ) \
-    do { \
+    { \
         if( ptr == NULL ) { \
             VSILOGE(__VA_ARGS__); \
             VSI_ASSERT(FALSE); \
             return rval; \
         } \
-    } while(0)
+    }
 
 #define _PARAM_ADD_TEMPLATE(TYPE_NAME, TYPE, PARAM_DTYPE) \
     vsi_bool vsi_nn_kernel_param_add_##TYPE_NAME \
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
index 7b0c6ca67..6c6dda92c 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
@@ -68,6 +68,12 @@ KERNEL_SELECTOR( depthwise_conv1d )
         { VSI_NN_KERNEL_TYPE_CL,    3 },
         { VSI_NN_KERNEL_TYPE_CPU,   2 },
         };
+
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(outputs);
+    VSI_UNREFERENCED(output_num);
+
     dilation = dilation == 0 ? 0 : dilation - 1;
     real_kernel = (kernel - 1) * dilation + kernel;
 
@@ -101,6 +107,12 @@ static vsi_status _select
         { VSI_NN_KERNEL_TYPE_CL,    1 },
         { VSI_NN_KERNEL_TYPE_CPU,   0 },
         };
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(outputs);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
     return vsi_nn_kernel_pirority_set( selector, pirority, _cnt_of_array(pirority) );
 } /* _select */
 
@@ -141,5 +153,8 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(atan)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(atanh)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(acosh)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(inverse_sigmoid)
+#if (VX_TENSOR_SELECT_VX_SUPPORT)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(select)
+#endif
 
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
index a1680edbf..55a61001a 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
@@ -30,6 +30,7 @@
 #include "vsi_nn_error.h"
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_error.h"
 
 typedef enum
 {
@@ -73,6 +74,11 @@ vsi_status vsi_nn_kernel_copy_tensor_veiw_patch
         vx_trensor_addressing addr = NULL;
         vx_size dim_sizes[VSI_NN_MAX_DIM_NUM], strides[VSI_NN_MAX_DIM_NUM];
         addr = (vx_trensor_addressing)malloc(sizeof(vx_tensorpatch_addressing_t));
+        if ( NULL == addr )
+        {
+            VSILOGE("Call vxCreateTensorAddressing fail");
+            return status;
+        }
         addr->num_of_dims = (vx_uint32)attr->shape->size;
 
         for (i = 0; i < dim; i++)
@@ -138,6 +144,7 @@ vsi_status vsi_nn_kernel_copy_tensor_veiw_patch
         }
     }
 #endif
+
     return status;
 } /* vsi_nn_kernel_copy_tensor_veiw_patch() */
 
@@ -153,6 +160,9 @@ vsi_status vsi_nn_kernel_copy_tensor_patch
     vsi_size_t start[VSI_NN_MAX_DIM_NUM],end[VSI_NN_MAX_DIM_NUM],stride[VSI_NN_MAX_DIM_NUM];
     vsi_status status = VSI_FAILURE;
     uint32_t i;
+
+    VSI_UNREFERENCED(buffer_size);
+
     if (NULL == tensor || NULL == user_ptr)
     {
         VSILOGE("Invalid parameter");
@@ -377,10 +387,12 @@ vsi_status vsi_nn_kernel_tensor_write_from_float
         vsi_size_t sz = 0;
         sz = vsi_nn_kernel_tensor_attr_get_size( attr );
         internal_buffer0 = malloc( sz );
+        CHECK_PTR_FAIL_GOTO( internal_buffer0, "Create buffer fail.", final );
     }
     else
     {
         internal_buffer0 = malloc( bytes );
+        CHECK_PTR_FAIL_GOTO( internal_buffer0, "Create buffer fail.", final );
         internal_buffer = internal_buffer0;
     }
 
@@ -422,6 +434,7 @@ vsi_status vsi_nn_kernel_tensor_write_from_float
             if ( attr->dtype == I4 || attr->dtype == U4 )
             {
                 internal_buffer = malloc( bytes );
+                CHECK_PTR_FAIL_GOTO( internal_buffer, "Create buffer fail.", final );
                 status = vsi_nn_kernel_pack_4bit_data(attr, (uint8_t*)internal_buffer0, (uint8_t*)internal_buffer);
             }
         }
@@ -442,7 +455,7 @@ vsi_status vsi_nn_kernel_tensor_write_from_float
     {
         vsi_nn_kernel_tensor_attr_release( &internal_attr );
     }
-    if ( attr->dtype == I4 || attr->dtype == U4 )
+    if ( attr && (attr->dtype == I4 || attr->dtype == U4) )
     {
         vsi_nn_safe_free(internal_buffer0);
     }
@@ -562,6 +575,8 @@ static void _convert_tensor_attr_to_vx_tensor_param
         MAP_TYPE( p->data_format, F64, VSI_NN_TYPE_FLOAT64 );
         MAP_TYPE( p->data_format, BF16, VSI_NN_TYPE_BFLOAT16 );
         MAP_TYPE( p->data_format, BOOL8, VSI_NN_TYPE_BOOL8 );
+        MAP_TYPE( p->data_format, FP8_E4M3, VSI_NN_TYPE_FLOAT8_E4M3 );
+        MAP_TYPE( p->data_format, FP8_E5M2, VSI_NN_TYPE_FLOAT8_E5M2 );
         default:
             VSI_ASSERT( FALSE );
             break;
@@ -577,6 +592,12 @@ static void _convert_tensor_attr_to_vx_tensor_param
         MAP_TYPE( p->quant_format,
                 VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL,
                 VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC );
+        MAP_TYPE(p->quant_format,
+                 VSI_NN_KERNEL_QUANT_FLOAT8,
+                 VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8);
+        MAP_TYPE(p->quant_format,
+                 VSI_NN_KERNEL_QUANT_FLOAT8_PERCHANNEL,
+                 VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8);
         default:
             VSI_ASSERT( FALSE );
             break;
@@ -615,11 +636,11 @@ vsi_nn_kernel_tensor_t vsi_nn_kernel_tensor_create
     //convert attr->shape->data to correct data type
     for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
     {
-        size_vxsize[i] = -1 == attr->shape->data[i] ? -1 : (vx_size)attr->shape->data[i];
+        size_vxsize[i] = (vsi_size_t)-1 == attr->shape->data[i] ? (vx_size)-1 : (vx_size)attr->shape->data[i];
     }
     for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
     {
-        size_u32[i] = -1 == attr->shape->data[i] ? -1 : (vx_uint32)attr->shape->data[i];
+        size_u32[i] = (vsi_size_t)-1 == attr->shape->data[i] ? (vx_uint32)-1 : (vx_uint32)attr->shape->data[i];
     }
 #ifdef VSI_40BIT_VA_SUPPORT
     params.sizes = size_vxsize;
@@ -672,6 +693,8 @@ vsi_nn_tensor_t* vsi_nn_pad_tensor
     vsi_nn_dtype_t  dst_type;
     vsi_nn_tensor_t *output = NULL;
 
+    VSI_UNREFERENCED(mode);
+
     input_data_ptr = vsi_nn_ConvertTensorToFloat32Data(graph, input);
     CHECK_PTR_FAIL_GOTO( input_data_ptr, "Create data ptr fail.", final );
 
@@ -764,6 +787,7 @@ vsi_nn_tensor_t* vsi_nn_merge_input_zeropoint_to_bias
     uint32_t  i, j;
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     weight_data = vsi_nn_ConvertTensorToData(graph, weight);
+    CHECK_PTR_FAIL_GOTO( weight_data, "Create buffer fail.", final );
 
     if (bias == NULL)
     {
@@ -787,9 +811,11 @@ vsi_nn_tensor_t* vsi_nn_merge_input_zeropoint_to_bias
             attr.dim_num  = 2;
         }
         bias_data = (int32_t *)vsi_nn_ConvertTensorToData(graph, bias);
+        CHECK_PTR_FAIL_GOTO( new_bias_data_ptr, "Create buffer fail.", final );
     }
 
     new_bias_data_ptr = (int32_t *)malloc(attr.size[0] * sizeof(int32_t));
+    CHECK_PTR_FAIL_GOTO( new_bias_data_ptr, "Create buffer fail.", final );
     memset((void *)new_bias_data_ptr, 0, sizeof(int32_t) * attr.size[0]);
 
     if (input->attr.dtype.zero_point != 0)
@@ -815,6 +841,7 @@ vsi_nn_tensor_t* vsi_nn_merge_input_zeropoint_to_bias
 
     new_bias = vsi_nn_CreateTensorFromData(graph, (uint8_t *)new_bias_data_ptr, &attr);
 
+final:
     vsi_nn_safe_free( new_bias_data_ptr );
     vsi_nn_safe_free( bias_data );
     vsi_nn_safe_free( weight_data );
diff --git a/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c b/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c
index 6756e3a16..a40bd81ba 100644
--- a/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c
@@ -29,6 +29,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_error.h"
 
 #define REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( kernel_name )   \
     static vsi_nn_kernel_node_t _##kernel_name##setup \
@@ -62,6 +63,11 @@ REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( a_times_b_plus_c )
     vsi_nn_tensor_t * a_times_b = NULL;
     vsi_nn_tensor_attr_t attr;
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     scale = 1.0;
     overflow_policy = VX_CONVERT_POLICY_SATURATE;
     rounding_policy = VX_ROUND_POLICY_TO_ZERO;
@@ -70,7 +76,7 @@ REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( a_times_b_plus_c )
     if(!scale_s)
     {
         VSILOGE("CreateScalar fail\n");
-        goto OnError;
+        goto final;
     }
 
     memset(&attr, 0, sizeof(attr));
@@ -79,6 +85,7 @@ REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( a_times_b_plus_c )
     attr.vtl = TRUE;
     attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
     a_times_b = vsi_nn_CreateTensor(graph, &attr);
+    CHECK_PTR_FAIL_GOTO( a_times_b, "Create tensor fail.", final );
 
     node = vxTensorMultiplyNode( graph->g,
         inputs[0]->t, inputs[1]->t,
@@ -89,7 +96,7 @@ REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( a_times_b_plus_c )
     if( NULL == node )
     {
         VSILOGE("Call vxTensorMultiplyNode fail.(a_times_b_plus_c)");
-        goto OnError;
+        goto final;
     }
 
     node = vxTensorAddNode( graph->g, a_times_b->t, inputs[2]->t,
@@ -97,10 +104,10 @@ REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( a_times_b_plus_c )
     if( NULL == node )
     {
         VSILOGE("Call vxTensorAddNode fail.(a_times_b_plus_c)");
-        goto OnError;
+        goto final;
     }
 
-OnError:
+final:
     if (scale_s) vxReleaseScalar(&scale_s);
     if (a_times_b) vsi_nn_ReleaseTensor(&a_times_b);
 
diff --git a/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c b/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c
index 955c61d2c..5fd98c2a9 100644
--- a/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/batch_norm_vx.c
@@ -48,6 +48,10 @@ static vsi_nn_kernel_node_t _setup
     vx_node node = NULL;
     float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     node = vxBatchNormalizationLayer(
         graph->g,
         eps,
diff --git a/src/tim/vx/internal/src/kernel/vx/convolutional.c b/src/tim/vx/internal/src/kernel/vx/convolutional.c
index 2f9be4903..d77719477 100644
--- a/src/tim/vx/internal/src/kernel/vx/convolutional.c
+++ b/src/tim/vx/internal/src/kernel/vx/convolutional.c
@@ -293,6 +293,14 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d )
     vx_tensor temp_tensors[3] = { NULL };
     uint32_t i = 0;
 
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(outputs);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(kernel);
+
     _build_vx_conv2d_param(
             &vxparam,
             1, vsi_nn_kernel_param_get_int32(params, "stride"),
@@ -310,7 +318,9 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d )
     temp_tensors[0] = _expand_tensor_dim( inputs[0]->t,
             (vsi_ssize_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 1 );
     CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final );
-    if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
+    if (inputs[1]->attr.dtype.qnt_type !=
+            VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC &&
+        inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8)
     {
         temp_tensors[1] = _expand_tensor_dim( inputs[1]->t,
                 (vsi_ssize_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 1 );
@@ -369,6 +379,14 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
     uint32_t i = 0;
     vsi_bool need_explicit_padding = FALSE;
 
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(outputs);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(kernel);
+
     _build_vx_conv2d_param(
             &vxparam,
             1, vsi_nn_kernel_param_get_int32(params, "stride"),
@@ -387,7 +405,9 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
             (vsi_ssize_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 1 );
     CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final );
 
-    if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
+    if (inputs[1]->attr.dtype.qnt_type !=
+            VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC &&
+        inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8)
     {
         vsi_size_t new_w_shape[VSI_NN_MAX_DIM_NUM] = { 0 };
         uint32_t new_w_rank = 4;
@@ -486,6 +506,14 @@ REGISTER_CONV_OPENVX_KERNEL( conv2d )
     vx_node node = NULL;
     vx_nn_convolution_params_ext2_t vxparam;
 
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(outputs);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(kernel);
+
     _build_vx_conv2d_param(
             &vxparam,
             vsi_nn_kernel_param_get_int32(params, "stride_h"),
@@ -518,6 +546,14 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv2d )
     vx_node node = NULL;
     vx_nn_convolution_params_ext2_t vxparam;
 
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(outputs);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(kernel);
+
     _build_vx_conv2d_param(
             &vxparam,
             vsi_nn_kernel_param_get_int32(params, "stride_h"),
@@ -552,6 +588,14 @@ REGISTER_CONV_OPENVX_KERNEL( deconvolution1d )
     vx_tensor temp_tensors[2] = { NULL };
     int i;
 
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(outputs);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(kernel);
+
     _build_vx_deconv2d_param(
             &vxparam,
             1, vsi_nn_kernel_param_get_int32(params, "stride"),
@@ -595,6 +639,7 @@ REGISTER_CONV_OPENVX_KERNEL( conv3d )
     vx_node node = NULL;
 #if VX_CONV_3D_API_SUPPORT
     vx_nn_convolution_3d_params_t vxparam;
+
     memset(&vxparam, 0, sizeof(vxparam));
 
     _build_vx_conv3d_param(
@@ -625,14 +670,23 @@ REGISTER_CONV_OPENVX_KERNEL( conv3d )
         outputs[0]->t
         );
 #endif
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(outputs);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(kernel);
     return (vsi_nn_kernel_node_t)node;
 } /* depthwise_conv2d*/
 
 REGISTER_CONV_OPENVX_KERNEL( deconv3d )
 {
     vx_node node = NULL;
+
 #if VX_DECONV_3D_API_SUPPORT
     vx_nn_deconvolution_3d_params_t vxparam;
+
     memset(&vxparam, 0, sizeof(vxparam));
 
     _build_vx_deconv3d_param(
@@ -648,7 +702,7 @@ REGISTER_CONV_OPENVX_KERNEL( deconv3d )
             vsi_nn_kernel_param_get_int32(params, "pad_right"),
             vsi_nn_kernel_param_get_int32(params, "outpadding_w"),
             vsi_nn_kernel_param_get_int32(params, "outpadding_h"),
-            vsi_nn_kernel_param_get_int32(params, "outpadding_w"),
+            vsi_nn_kernel_param_get_int32(params, "outpadding_d"),
             vsi_nn_kernel_param_get_int32(params, "group"),
             vsi_nn_kernel_param_get_int32(params, "overflow_policy"),
             vsi_nn_kernel_param_get_int32(params, "rounding_policy"),
@@ -662,7 +716,14 @@ REGISTER_CONV_OPENVX_KERNEL( deconv3d )
         outputs[0]->t
         );
 #endif
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(outputs);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(kernel);
     return (vsi_nn_kernel_node_t)node;
 } /* deconv3d */
 
-#undef REGISTER_CONV_OPENVX_KERNEL
\ No newline at end of file
+#undef REGISTER_CONV_OPENVX_KERNEL
diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
index 9e299da26..09514d316 100644
--- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
@@ -106,6 +106,10 @@ static vsi_nn_kernel_node_t _setup
         goto final;
     }
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
 final:
     if (lut1)
     {
@@ -120,6 +124,14 @@ static vsi_nn_kernel_node_t _setup
 
     return (vsi_nn_kernel_node_t)node;
 #else
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(outputs);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(lut_type);
     return NULL;
 #endif
 } /* _setup() */
@@ -190,6 +202,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( abs )
     vx_tensor input = NULL, input0 = NULL;
     vx_tensor output = NULL, output0 = NULL;
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     if (inputs[0]->attr.dim_num > 4)
     {
         input_size[0] = vsi_nn_GetElementNum(inputs[0]) /
@@ -231,6 +248,10 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( linear )
     float a_v = vsi_nn_kernel_param_get_float32( params, "a_v" );
     float b_v = vsi_nn_kernel_param_get_float32( params, "b_v" );
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     node = vxActivationLayer(
         graph->g,
         inputs[0]->t,
@@ -247,6 +268,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( sigmoid )
 {
     vx_node node = NULL;
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     node = vxActivationLayer(
         graph->g,
         inputs[0]->t,
@@ -265,6 +291,10 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( tanh )
     float scale_a = vsi_nn_kernel_param_get_float32( params, "scale_a" );
     float scale_b = vsi_nn_kernel_param_get_float32( params, "scale_b" );
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     node = vxActivationLayer(
         graph->g,
         inputs[0]->t,
@@ -281,6 +311,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( relu1 )
 {
     vx_node node = NULL;
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     node = vxActivationLayer(
         graph->g,
         inputs[0]->t,
@@ -297,6 +332,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( relu6 )
 {
     vx_node node = NULL;
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     node = vxActivationLayer(
         graph->g,
         inputs[0]->t,
@@ -313,6 +353,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( rsqrt )
 {
     vx_node node = NULL;
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     node = vxActivationLayer(
         graph->g,
         inputs[0]->t,
@@ -329,6 +374,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( sqrt )
 {
     vx_node node = NULL;
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     node = vxActivationLayer(
         graph->g,
         inputs[0]->t,
@@ -345,6 +395,11 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( softrelu )
 {
     vx_node node = NULL;
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     node = vxActivationLayer(
         graph->g,
         inputs[0]->t,
diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_vx.c
index 3c9947d40..d81a55563 100644
--- a/src/tim/vx/internal/src/kernel/vx/eltwise_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/eltwise_vx.c
@@ -57,6 +57,12 @@ REGISTER_ELTWISE_OPENVX_KERNEL( add )
 {
     vx_node node = vxTensorAddNode( graph->g, inputs[0]->t, inputs[1]->t,
         VX_CONVERT_POLICY_SATURATE, outputs[0]->t );
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     return (vsi_nn_kernel_node_t)node;
 } /* add() */
 
@@ -65,6 +71,11 @@ REGISTER_ELTWISE_OPENVX_KERNEL( sub )
     vx_node node = vxTensorSubtractNode( graph->g, inputs[0]->t, inputs[1]->t,
                 VX_CONVERT_POLICY_SATURATE, outputs[0]->t );
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     return (vsi_nn_kernel_node_t)node;
 } /* sub() */
 
@@ -75,6 +86,10 @@ REGISTER_ELTWISE_OPENVX_KERNEL( div )
     vx_scalar scale_s = NULL;
     vx_node node = NULL;
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     scale = vsi_nn_kernel_param_get_float32(params, "scale");
     overflow_policy = vsi_nn_kernel_param_get_int32(params, "overflow_policy");
     rounding_policy = vsi_nn_kernel_param_get_int32(params, "rounding_policy");
@@ -105,6 +120,10 @@ REGISTER_ELTWISE_OPENVX_KERNEL( mul )
     vx_scalar scale_s = NULL;
     vx_node node = NULL;
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     scale = vsi_nn_kernel_param_get_float32(params, "scale");
     overflow_policy = vsi_nn_kernel_param_get_int32(params, "overflow_policy");
     rounding_policy = vsi_nn_kernel_param_get_int32(params, "rounding_policy");
diff --git a/src/tim/vx/internal/src/kernel/vx/l2_normalize_vx.c b/src/tim/vx/internal/src/kernel/vx/l2_normalize_vx.c
index 5133dabc4..af68dd210 100644
--- a/src/tim/vx/internal/src/kernel/vx/l2_normalize_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/l2_normalize_vx.c
@@ -116,6 +116,10 @@ REGISTER_L2_NORMALIZE_OPENVX_KERNEL( l2_norm )
     if (vx_output) vxReleaseTensor(&vx_output);
 #endif
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     if( NULL == node )
     {
         VSILOGE("Call vxSoftmaxLayer2 fail.(softmax)");
diff --git a/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c b/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c
index 3f5bfa1f4..5279543dc 100644
--- a/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c
@@ -63,6 +63,10 @@ REGISTER_BATCH_GEMM_OPENVX_KERNEL( matrixmul )
     vx_scalar trans_a = vxCreateScalar(graph->ctx->c, VX_TYPE_BOOL, &transposeA);
     vx_scalar trans_b = vxCreateScalar(graph->ctx->c, VX_TYPE_BOOL, &transposeB);
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     node = vxBatchGemmNode(graph->g,
                       inputs[0]->t,
                       inputs[1]->t,
diff --git a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
index a458e3800..c9a2c845c 100644
--- a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
@@ -30,6 +30,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_dtype_util.h"
+#include "vsi_nn_error.h"
 
 #define REGISTER_PAD2_OPENVX_KERNEL( kernel_name )   \
     static vsi_nn_kernel_node_t _##kernel_name##setup \
@@ -68,6 +69,10 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 )
     vsi_bool release_intermediate_tensor = TRUE;
     float const_val = vsi_nn_kernel_param_get_float32(params, "const_val");
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     memset(&param, 0, sizeof(param));
     memset(pad_front_array, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
     memset(pad_back_array, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
@@ -90,6 +95,7 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 )
         attr.is_const = FALSE;
 
         convert_tensor = vsi_nn_CreateTensor(graph, &attr);
+        CHECK_PTR_FAIL_GOTO( convert_tensor, "Create tensor fail.", final );
 
         node = vxTensorCopyNode(
             graph->g,
@@ -105,6 +111,7 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 )
 
     node = vxTensorPadNode( graph->g, convert_tensor->t, outputs[0]->t, &param, sizeof(param) );
 
+final:
     vxReleaseScalar( &param.pad_const );
 
     if (release_intermediate_tensor)
diff --git a/src/tim/vx/internal/src/kernel/vx/prelu_vx.c b/src/tim/vx/internal/src/kernel/vx/prelu_vx.c
index 4728ad651..ebf381256 100644
--- a/src/tim/vx/internal/src/kernel/vx/prelu_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/prelu_vx.c
@@ -95,6 +95,10 @@ REGISTER_PRELU_OPENVX_KERNEL( prelu )
     vx_node node = NULL;
     int32_t is_per_channel_alpha = 0;
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     is_per_channel_alpha = vsi_nn_kernel_param_get_int32(params, "is_per_channel_alpha");
 
     if (!is_per_channel_alpha)
diff --git a/src/tim/vx/internal/src/kernel/vx/resize_vx.c b/src/tim/vx/internal/src/kernel/vx/resize_vx.c
index 3b2b16778..fdea91a43 100644
--- a/src/tim/vx/internal/src/kernel/vx/resize_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/resize_vx.c
@@ -121,6 +121,9 @@ static vsi_nn_kernel_node_t _setup
                               sizeof(param),
                               outputs[0]->t );
 #endif
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
     if ( NULL == node )
     {
         VSILOGI("Call vxTensorScaleNode fail.(resize)");
diff --git a/src/tim/vx/internal/src/kernel/vx/select_vx.c b/src/tim/vx/internal/src/kernel/vx/select_vx.c
new file mode 100644
index 000000000..d50a99504
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/vx/select_vx.c
@@ -0,0 +1,86 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+#if (VX_TENSOR_SELECT_VX_SUPPORT)
+
+#define REGISTER_SELECTOPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_SELECTOPENVX_KERNEL( select )
+{
+    vx_node node = NULL;
+    vx_tensor input_list[3] = {NULL};
+    uint32_t i = 0;
+    uint32_t input_count = (uint32_t)input_num;
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(output_num);
+
+    for ( i = 0; i < input_count; i++ )
+    {
+        input_list[i] = inputs[i]->t;
+    }
+
+    node = vxTensorSelectLayer(
+        graph->g,
+        input_list,
+        input_count,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* select() */
+
+#undef REGISTER_SELECTOPENVX_KERNEL
+
+#endif
diff --git a/src/tim/vx/internal/src/kernel/vx/softmax_vx.c b/src/tim/vx/internal/src/kernel/vx/softmax_vx.c
index f097fbbb9..1d1d445e5 100644
--- a/src/tim/vx/internal/src/kernel/vx/softmax_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/softmax_vx.c
@@ -59,10 +59,12 @@ REGISTER_SOFTMAX_OPENVX_KERNEL( softmax )
     vx_node node = NULL;
     float beta = vsi_nn_kernel_param_get_float32(params, "beta");
     vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+#if !VX_STREAM_PROCESSOR_SUPPORT
     vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
     uint32_t rank_in = 0;
-    int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
     int32_t new_axis = 0;
+#endif
+    int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
     size_t size = sizeof(vx_nn_softmax_params_t);
 #ifdef VX_SOFTMAX_AXIS_PARAMETER_SUPPORT
     vx_nn_softmax_params_ext_t paramExt;
@@ -78,6 +80,17 @@ REGISTER_SOFTMAX_OPENVX_KERNEL( softmax )
     base.beta = beta;
 #endif
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
+#if VX_STREAM_PROCESSOR_SUPPORT
+    node = vxSoftmaxLayer2( graph->g,
+        inputs[0]->t,
+        param,
+        size,
+        outputs[0]->t);
+#else
     vsi_nn_kernel_optimize_softmax_shape(
        inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
        shapes[0], &rank_in, &new_axis);
@@ -108,13 +121,14 @@ REGISTER_SOFTMAX_OPENVX_KERNEL( softmax )
         param,
         size,
         reshape_tensors[1]->t);
+#endif
     if( NULL == node )
     {
         VSILOGE("Call vxSoftmaxLayer2 fail.(softmax)");
     }
 
-    vsi_nn_ReleaseTensor( &reshape_tensors[0] );
-    vsi_nn_ReleaseTensor( &reshape_tensors[1] );
+    vsi_safe_release_tensor( reshape_tensors[0] );
+    vsi_safe_release_tensor( reshape_tensors[1] );
 
     return (vsi_nn_kernel_node_t)node;
 } /* softmax() */
diff --git a/src/tim/vx/internal/src/kernel/vx/square_vx.c b/src/tim/vx/internal/src/kernel/vx/square_vx.c
index 5ae1499da..778557331 100644
--- a/src/tim/vx/internal/src/kernel/vx/square_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/square_vx.c
@@ -46,6 +46,11 @@ static vsi_nn_kernel_node_t _setup
 {
     vx_node node = NULL;
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     node = vxActivationLayer(
         graph->g,
         inputs[0]->t,
diff --git a/src/tim/vx/internal/src/kernel/vx/swish_vx.c b/src/tim/vx/internal/src/kernel/vx/swish_vx.c
index 7557d9b11..9b458c62d 100644
--- a/src/tim/vx/internal/src/kernel/vx/swish_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/swish_vx.c
@@ -62,6 +62,10 @@ REGISTER_SWISH_OPENVX_KERNEL( swish )
     vx_enum function = VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SWISH;
     float   beta        = 1.0f;
 
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
     if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
     {
         swish_type = (vsi_nn_swish_type)vsi_nn_kernel_param_get_int32(params, "type");
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl
index 49d04e2d4..755c809e3 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather.cl
@@ -15,6 +15,8 @@ __kernel void gather_U8toU8(
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
     int4 indice = read_imagei(input1, coord_in.xy);
+
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
     coord_in.w = gidz * axis_num + indice.x;
 
     uint4 data = read_imageui(input0, coord_in.zw);
@@ -40,6 +42,8 @@ __kernel void gather_F16toF16(
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
     int4 indice = read_imagei(input1, coord_in.xy);
+
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
     coord_in.w = gidz * axis_num + indice.x;
 
     float4 data = read_imagef(input0, coord_in.zw);
@@ -65,6 +69,8 @@ __kernel void gather_I32toI32(
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
     int4 indice = read_imagei(input1, coord_in.xy);
+
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
     coord_in.w = gidz * axis_num + indice.x;
 
     int4 data = read_imagei(input0, coord_in.zw);
@@ -90,6 +96,8 @@ __kernel void gather_F32toF32(
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
     int4 indice = read_imagei(input1, coord_in.xy);
+
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
     coord_in.w = gidz * axis_num + indice.x;
 
     float4 data = read_imagef(input0, coord_in.zw);
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_array.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_array.cl
index 15a466443..574dd6b3f 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/gather_array.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_array.cl
@@ -15,6 +15,7 @@ __kernel void gather_array_U8toU8(
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
     int4 indice = read_imagei(input1, coord_in.xy);
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
     coord_in.w = gidz * axis_num + indice.x;
 
     Image img1 = create_image_from_image2d(input0, 1);
@@ -43,6 +44,7 @@ __kernel void gather_array_F16toF16(
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
     int4 indice = read_imagei(input1, coord_in.xy);
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
     coord_in.w = gidz * axis_num + indice.x;
 
     Image img1 = create_image_from_image2d(input0, 2);
@@ -71,6 +73,7 @@ __kernel void gather_array_I32toI32(
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
     int4 indice = read_imagei(input1, coord_in.xy);
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
     coord_in.w = gidz * axis_num + indice.x;
 
     Image img1 = create_image_from_image2d(input0, 4);
@@ -99,6 +102,7 @@ __kernel void gather_array_F32toF32(
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
     int4 indice = read_imagei(input1, coord_in.xy);
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
     coord_in.w = gidz * axis_num + indice.x;
 
     Image img1 = create_image_from_image2d(input0, 4);
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl
index 4ff6ec158..bfc88d0ed 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_batch.cl
@@ -20,6 +20,7 @@ __kernel void gather_batch_U8toU8(
     {
         int4 indice = read_imagei(input1, coord_idx);
         coord_idx.y++;
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
         coord_in.y = gidz * axis_num + indice.x;
 
         uint4 data = read_imageui(input0, coord_in);
@@ -51,6 +52,7 @@ __kernel void gather_batch_F16toF16(
     {
         int4 indice = read_imagei(input1, coord_idx);
         coord_idx.y++;
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
         coord_in.y = gidz * axis_num + indice.x;
 
         float4 data = read_imagef(input0, coord_in);
@@ -82,6 +84,7 @@ __kernel void gather_batch_I32toI32(
     {
         int4 indice = read_imagei(input1, coord_idx);
         coord_idx.y++;
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
         coord_in.y = gidz * axis_num + indice.x;
 
         int4 data = read_imagei(input0, coord_in);
@@ -113,6 +116,7 @@ __kernel void gather_batch_F32toF32(
     {
         int4 indice = read_imagei(input1, coord_idx);
         coord_idx.y++;
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
         coord_in.y = gidz * axis_num + indice.x;
 
         float4 data = read_imagef(input0, coord_in);
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_elements.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_elements.cl
index 323f69417..58403f9a3 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/gather_elements.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_elements.cl
@@ -1,3 +1,11 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+
+_viv_uniform uint width0;
+_viv_uniform uint height0;
+_viv_uniform uint width1;
+_viv_uniform uint height1;
+_viv_uniform uint width_out;
+_viv_uniform uint height_out;
 
 #define GATHER_ELEMENTS_AXIS0_2D(name, data_type, read_func, write_func, conv_func) \
 __kernel void gather_elements_axis0_##name##_I32to##name##_2D \
@@ -133,3 +141,159 @@ __kernel void gather_elements_axis2_##name##_I32to##name \
 GATHER_ELEMENTS_AXIS2(F32, float4, read_imagef,  write_imagef,  convert_float4)
 GATHER_ELEMENTS_AXIS2(I32, int4,   read_imagei,  write_imagei,  convert_int4_rte)
 GATHER_ELEMENTS_AXIS2(U32, uint4,  read_imageui, write_imageui, convert_uint4_rte)
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output, \
+                 float           input_scale, \
+                 float           input_tail, \
+                 int             axis_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+    Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \
+    int* index_ptr = (int*)index_tensor.ptr; \
+    int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \
+ \
+    Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \
+    data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \
+    data_type data = input_ptr[index + coord.y * width0 + coord.z * width0 * height0]; \
+ \
+    Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \
+    data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \
+    output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(F32, float, float*, 4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I32, int,   int*,   4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I8,  char,  char*,  1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(U8,  uchar, uchar*, 1)
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output, \
+                 float           input_scale, \
+                 float           input_tail, \
+                 int             axis_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+    Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \
+    int* index_ptr = (int*)index_tensor.ptr; \
+    int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \
+ \
+    Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \
+    data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \
+    data_type data = input_ptr[coord.x + index * width0 + coord.z * width0 * height0]; \
+ \
+    Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \
+    data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \
+    output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(F32, float, float*, 4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I32, int,   int*,   4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I8,  char,  char*,  1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(U8,  uchar, uchar*, 1)
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis2_##name##_I32to##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output, \
+                 float           input_scale, \
+                 float           input_tail, \
+                 int             axis_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+    Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \
+    int* index_ptr = (int*)index_tensor.ptr; \
+    int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \
+ \
+    Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \
+    data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \
+    data_type data = input_ptr[coord.x + coord.y * width0 + index * width0 * height0]; \
+ \
+    Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \
+    data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \
+    output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(F32, float, float*, 4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I32, int,   int*,   4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I8,  char,  char*,  1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(U8,  uchar, uchar*, 1)
+
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name##_2D \
+    ( \
+    __read_only  image2d_t input0, \
+    __read_only  image2d_t input1, \
+    __write_only image2d_t output, \
+                 float           input_scale, \
+                 float           input_tail, \
+                 int             axis_size \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    Image index_img = create_image_from_image2d(input1, 4); \
+    int* index_ptr = (int*)index_img.ptr; \
+    int index = index_ptr[coord.x + coord.y * width1]; \
+ \
+    Image input_img = create_image_from_image2d(input0, stride); \
+    data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \
+    data_type data = input_ptr[index + coord.y * width0]; \
+ \
+    Image output_img = create_image_from_image2d(output, stride); \
+    data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \
+    output_ptr[coord.x + coord.y * width_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(F32, float, float*, 4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I32, int,   int*,   4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I8,  char,  char*,  1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(U8,  uchar, uchar*, 1)
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name##_2D \
+    ( \
+    __read_only  image2d_t input0, \
+    __read_only  image2d_t input1, \
+    __write_only image2d_t output, \
+                 float           input_scale, \
+                 float           input_tail, \
+                 int             axis_size \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    Image index_img = create_image_from_image2d(input1, 4); \
+    int* index_ptr = (int*)index_img.ptr; \
+    int index = index_ptr[coord.x + coord.y * width1]; \
+ \
+    Image input_img = create_image_from_image2d(input0, stride); \
+    data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \
+    data_type data = input_ptr[coord.x + index  * width0]; \
+ \
+    Image output_img = create_image_from_image2d(output, stride); \
+    data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \
+    output_ptr[coord.x + coord.y * width_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(F32, float, float*, 4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I32, int,   int*,   4)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I8,  char,  char*,  1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(U8,  uchar, uchar*, 1)
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd_batch.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd_batch.cl
index 02e430922..1cf59759f 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd_batch.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd_batch.cl
@@ -1,124 +1,133 @@
 __kernel void gather_nd_batch_U8toU8_1D(
     __read_only image2d_t   input0,
-    __read_only image2d_t   input1,
-    __write_only image2d_t  output,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
     int block_size,
     int coord_dim
     )
 {
     int gidx = get_global_id(0);  // block_size
-    int gidy = get_global_id(1);  // batch_num
+    int gidy = get_global_id(1);  // index_num
+    int gidz = get_global_id(2);  // batch_num
 
-    int4 coord = (int4)(gidx, gidy, 0, 0);
-    int4 indice = read_imagei(input1, coord.wy);
-    coord.z = indice.x * block_size + gidx;
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    int4 indice = read_imagei(input1, coord.wyzw);
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
 
-    uint4 data = read_imageui(input0, coord.zy);
-    write_imageui(output, coord.xy, data);
+    uint4 data = read_imageui(input0, coord0);
+    write_imageui(output, coord, data);
 }
 
 __kernel void gather_nd_batch_F16toF16_1D(
     __read_only image2d_t   input0,
-    __read_only image2d_t   input1,
-    __write_only image2d_t  output,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
     int block_size,
     int coord_dim
     )
 {
     int gidx = get_global_id(0);  // block_size
-    int gidy = get_global_id(1);  // batch_num
+    int gidy = get_global_id(1);  // index_num
+    int gidz = get_global_id(2);  // batch_num
 
-    int4 coord = (int4)(gidx, gidy, 0, 0);
-    int4 indice = read_imagei(input1, coord.wy);
-    coord.z = indice.x * block_size + gidx;
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    int4 indice = read_imagei(input1, coord.wyzw);
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
 
-    float4 data = read_imagef(input0, coord.zy);
-    write_imagef(output, coord.xy, data);
+    float4 data = read_imagef(input0, coord0);
+    write_imagef(output, coord, data);
 }
 
 __kernel void gather_nd_batch_I8toI8_1D(
     __read_only image2d_t   input0,
-    __read_only image2d_t   input1,
-    __write_only image2d_t  output,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
     int block_size,
     int coord_dim
     )
 {
     int gidx = get_global_id(0);  // block_size
-    int gidy = get_global_id(1);  // batch_num
+    int gidy = get_global_id(1);  // index_num
+    int gidz = get_global_id(2);  // batch_num
 
-    int4 coord = (int4)(gidx, gidy, 0, 0);
-    int4 indice = read_imagei(input1, coord.wy);
-    coord.z = indice.x * block_size + gidx;
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    int4 indice = read_imagei(input1, coord.wyzw);
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
 
-    int4 data = read_imagei(input0, coord.zy);
-    write_imagei(output, coord.xy, data);
+    int4 data = read_imagei(input0, coord0);
+    write_imagei(output, coord, data);
 }
 
 //2D
 __kernel void gather_nd_batch_U8toU8_2D(
     __read_only image2d_array_t   input0,
-    __read_only image2d_t   input1,
-    __write_only image2d_t  output,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
     int block_size,
     int coord_dim
     )
 {
     int gidx = get_global_id(0);  // block_size
-    int gidy = get_global_id(1);  // batch_num
+    int gidy = get_global_id(1);  // index_num
+    int gidz = get_global_id(2);  // batch_num
 
-    int4 coord = (int4)(0, gidy, gidx, 1);
-    int4 indice = read_imagei(input1, coord.xy);
-    int4 indice1 = read_imagei(input1, coord.wy);
+    int4 coord = (int4)(1, gidy, gidz, 0);
+    int4 indice = read_imagei(input1, coord.wyzw);
+    int4 indice1 = read_imagei(input1, coord.xyzw);
     indice.x = indice.x * block_size + gidx;
     indice.y = indice1.x;
-    indice.zw = coord.yx;
+    indice.zw = coord.zw;
 
     uint4 data = read_imageui(input0, indice);
-    write_imageui(output, coord.zy, data);
+    coord.x = gidx;
+    write_imageui(output, coord, data);
 }
 
 __kernel void gather_nd_batch_F16toF16_2D(
     __read_only image2d_array_t   input0,
-    __read_only image2d_t   input1,
-    __write_only image2d_t  output,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
     int block_size,
     int coord_dim
     )
 {
     int gidx = get_global_id(0);  // block_size
-    int gidy = get_global_id(1);  // batch_num
+    int gidy = get_global_id(1);  // index_num
+    int gidz = get_global_id(2);  // batch_num
 
-    int4 coord = (int4)(0, gidy, gidx, 1);
-    int4 indice = read_imagei(input1, coord.xy);
-    int4 indice1 = read_imagei(input1, coord.wy);
+    int4 coord = (int4)(1, gidy, gidz, 0);
+    int4 indice = read_imagei(input1, coord.wyzw);
+    int4 indice1 = read_imagei(input1, coord.xyzw);
     indice.x = indice.x * block_size + gidx;
     indice.y = indice1.x;
-    indice.zw = coord.yx;
+    indice.zw = coord.zw;
 
     float4 data = read_imagef(input0, indice);
-    write_imagef(output, coord.zy, data);
+    coord.x = gidx;
+    write_imagef(output, coord, data);
 }
 
 __kernel void gather_nd_batch_I8toI8_2D(
     __read_only image2d_array_t   input0,
-    __read_only image2d_t   input1,
-    __write_only image2d_t  output,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
     int block_size,
     int coord_dim
     )
 {
     int gidx = get_global_id(0);  // block_size
-    int gidy = get_global_id(1);  // batch_num
+    int gidy = get_global_id(1);  // index_num
+    int gidz = get_global_id(2);  // batch_num
 
-    int4 coord = (int4)(0, gidy, gidx, 1);
-    int4 indice = read_imagei(input1, coord.xy);
-    int4 indice1 = read_imagei(input1, coord.wy);
+    int4 coord = (int4)(1, gidy, gidz, 0);
+    int4 indice = read_imagei(input1, coord.wyzw);
+    int4 indice1 = read_imagei(input1, coord.xyzw);
     indice.x = indice.x * block_size + gidx;
     indice.y = indice1.x;
     indice.y = indice1.x;
-    indice.zw = coord.yx;
+    indice.zw = coord.zw;
 
     int4 data = read_imagei(input0, indice);
-    write_imagei(output, coord.zy, data);
+    coord.x = gidx;
+    write_imagei(output, coord, data);
 }
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_cross.cl b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_cross.cl
new file mode 100644
index 000000000..e36f10353
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_cross.cl
@@ -0,0 +1,95 @@
+__kernel void gemm_F32F32toF32_merge(
+    __read_only image2d_array_t   inputA,
+    __read_only image2d_array_t   inputB,
+    __write_only image2d_array_t  output,
+    int M,
+    int K,
+    int N,
+    int ac2zero,
+    int bc2zero,
+    float scale_a,
+    float zp_a,
+    float scale_b,
+    float zp_b,
+    float scale_out,
+    float zp_out,
+    int outer)
+{
+    for(int i = 0; i < outer; i++)
+    {
+        int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? i : get_global_id(2)), 0);
+        int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0);
+
+        float4 sum = (float4)(0);
+
+        for(; coord_a.x < K;)
+        {
+            float4 tempA0;
+            float4 tempB0;
+
+            tempA0 = read_imagef(inputA, coord_a);
+            tempB0 = read_imagef(inputB, coord_b);
+            coord_a.x++;
+            coord_b.y++;
+
+            sum = sum + tempA0 * tempB0;
+        }
+
+        coord_b.y = get_global_id(1);
+        coord_b.z = get_global_id(2) + i * get_global_size(2);
+        write_imagef(output, coord_b, sum);
+    }
+}
+
+#define GEMM_MERGE(name, dst_type, read_image_type, convert_type, write_image_type) \
+__kernel void gemm_##name##_merge( \
+    __read_only image2d_array_t   inputA, \
+    __read_only image2d_array_t   inputB, \
+    __write_only image2d_array_t  output, \
+    int M, \
+    int K, \
+    int N, \
+    int ac2zero, \
+    int bc2zero, \
+    float scale_a, \
+    float zp_a, \
+    float scale_b, \
+    float zp_b, \
+    float scale_out, \
+    float zp_out, \
+    int outer) \
+{ \
+    for(int i = 0; i < outer; i++) \
+    { \
+        int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? i : get_global_id(2)), 0); \
+        int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); \
+        float4 sum = (float4)(0); \
+        dst_type dst; \
+     \
+        for(; coord_a.x < K;) \
+        { \
+            float4 tempA0; \
+            float4 tempB0; \
+     \
+            tempA0 = convert_float4(read_image_type(inputA, coord_a)); \
+            tempB0 = convert_float4(read_image_type(inputB, coord_b)); \
+            tempA0.x = (tempA0.x - zp_a) * scale_a; \
+            tempB0.x = (tempB0.x - zp_b) * scale_b; \
+     \
+            coord_a.x++; \
+            coord_b.y++; \
+     \
+            sum = sum + tempA0 * tempB0; \
+        } \
+        sum.x = sum.x * scale_out + zp_out; \
+        dst = convert_type(sum); \
+     \
+        coord_b.y = get_global_id(1); \
+        coord_b.z = get_global_id(2) + i * get_global_size(2); \
+        write_image_type(output, coord_b, dst); \
+    } \
+}
+GEMM_MERGE(I8I8toI8,int4,read_imagei,convert_int4,write_imagei);
+GEMM_MERGE(U8U8toU8,uint4,read_imageui,convert_uint4,write_imageui);
+GEMM_MERGE(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/nearest_grid_sample.cl b/src/tim/vx/internal/src/libnnext/ops/cl/nearest_grid_sample.cl
new file mode 100644
index 000000000..e427fe414
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/nearest_grid_sample.cl
@@ -0,0 +1,77 @@
+__kernel void nearest_grid_sample_F32_F32toF32(
+    __read_only  image2d_array_t  input0,
+    __read_only  image2d_t        input1,
+    __write_only image2d_array_t  output,
+                           float  half_input0_w,
+                           float  half_input0_h,
+                           float  add_float_value_w,
+                           float  add_float_value_h,
+                           int    depth
+                           )
+{
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int2   coord_in1    =  (int2)(get_global_id(0) * 2, get_global_id(1));
+
+    float fx = read_imagef(input1, coord_in1).x;
+    coord_in1.x = coord_in1.x + 1;
+    float fy = read_imagef(input1, coord_in1).x;
+
+    fx = fx * half_input0_w + add_float_value_w;
+    fy = fy * half_input0_h + add_float_value_h;
+    int   x_index = convert_int(fx);
+    int   y_index = convert_int(fy);
+    int4   coord_in     = (int4)(x_index, y_index, 0, 0);
+
+    float4  dst;
+
+    while (coord_in.z < depth){
+        dst    = read_imagef(input0, coord_in);
+        write_imagef(output, coord_out, dst);
+        coord_in.z++;
+        coord_out.z++;
+    }
+}
+
+
+__kernel void nearest_grid_sample_U8_U8toU8(
+    __read_only  image2d_array_t  input0,
+    __read_only  image2d_t        input1,
+    __write_only image2d_array_t  output,
+                           float  half_input0_w,
+                           float  half_input0_h,
+                           float  add_float_value_w,
+                           float  add_float_value_h,
+                           int    depth,
+                           float  in0_scale,
+                           float  in0_tail,
+                           float  in1_scale,
+                           float  in1_tail,
+                           float  out_scale,
+                           float  out_tail
+                           )
+{
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int2   coord_in1    =  (int2)(get_global_id(0) * 2, get_global_id(1));
+
+    float fx    = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;
+    coord_in1.x = coord_in1.x + 1;
+    float fy    = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;
+
+    fx = fx * half_input0_w + add_float_value_w;
+    fy = fy * half_input0_h + add_float_value_h;
+    int   x_index = convert_int(fx);
+    int   y_index = convert_int(fy);
+    int4   coord_in     = (int4)(x_index, y_index, 0, 0);
+
+    float4 val;
+    uint4  dst;
+
+    while (coord_in.z < depth){
+        val    = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
+        dst      = convert_uint4_rte(val * out_scale + out_tail);
+        write_imageui(output, coord_out, dst);
+        coord_in.z++;
+        coord_out.z++;
+    }
+
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/resize_3d_bilinear.cl b/src/tim/vx/internal/src/libnnext/ops/cl/resize_3d_bilinear.cl
new file mode 100644
index 000000000..f835db5e5
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/resize_3d_bilinear.cl
@@ -0,0 +1,161 @@
+#pragma OPENCL EXTENSION CL_VIV_asm : enable
+
+#define RESIZE_3D(in_name, out_name, read_image_type, dst_type, convert_type, write_image_type) \
+__kernel void resize_3d_bilinear_##in_name##to##out_name( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                           float  scale_x, \
+                           float  scale_y, \
+                           float  scale_z, \
+                           float  half_pixel_value, \
+                           uint   in_width, \
+                           uint   in_height, \
+                           uint   in_depth, \
+                           float  in_scale, \
+                           float  in_tail, \
+                           float  out_scale, \
+                           float  out_tail \
+                           ) \
+{ \
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    float  in_x         = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value; \
+    float  left_x_f     = fmax(floor(in_x), 0); \
+    float  x_lerp       = in_x - left_x_f; \
+    int    left_x_idx   = convert_int(left_x_f); \
+    float  in_y         = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value; \
+    float  top_y_f      = fmax(floor(in_y), 0); \
+    float  y_lerp       = in_y - top_y_f; \
+    int    top_y_idx    = convert_int(top_y_f); \
+    float  in_z         = (convert_float(coord_out.z) + half_pixel_value) * scale_z - half_pixel_value; \
+    float  front_z_f    = fmax(floor(in_z), 0); \
+    float  z_lerp       = in_z - front_z_f; \
+    int    front_z_idx  = convert_int(front_z_f); \
+    int4   coord_in     = (int4)(left_x_idx, top_y_idx, front_z_idx, 0); \
+    float4 data_000, data_100, data_010, data_110, data_001, data_011, data_101, data_111; \
+    dst_type  dst; \
+ \
+    int dx, dy, dz; \
+    dx = in_x < 0 ? 0 : (left_x_f < in_width - 1 ? 1 : 0); \
+    dy = in_y < 0 ? 0 : (top_y_f < in_height - 1 ? 1 : 0); \
+    dz = in_z < 0 ? 0 : (front_z_idx < in_depth - 1 ? 1 : 0); \
+ \
+    data_000 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \
+    coord_in.y = coord_in.y + dy; \
+    data_010 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \
+    coord_in.x = coord_in.x + dx; \
+    data_110 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \
+    coord_in.y = coord_in.y - dy; \
+    data_100 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \
+    coord_in.z = coord_in.z + dz; \
+    data_101 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \
+    coord_in.y = coord_in.y + dy; \
+    data_111 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \
+    coord_in.x = coord_in.x - dx; \
+    data_011 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \
+    coord_in.y = coord_in.y - dy; \
+    data_001 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \
+ \
+    data_000 = data_000 + (data_100 - data_000) * x_lerp; \
+    data_010 = data_010 + (data_110 - data_010) * x_lerp; \
+    data_000 = data_000 + (data_010 - data_000) * y_lerp; \
+ \
+    data_001 = data_001 + (data_101 - data_001) * x_lerp; \
+    data_011 = data_011 + (data_111 - data_011) * x_lerp; \
+    data_001 = data_001 + (data_011 - data_001) * y_lerp; \
+    data_000 = data_000 + (data_001 - data_000) * z_lerp; \
+ \
+    dst      = convert_type(data_000 * out_scale + out_tail); \
+ \
+    write_image_type(output, coord_out, dst); \
+}
+RESIZE_3D(F32, F32, read_imagef,  float4, convert_float4, write_imagef)
+RESIZE_3D(F32, U8,  read_imagef,  uint4,  convert_uint4,  write_imageui)
+RESIZE_3D(U8,  F32, read_imageui, float4, convert_float4, write_imagef)
+RESIZE_3D(U8,  U8,  read_imageui, uint4,  convert_uint4,  write_imageui)
+RESIZE_3D(I8,  I8,  read_imagei,  int4,   convert_int4,   write_imagei)
+
+__kernel void resize_3d_bilinear_BF16toBF16(
+    __read_only  image2d_array_t  input,
+    __write_only image2d_array_t  output,
+                           float  scale_x,
+                           float  scale_y,
+                           float  scale_z,
+                           float  half_pixel_value,
+                           uint   in_width,
+                           uint   in_height,
+                           uint   in_depth,
+                           float  in_scale,
+                           float  in_tail,
+                           float  out_scale,
+                           float  out_tail
+                           )
+{
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    float  in_x         = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value;
+    float  left_x_f     = fmax(floor(in_x), 0);
+    float  x_lerp       = in_x - left_x_f;
+    int    left_x_idx   = convert_int(left_x_f);
+    float  in_y         = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value;
+    float  top_y_f      = fmax(floor(in_y), 0);
+    float  y_lerp       = in_y - top_y_f;
+    int    top_y_idx    = convert_int(top_y_f);
+    float  in_z         = (convert_float(coord_out.z) + half_pixel_value) * scale_z - half_pixel_value;
+    float  front_z_f    = fmax(floor(in_z), 0);
+    float  z_lerp       = in_z - front_z_f;
+    int    front_z_idx  = convert_int(front_z_f);
+    int4   coord_in     = (int4)(left_x_idx, top_y_idx, front_z_idx, 0);
+    uint4 data_000, data_100, data_010, data_110, data_001, data_011, data_101, data_111;
+    float4 data_000_f, data_100_f, data_010_f, data_110_f, data_001_f, data_011_f, data_101_f, data_111_f;
+    uint4  dst;
+
+    int dx, dy, dz;
+    dx = in_x < 0 ? 0 : (left_x_f < in_width - 1 ? 1 : 0);
+    dy = in_y < 0 ? 0 : (top_y_f < in_height - 1 ? 1 : 0);
+    dz = in_z < 0 ? 0 : (front_z_idx < in_depth - 1 ? 1 : 0);
+
+    data_000 = read_imageui(input, coord_in);
+    data_000 = data_000 << 16;
+    coord_in.y = coord_in.y + dy;
+    data_010 = read_imageui(input, coord_in);
+    data_010 = data_010 << 16;
+    coord_in.x = coord_in.x + dx;
+    data_110 = read_imageui(input, coord_in);
+    data_110 = data_110 << 16;
+    coord_in.y = coord_in.y - dy;
+    data_100 = read_imageui(input, coord_in);
+    data_100 = data_100 << 16;
+    coord_in.z = coord_in.z + dz;
+    data_101 = read_imageui(input, coord_in);
+    data_101 = data_101 << 16;
+    coord_in.y = coord_in.y + dy;
+    data_111 = read_imageui(input, coord_in);
+    data_111 = data_111 << 16;
+    coord_in.x = coord_in.x - dx;
+    data_011 = read_imageui(input, coord_in);
+    data_011 = data_011 << 16;
+    coord_in.y = coord_in.y - dy;
+    data_001 = read_imageui(input, coord_in);
+    data_001 = data_001 << 16;
+
+    _viv_asm(COPY, data_000_f, data_000, 16);
+    _viv_asm(COPY, data_010_f, data_010, 16);
+    _viv_asm(COPY, data_110_f, data_110, 16);
+    _viv_asm(COPY, data_100_f, data_100, 16);
+    _viv_asm(COPY, data_101_f, data_101, 16);
+    _viv_asm(COPY, data_111_f, data_111, 16);
+    _viv_asm(COPY, data_011_f, data_011, 16);
+    _viv_asm(COPY, data_001_f, data_001, 16);
+
+    data_000_f = data_000_f + (data_100_f - data_000_f) * x_lerp;
+    data_010_f = data_010_f + (data_110_f - data_010_f) * x_lerp;
+    data_000_f = data_000_f + (data_010_f - data_000_f) * y_lerp;
+
+    data_001_f = data_001_f + (data_101_f - data_001_f) * x_lerp;
+    data_011_f = data_011_f + (data_111_f - data_011_f) * x_lerp;
+    data_001_f = data_001_f + (data_011_f - data_001_f) * y_lerp;
+    data_000_f = data_000_f + (data_001_f - data_000_f) * z_lerp;
+
+    _viv_asm(COPY, dst, data_000_f, 16);
+    dst = dst >> 16;
+    write_imageui(output, coord_out, dst);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/resize_3d_nearest.cl b/src/tim/vx/internal/src/libnnext/ops/cl/resize_3d_nearest.cl
new file mode 100644
index 000000000..220acd351
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/resize_3d_nearest.cl
@@ -0,0 +1,119 @@
+
+#define NEAREST_INDEX_PROCESS() \
+    int4   coord_out  = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    float  in_x       = (convert_float(coord_out.x) + half_pixel_value) * scale_x + round_value; \
+    int    in_x_idx   = convert_int(in_x); \
+    float  in_y       = (convert_float(coord_out.y) + half_pixel_value) * scale_y + round_value; \
+    int    in_y_idx   = convert_int(in_y); \
+    float  in_z       = (convert_float(coord_out.z) + half_pixel_value) * scale_z + round_value; \
+    int    in_z_idx   = convert_int(in_z); \
+
+__kernel void resize_3d_nearest_F32toF32(
+    __read_only  image2d_array_t  input,
+    __write_only image2d_array_t  output,
+                           float  scale_x,
+                           float  scale_y,
+                           float  scale_z,
+                           float  half_pixel_value,
+                           float  round_value,
+                           float  output_scale,
+                           float  output_tail)
+{
+    NEAREST_INDEX_PROCESS()
+    int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);
+    float4 dst;
+    dst    = read_imagef(input, coord_in);
+    write_imagef(output, coord_out, dst);
+}
+
+
+__kernel void resize_3d_nearest_U8toU8(
+    __read_only  image2d_array_t  input,
+    __write_only image2d_array_t  output,
+                           float  scale_x,
+                           float  scale_y,
+                           float  scale_z,
+                           float  half_pixel_value,
+                           float  round_value,
+                           float  output_scale,
+                           float  output_tail)
+{
+    NEAREST_INDEX_PROCESS()
+    int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);
+    uint4 dst;
+    dst    = convert_uint4(convert_float4(read_imageui(input, coord_in)) * output_scale + output_tail);
+    write_imageui(output, coord_out, dst);
+}
+
+__kernel void resize_3d_nearest_U8toF32(
+    __read_only  image2d_array_t  input,
+    __write_only image2d_array_t  output,
+                           float  scale_x,
+                           float  scale_y,
+                           float  scale_z,
+                           float  half_pixel_value,
+                           float  round_value,
+                           float  output_scale,
+                           float  output_tail)
+{
+    NEAREST_INDEX_PROCESS()
+    int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);
+    float4 dst;
+    dst    = convert_float4(read_imageui(input, coord_in)) * output_scale + output_tail;
+    write_imagef(output, coord_out, dst);
+}
+
+__kernel void resize_3d_nearest_F32toU8(
+    __read_only  image2d_array_t  input,
+    __write_only image2d_array_t  output,
+                           float  scale_x,
+                           float  scale_y,
+                           float  scale_z,
+                           float  half_pixel_value,
+                           float  round_value,
+                           float  output_scale,
+                           float  output_tail)
+{
+    NEAREST_INDEX_PROCESS()
+    int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);
+    uint4 dst;
+    dst    = convert_uint4(read_imagef(input, coord_in) * output_scale + output_tail);
+    write_imageui(output, coord_out, dst);
+}
+
+__kernel void resize_3d_nearest_I8toI8(
+    __read_only  image2d_array_t  input,
+    __write_only image2d_array_t  output,
+                           float  scale_x,
+                           float  scale_y,
+                           float  scale_z,
+                           float  half_pixel_value,
+                           float  round_value,
+                           float  output_scale,
+                           float  output_tail)
+{
+    NEAREST_INDEX_PROCESS()
+    int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);
+    int4 dst;
+    dst    = convert_int4(convert_float4(read_imagei(input, coord_in)) * output_scale);
+    write_imagei(output, coord_out, dst);
+}
+
+__kernel void resize_3d_nearest_BF16toBF16(
+    __read_only  image2d_array_t  input,
+    __write_only image2d_array_t  output,
+                           float  scale_x,
+                           float  scale_y,
+                           float  scale_z,
+                           float  half_pixel_value,
+                           float  round_value,
+                           float  output_scale,
+                           float  output_tail)
+{
+    NEAREST_INDEX_PROCESS()
+    int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);
+    uint4 dst;
+    dst = read_imageui(input, coord_in);
+    write_imageui(output, coord_out, dst);
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl b/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl
index 117d6d25e..87a9df7d2 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/tile.cl
@@ -1,5 +1,5 @@
 
-#define TILE_3D(name0, name1, data_type, read_image_func, write_image_func) \
+#define TILE_3D(name0, name1, src_type, dst_type, conv_type, read_image_func, write_image_func) \
 __kernel void tile_##name0##to##name1 \
     ( \
     __read_only  image2d_array_t input, \
@@ -10,7 +10,9 @@ __kernel void tile_##name0##to##name1 \
                              int multiples_0, \
                              int multiples_1, \
                              int multiples_2, \
-                             int multiples_3 \
+                             int multiples_3, \
+                             float inoutscale, \
+                             float inouttail \
     ) \
 { \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
@@ -18,7 +20,9 @@ __kernel void tile_##name0##to##name1 \
     int width = get_image_width(input); \
     int height = get_image_height(input); \
  \
-    data_type src; \
+    src_type src; \
+    dst_type dst; \
+ \
     read_image_func(src, input, coord); \
  \
     int batch_id = (short)coord.z / (short)depthIn; \
@@ -40,17 +44,19 @@ __kernel void tile_##name0##to##name1 \
                 for (int x = 0; x < multiples_0; x++) \
                 { \
                     coord_out.x = coord.x + x * width; \
-                    write_image_func(output, coord_out.xyzw, src); \
+                    dst = conv_type(convert_float4(src) * inoutscale + inouttail); \
+                    write_image_func(output, coord_out.xyzw, dst); \
                 } \
             } \
         } \
     } \
 }
-TILE_3D(I32, I32, int4,   READ_IMAGEI_2DARRAY,  write_imagei)
-TILE_3D(U32, U32, uint4,  READ_IMAGEUI_2DARRAY, write_imageui)
-TILE_3D(F32, F32, float4, READ_IMAGEF_2DARRAY,  write_imagef)
+TILE_3D(I32, I32, int4,   int4,  convert_int4_rte,  READ_IMAGEI_2DARRAY,  write_imagei)
+TILE_3D(U32, U32, uint4,  uint4, convert_uint4_rte, READ_IMAGEUI_2DARRAY, write_imageui)
+TILE_3D(F32, F32, float4, float4,convert_float4_rte,READ_IMAGEF_2DARRAY,  write_imagef)
+TILE_3D(F32, U32, float4, uint4, convert_uint4_rte, READ_IMAGEF_2DARRAY,  write_imageui)
 
-#define TILE_2D(name0, name1, data_type, read_image_func, write_image_func) \
+#define TILE_2D(name0, name1, src_type, dst_type, conv_type, read_image_func, write_image_func) \
 __kernel void tile_##name0##to##name1##_2D \
     ( \
     __read_only  image2d_t input, \
@@ -61,7 +67,9 @@ __kernel void tile_##name0##to##name1##_2D \
                        int multiples_0, \
                        int multiples_1, \
                        int multiples_2, \
-                       int multiples_3 \
+                       int multiples_3, \
+                       float inoutscale, \
+                       float inouttail \
     ) \
 { \
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
@@ -70,22 +78,25 @@ __kernel void tile_##name0##to##name1##_2D \
     int output_width = get_image_width(output); \
     int output_height = get_image_height(output); \
  \
-    data_type src = read_image_func(input, coord); \
+    src_type src = read_image_func(input, coord); \
+    dst_type dst; \
  \
     do \
     { \
         do \
         { \
-            write_image_func(output, coord, src); \
+            dst = conv_type(convert_float4(src) * inoutscale + inouttail); \
+            write_image_func(output, coord, dst); \
             coord.x += width; \
         } while (coord.x < output_width); \
         coord.x = get_global_id(0); \
         coord.y += height; \
     } while (coord.y < output_height); \
 }
-TILE_2D(I32, I32, int4,   read_imagei,  write_imagei)
-TILE_2D(U32, U32, uint4,  read_imageui, write_imageui)
-TILE_2D(F32, F32, float4, read_imagef,  write_imagef)
+TILE_2D(I32, I32, int4,   int4,  convert_int4_rte,  read_imagei,  write_imagei)
+TILE_2D(U32, U32, uint4,  uint4, convert_uint4_rte, read_imageui, write_imageui)
+TILE_2D(F32, F32, float4, float4,convert_float4_rte,read_imagef,  write_imagef)
+TILE_2D(F32, U32, float4, uint4, convert_uint4_rte, read_imagef,  write_imageui)
 
 
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis0.vx
new file mode 100644
index 000000000..a20f024a3
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis0.vx
@@ -0,0 +1,191 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;
+_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzRevF16toF16_2x8;
+
+_viv_uniform VXC_512Bits uniSumHorzRevU8toI16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzRevU8toI16B_8x4;
+_viv_uniform VXC_512Bits uniSubZpRevI16toI16_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32B_4x4;
+
+
+_viv_uniform int width;
+_viv_uniform int input_zp;
+_viv_uniform float in_out_scale;
+_viv_uniform float output_zp;
+
+__kernel void cumsum_ex_rev_F16toF16_axis0(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    vxc_short8 src, dst;
+    vxc_half8 data, tmpsum, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    if(exclusive == 0 && rev)
+    {
+        for(coord.x = width - 8; coord.x >= 0; coord.x -= 8)
+        {
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);
+            VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                        uniSumHorzRevF16toF16C_2x8);
+            VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        }
+    }
+    else if(exclusive && rev == 0)
+    {
+        _viv_asm(COPY, dst, sum, 16);
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+        for(; coord.x < width - 8;)
+        {
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord_out.x = coord.x + 1;
+            coord.x += 8;
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);
+            VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);
+            VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        }
+    }
+    else if(exclusive && rev)
+    {
+        coord.x = width - 8;
+        coord_out.x = width - 1;
+        _viv_asm(COPY, dst, sum, 16);
+        VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+        for(; coord.x > 0;)
+        {
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord_out.x = coord.x - 1;
+            coord.x -= 8;
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);
+            VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                        uniSumHorzRevF16toF16C_2x8);
+            VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        }
+    }
+}
+
+#define CUMSUM_QINT_EX_REV_AXIS0(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis0( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); \
+    int4 coord_out = coord; \
+ \
+    src_type src; \
+    dst_type dst; \
+    vxc_short8 rowSum; \
+    int4 sum0 = (int4)(0), sum1 = (int4)(0); \
+    short zp = (short)input_zp; \
+ \
+    if(exclusive == 0 && rev) \
+    { \
+        for(coord.x = width - 8; coord.x >= 0; coord.x -= 8) \
+        { \
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \
+            VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \
+            VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \
+            VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniAccSumHorzRevI16toI32A_4x4); \
+            VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniAccSumHorzRevI16toI32B_4x4); \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+    else if(exclusive && rev == 0) \
+    { \
+        for(coord.x = -1; coord.x < width - 8;) \
+        { \
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            coord_out.x = coord.x + 1; \
+            coord.x += 8; \
+            VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \
+            VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \
+            VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \
+            VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniAccSumHorzI16toI32A_4x4); \
+            VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniAccSumHorzI16toI32B_4x4); \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+    else if(exclusive && rev) \
+    { \
+        for(coord.x = width - 7; coord.x > 0;) \
+        { \
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            coord_out.x = coord.x - 1; \
+            coord.x -= 8; \
+            VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \
+            VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \
+            VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \
+            VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniAccSumHorzRevI16toI32A_4x4); \
+            VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniAccSumHorzRevI16toI32B_4x4); \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+}
+CUMSUM_QINT_EX_REV_AXIS0(U8,  U8,  vxc_uchar16, vxc_uchar16)
+CUMSUM_QINT_EX_REV_AXIS0(I8,  I8,  vxc_char16,  vxc_char16)
+CUMSUM_QINT_EX_REV_AXIS0(I16, I16, vxc_short8,  vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis1.vx
new file mode 100644
index 000000000..631964c5f
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis1.vx
@@ -0,0 +1,255 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int height;
+_viv_uniform float in_out_scale;
+_viv_uniform float in_out_zp_scale;
+_viv_uniform float output_zp;
+
+__kernel void cumsum_ex_rev_F16toF16_axis1(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev)
+{
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    vxc_half8 data, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    if(exclusive == 0 && rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        }
+    }
+    else if(exclusive && rev == 0)
+    {
+        dst ^= dst;
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        for(; coord.y < height - 1;)
+        {
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        }
+    }
+    else if(exclusive && rev)
+    {
+        dst ^= dst;
+        coord.y = height - 1;
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+        for(; coord.y > 0;)
+        {
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y--;
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        }
+    }
+}
+
+#define CUMSUM_8BITS_EX_REV_AXIS1(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev) \
+{ \
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \
+ \
+    src_type src; \
+    dst_type dst; \
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
+ \
+    if(exclusive == 0 && rev) \
+    { \
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \
+        { \
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+    else if(exclusive && rev == 0) \
+    { \
+        int tmpAlpha0 = convert_int_rte(output_zp); \
+        int4 tmpVal; \
+        tmpVal.x = tmpAlpha0; \
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        for(; coord.y < height - 1;) \
+        { \
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+            coord.y++; \
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+            float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp; \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8);\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8);\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+    else if(exclusive && rev) \
+    { \
+        coord.y = height - 1; \
+        int tmpAlpha0 = convert_int_rte(output_zp); \
+        int4 tmpVal; \
+        tmpVal.x = tmpAlpha0; \
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        for(; coord.y > 0;) \
+        { \
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \
+            coord.y--; \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8);\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8);\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+}
+CUMSUM_8BITS_EX_REV_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)
+CUMSUM_8BITS_EX_REV_AXIS1(I8, I8, vxc_char16,  vxc_char16)
+
+__kernel void cumsum_ex_rev_I16toI16_axis1(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev)
+{
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);
+    if(exclusive == 0 && rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
+                        uniConvertInt32toUint8_2x8);
+
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        }
+    }
+    else if(exclusive && rev == 0)
+    {
+        int tmpAlpha0 = convert_int_rte(output_zp);
+        int4 tmpVal;
+        tmpVal.x = tmpAlpha0;
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+        for(; coord.y < height - 1;)
+        {
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+            float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp;
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
+                        uniConvertInt32toUint8_2x8);
+
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        }
+    }
+    else if(exclusive && rev)
+    {
+        coord.y = height - 1;
+        int tmpAlpha0 = convert_int_rte(output_zp);
+        int4 tmpVal;
+        tmpVal.x = tmpAlpha0;
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+        for(; coord.y > 0;)
+        {
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;
+            coord.y--;
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
+                        uniConvertInt32toUint8_2x8);
+
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        }
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis2.vx
new file mode 100644
index 000000000..e8a8d2790
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_ex_rev_axis2.vx
@@ -0,0 +1,252 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int channel;
+_viv_uniform float in_out_scale;
+_viv_uniform float in_out_zp_scale;
+_viv_uniform float output_zp;
+
+__kernel void cumsum_ex_rev_F16toF16_axis2(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev)
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+    vxc_short8 src, dst;
+    vxc_half8 data, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    if(rev && exclusive == 0)
+    {
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)
+        {
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        }
+    }
+    else if(rev == 0 && exclusive)
+    {
+        _viv_asm(COPY, dst, sum, 16);
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        for(; coord.z < channel - 1;)
+        {
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.z++;
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        }
+    }
+    else if(rev && exclusive)
+    {
+        _viv_asm(COPY, dst, sum, 16);
+        coord.z = channel - 1;
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        for(; coord.z > 0;)
+        {
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.z--;
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        }
+    }
+}
+
+#define CUMSUM_8BITS_EX_REV_AXIS2(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+ \
+    src_type src; \
+    dst_type dst; \
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
+ \
+    if(rev && exclusive == 0) \
+    { \
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
+        { \
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8);\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \
+                        uniConvertInt32toUint8_2x8);\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+    else if(exclusive && rev == 0) \
+    { \
+        int tmpAlpha0 = convert_int_rte(output_zp); \
+        int4 tmpVal; \
+        tmpVal.x = tmpAlpha0; \
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        for(; coord.z < channel - 1;) \
+        { \
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+            coord.z++; \
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+            float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp; \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \
+                        uniConvertInt32toUint8_2x8); \
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+    else if(rev && exclusive) \
+    { \
+        coord.z = channel - 1; \
+        int tmpAlpha0 = convert_int_rte(output_zp); \
+        int4 tmpVal; \
+        tmpVal.x = tmpAlpha0; \
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        for(; coord.z > 0;) \
+        { \
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \
+            coord.z--; \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1),
+                        uniConvertInt32toUint8_2x8); \
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+}
+CUMSUM_8BITS_EX_REV_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)
+CUMSUM_8BITS_EX_REV_AXIS2(I8, I8, vxc_char16, vxc_char16)
+
+__kernel void cumsum_ex_rev_I16toI16_axis2(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev)
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+    vxc_short8 src, dst;
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);
+    if(exclusive == 0 && rev)
+    {
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)
+        {
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),
+                        uniConvertInt32toUint8_2x8);
+
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        }
+    }
+    else if(exclusive && rev == 0)
+    {
+        int tmpAlpha0 = convert_int_rte(output_zp);
+        int4 tmpVal;
+        tmpVal.x = tmpAlpha0;
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        for(; coord.z < channel - 1;)
+        {
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.z++;
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+            float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp;
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),
+                        uniConvertInt32toUint8_2x8);
+
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        }
+    }
+    else if(exclusive && rev)
+    {
+        coord.z = channel - 1;
+        int tmpAlpha0 = convert_int_rte(output_zp);
+        int4 tmpVal;
+        tmpVal.x = tmpAlpha0;
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        for(; coord.z > 0;)
+        {
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;
+            coord.z--;
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),
+                        uniConvertInt32toUint8_2x8);
+
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        }
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_f16_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_f16_u8.vx
index b9f4e1754..60159d98a 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_f16_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_f16_u8.vx
@@ -176,3 +176,135 @@ __kernel void cumsum_F16to##out_name##_axis0_2D( \
 CUMSUM_F16TOQINT_AXIS0_2D(I8,  vxc_half8, vxc_char16)
 CUMSUM_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8)
 CUMSUM_F16TOQINT_AXIS0_2D(U8,  vxc_half8, vxc_uchar16)
+
+#define CUMSUM_F16TOQINT_EX_REV_AXIS2(out_name, src_type, dst_type) \
+__kernel void cumsum_ex_rev_F16to##out_name##_axis2( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    if(exclusive == 0 && rev) \
+    { \
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
+        { \
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            _viv_asm(COPY, data, src, 16); \
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+    else if(exclusive && rev == 0) \
+    { \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        for(; coord.z < channel - 1;) \
+        { \
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            coord.z++; \
+            _viv_asm(COPY, data, src, 16); \
+     \
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+    else if(exclusive && rev) \
+    { \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+        coord.z = channel - 1; \
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        for(; coord.z > 0;) \
+        { \
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            coord.z--; \
+            _viv_asm(COPY, data, src, 16); \
+     \
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+}
+CUMSUM_F16TOQINT_EX_REV_AXIS2(I8,  vxc_half8, vxc_char16)
+CUMSUM_F16TOQINT_EX_REV_AXIS2(I16, vxc_half8, vxc_short8)
+CUMSUM_F16TOQINT_EX_REV_AXIS2(U8,  vxc_half8, vxc_uchar16)
+
+#define CUMSUM_F16TOQINT_EX_REV_AXIS1(out_name, src_type, dst_type) \
+__kernel void cumsum_ex_rev_F16to##out_name##_axis1( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    if(exclusive == 0 && rev) \
+    { \
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \
+        { \
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            _viv_asm(COPY, data, src, 16); \
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+    else if(exclusive && rev == 0) \
+    { \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        for(; coord.y < height - 1;) \
+        { \
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            coord.y++; \
+            _viv_asm(COPY, data, src, 16); \
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+    else if(exclusive && rev) \
+    { \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+        coord.y = height - 1; \
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        for(; coord.y > 0;) \
+        { \
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            coord.y--; \
+            _viv_asm(COPY, data, src, 16); \
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+}
+CUMSUM_F16TOQINT_EX_REV_AXIS1(I8,  vxc_half8, vxc_char16)
+CUMSUM_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8)
+CUMSUM_F16TOQINT_EX_REV_AXIS1(U8,  vxc_half8, vxc_uchar16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_rgb.vx b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_rgb.vx
new file mode 100644
index 000000000..2088285dd
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/custom_warp_affine_rgb.vx
@@ -0,0 +1,316 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float4 matrix0;
+_viv_uniform float2 matrix1;
+__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb_2D
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5
+)
+{
+    int2   coord = (int2)(get_global_id(0) * 3, get_global_id(1));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f = convert_float4(coord_in);
+
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
+
+    coord_in.x = floor(coord_f.x) * 3;
+    coord_in.y = floor(coord_f.y);
+    coord_in.z = floor(coord_f.z) * 3;
+    coord_in.w = floor(coord_f.w);
+
+    vxc_uchar16 dst;
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = coord_in.x + 1;
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = coord_in.x + 1;
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    coord_in.z = coord_in.z + 1;
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+    coord_in.z = coord_in.z + 1;
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_warp_affine_bilinear_U8toU8_rgb_2D
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5
+)
+{
+    int2   coord = (int2)(get_global_id(0) * 3, get_global_id(1));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f = convert_float4(coord_in);
+
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
+
+    coord_in.x = floor(coord_f.x) * 3;
+    coord_in.y = floor(coord_f.y);
+    coord_in.z = floor(coord_f.z) * 3;
+    coord_in.w = floor(coord_f.w);
+
+    vxc_uchar16 src0, src1, src_0, src_1, dst;
+    VXC_ReadImage(src_0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src_1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    src0.x = src_0.s0;
+    src0.y = src_0.s3;
+    src1.x = src_1.s0;
+    src1.y = src_1.s3;
+
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    src0.x = src_0.s1;
+    src0.y = src_0.s4;
+    src1.x = src_1.s1;
+    src1.y = src_1.s4;
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    src0.x = src_0.s2;
+    src0.y = src_0.s5;
+    src1.x = src_1.s2;
+    src1.y = src_1.s5;
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_ReadImage(src_0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src_1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    src0.x = src_0.s0;
+    src0.y = src_0.s3;
+    src1.x = src_1.s0;
+    src1.y = src_1.s3;
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    src0.x = src_0.s1;
+    src0.y = src_0.s4;
+    src1.x = src_1.s1;
+    src1.y = src_1.s4;
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    src0.x = src_0.s2;
+    src0.y = src_0.s5;
+    src1.x = src_1.s2;
+    src1.y = src_1.s5;
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5
+)
+{
+    int4   coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f = convert_float4(coord_in);
+
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
+
+    coord_in.x = floor(coord_f.x) * 3;
+    coord_in.y = floor(coord_f.y);
+    coord_in.z = floor(coord_f.z) * 3;
+    coord_in.w = floor(coord_f.w);
+
+    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_input.w, baseAddr);
+
+    vxc_uchar16 dst;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    coord_input.x = coord_input.x + 1;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+    coord_input.x = coord_input.x + 1;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+    coord_input.xy = coord_in.zw;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+    coord_input.x = coord_input.x + 1;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+    coord_input.x = coord_input.x + 1;
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void custom_warp_affine_bilinear_U8toU8_rgb
+(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+                 float           _m0,
+                 float           _m1,
+                 float           _m2,
+                 float           _m3,
+                 float           _m4,
+                 float           _m5
+)
+{
+    int4   coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2));
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
+
+    float4 coord_f = convert_float4(coord_in);
+
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
+
+    coord_in.x = floor(coord_f.x) * 3;
+    coord_in.y = floor(coord_f.y);
+    coord_in.z = floor(coord_f.z) * 3;
+    coord_in.w = floor(coord_f.w);
+
+    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_input.w, baseAddr);
+
+    vxc_uchar16 src0, src1, src_0, src_1, dst;
+    VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    src0.x = src_0.s0;
+    src0.y = src_0.s3;
+    src1.x = src_1.s0;
+    src1.y = src_1.s3;
+
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    src0.x = src_0.s1;
+    src0.y = src_0.s4;
+    src1.x = src_1.s1;
+    src1.y = src_1.s4;
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    src0.x = src_0.s2;
+    src0.y = src_0.s5;
+    src1.x = src_1.s2;
+    src1.y = src_1.s5;
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    coord_input.xy = coord_in.zw;
+    VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    src0.x = src_0.s0;
+    src0.y = src_0.s3;
+    src1.x = src_1.s0;
+    src1.y = src_1.s3;
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    src0.x = src_0.s1;
+    src0.y = src_0.s4;
+    src1.x = src_1.s1;
+    src1.y = src_1.s4;
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    src0.x = src_0.s2;
+    src0.y = src_0.s5;
+    src1.x = src_1.s2;
+    src1.y = src_1.s5;
+#if (VX_VERSION==1)
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#else
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    src1.s0 = src0.s1;
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
+#endif
+
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
index 3a1661e85..73171a8b0 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
@@ -18,6 +18,7 @@ __kernel void gather_I8toI8(
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
     int4 indice = read_imagei(input1, coord_in.xy);
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
     coord_in.w = gidz * axis_num + indice.x;
 
     vxc_char16 src;
@@ -42,6 +43,7 @@ __kernel void gather_U8toU8(
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
     int4 indice = read_imagei(input1, coord_in.xy);
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
     coord_in.w = gidz * axis_num + indice.x;
 
     vxc_uchar16 src;
@@ -66,8 +68,8 @@ __kernel void gather_I16toI16(
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
 
-
     int4 indice = read_imagei(input1, coord_in.xy);
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
     coord_in.w = gidz * axis_num + indice.x;
 
     vxc_short8 src;
@@ -92,6 +94,7 @@ __kernel void gather_F16toF16(
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
     int4 indice = read_imagei(input1, coord_in.xy);
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
     coord_in.w = gidz * axis_num + indice.x;
 
     vxc_short8 src;
@@ -112,6 +115,7 @@ __kernel void gather_I8toI8_axis0(
 {
     int2 coord = (int2)(get_global_id(0), get_global_id(1));
     int4 indices = read_imagei(input1, coord.xx);
+    indices = indices >= 0 ? indices : indices + axis_num;
     int2 coord_in = (int2)(indices.x, get_global_id(1));
 
     vxc_char16 src, dst;
@@ -138,6 +142,7 @@ __kernel void gather_U8toU8_axis0(
 {
     int2 coord = (int2)(get_global_id(0), get_global_id(1));
     int4 indices = read_imagei(input1, coord.xx);
+    indices = indices >= 0 ? indices : indices + axis_num;
     int2 coord_in = (int2)(indices.x, get_global_id(1));
 
     vxc_uchar16 src, dst;
@@ -164,6 +169,7 @@ __kernel void gather_I16toI16_axis0(
 {
     int2 coord = (int2)(get_global_id(0), get_global_id(1));
     int4 indices = read_imagei(input1, coord.xx);
+    indices = indices >= 0 ? indices : indices + axis_num;
     int2 coord_in = (int2)(indices.x, get_global_id(1));
 
     vxc_short8 src, dst;
@@ -190,6 +196,7 @@ __kernel void gather_F16toF16_axis0(
 {
     int2 coord = (int2)(get_global_id(0), get_global_id(1));
     int4 indices = read_imagei(input1, coord.xx);
+    indices = indices >= 0 ? indices : indices + axis_num;
     int2 coord_in = (int2)(indices.x, get_global_id(1));
 
     vxc_short8 src, dst;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx
index 9ed287631..9c21fd131 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx
@@ -18,6 +18,7 @@ __kernel void gather_I8toI8_array(
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
     int4 indice = read_imagei(input1, coord_in.xy);
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
     coord_in.w = gidz * axis_num + indice.x;
 
     Image img1 = create_image_from_image2d(input0, 1);
@@ -46,6 +47,7 @@ __kernel void gather_U8toU8_array(
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
     int4 indice = read_imagei(input1, coord_in.xy);
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
     coord_in.w = gidz * axis_num + indice.x;
 
     Image img1 = create_image_from_image2d(input0, 1);
@@ -74,8 +76,8 @@ __kernel void gather_I16toI16_array(
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
 
-
     int4 indice = read_imagei(input1, coord_in.xy);
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
     coord_in.w = gidz * axis_num + indice.x;
 
     Image img1 = create_image_from_image2d(input0, 2);
@@ -105,6 +107,7 @@ __kernel void gather_F16toF16_array(
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
 
     int4 indice = read_imagei(input1, coord_in.xy);
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
     coord_in.w = gidz * axis_num + indice.x;
 
     Image img1 = create_image_from_image2d(input0, 2);
@@ -142,6 +145,7 @@ __kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \
     uchar* output_ptr = get_image_ptr_from_coord(img2, coord.xy); \
     __global data_type* data_ptr = (__global data_type*)input_ptr; \
     __global write_type* out_ptr = (__global write_type*)output_ptr; \
+    indices = indices >= 0 ? indices : indices + axis_num; \
     src.s0 = data_ptr[indices.x]; \
     src.s1 = data_ptr[indices.y]; \
     src.s2 = data_ptr[indices.z]; \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx
index 8d09d50d4..47f1db609 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx
@@ -24,6 +24,7 @@ __kernel void gather_batch_I8toI8(
     {
         int4 indice = read_imagei(input1, coord_idx);
         coord_idx.y++;
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
         coord_in.y = gidz * axis_num + indice.x;
 
         vxc_char16 src;
@@ -54,6 +55,7 @@ __kernel void gather_batch_U8toU8(
     {
         int4 indice = read_imagei(input1, coord_idx);
         coord_idx.y++;
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
         coord_in.y = gidz * axis_num + indice.x;
 
         vxc_uchar16 src;
@@ -84,6 +86,7 @@ __kernel void gather_batch_I16toI16(
     {
         int4 indice = read_imagei(input1, coord_idx);
         coord_idx.y++;
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
         coord_in.y = gidz * axis_num + indice.x;
 
         vxc_short8 src;
@@ -114,6 +117,7 @@ __kernel void gather_batch_F16toF16(
     {
         int4 indice = read_imagei(input1, coord_idx);
         coord_idx.y++;
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
         coord_in.y = gidz * axis_num + indice.x;
 
         vxc_short8 src;
@@ -135,6 +139,7 @@ __kernel void gather_batch_I8toI8_axis0(
 {
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
     int4 indices = read_imagei(input1, coord.xz);
+    indices = indices >= 0 ? indices : indices + axis_num;
     int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
 
     vxc_char16 src, dst;
@@ -163,6 +168,7 @@ __kernel void gather_batch_U8toU8_axis0(
 {
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
     int4 indices = read_imagei(input1, coord.xz);
+    indices = indices >= 0 ? indices : indices + axis_num;
     int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
 
     vxc_uchar16 src, dst;
@@ -191,6 +197,7 @@ __kernel void gather_batch_I16toI16_axis0(
 {
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
     int4 indices = read_imagei(input1, coord.xz);
+    indices = indices >= 0 ? indices : indices + axis_num;
     int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
 
     vxc_short8 src, dst;
@@ -219,6 +226,7 @@ __kernel void gather_batch_F16toF16_axis0(
 {
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
     int4 indices = read_imagei(input1, coord.xz);
+    indices = indices >= 0 ? indices : indices + axis_num;
     int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
 
     vxc_short8 src, dst;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_elements.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_elements.vx
index 39a8a990d..9f962c410 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_elements.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_elements.vx
@@ -1,6 +1,12 @@
 #include "cl_viv_vx_ext.h"
 
 _viv_uniform int axis_size;
+_viv_uniform uint width0;
+_viv_uniform uint height0;
+_viv_uniform uint width1;
+_viv_uniform uint height1;
+_viv_uniform uint width_out;
+_viv_uniform uint height_out;
 
 #define GATHER_ELEMENTS_AXIS0_2D(name, data_type) \
 __kernel void gather_elements_axis0_##name##_I32to##name##_2D \
@@ -151,3 +157,141 @@ GATHER_ELEMENTS_AXIS2(F16, vxc_short4)
 GATHER_ELEMENTS_AXIS2(I16, vxc_short4)
 GATHER_ELEMENTS_AXIS2(I8,  vxc_char4)
 GATHER_ELEMENTS_AXIS2(U8,  vxc_uchar4)
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output, \
+    int    axis \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+    Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \
+    int* index_ptr = (int*)index_tensor.ptr; \
+    int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \
+ \
+    Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \
+    data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \
+    data_type data = input_ptr[index + coord.y * width0 + coord.z * width0 * height0]; \
+ \
+    Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \
+    data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \
+    output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I8,  char,  char*,  1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(U8,  uchar, uchar*, 1)
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output, \
+    int    axis \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+    Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \
+    int* index_ptr = (int*)index_tensor.ptr; \
+    int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \
+ \
+    Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \
+    data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \
+    data_type data = input_ptr[coord.x + index * width0 + coord.z * width0 * height0]; \
+ \
+    Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \
+    data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \
+    output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I8,  char,  char*,  1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(U8,  uchar, uchar*, 1)
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis2_##name##_I32to##name \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __write_only image2d_array_t output, \
+    int    axis \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \
+    Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \
+    int* index_ptr = (int*)index_tensor.ptr; \
+    int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \
+ \
+    Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \
+    data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \
+    data_type data = input_ptr[coord.x + coord.y * width0 + index * width0 * height0]; \
+ \
+    Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \
+    data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \
+    output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I8,  char,  char*,  1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(U8,  uchar, uchar*, 1)
+
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name##_2D \
+    ( \
+    __read_only  image2d_t input0, \
+    __read_only  image2d_t input1, \
+    __write_only image2d_t output, \
+    int    axis \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    Image index_img = create_image_from_image2d(input1, 4); \
+    int* index_ptr = (int*)index_img.ptr; \
+    int index = index_ptr[coord.x + coord.y * width1]; \
+ \
+    Image input_img = create_image_from_image2d(input0, stride); \
+    data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \
+    data_type data = input_ptr[index + coord.y * width0]; \
+ \
+    Image output_img = create_image_from_image2d(output, stride); \
+    data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \
+    output_ptr[coord.x + coord.y * width_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I8,  char,  char*,  1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(U8,  uchar, uchar*, 1)
+
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(name, data_type, data_type_ptr, stride) \
+__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name##_2D \
+    ( \
+    __read_only  image2d_t input0, \
+    __read_only  image2d_t input1, \
+    __write_only image2d_t output, \
+    int    axis \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    Image index_img = create_image_from_image2d(input1, 4); \
+    int* index_ptr = (int*)index_img.ptr; \
+    int index = index_ptr[coord.x + coord.y * width1]; \
+ \
+    Image input_img = create_image_from_image2d(input0, stride); \
+    data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \
+    data_type data = input_ptr[coord.x + index  * width0]; \
+ \
+    Image output_img = create_image_from_image2d(output, stride); \
+    data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \
+    output_ptr[coord.x + coord.y * width_out] = data; \
+}
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(F16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I16, short, short*, 2)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I8,  char,  char*,  1)
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(U8,  uchar, uchar*, 1)
+
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx
index e9b8fd14e..87825fd13 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx
@@ -24,6 +24,7 @@ __kernel void gather_##src0_type_name##toF16( \
  \
     int4 coord_in = (int4)(gidy, 0, gidx, 0); \
     int4 indice = read_imagei(input1, coord_in.xy); \
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \
     coord_in.w = gidz * axis_num + indice.x; \
  \
     read_type src; \
@@ -60,6 +61,7 @@ __kernel void gather_F16to##src1_type_name( \
     int4 coord_in = (int4)(gidy, 0, gidx, 0); \
  \
     int4 indice = read_imagei(input1, coord_in.xy); \
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \
     coord_in.w = gidz * axis_num + indice.x; \
  \
     vxc_short8 src; \
@@ -92,6 +94,7 @@ __kernel void gather_I16toF16(
 
     int4 coord_in = (int4)(gidy, 0, gidx, 0);
     int4 indice = read_imagei(input1, coord_in.xy);
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
     coord_in.w = gidz * axis_num + indice.x;
 
     vxc_short8 src;
@@ -122,6 +125,7 @@ __kernel void gather_##src0_type_name##toF16_axis0( \
 { \
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
     int4 indices = read_imagei(input1, coord.xx); \
+    indices = indices >= 0 ? indices : indices + axis_num; \
     int2 coord_in = (int2)(indices.x, get_global_id(1)); \
  \
     read_type src; \
@@ -153,6 +157,7 @@ __kernel void gather_F16to##src1_type_name##_axis0( \
 { \
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
     int4 indices = read_imagei(input1, coord.xx); \
+    indices = indices >= 0 ? indices : indices + axis_num; \
     int2 coord_in = (int2)(indices.x, get_global_id(1)); \
  \
     vxc_short8 src; \
@@ -184,6 +189,7 @@ __kernel void gather_I16toF16_axis0(
 {
     int2 coord = (int2)(get_global_id(0), get_global_id(1));
     int4 indices = read_imagei(input1, coord.xx);
+    indices = indices >= 0 ? indices : indices + axis_num;
     int2 coord_in = (int2)(indices.x, get_global_id(1));
 
     vxc_short8 src;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx
index 0e94445ca..988c81183 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx
@@ -33,6 +33,7 @@ __kernel void gather_batch_##src0_type_name##toF16( \
     { \
         int4 indice = read_imagei(input1, coord_idx); \
         coord_idx.y++; \
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \
         coord_in.y = gidz * axis_num + indice.x; \
  \
         read_type src; \
@@ -78,6 +79,7 @@ __kernel void gather_batch_F16to##src1_type_name( \
     { \
         int4 indice = read_imagei(input1, coord_idx); \
         coord_idx.y++; \
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \
         coord_in.y = gidz * axis_num + indice.x; \
  \
         vxc_short8 src; \
@@ -120,6 +122,7 @@ __kernel void gather_batch_I16toF16(
     {
         int4 indice = read_imagei(input1, coord_idx);
         coord_idx.y++;
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;
         coord_in.y = gidz * axis_num + indice.x;
 
         vxc_short8 src;
@@ -145,6 +148,7 @@ __kernel void gather_batch_##src0_type_name##toF16_axis0( \
 { \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
     int4 indices = read_imagei(input1, coord.xz); \
+    indices = indices >= 0 ? indices : indices + axis_num; \
     int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \
  \
     read_type src; \
@@ -179,6 +183,7 @@ __kernel void gather_batch_F16to##src1_type_name##_axis0( \
 { \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
     int4 indices = read_imagei(input1, coord.xz); \
+    indices = indices >= 0 ? indices : indices + axis_num; \
     int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \
  \
     vxc_short8 src; \
@@ -213,6 +218,7 @@ __kernel void gather_batch_I16toF16_axis0(
 {
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
     int4 indices = read_imagei(input1, coord.xz);
+    indices = indices >= 0 ? indices : indices + axis_num;
     int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
 
     vxc_short8 src, dst;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx
index c479a3b58..e467f252e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx
@@ -2,93 +2,96 @@
 
 __kernel void gather_nd_batch_I8toI8_1D(
     __read_only image2d_t   input0,
-    __read_only image2d_t   input1,
-    __write_only image2d_t  output,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
     int block_size,
     int coord_dim
     )
 {
     int gidx = get_global_id(0);  // block_size
-    int gidy = get_global_id(1);  // batch
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
 
-    int4 coord = (int4)(gidx, gidy, 0, 0);
-    Image img = create_image_from_image2d(input1, 4);
-    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
     int4 indice = ((int4 *)indice_ptr)[0];
-
-    coord.z = indice.x * block_size + gidx;
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
 
     vxc_char16 src;
-    VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 
-    VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
 
 __kernel void gather_nd_batch_U8toU8_1D(
     __read_only image2d_t   input0,
-    __read_only image2d_t   input1,
-    __write_only image2d_t  output,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
     int block_size,
     int coord_dim
     )
 {
     int gidx = get_global_id(0);  // block_size
-    int gidy = get_global_id(1);  // batch num
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
 
-    int4 coord = (int4)(gidx, gidy, 0, 0);
-    Image img = create_image_from_image2d(input1, 4);
-    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
     int4 indice = ((int4 *)indice_ptr)[0];
 
-    coord.z = indice.x * block_size + gidx;
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
 
     vxc_uchar16 src;
-    VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
 
 __kernel void gather_nd_batch_I16toI16_1D(
     __read_only image2d_t   input0,
-    __read_only image2d_t   input1,
-    __write_only image2d_t  output,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
     int block_size,
     int coord_dim
     )
 {
     int gidx = get_global_id(0);  // block_size
-    int gidy = get_global_id(1);  // batch num
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
 
-    int4 coord = (int4)(gidx, gidy, 0, 0);
-    Image img = create_image_from_image2d(input1, 4);
-    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
     int4 indice = ((int4 *)indice_ptr)[0];
 
-    coord.z = indice.x * block_size + gidx;
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
 
     vxc_short8 src;
-    VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
 
 __kernel void gather_nd_batch_F16toF16_1D(
     __read_only image2d_t   input0,
-    __read_only image2d_t   input1,
-    __write_only image2d_t  output,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
     int block_size,
     int coord_dim
     )
 {
     int gidx = get_global_id(0);  // block_size
-    int gidy = get_global_id(1);  // batch num
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
 
-    int4 coord = (int4)(gidx, gidy, 0, 0);
-    Image img = create_image_from_image2d(input1, 4);
-    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
     int4 indice = ((int4 *)indice_ptr)[0];
 
-    coord.z = indice.x * block_size + gidx;
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
 
     vxc_short8 src;
-    VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx
index acc6c4cfc..58c2af349 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx
@@ -2,18 +2,19 @@
 
 __kernel void gather_nd_batch_I8toI8_2D(
     __read_only image2d_array_t   input0,
-    __read_only image2d_t   input1,
-    __write_only image2d_t  output,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
     int block_size,
     int coord_dim
     )
 {
     int gidx = get_global_id(0);  // block_size
-    int gidy = get_global_id(1);  // batch num
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
 
-    int4 coord = (int4)(gidx, 0, gidy, 0);
-    Image img = create_image_from_image2d(input1, 4);
-    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
     int4 indice = ((int4 *)indice_ptr)[0];
 
     indice.x = indice.x * block_size + gidx;
@@ -22,23 +23,24 @@ __kernel void gather_nd_batch_I8toI8_2D(
     vxc_char16 src;
     VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 
-    VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
 
 __kernel void gather_nd_U8toU8_2D(
     __read_only image2d_array_t   input0,
-    __read_only image2d_t   input1,
-    __write_only image2d_t  output,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
     int block_size,
     int coord_dim
     )
 {
     int gidx = get_global_id(0);  // block_size
-    int gidy = get_global_id(1);  // batch num
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
 
-    int4 coord = (int4)(gidx, 0, gidy, 0);
-    Image img = create_image_from_image2d(input1, 4);
-    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
     int4 indice = ((int4 *)indice_ptr)[0];
 
     indice.x = indice.x * block_size + gidx;
@@ -46,23 +48,24 @@ __kernel void gather_nd_U8toU8_2D(
 
     vxc_uchar16 src;
     VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
 
 __kernel void gather_nd_I16toI16_2D(
     __read_only image2d_array_t   input0,
-    __read_only image2d_t   input1,
-    __write_only image2d_t  output,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
     int block_size,
     int coord_dim
     )
 {
     int gidx = get_global_id(0);  // block_size
-    int gidy = get_global_id(1);  // batch num
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
 
-    int4 coord = (int4)(gidx, 0, gidy, 0);
-    Image img = create_image_from_image2d(input1, 4);
-    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
     int4 indice = ((int4 *)indice_ptr)[0];
 
     indice.x = indice.x * block_size + gidx;
@@ -70,23 +73,24 @@ __kernel void gather_nd_I16toI16_2D(
 
     vxc_short8 src;
     VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
 
 __kernel void gather_nd_F16toF16_2D(
     __read_only image2d_array_t   input0,
-    __read_only image2d_t   input1,
-    __write_only image2d_t  output,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
     int block_size,
     int coord_dim
     )
 {
     int gidx = get_global_id(0);  // block_size
-    int gidy = get_global_id(1);  // batch num
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
 
-    int4 coord = (int4)(gidx, 0, gidy, 0);
-    Image img = create_image_from_image2d(input1, 4);
-    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
     int4 indice = ((int4 *)indice_ptr)[0];
 
     indice.x = indice.x * block_size + gidx;
@@ -94,5 +98,5 @@ __kernel void gather_nd_F16toF16_2D(
 
     vxc_short8 src;
     VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_0.vx
index c1b970d43..5dfbc3ad7 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_0.vx
@@ -184,12 +184,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
     __read_only  image2d_array_t scale, \
     __read_only  image2d_t       meanVari, \
     __write_only image2d_array_t output, \
-    float eps, int is2D, float rSpaceOrg, int pStride) \
+    float eps, int is2D, float rSpaceOrg, float pStride) \
 { \
+    int gidx = get_global_id(0); \
     int gidy = get_global_id(1); \
     int gidz = get_global_id(2); \
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \
+    int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \
     src_type src0; \
     dst_type dst; \
     vxc_short8 src1; \
@@ -235,7 +236,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
     __read_only  image2d_array_t scale, \
     __read_only  image2d_t       meanVari, \
     __write_only image2d_array_t output, \
-    float eps, int is2D, float rSpaceOrg, int pStride) \
+    float eps, int is2D, float rSpaceOrg, float pStride) \
 { \
     int gidz = get_global_id(1); \
     int2 coord = (int2)(get_global_id(0), gidz); \
@@ -285,12 +286,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
     __read_only  image2d_t       scale, \
     __read_only  image2d_t       meanVari, \
     __write_only image2d_array_t output, \
-    float eps, int is2D, float rSpaceOrg, int pStride) \
+    float eps, int is2D, float rSpaceOrg, float pStride) \
 { \
+    int gidx = get_global_id(0); \
     int gidy = get_global_id(1); \
     int gidz = get_global_id(2); \
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \
+    int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \
     src_type src0; \
     dst_type dst; \
     float scale_vari, bias_val; \
@@ -331,7 +333,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
     __read_only  image2d_t       scale, \
     __read_only  image2d_t       meanVari, \
     __write_only image2d_array_t output, \
-    float eps, int is2D, float rSpaceOrg, int pStride) \
+    float eps, int is2D, float rSpaceOrg, float pStride) \
 { \
     int gidz = get_global_id(1); \
     int2 coord = (int2)(get_global_id(0), gidz); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx
index 3562ae557..8b45e178f 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx
@@ -17,12 +17,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
     __read_only  image2d_array_t scale, \
     __read_only  image2d_t       meanVari, \
     __write_only image2d_array_t output, \
-    float eps, int is2D, float rSpaceOrg, int pStride) \
+    float eps, int is2D, float rSpaceOrg, float pStride) \
 { \
+    int gidx = get_global_id(0); \
     int gidy = get_global_id(1); \
     int gidz = get_global_id(2); \
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \
+    int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \
     src_type src0; \
     vxc_short8 src1, outval; \
     vxc_half8 scale_h, dst; \
@@ -75,7 +76,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
     __read_only  image2d_array_t scale, \
     __read_only  image2d_t       meanVari, \
     __write_only image2d_array_t output, \
-    float eps, int is2D, float rSpaceOrg, int pStride) \
+    float eps, int is2D, float rSpaceOrg, float pStride) \
 { \
     int gidz = get_global_id(1); \
     int2 coord = (int2)(get_global_id(0), gidz); \
@@ -132,12 +133,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
     __read_only  image2d_t       scale, \
     __read_only  image2d_t       meanVari, \
     __write_only image2d_array_t output, \
-    float eps, int is2D, float rSpaceOrg, int pStride) \
+    float eps, int is2D, float rSpaceOrg, float pStride) \
 { \
+    int gidx = get_global_id(0); \
     int gidy = get_global_id(1); \
     int gidz = get_global_id(2); \
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \
+    int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \
     src_type src0; \
     vxc_short8 outval; \
     vxc_half8 dst; \
@@ -186,7 +188,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
     __read_only  image2d_t       scale, \
     __read_only  image2d_t       meanVari, \
     __write_only image2d_array_t output, \
-    float eps, int is2D, float rSpaceOrg, int pStride) \
+    float eps, int is2D, float rSpaceOrg, float pStride) \
 { \
     int gidz = get_global_id(1); \
     int2 coord = (int2)(get_global_id(0), gidz); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx
index b62b67faf..33edef844 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx
@@ -138,12 +138,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
     __read_only  image2d_array_t scale, \
     __read_only  image2d_t       meanVari, \
     __write_only image2d_array_t output, \
-    float eps, int is2D, float rSpaceOrg, int pStride) \
+    float eps, int is2D, float rSpaceOrg, float pStride) \
 { \
+    int gidx = get_global_id(0); \
     int gidy = get_global_id(1); \
     int gidz = get_global_id(2); \
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \
+    int4 coord_para = (int4)((convert_int(gidx* rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \
     vxc_short8 src0; \
     vxc_short8 src1; \
     vxc_half8 scale_h; \
@@ -195,7 +196,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
     __read_only  image2d_array_t scale, \
     __read_only  image2d_t       meanVari, \
     __write_only image2d_array_t output, \
-    float eps, int is2D, float rSpaceOrg, int pStride) \
+    float eps, int is2D, float rSpaceOrg, float pStride) \
 { \
     int gidz = get_global_id(1); \
     int2 coord = (int2)(get_global_id(0), gidz); \
@@ -250,12 +251,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
     __read_only  image2d_t       scale, \
     __read_only  image2d_t       meanVari, \
     __write_only image2d_array_t output, \
-    float eps, int is2D, float rSpaceOrg, int pStride) \
+    float eps, int is2D, float rSpaceOrg, float pStride) \
 { \
+    int gidx = get_global_id(0); \
     int gidy = get_global_id(1); \
     int gidz = get_global_id(2); \
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \
+    int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \
     vxc_short8 src0; \
     src_type in_h; \
     float scale_vari, bias_val; \
@@ -302,7 +304,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
     __read_only  image2d_t       scale, \
     __read_only  image2d_t       meanVari, \
     __write_only image2d_array_t output, \
-    float eps, int is2D, float rSpaceOrg, int pStride) \
+    float eps, int is2D, float rSpaceOrg, float pStride) \
 { \
     int gidz = get_global_id(1); \
     int2 coord = (int2)(get_global_id(0), gidz); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx
index 77fdcc99a..8086f28c9 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx
@@ -29,8 +29,8 @@ _viv_uniform VXC_512Bits uniConvertF16_0_4x4;
 _viv_uniform VXC_512Bits uniConvertF16_1_4x4;
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;
 
-#define GRUCELL_F16_F16TOF16(act_name, act_func) \
-__kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \
+#define GRUCELL_F16_F16TOF16(act_name, act_func, rec_act_name, rec_act_func) \
+__kernel void grucell_reset_after_activation_F16_F16toF16_##act_name##_##rec_act_name( \
     __read_only  image2d_t hstate_in, \
     __read_only  image2d_t input_z_conv, \
     __read_only  image2d_t input_r_conv, \
@@ -62,15 +62,15 @@ __kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \
  \
     float4 r; \
     VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \
-    r = act_func(r); \
+    r = rec_act_func(r); \
     float4 h0, h1; \
     VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
     VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
     float4 h = h0 + r * h1; \
     float4 z; \
     VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \
-    z = act_func(z); \
-    h = tanh_func(h); \
+    z = rec_act_func(z); \
+    h = act_func(h); \
     float4 h_tm; \
     VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
     float4 result = (1 - z) * h + z * h_tm; \
@@ -83,14 +83,15 @@ __kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \
     VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
     VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
 }
-GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)
+GRUCELL_F16_F16TOF16(TANH,    tanh_func,    SIGMOID, sigmoid_func)
+GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func, SIGMOID, sigmoid_func)
 
 _viv_uniform float hstate_in_scale;
 _viv_uniform float hstate_in_tail;
 _viv_uniform float output_scale;
 _viv_uniform float output_zp;
-#define GRUCELL_QNT_F16TO_QNT(name0, name1, act_name, act_func, src0_type, dst_type) \
-__kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name( \
+#define GRUCELL_QNT_F16TO_QNT(name, act_func, rec_act_func, src0_type, dst_type) \
+__kernel void grucell_reset_after_activation_##name( \
     __read_only  image2d_t hstate_in, \
     __read_only  image2d_t input_z_conv, \
     __read_only  image2d_t input_r_conv, \
@@ -122,15 +123,15 @@ __kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name
  \
     float4 r; \
     VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \
-    r = act_func(r); \
+    r = rec_act_func(r); \
     float4 h0, h1; \
     VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
     VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
     float4 h = h0 + r * h1; \
     float4 z; \
     VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \
-    z = act_func(z); \
-    h = tanh_func(h); \
+    z = rec_act_func(z); \
+    h = act_func(h); \
     float4 h_tm; \
     VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
     h_tm = h_tm * hstate_in_scale + hstate_in_tail; \
@@ -143,6 +144,9 @@ __kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name
     VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
     VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
 }
-GRUCELL_QNT_F16TO_QNT(U8,  U8,  SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8)
-GRUCELL_QNT_F16TO_QNT(I8,  I8,  SIGMOID, sigmoid_func, vxc_char8,  vxc_char8)
-GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8)
+GRUCELL_QNT_F16TO_QNT(U8_F16toU8_TANH_SIGMOID,      tanh_func,    sigmoid_func, vxc_uchar8, vxc_uchar8)
+GRUCELL_QNT_F16TO_QNT(I8_F16toI8_TANH_SIGMOID,      tanh_func,    sigmoid_func, vxc_char8,  vxc_char8)
+GRUCELL_QNT_F16TO_QNT(I16_F16toI16_TANH_SIGMOID,    tanh_func,    sigmoid_func, vxc_short8, vxc_short8)
+GRUCELL_QNT_F16TO_QNT(U8_F16toU8_SIGMOID_SIGMOID,   sigmoid_func, sigmoid_func, vxc_uchar8, vxc_uchar8)
+GRUCELL_QNT_F16TO_QNT(I8_F16toI8_SIGMOID_SIGMOID,   sigmoid_func, sigmoid_func, vxc_char8,  vxc_char8)
+GRUCELL_QNT_F16TO_QNT(I16_F16toI16_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_short8, vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_cross.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_cross.vx
new file mode 100644
index 000000000..b4dc43c24
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_cross.vx
@@ -0,0 +1,208 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float output_ZP;
+_viv_uniform float mulKIn0In1Zp;
+_viv_uniform float inOutScale;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4;
+_viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4;
+
+_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;
+
+#define GEMM_QINT_TO_QINT_CROSS(src0_type_name, read_type) \
+__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_cross( \
+        image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \
+        int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N, \
+        int axis_size, int inner_size, int outer_size, int axis_size0, \
+        int inner_size0, int outer_size0, int axis_size1, int inner_size1, \
+        int outer_size1, int axis_size2, int inner_size2, int outer_size2) \
+{ \
+    read_type srcA0, srcA1, srcA2, srcA3, srcB, outC; \
+    vxc_float4 sum = (vxc_float4)(mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp); \
+    int gidz = get_global_id(2); \
+    for(int j = 0; j < outer_size; j++) \
+    { \
+        for(int i = 0; i < inner_size; i++) \
+        { \
+            vxc_float4 sum0 = sum, sum1 = sum, sum2 = sum, sum3 = sum; \
+            int4 coord_a = (int4)(0, get_global_id(1), gidz * axis_size0 + i * inner_size0 + j * outer_size0, 0); \
+            int4 coord_b = (int4)(get_global_id(0), 0, gidz * axis_size1 + i * inner_size1 + j * outer_size1, 0); \
+            int8 inputA_desc, inputB_desc, output_desc; \
+            _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \
+            int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \
+            _viv_asm(MOV, coord_a.w, baseAddr_a);  \
+            _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \
+            int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \
+            _viv_asm(MOV, coord_b.w, baseAddr_b);  \
+            for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \
+            { \
+                vxc_float4 tempA0, tempA1, tempA2, tempA3; \
+                vxc_float4 tempB0, tempB1, tempB2, tempB3; \
+                VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+                VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+                VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+                VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \
+                            VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \
+                VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+                VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \
+                            VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \
+                VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+                VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \
+                            VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \
+                coord_a.x += 4; coord_b.y += 4; \
+                VXC_DP4x4(tempA0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                            uniGemmU8U8toFp32Block4_4x4); \
+                VXC_DP4x4(tempA1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                            uniGemmU8U8toFp32Block4_4x4); \
+                VXC_DP4x4(tempA2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                            uniGemmU8U8toFp32Block4_4x4); \
+                VXC_DP4x4(tempA3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                            uniGemmU8U8toFp32Block4_4x4); \
+                VXC_DP8x4(tempB0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                            uniGemmU8U8MulZptoFp32_8x4); \
+                VXC_DP8x4(tempB1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                            uniGemmU8U8MulZptoFp32_8x4); \
+                VXC_DP8x4(tempB2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                            uniGemmU8U8MulZptoFp32_8x4); \
+                VXC_DP8x4(tempB3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                            uniGemmU8U8MulZptoFp32_8x4); \
+                sum0 += tempA0 + tempB0; \
+                sum1 += tempA1 + tempB1; \
+                sum2 += tempA2 + tempB2; \
+                sum3 += tempA3 + tempB3; \
+            } \
+            vxc_int4 tmpOut0, tmpOut1; \
+            coord_b.y = get_global_id(1); \
+            coord_b.z = gidz * axis_size2 + i * inner_size2 + j * outer_size2; \
+            _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+            int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \
+            _viv_asm(MOV, coord_b.w, baseAddr); \
+            tmpOut0 = convert_int4_rte(sum0 * inOutScale + output_ZP); \
+            tmpOut1 = convert_int4_rte(sum1 * inOutScale + output_ZP); \
+            VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+            coord_b.y++; \
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+            coord_b.y++; \
+            tmpOut0 = convert_int4_rte(sum2 * inOutScale + output_ZP); \
+            tmpOut1 = convert_int4_rte(sum3 * inOutScale + output_ZP); \
+            VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+            coord_b.y++; \
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+}
+GEMM_QINT_TO_QINT_CROSS(U8, vxc_uchar16)
+GEMM_QINT_TO_QINT_CROSS(I8, vxc_char16)
+
+__kernel void gemm_F16F16toF16_cross(image2d_array_t inputA,
+            image2d_array_t inputB, image2d_array_t output,
+            int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N,
+            int axis_size, int inner_size, int outer_size, int axis_size0,
+            int inner_size0, int outer_size0, int axis_size1, int inner_size1,
+            int outer_size1, int axis_size2, int inner_size2, int outer_size2)
+{
+    uint gidy = get_global_id(1);
+    uint gidz = get_global_id(2);
+    for(int j = 0; j < outer_size; j++)
+    {
+        for(int i = 0; i < inner_size; i++)
+        {
+            int4 coord_a = (int4)(0, gidy, gidz * axis_size0 + i * inner_size0 + j * outer_size0, 0);
+            int4 coord_b = (int4)(get_global_id(0), 0, gidz * axis_size1 + i * inner_size1 + j * outer_size1, 0);
+
+            half4 valC;
+            vxc_short8 srcA0, srcA1, srcA2, srcA3, outC;
+            vxc_half8 tmpA0, tmpA1, tmpA2, tmpA3;
+            vxc_short16 srcB;
+            vxc_half16 tmpB;
+            vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);
+            vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);
+
+            int8 inputA_desc, inputB_desc, output_desc;
+            _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));
+            int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;
+            _viv_asm(MOV, coord_a.w, baseAddr_a);
+            _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));
+            int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;
+            _viv_asm(MOV, coord_b.w, baseAddr_b);
+
+            for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)
+            {
+                vxc_float4 tempA0, tempA1, tempA2, tempA3;
+                VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+                VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+                VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+                VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),
+                            VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+                VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+                VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+                VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+                VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),
+                            VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+                coord_a.x += 4; coord_b.y += 4;
+                _viv_asm(COPY, tmpA0, srcA0, 16);
+                _viv_asm(COPY, tmpA1, srcA1, 16);
+                _viv_asm(COPY, tmpA2, srcA2, 16);
+                _viv_asm(COPY, tmpA3, srcA3, 16);
+                _viv_asm(COPY, tmpB.hi, srcB.hi, 16);
+                _viv_asm(COPY, tmpB.lo, srcB.lo, 16);
+                VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                            uniGemmU8F16toF32Lo_4x4b);
+                VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                            uniGemmU8F16toF32Lo_4x4b);
+                VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, tmpA2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                            uniGemmU8F16toF32Lo_4x4b);
+                VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, tmpA3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                            uniGemmU8F16toF32Lo_4x4b);
+                sum0 += (tempA0);
+                sum1 += (tempA1);
+                sum2 += (tempA2);
+                sum3 += (tempA3);
+            }
+            coord_b.y = gidy;
+            coord_b.z = gidz * axis_size2 + i * inner_size2 + j * outer_size2;
+            _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+            int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0;
+            _viv_asm(MOV, coord_b.w, baseAddr);
+            _viv_asm(CONV, valC, sum0);
+            _viv_asm(COPY, outC, valC, 16);
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+            coord_b.y++;
+            _viv_asm(CONV, valC, sum1);
+            _viv_asm(COPY, outC, valC, 16);
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+            coord_b.y++;
+            _viv_asm(CONV, valC, sum2);
+            _viv_asm(COPY, outC, valC, 16);
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+            coord_b.y++;
+            _viv_asm(CONV, valC, sum3);
+            _viv_asm(COPY, outC, valC, 16);
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        }
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_cross_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_cross_i16.vx
new file mode 100644
index 000000000..241118079
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_cross_i16.vx
@@ -0,0 +1,214 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int input0_ZP;
+_viv_uniform int input1_ZP;
+_viv_uniform float output_ZP;
+_viv_uniform float outputScale;
+_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32B_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+_viv_uniform int ac2zero;
+_viv_uniform int bc2zero;
+
+_viv_uniform int outer;
+
+#define GEMM_QINT_TO_QINT_MERGE(src0_type_name, read_type) \
+__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_merge( \
+        image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \
+        int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \
+{ \
+    uint gidy = get_global_id(1); \
+    short in0_zp, in1_zp; \
+    _viv_asm(COPY, in0_zp, input0_ZP, 4); \
+    _viv_asm(COPY, in1_zp, input1_ZP, 4); \
+    for(int i = 0; i < outer; i++) \
+    { \
+        read_type srcA, srcB, outC; \
+        int4 coord_a = (int4)(0, gidy, (ac2zero ? i : get_global_id(2)), 0); \
+        int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); \
+        vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \
+        vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \
+     \
+        int8 inputA_desc, inputB_desc, output_desc; \
+        _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \
+        int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \
+        _viv_asm(MOV, coord_a.w, baseAddr_a);  \
+        _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \
+        int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \
+        _viv_asm(MOV, coord_b.w, baseAddr_b);  \
+     \
+        for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \
+        { \
+            vxc_float4 tempA0, tempA1, tempA2, tempA3; \
+            vxc_float4 tempB0, tempB1, tempB2, tempB3; \
+            VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniConvertUint8SubZpToFp32_4x4); \
+            VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniConvertUint8SubZpToFp32B_4x4); \
+            VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniConvertUint8SubZpToFp32_4x4); \
+            VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniConvertUint8SubZpToFp32B_4x4); \
+            VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniConvertUint8SubZpToFp32_4x4); \
+            VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniConvertUint8SubZpToFp32B_4x4); \
+            VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            coord_a.x += 4; \
+            coord_b.y += 4; \
+            VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniConvertUint8SubZpToFp32_4x4); \
+            VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniConvertUint8SubZpToFp32B_4x4); \
+            sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \
+            sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \
+            sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \
+            sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \
+        } \
+        vxc_int4 tmpOut0, tmpOut1; \
+        coord_b.y = gidy; \
+        coord_b.z = get_global_id(2) + i * get_global_size(2); \
+        _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+        int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \
+        _viv_asm(MOV, coord_b.w, baseAddr); \
+        tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \
+        tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \
+        VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                    uniConvertInt32toUint8_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+                    VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_b.y++; \
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+                    VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_b.y++; \
+        tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \
+        tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \
+        VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                    uniConvertInt32toUint8_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+                    VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_b.y++; \
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+                    VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+    } \
+}
+GEMM_QINT_TO_QINT_MERGE(I16, vxc_short8)
+
+#define GEMM_QINT_TO_QINT_CROSS(src0_type_name, read_type) \
+__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_cross( \
+        image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \
+        int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N, \
+        int axis_size, int inner_size, int outer_size, int axis_size0, \
+        int inner_size0, int outer_size0, int axis_size1, int inner_size1, \
+        int outer_size1, int axis_size2, int inner_size2, int outer_size2) \
+{ \
+    uint gidy = get_global_id(1); \
+    uint gidz = get_global_id(2); \
+    short in0_zp, in1_zp; \
+    _viv_asm(COPY, in0_zp, input0_ZP, 4); \
+    _viv_asm(COPY, in1_zp, input1_ZP, 4); \
+    for(int j = 0; j < outer_size; j++) \
+    { \
+        for(int i = 0; i < inner_size; i++) \
+        { \
+            read_type srcA, srcB, outC; \
+            int4 coord_a = (int4)(0, gidy, gidz * axis_size0 + i * inner_size0 + j * outer_size0, 0); \
+            int4 coord_b = (int4)(get_global_id(0), 0, gidz * axis_size1 + i * inner_size1 + j * outer_size1, 0); \
+            vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \
+            vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \
+         \
+            int8 inputA_desc, inputB_desc, output_desc; \
+            _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \
+            int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \
+            _viv_asm(MOV, coord_a.w, baseAddr_a);  \
+            _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \
+            int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \
+            _viv_asm(MOV, coord_b.w, baseAddr_b);  \
+         \
+            for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \
+            { \
+                vxc_float4 tempA0, tempA1, tempA2, tempA3; \
+                vxc_float4 tempB0, tempB1, tempB2, tempB3; \
+                VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+                VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+                VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                            uniConvertUint8SubZpToFp32_4x4); \
+                VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                            uniConvertUint8SubZpToFp32B_4x4); \
+                VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+                VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+                VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                            uniConvertUint8SubZpToFp32_4x4); \
+                VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                            uniConvertUint8SubZpToFp32B_4x4); \
+                VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+                VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+                VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                            uniConvertUint8SubZpToFp32_4x4); \
+                VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                            uniConvertUint8SubZpToFp32B_4x4); \
+                VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+                VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+                coord_a.x += 4; \
+                coord_b.y += 4; \
+                VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                            uniConvertUint8SubZpToFp32_4x4); \
+                VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                            uniConvertUint8SubZpToFp32B_4x4); \
+                sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \
+                sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \
+                sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \
+                sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \
+            } \
+            vxc_int4 tmpOut0, tmpOut1; \
+            coord_b.y = gidy; \
+            coord_b.z = gidz * axis_size2 + i * inner_size2 + j * outer_size2; \
+            _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+            int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \
+            _viv_asm(MOV, coord_b.w, baseAddr); \
+            tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \
+            tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \
+            VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+            coord_b.y++; \
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+            coord_b.y++; \
+            tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \
+            tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \
+            VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+            coord_b.y++; \
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+}
+GEMM_QINT_TO_QINT_CROSS(I16, vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_merge.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_merge.vx
new file mode 100644
index 000000000..9f33be797
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_merge.vx
@@ -0,0 +1,294 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float output_ZP;
+_viv_uniform float mulKIn0In1Zp;
+_viv_uniform float inOutScale;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+_viv_uniform int ac2zero;
+_viv_uniform int bc2zero;
+_viv_uniform int outer;
+
+_viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4;
+_viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4;
+
+_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Lo_4x4;
+_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Hi_4x4;
+_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Lo_4x4;
+_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Hi_4x4;
+
+_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;
+
+#define GEMM_QINT_TO_QINT_MERGE(src0_type_name, read_type) \
+__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_merge( \
+        image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \
+        int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \
+{ \
+    read_type srcA0, srcA1, srcA2, srcA3, srcB, outC; \
+    vxc_float4 sum = (vxc_float4)(mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp); \
+    for(int i = 0; i < outer; i++) \
+    { \
+        vxc_float4 sum0 = sum, sum1 = sum, sum2 = sum, sum3 = sum; \
+        int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? i : get_global_id(2)), 0); \
+        int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); \
+        int8 inputA_desc, inputB_desc, output_desc; \
+        _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \
+        int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \
+        _viv_asm(MOV, coord_a.w, baseAddr_a);  \
+        _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \
+        int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \
+        _viv_asm(MOV, coord_b.w, baseAddr_b);  \
+        for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \
+        { \
+            vxc_float4 tempA0, tempA1, tempA2, tempA3; \
+            vxc_float4 tempB0, tempB1, tempB2, tempB3; \
+            VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \
+                        VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \
+            VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \
+                        VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \
+            VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \
+                        VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \
+            coord_a.x += 4; coord_b.y += 4; \
+            VXC_DP4x4(tempA0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniGemmU8U8toFp32Block4_4x4); \
+            VXC_DP4x4(tempA1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniGemmU8U8toFp32Block4_4x4); \
+            VXC_DP4x4(tempA2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniGemmU8U8toFp32Block4_4x4); \
+            VXC_DP4x4(tempA3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniGemmU8U8toFp32Block4_4x4); \
+            VXC_DP8x4(tempB0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniGemmU8U8MulZptoFp32_8x4); \
+            VXC_DP8x4(tempB1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniGemmU8U8MulZptoFp32_8x4); \
+            VXC_DP8x4(tempB2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniGemmU8U8MulZptoFp32_8x4); \
+            VXC_DP8x4(tempB3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniGemmU8U8MulZptoFp32_8x4); \
+            sum0 += tempA0 + tempB0; \
+            sum1 += tempA1 + tempB1; \
+            sum2 += tempA2 + tempB2; \
+            sum3 += tempA3 + tempB3; \
+        } \
+        vxc_int4 tmpOut0, tmpOut1; \
+        coord_b.y = get_global_id(1); \
+        coord_b.z = get_global_id(2) + i * get_global_size(2); \
+        _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+        int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \
+        _viv_asm(MOV, coord_b.w, baseAddr); \
+        tmpOut0 = convert_int4_rte(sum0 * inOutScale + output_ZP); \
+        tmpOut1 = convert_int4_rte(sum1 * inOutScale + output_ZP); \
+        VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                    uniConvertInt32toUint8_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+                    VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_b.y++; \
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+                    VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_b.y++; \
+        tmpOut0 = convert_int4_rte(sum2 * inOutScale + output_ZP); \
+        tmpOut1 = convert_int4_rte(sum3 * inOutScale + output_ZP); \
+        VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                    uniConvertInt32toUint8_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+                    VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_b.y++; \
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+                    VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+    } \
+}
+GEMM_QINT_TO_QINT_MERGE(U8, vxc_uchar16)
+GEMM_QINT_TO_QINT_MERGE(I8, vxc_char16)
+
+#if (VX_VERSION==2)
+__kernel void gemm_F16F16toF16_merge(image2d_array_t inputA,
+            image2d_array_t inputB, image2d_array_t output,
+            int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N)
+{
+    uint gidy = get_global_id(1);
+    for(int i = 0; i < outer; i++)
+    {
+        int4 coord_a = (int4)(0, gidy, (ac2zero ? i : get_global_id(2)), 0);
+        int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0);
+
+        half4 valC;
+        vxc_short8 srcA0, srcA1, srcA2, srcA3, outC;
+        vxc_half8 tmpA0, tmpA1, tmpA2, tmpA3;
+        vxc_short16 srcB;
+        vxc_half16 tmpB;
+        vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);
+        vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);
+
+        int8 inputA_desc, inputB_desc, output_desc;
+        _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));
+        int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;
+        _viv_asm(MOV, coord_a.w, baseAddr_a);
+        _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));
+        int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;
+        _viv_asm(MOV, coord_b.w, baseAddr_b);
+
+        for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)
+        {
+            vxc_float4 tempA0, tempA1, tempA2, tempA3;
+            VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+            VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+            VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+            VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),
+                        VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+            VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+            VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+            VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+            VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),
+                        VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+            coord_a.x += 4; coord_b.y += 4;
+            _viv_asm(COPY, tmpA0, srcA0, 16);
+            _viv_asm(COPY, tmpA1, srcA1, 16);
+            _viv_asm(COPY, tmpA2, srcA2, 16);
+            _viv_asm(COPY, tmpA3, srcA3, 16);
+            _viv_asm(COPY, tmpB.hi, srcB.hi, 16);
+            _viv_asm(COPY, tmpB.lo, srcB.lo, 16);
+            VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                        uniGemmU8F16toF32Lo_4x4b);
+            VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                        uniGemmU8F16toF32Lo_4x4b);
+            VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, tmpA2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                        uniGemmU8F16toF32Lo_4x4b);
+            VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, tmpA3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                        uniGemmU8F16toF32Lo_4x4b);
+            sum0 += (tempA0);
+            sum1 += (tempA1);
+            sum2 += (tempA2);
+            sum3 += (tempA3);
+        }
+        coord_b.y = gidy;
+        coord_b.z = get_global_id(2) + i * get_global_size(2);
+        _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+        int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0;
+        _viv_asm(MOV, coord_b.w, baseAddr);
+        _viv_asm(CONV, valC, sum0);
+        _viv_asm(COPY, outC, valC, 16);
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        coord_b.y++;
+        _viv_asm(CONV, valC, sum1);
+        _viv_asm(COPY, outC, valC, 16);
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        coord_b.y++;
+        _viv_asm(CONV, valC, sum2);
+        _viv_asm(COPY, outC, valC, 16);
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        coord_b.y++;
+        _viv_asm(CONV, valC, sum3);
+        _viv_asm(COPY, outC, valC, 16);
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    }
+}
+#else
+__kernel void gemm_F16F16toF16_merge(image2d_array_t inputA,
+            image2d_array_t inputB, image2d_array_t output,
+            int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N)
+{
+    uint gidy = get_global_id(1);
+    for(int i = 0; i < outer; i++)
+    {
+        int4 coord_a = (int4)(0, gidy, (ac2zero ? i : get_global_id(2)), 0);
+        int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0);
+
+        half4 valC;
+        vxc_short8 srcA0, srcB0, srcA1, srcB1, outC;
+        vxc_half8 tmpA0, tmpB0, tmpA1, tmpB1;
+        vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);
+        vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);
+
+        int8 inputA_desc, inputB_desc, output_desc;
+        _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));
+        int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;
+        _viv_asm(MOV, coord_a.w, baseAddr_a);
+        _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));
+        int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;
+        _viv_asm(MOV, coord_b.w, baseAddr_b);
+
+        for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)
+        {
+            vxc_float4 tempA0, tempA1, tempA2, tempA3;
+            vxc_float4 tempB0, tempB1, tempB2, tempB3;
+            VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+            VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+            VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),
+                        VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+            VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),
+                        VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+            VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+            VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+            VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),
+                        VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+            VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),
+                        VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+            coord_a.x += 4; coord_b.y += 4;
+            _viv_asm(COPY, tmpA0, srcA0, 16);
+            _viv_asm(COPY, tmpB0, srcB0, 16);
+            _viv_asm(COPY, tmpA1, srcA1, 16);
+            _viv_asm(COPY, tmpB1, srcB1, 16);
+
+            VXC_DP4x4(tempA0, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                        uniGemmFp16toFp32Row0Lo_4x4);
+            VXC_DP4x4(tempB0, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                        uniGemmFp16toFp32Row0Hi_4x4);
+            VXC_DP4x4(tempA1, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                        uniGemmFp16toFp32Row1Lo_4x4);
+            VXC_DP4x4(tempB1, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                        uniGemmFp16toFp32Row1Hi_4x4);
+            VXC_DP4x4(tempA2, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                        uniGemmFp16toFp32Row0Lo_4x4);
+            VXC_DP4x4(tempB2, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                        uniGemmFp16toFp32Row0Hi_4x4);
+            VXC_DP4x4(tempA3, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                        uniGemmFp16toFp32Row1Lo_4x4);
+            VXC_DP4x4(tempB3, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                        uniGemmFp16toFp32Row1Hi_4x4);
+            sum0 += (tempA0 + tempB0);
+            sum1 += (tempA1 + tempB1);
+            sum2 += (tempA2 + tempB2);
+            sum3 += (tempA3 + tempB3);
+        }
+        coord_b.y = gidy;
+        coord_b.z = get_global_id(2) + i * get_global_size(2);
+        _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+        int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0;
+        _viv_asm(MOV, coord_b.w, baseAddr);
+        _viv_asm(CONV, valC, sum0);
+        _viv_asm(COPY, outC, valC, 16);
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        coord_b.y++;
+        _viv_asm(CONV, valC, sum1);
+        _viv_asm(COPY, outC, valC, 16);
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        coord_b.y++;
+        _viv_asm(CONV, valC, sum2);
+        _viv_asm(COPY, outC, valC, 16);
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        coord_b.y++;
+        _viv_asm(CONV, valC, sum3);
+        _viv_asm(COPY, outC, valC, 16);
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    }
+}
+#endif
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_BF16_to_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_BF16_to_BF16.vx
new file mode 100644
index 000000000..03b2c33d4
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_BF16_to_BF16.vx
@@ -0,0 +1,99 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+
+_viv_uniform VXC_512Bits uniBF16toFp32_part0_2x8;
+_viv_uniform VXC_512Bits uniBF16toFp32_part1_2x8;
+
+#define GRID_SAMPLE_BF16_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    int4   x_idx       = convert_int4(in_x); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    int4    y_idx        = convert_int4(in_y); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_short8 src; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+
+
+__kernel void nearest_grid_sample_BF16_BF16toBF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_short8 read_val;
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+
+    float4 fxy0;
+    float4 fxy1;
+
+    vxc_short8 read_src;
+    VXC_DP2x8(read_src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part0_2x8);
+    _viv_asm(COPY, fxy0, read_src, 16);
+    VXC_DP2x8(read_src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part1_2x8);
+    _viv_asm(COPY, fxy1, read_src, 16);
+
+
+
+    GRID_SAMPLE_BF16_PROCESS();
+
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_F16_to_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_F16_to_F16.vx
new file mode 100644
index 000000000..ec90f1daa
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_F16_to_F16.vx
@@ -0,0 +1,148 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;
+_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;
+_viv_uniform VXC_512Bits uniExtactHalf8_2x8;
+
+#define GRID_SAMPLE_F16_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    int4   x_idx       = convert_int4(in_x); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    int4    y_idx        = convert_int4(in_y); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_short8 src; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+
+__kernel void nearest_grid_sample_F16_F32toF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    coord_in1.z  = coord_in1.z + 4;
+
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);
+
+    GRID_SAMPLE_F16_PROCESS();
+
+}
+
+_viv_uniform int input1_ZP;
+_viv_uniform float input1Scale;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
+
+__kernel void nearest_grid_sample_F16_U8toF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+    coord_in1.xz = coord_in1.xz * 2;
+    vxc_uchar16 read_coord;
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    float4 fxy0;
+    float4 fxy1;
+    unsigned char input1ZP;
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
+    fxy0 = fxy0 * input1Scale;
+    fxy1 = fxy1 * input1Scale;
+
+    GRID_SAMPLE_F16_PROCESS();
+
+}
+
+
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
+
+__kernel void nearest_grid_sample_F16_F16toF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_short8 read_val;
+    vxc_half8  read_coord;
+
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, read_coord, read_val, 16);
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+
+    GRID_SAMPLE_F16_PROCESS();
+
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_F16_to_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_F16_to_U8.vx
new file mode 100644
index 000000000..6a43dddd0
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_F16_to_U8.vx
@@ -0,0 +1,171 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
+_viv_uniform float uint8Scale;
+_viv_uniform float output_ZP;
+
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
+
+#define GRID_SAMPLE_F16_to_U8_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    int4   x_idx       = convert_int4(in_x); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    int4    y_idx        = convert_int4(in_y); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_short8 s0; \
+    vxc_uchar16 result; \
+    vxc_half8 src; \
+    VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src, s0, 16); \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    float4 dst4; \
+    int4 dst; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP4x4(dst4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4); \
+        dst4         = dst4 * uint8Scale + output_ZP; \
+        dst     = convert_int4_rte(dst4); \
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+        result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src, s0, 16); \
+    } \
+    VXC_DP4x4(dst4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4); \
+    dst4         = dst4 * uint8Scale + output_ZP; \
+    dst     = convert_int4_rte(dst4); \
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+__kernel void nearest_grid_sample_F16_F32toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    coord_in1.z  = coord_in1.z + 4;
+
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);
+    GRID_SAMPLE_F16_to_U8_PROCESS();
+
+}
+
+_viv_uniform int input1_ZP;
+_viv_uniform float input1Scale;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
+
+
+__kernel void nearest_grid_sample_F16_U8toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_uchar16 read_coord;
+
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 fxy0;
+    float4 fxy1;
+
+    unsigned char input1ZP;
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);
+
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
+
+    fxy0 = fxy0 * input1Scale;
+    fxy1 = fxy1 * input1Scale;
+
+    GRID_SAMPLE_F16_to_U8_PROCESS();
+
+}
+
+
+__kernel void nearest_grid_sample_F16_F16toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_short8 read_val;
+    vxc_half8  read_coord;
+
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, read_coord, read_val, 16);
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+
+    GRID_SAMPLE_F16_to_U8_PROCESS();
+
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_I16_to_I16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_I16_to_I16.vx
new file mode 100644
index 000000000..b838b08d8
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_I16_to_I16.vx
@@ -0,0 +1,98 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+
+_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;
+_viv_uniform float input1_scale;
+_viv_uniform VXC_512Bits uniConvertI8toI8_2x8;
+
+
+#define GRID_SAMPLE_I16_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    int4   x_idx       = convert_int4(in_x); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    int4    y_idx        = convert_int4(in_y); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_short8 src, dst; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+
+__kernel void nearest_grid_sample_I16_I16toI16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    vxc_short8 read_coord;
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);
+
+    fxy0 = fxy0 * input1_scale;
+    fxy1 = fxy1 * input1_scale;
+
+    GRID_SAMPLE_I16_PROCESS();
+
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_I8_to_I8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_I8_to_I8.vx
new file mode 100644
index 000000000..871383cbc
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_I8_to_I8.vx
@@ -0,0 +1,97 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+
+
+_viv_uniform float input1_scale;
+_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;
+_viv_uniform VXC_512Bits uniConvertI8toI8_2x8;
+
+#define GRID_SAMPLE_I8_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    int4   x_idx       = convert_int4(in_x); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    int4    y_idx        = convert_int4(in_y); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_char16 src, dst; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+__kernel void nearest_grid_sample_I8_I8toI8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    vxc_char16 read_coord;
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);
+
+    fxy0 = fxy0 * input1_scale;
+    fxy1 = fxy1 * input1_scale;
+
+    GRID_SAMPLE_I8_PROCESS();
+
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_U8_to_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_U8_to_U8.vx
new file mode 100644
index 000000000..696c96dc9
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/nearest_grid_sample_U8_to_U8.vx
@@ -0,0 +1,160 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+
+_viv_uniform int input1_ZP;
+_viv_uniform float input1Scale;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
+
+_viv_uniform VXC_512Bits uniMultiplyAndPostShift_2x8;
+_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
+
+#define GRID_SAMPLE_U8_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    int4   x_idx       = convert_int4(in_x); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    int4    y_idx        = convert_int4(in_y); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_uchar16 src, dst; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    vxc_ushort8 multiplier; \
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP2x8(dst, src, multiplier, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    VXC_DP2x8(dst, src, multiplier, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+__kernel void nearest_grid_sample_U8_F32toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    coord_in1.z  = coord_in1.z + 4;
+
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);
+    GRID_SAMPLE_U8_PROCESS();
+
+}
+
+
+__kernel void nearest_grid_sample_U8_U8toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_uchar16 read_coord;
+
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 fxy0;
+    float4 fxy1;
+
+    unsigned char input1ZP;
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);
+
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
+
+    fxy0 = fxy0 * input1Scale;
+    fxy1 = fxy1 * input1Scale;
+
+    GRID_SAMPLE_U8_PROCESS();
+
+}
+
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
+
+__kernel void nearest_grid_sample_U8_F16toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_short8 read_val;
+    vxc_half8  read_coord;
+
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, read_coord, read_val, 16);
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+
+    GRID_SAMPLE_U8_PROCESS();
+
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow.vx
index 19873f170..438d7be12 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pow.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pow.vx
@@ -22,8 +22,8 @@ __kernel void pow_##name \
  \
     src0_type src0; \
     copy0_type data0; \
-    src0_type src1; \
-    copy0_type data1; \
+    src1_type src1; \
+    copy1_type data1; \
     VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
     _viv_asm(COPY, data0, src0, 16); \
     VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
@@ -94,8 +94,8 @@ __kernel void pow_##name##_2D \
  \
     src0_type src0; \
     copy0_type data0; \
-    src0_type src1; \
-    copy0_type data1; \
+    src1_type src1; \
+    copy1_type data1; \
     VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
     _viv_asm(COPY, data0, src0, 16); \
     VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra.vx
index 28f3f0c0e..91e4213dd 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_bgra.vx
@@ -28,9 +28,21 @@ _viv_uniform int zp;
 _viv_uniform float outputScale;
 
 __kernel void pre_process_bgra_scale_U8toU8(
-    __read_only image2d_array_t input, __write_only image2d_array_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
+         __read_only image2d_array_t  input,
+        __write_only image2d_array_t  output,
+              global int *            xRatio,
+              global int *            yRatio,
+              global int *            xOffset,
+              global int *            yOffset,
+                     float            rMean,
+                     float            gMean,
+                     float            bMean,
+                     float            r_scale,
+                     int              reverse_channel,
+                     int              trans,
+                     float            g_scale,
+                     float            b_scale
+    )
 {
     int4 gidx = get_global_id(0);
     int gidy = get_global_id(1);
@@ -86,6 +98,7 @@ __kernel void pre_process_bgra_scale_U8toU8(
     int4 tmp1, tmp2, result1, result2;
     float4 tmpDst, tmp0;
     float4 mean = (float4)(bMean, gMean, rMean, 0);
+    float4 var = (float4)(b_scale, g_scale, r_scale, 0);
     //tmpFx = (int4)(fx.x, fx.x, fx.x, fx.x);
     int tmpV = 1 << 19;
     vxc_short8 tmpFx;
@@ -148,9 +161,21 @@ __kernel void pre_process_bgra_scale_U8toU8(
 }
 
 __kernel void pre_process_bgra_copy_U8toU8(
-    __read_only image2d_array_t input, __write_only image2d_array_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
+         __read_only image2d_array_t  input,
+        __write_only image2d_array_t  output,
+              global int *            xRatio,
+              global int *            yRatio,
+              global int *            xOffset,
+              global int *            yOffset,
+                     float            rMean,
+                     float            gMean,
+                     float            bMean,
+                     float            r_scale,
+                     int              reverse_channel,
+                     int              trans,
+                     float            g_scale,
+                     float            b_scale
+)
 {
     int2 pos = (int2)((get_global_id(0) + (*xOffset)) << 2, get_global_id(1) + (*yOffset));
 
@@ -165,10 +190,10 @@ __kernel void pre_process_bgra_copy_U8toU8(
     VXC_DP4x4(tmpG, lineBGRA0, lineBGRA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGfromBgra_4x4);
     VXC_DP4x4(tmpR, lineBGRA0, lineBGRA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRfromBgra_4x4);
 
-    tmpDst = (tmpB - bMean) * var;
+    tmpDst = (tmpB - bMean) * b_scale;
     result1 = convert_int4_rte(tmpDst * outputScale + zp);
 
-    tmpDst = (tmpG - gMean) * var;
+    tmpDst = (tmpG - gMean) * g_scale;
     result2 = convert_int4_rte(tmpDst * outputScale + zp);
     VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
 
@@ -178,7 +203,7 @@ __kernel void pre_process_bgra_copy_U8toU8(
     dstPos.z = 1;
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
 
-    tmpDst = (tmpR - rMean) * var;
+    tmpDst = (tmpR - rMean) * r_scale;
     result1 = convert_int4_rte(tmpDst * outputScale + zp);
     VXC_DP2x8(dst, result1, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx
index fcc8d9c06..a20a579f6 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx
@@ -3,7 +3,10 @@
 _viv_uniform int bOrder;
 _viv_uniform int rOrder;
 
-_viv_uniform float outputScaleVar;
+_viv_uniform float outputScaleVar_b;
+_viv_uniform float outputScaleVar_g;
+_viv_uniform float outputScaleVar_r;
+
 _viv_uniform float bMeanScaleVarZp;
 _viv_uniform float gMeanScaleVarZp;
 _viv_uniform float rMeanScaleVarZp;
@@ -28,10 +31,12 @@ __kernel void pre_process_nv12_copy_##name \
                  float           rMean, \
                  float           gMean, \
                  float           bMean, \
-                 float           var, \
+                 float           r_scale, \
                  int             reverse_channel, \
                  int             trans, \
-                 int             nv_type \
+                 int             nv_type, \
+                 float           g_scale, \
+                 float           b_scale \
     ) \
 { \
     int gidx = get_global_id(0); \
@@ -65,21 +70,21 @@ __kernel void pre_process_nv12_copy_##name \
     dst_type dst0; \
     save_type dst; \
     int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
-    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
+    tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \
     _viv_asm(CONV_RTE, result, tmpDstB); \
     dstPos.z = bOrder; \
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
     _viv_asm(COPY, dst, dst0, copy_bytes); \
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
-    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
+    tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \
     _viv_asm(CONV_RTE, result, tmpDstG); \
     dstPos.z = 1; \
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
     _viv_asm(COPY, dst, dst0, copy_bytes); \
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
-    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
+    tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \
     _viv_asm(CONV_RTE, result, tmpDstR); \
     dstPos.z = rOrder; \
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx
index f4ac83b40..2fe9ad62f 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx
@@ -3,7 +3,10 @@
 _viv_uniform int bOrder;
 _viv_uniform int rOrder;
 
-_viv_uniform float outputScaleVar;
+_viv_uniform float outputScaleVar_b;
+_viv_uniform float outputScaleVar_g;
+_viv_uniform float outputScaleVar_r;
+
 _viv_uniform float bMeanScaleVarZp;
 _viv_uniform float gMeanScaleVarZp;
 _viv_uniform float rMeanScaleVarZp;
@@ -36,10 +39,12 @@ __kernel void pre_process_nv12_scale_##name##_gq \
                  float           rMean, \
                  float           gMean, \
                  float           bMean, \
-                 float           var, \
+                 float           r_scale, \
                  int             reverse_channel, \
                  int             trans, \
-                 int             nv_type \
+                 int             nv_type, \
+                 float           g_scale, \
+                 float           b_scale \
     ) \
 { \
     uint4 gidx = get_global_id(0); \
@@ -93,21 +98,21 @@ __kernel void pre_process_nv12_scale_##name##_gq \
     dst_type dst0; \
     save_type dst; \
     int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
-    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
+    tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \
     _viv_asm(CONV_RTE, result, tmpDstB); \
     dstPos.z = bOrder; \
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
     _viv_asm(COPY, dst, dst0, copy_bytes); \
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
-    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
+    tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \
     _viv_asm(CONV_RTE, result, tmpDstG); \
     dstPos.z = 1; \
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
     _viv_asm(COPY, dst, dst0, copy_bytes); \
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
-    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
+    tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \
     _viv_asm(CONV_RTE, result, tmpDstR); \
     dstPos.z = rOrder; \
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
@@ -132,10 +137,12 @@ __kernel void pre_process_nv12_scale_##name \
                  float           rMean, \
                  float           gMean, \
                  float           bMean, \
-                 float           var, \
+                 float           r_scale, \
                  int             reverse_channel, \
                  int             trans, \
-                 int             nv_type \
+                 int             nv_type, \
+                 float           g_scale, \
+                 float           b_scale \
     ) \
 { \
     uint4 gidx = get_global_id(0); \
@@ -187,21 +194,21 @@ __kernel void pre_process_nv12_scale_##name \
     dst_type dst0; \
     save_type dst; \
     int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
-    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
+    tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \
     _viv_asm(CONV_RTE, result, tmpDstB); \
     dstPos.z = bOrder; \
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
     _viv_asm(COPY, dst, dst0, copy_bytes); \
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
-    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
+    tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \
     _viv_asm(CONV_RTE, result, tmpDstG); \
     dstPos.z = 1; \
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
     _viv_asm(COPY, dst, dst0, copy_bytes); \
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
-    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
+    tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \
     _viv_asm(CONV_RTE, result, tmpDstR); \
     dstPos.z = rOrder; \
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb.vx
index 536c18df0..c42f2eb6b 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb.vx
@@ -29,9 +29,11 @@ __write_only image2d_array_t output, \
                float         rMean, \
                float         gMean, \
                float         bMean, \
-               float         f32Var, \
+               float         r_scale, \
                int           reverse_channel, \
-               int           trans \
+               int           trans, \
+               float         g_scale, \
+               float         b_scale \
     ) \
 { \
     int2 ratioXY = (int2)(*xRatio, *yRatio); \
@@ -80,7 +82,7 @@ __write_only image2d_array_t output, \
  \
     float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \
  \
-    bgrMean *= f32Var; \
+    bgrMean *= (float4)(b_scale, g_scale, r_scale, 0); \
  \
     int4 test01, temp1; \
     int4 test02, temp2; \
@@ -113,7 +115,7 @@ __write_only image2d_array_t output, \
  \
     /*convert U8 to dst*/ \
     dst_type dst; \
-    tmp_dst = tmp_dst * f32Var - bgrMean.zzzz; \
+    tmp_dst = tmp_dst * r_scale - bgrMean.zzzz; \
     tmp_dst = tmp_dst * outputScale + outputZP; \
     conv_type dst0; \
     _viv_asm(CONV_RTE, dst0, tmp_dst); \
@@ -140,7 +142,7 @@ __write_only image2d_array_t output, \
     VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
             uniConvertIntergetoF32_4x4); \
  \
-    tmp_dst = tmp_dst * f32Var - bgrMean.y; \
+    tmp_dst = tmp_dst * g_scale - bgrMean.y; \
     tmp_dst = tmp_dst * outputScale + outputZP; \
     _viv_asm(CONV_RTE, dst0, tmp_dst); \
     VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
@@ -165,7 +167,7 @@ __write_only image2d_array_t output, \
     VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
         uniConvertIntergetoF32_4x4); \
  \
-    tmp_dst = tmp_dst * f32Var - bgrMean.x; \
+    tmp_dst = tmp_dst * b_scale - bgrMean.x; \
     tmp_dst = tmp_dst * outputScale + outputZP; \
     _viv_asm(CONV_RTE, dst0, tmp_dst); \
     VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx
index 5cb3ebbe7..a008b46e2 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx
@@ -10,8 +10,9 @@ _viv_uniform VXC_512Bits uniExtract8Data_2x8;
 
 _viv_uniform float output_scale;
 _viv_uniform float output_zp;
+_viv_uniform int4 rgb_order;
 
-#define RESIZE_BILINEAR_4X1(mean, output) \
+#define RESIZE_BILINEAR_4X1(scale, mean, output, _coord) \
     VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
         VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
     VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
@@ -49,21 +50,19 @@ _viv_uniform float output_zp;
     VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
         uniConvertIntergetoF32_4x4); \
  \
-    tmp_dst = tmp_dst * output_scale - mean * output_scale + output_zp; \
+    tmp_dst = tmp_dst * scale * output_scale - mean * scale * output_scale + output_zp; \
     _viv_asm(CONV, dst0, tmp_dst); \
     VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
         uniExtract8Data_2x8); \
     _viv_asm(COPY, dst, dst1, 8); \
-    VXC_WriteImage(output, coord_out, dst, \
+    VXC_WriteImage(output, _coord, dst, \
         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
 
 #define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \
 __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
     ( \
     __read_only  image2d_array_t input, \
-    __write_only image2d_array_t output0, \
-    __write_only image2d_array_t output1, \
-    __write_only image2d_array_t output2, \
+    __write_only image2d_array_t output, \
           global int             *xRatio, \
           global int             *yRatio, \
           global int             *xOffset, \
@@ -71,7 +70,11 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
                  float           rMean, \
                  float           gMean, \
                  float           bMean, \
-                 float           f32Var \
+                 float           r_scale, \
+                 int             reverse, \
+                 int             height, \
+                 float           g_scale, \
+                 float           b_scale \
     ) \
 { \
     int2 ratioXY = (int2)(*xRatio, *yRatio); \
@@ -133,7 +136,8 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
     int4 test02, temp2; \
     int4 tt; \
     vxc_uchar4 val; \
-    int2 coord_out = (int2)(xPos.x, yPos); \
+    int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \
+    coord_out.yzw += rgb_order.xyz; \
  \
     vxc_uchar8 line1, line2; \
  \
@@ -158,16 +162,16 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
     conv_type dst0; \
     dst_type dst1; \
     copy_type dst; \
-    tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \
+    tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \
     _viv_asm(CONV, dst0, tmp_dst); \
     VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
         uniExtract8Data_2x8); \
     _viv_asm(COPY, dst, dst1, 8); \
-    VXC_WriteImage(output0, coord_out, dst, \
+    VXC_WriteImage(output, coord_out.xy, dst, \
         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
-    RESIZE_BILINEAR_4X1(gMean, output1) \
-    RESIZE_BILINEAR_4X1(bMean, output2) \
+    RESIZE_BILINEAR_4X1(g_scale, gMean, output, coord_out.xz) \
+    RESIZE_BILINEAR_4X1(b_scale, bMean, output, coord_out.xw) \
 }
 PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8,  half4, vxc_short8)
 PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4,  vxc_short8)
@@ -176,9 +180,7 @@ PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4,  vxc_short8)
 __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
     ( \
     __read_only  image2d_array_t input, \
-    __write_only image2d_array_t output0, \
-    __write_only image2d_array_t output1, \
-    __write_only image2d_array_t output2, \
+    __write_only image2d_array_t output, \
           global int             *xRatio, \
           global int             *yRatio, \
           global int             *xOffset, \
@@ -186,7 +188,11 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
                  float           rMean, \
                  float           gMean, \
                  float           bMean, \
-                 float           f32Var \
+                 float           r_scale, \
+                 int             reverse, \
+                 int             height, \
+                 float           g_scale, \
+                 float           b_scale \
     ) \
 { \
     int2 ratioXY = (int2)(*xRatio, *yRatio); \
@@ -241,7 +247,8 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
  \
     int4 test01, temp1; \
     int4 test02, temp2; \
-    int2 coord_out = (int2)(xPos.x, yPos); \
+    int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \
+    coord_out.yzw += rgb_order.xyz; \
  \
     VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
         uniVecShift10); \
@@ -265,12 +272,12 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
  \
     int4 dst0; \
     write_type dst; \
-    tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \
+    tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \
     dst0 = convert_int4_rte(tmp_dst); \
     VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
         uniExtract8Data_2x8); \
  \
-    VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
     coord_in.x = coord.x; \
     coord_in.z = 1; \
@@ -310,12 +317,12 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
         uniExtractBytes); \
     VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
         uniConvertIntergetoF32_4x4); \
-    tmp_dst = tmp_dst * output_scale - gMean * output_scale + output_zp; \
+    tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \
     dst0 = convert_int4_rte(tmp_dst); \
     VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
         uniExtract8Data_2x8); \
  \
-    VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.xz, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
     coord_in.x = coord.x; \
     coord_in.z = 2; \
@@ -355,12 +362,12 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \
         uniExtractBytes); \
     VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
         uniConvertIntergetoF32_4x4); \
-    tmp_dst = tmp_dst * output_scale - bMean * output_scale + output_zp; \
+    tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \
     dst0 = convert_int4_rte(tmp_dst); \
     VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
         uniExtract8Data_2x8); \
  \
-    VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.xw, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
 }
 PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)
 PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx
index b0714e47c..724b28ad3 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx
@@ -6,14 +6,13 @@ _viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;
 
 _viv_uniform float output_scale;
 _viv_uniform float output_zp;
+_viv_uniform int4 rgb_order;
 
 #define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \
 __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
     ( \
     __read_only  image2d_array_t input, \
-    __write_only image2d_array_t output0, \
-    __write_only image2d_array_t output1, \
-    __write_only image2d_array_t output2, \
+    __write_only image2d_array_t output, \
           global int             *xRatio, \
           global int             *yRatio, \
           global int             *xOffset, \
@@ -21,7 +20,11 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
                  float           rMean, \
                  float           gMean, \
                  float           bMean, \
-                 float           f32Var \
+                 float           r_scale, \
+                 int             reverse, \
+                 int             height, \
+                 float           g_scale, \
+                 float           b_scale \
     ) \
 { \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
@@ -38,8 +41,9 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
     VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
  \
     coord.x = coord.z + 8; \
-    float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \
-        rMean * output_scale - output_zp, output_scale); \
+    float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp,\
+        rMean * r_scale * output_scale - output_zp, \
+        rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \
  \
     half4 paramData_f16; \
     copy_type tmp_dst; \
@@ -49,33 +53,38 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
     VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
         uniDataMeanStddevHi_2x8); \
     _viv_asm(COPY, tmp_dst, dst0, 16); \
-    VXC_WriteImage(output0, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    int4 coord_out = coord; \
+    coord_out.yw = coord_out.ww + rgb_order.xy; \
+    VXC_WriteImage(output, coord_out.zy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
     _viv_asm(COPY, tmp_dst, dst1, 16); \
-    VXC_WriteImage(output0, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.xy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
  \
-    float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \
-        gMean * output_scale - output_zp, output_scale); \
+    float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \
+        gMean * g_scale * output_scale - output_zp, \
+        gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \
     _viv_asm(CONV, paramData_f16, paramData1); \
     VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
         uniDataMeanStddevLo_2x8); \
     VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
         uniDataMeanStddevHi_2x8); \
     _viv_asm(COPY, tmp_dst, dst0, 16); \
-    VXC_WriteImage(output1, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
     _viv_asm(COPY, tmp_dst, dst1, 16); \
-    VXC_WriteImage(output1, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
  \
-    float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \
-        bMean * output_scale - output_zp, output_scale); \
+    float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \
+        bMean * b_scale * output_scale - output_zp, \
+        bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \
     _viv_asm(CONV, paramData_f16, paramData2); \
     VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
         uniDataMeanStddevLo_2x8); \
     VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
         uniDataMeanStddevHi_2x8); \
     _viv_asm(COPY, tmp_dst, dst0, 16); \
-    VXC_WriteImage(output2, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    coord_out.w = coord.w + rgb_order.z; \
+    VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
     _viv_asm(COPY, tmp_dst, dst1, 16); \
-    VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
 PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8,  vxc_short8)
 PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)
@@ -84,9 +93,7 @@ PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)
 __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
     ( \
     __read_only  image2d_array_t input, \
-    __write_only image2d_array_t output0, \
-    __write_only image2d_array_t output1, \
-    __write_only image2d_array_t output2, \
+    __write_only image2d_array_t output, \
           global int             *xRatio, \
           global int             *yRatio, \
           global int             *xOffset, \
@@ -94,7 +101,11 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
                  float           rMean, \
                  float           gMean, \
                  float           bMean, \
-                 float           f32Var \
+                 float           r_scale, \
+                 int             reverse, \
+                 int             height, \
+                 float           g_scale, \
+                 float           b_scale \
     ) \
 { \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
@@ -110,8 +121,11 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
     coord_in.z ++; \
     VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
  \
-    float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \
-        rMean * output_scale - output_zp, output_scale); \
+    int4 coord_out = coord; \
+    coord_out.xyw = coord.www + rgb_order.xyz; \
+    float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \
+        rMean * r_scale * output_scale - output_zp, \
+        rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \
  \
     half4 paramData_f16; \
     _viv_asm(CONV, paramData_f16, paramData0); \
@@ -120,27 +134,29 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \
         uniDataMeanStddevLo_2x8); \
     VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
         uniDataMeanStddevHi_2x8); \
-    VXC_WriteImage(output0, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.zx, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
  \
-    float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \
-        gMean * output_scale - output_zp, output_scale); \
+    float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \
+        gMean * g_scale * output_scale - output_zp, \
+        gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \
     _viv_asm(CONV, paramData_f16, paramData1); \
  \
     VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
         uniDataMeanStddevLo_2x8); \
     VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
         uniDataMeanStddevHi_2x8); \
-    VXC_WriteImage(output1, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
  \
-    float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \
-        bMean * output_scale - output_zp, output_scale); \
+    float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \
+        bMean * b_scale * output_scale - output_zp, \
+        bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \
     _viv_asm(CONV, paramData_f16, paramData2); \
  \
     VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
         uniDataMeanStddevLo_2x8); \
     VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
         uniDataMeanStddevHi_2x8); \
-    VXC_WriteImage(output2, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
 }
 PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)
 PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx
index 1ac60fe72..ed58fa920 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx
@@ -5,13 +5,12 @@ _viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;
 _viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;
 _viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;
 _viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;
+_viv_uniform int4 rgb_order;
 
 __kernel void pre_process_rgb888_planar_4over3_U8toU8
     (
     __read_only  image2d_array_t input,
-    __write_only image2d_array_t output0,
-    __write_only image2d_array_t output1,
-    __write_only image2d_array_t output2,
+    __write_only image2d_array_t output,
           global int             *xRatio,
           global int             *yRatio,
           global int             *xOffset,
@@ -19,7 +18,11 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8
                  float           rMean,
                  float           gMean,
                  float           bMean,
-                 float           f32Var
+                 float           r_scale,
+                 int             reverse,
+                 int             height,
+                 float           g_scale,
+                 float           b_scale
     )
 {
     int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);
@@ -49,9 +52,11 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8
     VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
     VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
 
-    VXC_WriteImage(output0, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    int4 coord_r = coord_out;
+    coord_r.yzw += rgb_order.xxx;
+    VXC_WriteImage(output, coord_r.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_r.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_r.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
 
     VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),
         VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
@@ -72,9 +77,11 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8
     VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
     VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
 
-    VXC_WriteImage(output1, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    int4 coord_g = coord_out;
+    coord_g.yzw += rgb_order.yyy;
+    VXC_WriteImage(output, coord_g.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_g.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_g.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
 
     VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),
         VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
@@ -94,17 +101,17 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8
     VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
     VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
 
-    VXC_WriteImage(output2, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output2, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    int4 coord_b = coord_out;
+    coord_b.yzw += rgb_order.zzz;
+    VXC_WriteImage(output, coord_b.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_b.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_b.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
 }
 
 __kernel void pre_process_rgb888_planar_half_U8toU8
     (
     __read_only  image2d_array_t input,
-    __write_only image2d_array_t output0,
-    __write_only image2d_array_t output1,
-    __write_only image2d_array_t output2,
+    __write_only image2d_array_t output,
           global int             *xRatio,
           global int             *yRatio,
           global int             *xOffset,
@@ -112,7 +119,11 @@ __kernel void pre_process_rgb888_planar_half_U8toU8
                  float           rMean,
                  float           gMean,
                  float           bMean,
-                 float           f32Var
+                 float           r_scale,
+                 int             reverse,
+                 int             height,
+                 float           g_scale,
+                 float           b_scale
     )
 {
     int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);
@@ -130,7 +141,9 @@ __kernel void pre_process_rgb888_planar_half_U8toU8
 
     int2 coord = coord_in.xy >> 1;
 
-    VXC_WriteImage(output0, coord, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output1, coord, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output2, coord, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    int4 coord_rgb = coord.xyyy;
+    coord_rgb.yzw += rgb_order.xyz;
+    VXC_WriteImage(output, coord_rgb.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_rgb.xz, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_rgb.xw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
 }
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_0.vx
new file mode 100644
index 000000000..336c4e6e1
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_0.vx
@@ -0,0 +1,377 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniVecShift10;
+_viv_uniform VXC_512Bits uniAddRShift;
+_viv_uniform VXC_512Bits uniGetTempVal;
+_viv_uniform VXC_512Bits uniExtractBytes;
+
+_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;
+
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+#define RESIZE_BILINEAR_4X1(scale, mean) \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.y; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.z; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.w; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.z ++; \
+    coord_in.x = coord.x; \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+ \
+    tmp_dst = tmp_dst * scale * output_scale - mean * scale * output_scale + output_zp; \
+    _viv_asm(CONV, dst0, tmp_dst);
+#define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \
+__kernel void pre_process_rgb888_planar_scale_U8to##dst_name##_nhwc \
+    ( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+          global int             *xRatio, \
+          global int             *yRatio, \
+          global int             *xOffset, \
+          global int             *yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           r_scale, \
+                 int             reverse, \
+                 float           g_scale, \
+                 float           b_scale \
+    ) \
+{ \
+    int2 ratioXY = (int2)(*xRatio, *yRatio); \
+ \
+    int4 xPos = get_global_id(0); \
+    int yPos = get_global_id(1); \
+ \
+    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \
+    xPos += (int4)(0, 1, 2, 3); \
+ \
+    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \
+    int4 sx = fx0 & 0xffff8000; \
+    fx0 -= sx; \
+    sx = sx >> 15; \
+ \
+    vxc_short4 fx; \
+    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniAddRShift); \
+ \
+    int fy = yPos * ratioXY.y + ratioSufXY.y; \
+    int sy = fy & 0xffff8000; \
+ \
+    fy -= sy; \
+    sy = sy >> 15; \
+ \
+    fy = (fy + (1<< 4)) >> 5; \
+ \
+    vxc_uchar16 line0Y; \
+    vxc_uchar16 line1Y; \
+    int4 coord; \
+    int4 coord_in = (int4)(0, 0, 0, 0); \
+    sx = sx + *xOffset; \
+    coord = sx.xyzw; \
+    coord_in.y = sy + *yOffset; \
+    coord_in.x = coord.x; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.y; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.z; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.w; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.z ++; \
+    coord_in.x = coord.x; \
+ \
+    int4 test01, temp1; \
+    int4 test02, temp2; \
+    int4 tt; \
+    vxc_uchar4 val; \
+    int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \
+    coord_out.x = coord_out.x * 3; \
+    coord_out.z = coord_out.x + 8; \
+ \
+    vxc_uchar8 line1, line2; \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+ \
+    vxc_float4 tmp_dst; \
+    vxc_uchar4 u8_dst; \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+ \
+    conv_type dst0; \
+    dst_type dst1, dst2; \
+    copy_type data0, data1, dst; \
+    tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \
+    _viv_asm(CONV, dst0, tmp_dst); \
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+ \
+    RESIZE_BILINEAR_4X1(g_scale, gMean) \
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+    RESIZE_BILINEAR_4X1(b_scale, bMean) \
+    VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+    _viv_asm(COPY, data0, dst1, 16); \
+    _viv_asm(COPY, data1, dst2, 16); \
+    VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uni16BitsDataInterleave_0_2x8); \
+    VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uni16BitsDataInterleave_1_2x8); \
+    VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8,  half4, vxc_short8)
+PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4,  vxc_short8)
+
+#define PRE_PROCESS_RGB888_PLANAR_8BITS(dst_name, write_type) \
+__kernel void pre_process_rgb888_planar_scale_U8to##dst_name##_nhwc \
+    ( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+          global int             *xRatio, \
+          global int             *yRatio, \
+          global int             *xOffset, \
+          global int             *yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           r_scale, \
+                 int             reverse, \
+                 float           g_scale, \
+                 float           b_scale \
+    ) \
+{ \
+    int2 ratioXY = (int2)(*xRatio, *yRatio); \
+    int4 xPos = get_global_id(0); \
+    int yPos  = get_global_id(1); \
+ \
+    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \
+    xPos += (int4)(0, 1, 2, 3); \
+ \
+    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \
+    int4 sx = fx0 & 0xffff8000; \
+    fx0 -= sx; \
+    sx = sx >> 15; \
+ \
+    vxc_short4 fx; \
+    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \
+ \
+    int fy = yPos * ratioXY.y + ratioSufXY.y; \
+    int sy = fy & 0xffff8000; \
+ \
+    fy -= sy; \
+    sy = sy >> 15; \
+    fy = (fy + (1<< 4)) >> 5; \
+ \
+    vxc_uchar16 line0Y; \
+    vxc_uchar16 line1Y; \
+    int4 coord; \
+    sx = sx + *xOffset; \
+    coord.xyz = sx.xyz; \
+    coord.w   = sy + *yOffset; \
+    int2 coord1 = (int2)(sx.w, coord.w); \
+    int4 coord_in = (int4)(coord.xw, 0, 0); \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.y; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.z; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord1.x; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 test01, temp1; \
+    int4 test02, temp2; \
+    int2 coord_out = (int2)(xPos.x, yPos); \
+    coord_out.x = coord_out.x * 3; \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+ \
+    vxc_float4 tmp_dst; \
+    vxc_uchar4 u8_dst; \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+ \
+    int4 dst0; \
+    write_type dst1, dst; \
+    tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \
+    dst0 = convert_int4_rte(tmp_dst); \
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+ \
+    coord_in.x = coord.x; \
+    coord_in.z = 1; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.y; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.z; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord1.x; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+    tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \
+    dst0 = convert_int4_rte(tmp_dst); \
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+ \
+    coord_in.x = coord.x; \
+    coord_in.z = 2; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.y; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord.z; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = coord1.x; \
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+    tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \
+    dst0 = convert_int4_rte(tmp_dst); \
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+    VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uni8BitsDataInterleave_0_2x8); \
+    VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \
+        uni16BitsDataInterleave_1_2x8); \
+    VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \
+}
+PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)
+PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_1.vx
new file mode 100644
index 000000000..80c603bc2
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_1.vx
@@ -0,0 +1,153 @@
+
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;
+_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;
+
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;
+
+#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \
+__kernel void pre_process_rgb888_planar_copy_U8to##dst_name##_nhwc \
+    ( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+          global int             *xRatio, \
+          global int             *yRatio, \
+          global int             *xOffset, \
+          global int             *yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           r_scale, \
+                 int             reverse, \
+                 float           g_scale, \
+                 float           b_scale \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    coord.xy += (int2)(*xOffset, *yOffset); \
+    vxc_uchar16 src0, src1, src2; \
+    dst_type dst0, dst1; \
+ \
+    int4 coord_in = (int4)(coord.xy, 0, 0); \
+    VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.z ++; \
+    VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.z ++; \
+    VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 coord_out = coord; \
+    coord_out.z = coord_out.z * 3; \
+    coord_out.x = coord_out.z + 8; \
+    float4 paramData0 = (float4)(rMean * output_scale * r_scale - output_zp, \
+        rMean * r_scale * output_scale - output_zp, \
+        rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \
+ \
+    half4 paramData_f16; \
+    copy_type data0, data1, data2, dst; \
+    _viv_asm(CONV, paramData_f16, paramData0); \
+    VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniDataMeanStddevLo_2x8); \
+    float4 paramData1 = (float4)(gMean * output_scale * g_scale - output_zp,\
+        gMean * g_scale * output_scale - output_zp, \
+        gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \
+    _viv_asm(CONV, paramData_f16, paramData1); \
+    VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), \
+        uniDataMeanStddevLo_2x8); \
+    _viv_asm(COPY, data0, dst0, 16); \
+ \
+    float4 paramData2 = (float4)(bMean * output_scale * b_scale - output_zp, \
+        bMean * b_scale * output_scale - output_zp, \
+        bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \
+    _viv_asm(CONV, paramData_f16, paramData2); \
+    VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+        uniDataMeanStddevLo_2x8); \
+    _viv_asm(COPY, data1, dst1, 16); \
+    VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uni16BitsDataInterleave_0_2x8); \
+    VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uni16BitsDataInterleave_1_2x8); \
+    VXC_WriteImage(output, coord_out.xw, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8,  vxc_short8)
+PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)
+
+#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \
+__kernel void pre_process_rgb888_planar_copy_U8to##dst_name##_nhwc \
+    ( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+          global int             *xRatio, \
+          global int             *yRatio, \
+          global int             *xOffset, \
+          global int             *yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           r_scale, \
+                 int             reverse, \
+                 int             height, \
+                 float           g_scale, \
+                 float           b_scale \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    coord.xy += (int2) (*xOffset, *yOffset); \
+    vxc_uchar16 src0, src1, src2; \
+    write_type dst0, dst1, dst2, dst3; \
+ \
+    int4 coord_in = (int4)(coord.xy, 0, 0); \
+    VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.z ++; \
+    VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.z ++; \
+    VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 coord_out = coord; \
+    coord_out.z = coord_out.z * 3; \
+    coord_out.x = coord_out.z + 16; \
+    float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \
+        rMean * r_scale * output_scale - output_zp, \
+        rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \
+ \
+    half4 paramData_f16; \
+    _viv_asm(CONV, paramData_f16, paramData0); \
+ \
+    VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniDataMeanStddevLo_2x8); \
+ \
+    float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \
+        gMean * g_scale * output_scale - output_zp, \
+        gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \
+    _viv_asm(CONV, paramData_f16, paramData1); \
+ \
+    VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniDataMeanStddevLo_2x8); \
+ \
+    float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \
+        bMean * b_scale * output_scale - output_zp, \
+        bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \
+    _viv_asm(CONV, paramData_f16, paramData2); \
+ \
+    VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniDataMeanStddevLo_2x8); \
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uni8BitsDataInterleave_0_2x8); \
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uni8BitsDataInterleave_1_2x8); \
+    VXC_DP2x8(dst3, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uni8BitsDataInterleave_2_2x8); \
+    VXC_WriteImage(output, coord_out.zw, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.xw, dst3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)
+PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_2.vx
new file mode 100644
index 000000000..8d686ebd6
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_nhwc_2.vx
@@ -0,0 +1,57 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;
+
+__kernel void pre_process_rgb888_planar_half_U8toU8_nhwc
+    (
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+          global int             *xRatio,
+          global int             *yRatio,
+          global int             *xOffset,
+          global int             *yOffset,
+                 float           rMean,
+                 float           gMean,
+                 float           bMean,
+                 float           r_scale,
+                 int             reverse,
+                 float           g_scale,
+                 float           b_scale
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+    vxc_uchar16 src0, src1, src2;
+
+    VXC_ReadImage2DArray(src0, input, coord_in, 0,
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    coord_in.z ++;
+    VXC_ReadImage2DArray(src1, input, coord_in, 0,
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    coord_in.z ++;
+    VXC_ReadImage2DArray(src2, input, coord_in, 0,
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    int4 coord;
+    coord.xy = coord_in.xy >> 1;
+
+    coord.x = coord.x * 3;
+    coord.z = coord.x + 16;
+
+    vxc_uchar16 dst0, dst1;
+    src0.lo = src0.s02468ace;
+    src0.hi = src1.s02468ace;
+    src1.lo = src2.s02468ace;
+
+    VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),
+        uni8BitsDataInterleave_0_2x8);
+    VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),
+        uni8BitsDataInterleave_1_2x8);
+    VXC_DP2x8(dst1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),
+        uni8BitsDataInterleave_2_2x8);
+
+    VXC_WriteImage(output, coord.xy, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord.zy, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_0.vx
index 107846e09..de9dbdeaf 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_0.vx
@@ -10,8 +10,9 @@ _viv_uniform VXC_512Bits uniExtract8Data_2x8;
 
 _viv_uniform float output_scale;
 _viv_uniform float output_zp;
+_viv_uniform int4 rgb_order;
 
-#define RESIZE_BILINEAR_4X1(input, mean, output) \
+#define RESIZE_BILINEAR_4X1(input, scale, mean, output, _coord) \
     VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
     VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
     VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
@@ -41,12 +42,12 @@ _viv_uniform float output_zp;
     VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
         uniConvertIntergetoF32_4x4); \
  \
-    tmp_dst = tmp_dst * output_scale - mean * output_scale + output_zp; \
+    tmp_dst = tmp_dst * scale * output_scale - scale * mean * output_scale + output_zp; \
     _viv_asm(CONV, dst0, tmp_dst); \
     VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
         uniExtract8Data_2x8); \
     _viv_asm(COPY, dst, dst1, 8); \
-    VXC_WriteImage(output, coord_out, dst, \
+    VXC_WriteImage(output, _coord, dst, \
         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
 
 #define RGB888_PLANAR_SEP_16BITS(dst_name, dst_type, conv_type, copy_type) \
@@ -55,9 +56,7 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
     __read_only  image2d_array_t input0, \
     __read_only  image2d_array_t input1, \
     __read_only  image2d_array_t input2, \
-    __write_only image2d_array_t output0, \
-    __write_only image2d_array_t output1, \
-    __write_only image2d_array_t output2, \
+    __write_only image2d_array_t output, \
           global int             *xRatio, \
           global int             *yRatio, \
           global int             *xOffset, \
@@ -65,7 +64,11 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
                  float           rMean, \
                  float           gMean, \
                  float           bMean, \
-                 float           f32Var \
+                 float           r_scale, \
+                 int             reverse, \
+                 int             height, \
+                 float           g_scale, \
+                 float           b_scale \
     ) \
 { \
     int2 ratioXY = (int2)(*xRatio, *yRatio); \
@@ -118,7 +121,8 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
     int4 test02, temp2; \
     int4 tt; \
     vxc_uchar4 val; \
-    int2 coord_out = (int2)(xPos.x, yPos); \
+    int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \
+    coord_out.yzw += rgb_order.xyz; \
  \
     vxc_uchar8 line1, line2; \
  \
@@ -143,16 +147,16 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
     conv_type dst0; \
     dst_type dst1; \
     copy_type dst; \
-    tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \
+    tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \
     _viv_asm(CONV, dst0, tmp_dst); \
     VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
         uniExtract8Data_2x8); \
     _viv_asm(COPY, dst, dst1, 8); \
-    VXC_WriteImage(output0, coord_out, dst, \
+    VXC_WriteImage(output, coord_out.xy, dst, \
         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
-    RESIZE_BILINEAR_4X1(input1, gMean, output1) \
-    RESIZE_BILINEAR_4X1(input2, bMean, output2) \
+    RESIZE_BILINEAR_4X1(input1, g_scale, gMean, output, coord_out.xz) \
+    RESIZE_BILINEAR_4X1(input2, b_scale, bMean, output, coord_out.xw) \
 }
 RGB888_PLANAR_SEP_16BITS(F16, vxc_half8,  half4, vxc_short8)
 RGB888_PLANAR_SEP_16BITS(I16, vxc_short8, int4,  vxc_short8)
@@ -163,9 +167,7 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
     __read_only  image2d_array_t input0, \
     __read_only  image2d_array_t input1, \
     __read_only  image2d_array_t input2, \
-    __write_only image2d_array_t output0, \
-    __write_only image2d_array_t output1, \
-    __write_only image2d_array_t output2, \
+    __write_only image2d_array_t output, \
           global int             *xRatio, \
           global int             *yRatio, \
           global int             *xOffset, \
@@ -173,7 +175,11 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
                  float           rMean, \
                  float           gMean, \
                  float           bMean, \
-                 float           f32Var \
+                 float           r_scale, \
+                 int             reverse, \
+                 int             height, \
+                 float           g_scale, \
+                 float           b_scale \
     ) \
 { \
     int2 ratioXY = (int2)(*xRatio, *yRatio); \
@@ -221,7 +227,8 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
  \
     int4 test01, temp1; \
     int4 test02, temp2; \
-    int2 coord_out = (int2)(xPos.x, yPos); \
+    int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \
+    coord_out.yzw += rgb_order.xyz; \
  \
     VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
         uniVecShift10); \
@@ -245,12 +252,13 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
  \
     int4 dst0; \
     write_type dst; \
-    tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \
+    tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \
     dst0 = convert_int4_rte(tmp_dst); \
     VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
         uniExtract8Data_2x8); \
  \
-    VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.xy, dst, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
     VXC_ReadImage(line0Y, input1, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
     VXC_ReadImage(line0Y, input1, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
@@ -282,12 +290,13 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
         uniExtractBytes); \
     VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
         uniConvertIntergetoF32_4x4); \
-    tmp_dst = tmp_dst * output_scale - gMean * output_scale + output_zp; \
+    tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \
     dst0 = convert_int4_rte(tmp_dst); \
     VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
         uniExtract8Data_2x8); \
  \
-    VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.xz, \
+        dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
     VXC_ReadImage(line0Y, input2, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
     VXC_ReadImage(line0Y, input2, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
@@ -319,12 +328,13 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \
         uniExtractBytes); \
     VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
         uniConvertIntergetoF32_4x4); \
-    tmp_dst = tmp_dst * output_scale - bMean * output_scale + output_zp; \
+    tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \
     dst0 = convert_int4_rte(tmp_dst); \
     VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
         uniExtract8Data_2x8); \
  \
-    VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.xw, \
+        dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
 }
 RGB888_PLANAR_SEP_8BITS(U8, vxc_uchar16)
-RGB888_PLANAR_SEP_8BITS(I8, vxc_char16)
\ No newline at end of file
+RGB888_PLANAR_SEP_8BITS(I8, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx
index ff55851a6..b308e65cc 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx
@@ -5,6 +5,7 @@ _viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;
 
 _viv_uniform float output_scale;
 _viv_uniform float output_zp;
+_viv_uniform int4 rgb_order;
 
 #define RGB888_PLANAR_SEP_COPY_16BITS(dst_name, dst_type, copy_type) \
 __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
@@ -12,9 +13,7 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
     __read_only  image2d_array_t input0, \
     __read_only  image2d_array_t input1, \
     __read_only  image2d_array_t input2, \
-    __write_only image2d_array_t output0, \
-    __write_only image2d_array_t output1, \
-    __write_only image2d_array_t output2, \
+    __write_only image2d_array_t output, \
           global int             *xRatio, \
           global int             *yRatio, \
           global int             *xOffset, \
@@ -22,7 +21,11 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
                  float           rMean, \
                  float           gMean, \
                  float           bMean, \
-                 float           f32Var \
+                 float           r_scale, \
+                 int             reverse, \
+                 int             height, \
+                 float           g_scale, \
+                 float           b_scale \
     ) \
 { \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
@@ -36,8 +39,9 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
     VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
  \
     coord.x = coord.z + 8; \
-    float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \
-        rMean * output_scale - output_zp, output_scale); \
+    float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \
+        rMean * r_scale * output_scale - output_zp, \
+        rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \
  \
     half4 paramData_f16; \
     copy_type tmp_dst; \
@@ -47,33 +51,38 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
     VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
         uniDataMeanStddevHi_2x8); \
     _viv_asm(COPY, tmp_dst, dst0, 16); \
-    VXC_WriteImage(output0, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    int4 coord_out = coord; \
+    coord_out.yw = coord_out.ww + rgb_order.xy; \
+    VXC_WriteImage(output, coord_out.zy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
     _viv_asm(COPY, tmp_dst, dst1, 16); \
-    VXC_WriteImage(output0, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.xy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
  \
-    float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \
-        gMean * output_scale - output_zp, output_scale); \
+    float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \
+        gMean * g_scale * output_scale - output_zp, \
+        gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \
     _viv_asm(CONV, paramData_f16, paramData1); \
     VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
         uniDataMeanStddevLo_2x8); \
     VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
         uniDataMeanStddevHi_2x8); \
     _viv_asm(COPY, tmp_dst, dst0, 16); \
-    VXC_WriteImage(output1, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
     _viv_asm(COPY, tmp_dst, dst1, 16); \
-    VXC_WriteImage(output1, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
  \
-    float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \
-        bMean * output_scale - output_zp, output_scale); \
+    float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \
+        bMean * b_scale * output_scale - output_zp, \
+        bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \
     _viv_asm(CONV, paramData_f16, paramData2); \
     VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
         uniDataMeanStddevLo_2x8); \
     VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
         uniDataMeanStddevHi_2x8); \
     _viv_asm(COPY, tmp_dst, dst0, 16); \
-    VXC_WriteImage(output2, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    coord_out.w = coord.w + rgb_order.z; \
+    VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
     _viv_asm(COPY, tmp_dst, dst1, 16); \
-    VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
 }
 RGB888_PLANAR_SEP_COPY_16BITS(F16, vxc_half8,  vxc_short8)
 RGB888_PLANAR_SEP_COPY_16BITS(I16, vxc_short8, vxc_short8)
@@ -84,9 +93,7 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
     __read_only  image2d_array_t input0, \
     __read_only  image2d_array_t input1, \
     __read_only  image2d_array_t input2, \
-    __write_only image2d_array_t output0, \
-    __write_only image2d_array_t output1, \
-    __write_only image2d_array_t output2, \
+    __write_only image2d_array_t output, \
           global int             *xRatio, \
           global int             *yRatio, \
           global int             *xOffset, \
@@ -94,7 +101,11 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
                  float           rMean, \
                  float           gMean, \
                  float           bMean, \
-                 float           f32Var \
+                 float           r_scale, \
+                 int             reverse, \
+                 int             height, \
+                 float           g_scale, \
+                 float           b_scale \
     ) \
 { \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
@@ -107,8 +118,11 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
     VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
     VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
  \
-    float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \
-        rMean * output_scale - output_zp, output_scale); \
+    int4 coord_out = coord; \
+    coord_out.xyw += rgb_order.xyz; \
+    float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \
+        rMean * r_scale * output_scale - output_zp, \
+        rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \
  \
     half4 paramData_f16; \
     _viv_asm(CONV, paramData_f16, paramData0); \
@@ -117,27 +131,29 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \
         uniDataMeanStddevLo_2x8); \
     VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
         uniDataMeanStddevHi_2x8); \
-    VXC_WriteImage(output0, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.zx, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
  \
-    float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \
-        gMean * output_scale - output_zp, output_scale); \
+    float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \
+        gMean * g_scale * output_scale - output_zp, \
+        gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \
     _viv_asm(CONV, paramData_f16, paramData1); \
  \
     VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
         uniDataMeanStddevLo_2x8); \
     VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
         uniDataMeanStddevHi_2x8); \
-    VXC_WriteImage(output1, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
  \
-    float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \
-        bMean * output_scale - output_zp, output_scale); \
+    float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \
+        bMean * b_scale * output_scale - output_zp, \
+        bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \
     _viv_asm(CONV, paramData_f16, paramData2); \
  \
     VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
         uniDataMeanStddevLo_2x8); \
     VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
         uniDataMeanStddevHi_2x8); \
-    VXC_WriteImage(output2, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
 }
 PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)
 PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_2.vx
index bbfed6e7e..51a97f047 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_2.vx
@@ -5,15 +5,14 @@ _viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;
 _viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;
 _viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;
 _viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;
+_viv_uniform int4 rgb_order;
 
 __kernel void pre_process_rgb888_planar_sep_4over3_U8toU8
     (
     __read_only  image2d_array_t input0,
     __read_only  image2d_array_t input1,
     __read_only  image2d_array_t input2,
-    __write_only image2d_array_t output0,
-    __write_only image2d_array_t output1,
-    __write_only image2d_array_t output2,
+    __write_only image2d_array_t output,
           global int             *xRatio,
           global int             *yRatio,
           global int             *xOffset,
@@ -21,7 +20,11 @@ __kernel void pre_process_rgb888_planar_sep_4over3_U8toU8
                  float           rMean,
                  float           gMean,
                  float           bMean,
-                 float           f32Var
+                 float           r_scale,
+                 int             reverse,
+                 int             height,
+                 float           g_scale,
+                 float           b_scale
     )
 {
     int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
@@ -47,9 +50,11 @@ __kernel void pre_process_rgb888_planar_sep_4over3_U8toU8
     VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
     VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
 
-    VXC_WriteImage(output0, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    int4 coord_r = coord_out;
+    coord_r.yzw += rgb_order.xxx;
+    VXC_WriteImage(output, coord_r.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_r.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_r.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
 
     VXC_ReadImage(src0, input1, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
     VXC_ReadImage(src1, input1, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
@@ -65,9 +70,11 @@ __kernel void pre_process_rgb888_planar_sep_4over3_U8toU8
     VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
     VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
 
-    VXC_WriteImage(output1, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    int4 coord_g = coord_out;
+    coord_g.yzw += rgb_order.yyy;
+    VXC_WriteImage(output, coord_g.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_g.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_g.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
 
     VXC_ReadImage(src0, input2, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
     VXC_ReadImage(src1, input2, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
@@ -83,9 +90,11 @@ __kernel void pre_process_rgb888_planar_sep_4over3_U8toU8
     VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
     VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
 
-    VXC_WriteImage(output2, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output2, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    int4 coord_b = coord_out;
+    coord_b.yzw += rgb_order.zzz;
+    VXC_WriteImage(output, coord_b.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_b.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_b.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
 }
 
 __kernel void pre_process_rgb888_planar_sep_half_U8toU8
@@ -93,9 +102,7 @@ __kernel void pre_process_rgb888_planar_sep_half_U8toU8
     __read_only  image2d_array_t input0,
     __read_only  image2d_array_t input1,
     __read_only  image2d_array_t input2,
-    __write_only image2d_array_t output0,
-    __write_only image2d_array_t output1,
-    __write_only image2d_array_t output2,
+    __write_only image2d_array_t output,
           global int             *xRatio,
           global int             *yRatio,
           global int             *xOffset,
@@ -103,7 +110,11 @@ __kernel void pre_process_rgb888_planar_sep_half_U8toU8
                  float           rMean,
                  float           gMean,
                  float           bMean,
-                 float           f32Var
+                 float           r_scale,
+                 int             reverse,
+                 int             height,
+                 float           g_scale,
+                 float           b_scale
     )
 {
     int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
@@ -116,7 +127,9 @@ __kernel void pre_process_rgb888_planar_sep_half_U8toU8
 
     coord_in.zw = coord_in.xy >> 1;
 
-    VXC_WriteImage(output0, coord_in.zw, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output1, coord_in.zw, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage(output2, coord_in.zw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    int4 coord_rgb = coord_in.zwww;
+    coord_rgb.yzw += rgb_order.xyz;
+    VXC_WriteImage(output, coord_rgb.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_rgb.xz, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_rgb.xw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
 }
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_0.vx
new file mode 100644
index 000000000..a9b792599
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_0.vx
@@ -0,0 +1,342 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniVecShift10;
+_viv_uniform VXC_512Bits uniAddRShift;
+_viv_uniform VXC_512Bits uniGetTempVal;
+_viv_uniform VXC_512Bits uniExtractBytes;
+
+_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;
+
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+#define RESIZE_BILINEAR_4X1(input, scale, mean) \
+    VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+ \
+    tmp_dst = tmp_dst * scale * output_scale - mean * scale * output_scale + output_zp; \
+    _viv_asm(CONV, dst0, tmp_dst);
+
+#define RGB888_PLANAR_SEP_16BITS(dst_name, dst_type, conv_type, copy_type) \
+__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name##_nhwc \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __read_only  image2d_array_t input2, \
+    __write_only image2d_array_t output, \
+          global int             *xRatio, \
+          global int             *yRatio, \
+          global int             *xOffset, \
+          global int             *yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           r_scale, \
+                 int             reverse, \
+                 float           g_scale, \
+                 float           b_scale \
+    ) \
+{ \
+    int2 ratioXY = (int2)(*xRatio, *yRatio); \
+ \
+    int4 xPos = get_global_id(0); \
+    int yPos = get_global_id(1); \
+ \
+    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \
+    xPos += (int4)(0, 1, 2, 3); \
+ \
+    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \
+    int4 sx = fx0 & 0xffff8000; \
+    fx0 -= sx; \
+    sx = sx >> 15; \
+ \
+    vxc_short4 fx; \
+    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniAddRShift); \
+ \
+    int fy = yPos * ratioXY.y + ratioSufXY.y; \
+    int sy = fy & 0xffff8000; \
+ \
+    fy -= sy; \
+    sy = sy >> 15; \
+ \
+    fy = (fy + (1<< 4)) >> 5; \
+ \
+    vxc_uchar16 line0Y; \
+    vxc_uchar16 line1Y; \
+    int4 coord; \
+    sx = sx + *xOffset; \
+    coord.xyz = sx.xyz; \
+    coord.w = sy + *yOffset; \
+    int2 coord1 = (int2)(sx.w, coord.w); \
+    VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 test01, temp1; \
+    int4 test02, temp2; \
+    int4 tt; \
+    vxc_uchar4 val; \
+    int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \
+    coord_out.x = coord_out.x * 3; \
+    coord_out.z = coord_out.x + 8; \
+ \
+    vxc_uchar8 line1, line2; \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+ \
+    vxc_float4 tmp_dst; \
+    vxc_uchar4 u8_dst; \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+ \
+    conv_type dst0; \
+    dst_type dst1, dst2; \
+    copy_type data0, data1, dst; \
+    tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \
+    _viv_asm(CONV, dst0, tmp_dst); \
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+    RESIZE_BILINEAR_4X1(input1, g_scale, gMean) \
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+ \
+    RESIZE_BILINEAR_4X1(input2, b_scale, bMean) \
+    VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+    _viv_asm(COPY, data0, dst1, 16); \
+    _viv_asm(COPY, data1, dst2, 16); \
+    VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uni16BitsDataInterleave_0_2x8); \
+    VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uni16BitsDataInterleave_1_2x8); \
+    VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+RGB888_PLANAR_SEP_16BITS(F16, vxc_half8,  half4, vxc_short8)
+RGB888_PLANAR_SEP_16BITS(I16, vxc_short8, int4,  vxc_short8)
+
+#define RGB888_PLANAR_SEP_8BITS(dst_name, write_type) \
+__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name##_nhwc \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __read_only  image2d_array_t input2, \
+    __write_only image2d_array_t output, \
+          global int             *xRatio, \
+          global int             *yRatio, \
+          global int             *xOffset, \
+          global int             *yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           r_scale, \
+                 int             reverse, \
+                 float           g_scale, \
+                 float           b_scale \
+    ) \
+{ \
+    int2 ratioXY = (int2)(*xRatio, *yRatio); \
+    int4 xPos = get_global_id(0); \
+    int yPos  = get_global_id(1); \
+ \
+    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \
+    xPos += (int4)(0, 1, 2, 3); \
+ \
+    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \
+    int4 sx = fx0 & 0xffff8000; \
+    fx0 -= sx; \
+    sx = sx >> 15; \
+ \
+    vxc_short4 fx; \
+    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \
+ \
+    int fy = yPos * ratioXY.y + ratioSufXY.y; \
+    int sy = fy & 0xffff8000; \
+ \
+    fy -= sy; \
+    sy = sy >> 15; \
+    fy = (fy + (1<< 4)) >> 5; \
+ \
+    vxc_uchar16 line0Y; \
+    vxc_uchar16 line1Y; \
+    int4 coord; \
+    sx = sx + *xOffset; \
+    coord.xyz = sx.xyz; \
+    coord.w   = sy + *yOffset; \
+    int2 coord1 = (int2)(sx.w, coord.w); \
+    VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 test01, temp1; \
+    int4 test02, temp2; \
+    int2 coord_out = (int2)(xPos.x, yPos); \
+    coord_out.x = coord_out.x * 3; \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+ \
+    vxc_float4 tmp_dst; \
+    vxc_uchar4 u8_dst; \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+ \
+    int4 dst0; \
+    write_type dst1, dst; \
+    tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \
+    dst0 = convert_int4_rte(tmp_dst); \
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+ \
+    VXC_ReadImage(line0Y, input1, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input1, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input1, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input1, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line1Y, input1, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input1, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input1, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input1, coord1, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+    tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \
+    dst0 = convert_int4_rte(tmp_dst); \
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+ \
+    VXC_ReadImage(line0Y, input2, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input2, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input2, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line0Y, input2, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(line1Y, input2, coord.xw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input2, coord.yw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input2, coord.zw, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(line1Y, input2, coord1, VXC_5BITOFFSET_XY(0, 1), \
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp1 = temp1 + test01; \
+ \
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniVecShift10); \
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniGetTempVal); \
+    temp2 = temp2 + test02; \
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniExtractBytes); \
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+        uniConvertIntergetoF32_4x4); \
+    tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \
+    dst0 = convert_int4_rte(tmp_dst); \
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \
+        uniExtract8Data_2x8); \
+    VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uni8BitsDataInterleave_0_2x8); \
+    VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \
+        uni16BitsDataInterleave_1_2x8); \
+    VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \
+}
+RGB888_PLANAR_SEP_8BITS(U8, vxc_uchar16)
+RGB888_PLANAR_SEP_8BITS(I8, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_1.vx
new file mode 100644
index 000000000..1ae298c22
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_1.vx
@@ -0,0 +1,148 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;
+
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;
+
+#define RGB888_PLANAR_SEP_COPY_16BITS(dst_name, dst_type, copy_type) \
+__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name##_nhwc \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __read_only  image2d_array_t input2, \
+    __write_only image2d_array_t output, \
+          global int             *xRatio, \
+          global int             *yRatio, \
+          global int             *xOffset, \
+          global int             *yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           r_scale, \
+                 int             reverse, \
+                 float           g_scale, \
+                 float           b_scale \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    coord.xy += (int2)(*xOffset, *yOffset); \
+    vxc_uchar16 src0, src1, src2; \
+    dst_type dst0, dst1; \
+ \
+    VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 coord_out = coord; \
+    coord_out.z = coord_out.z * 3; \
+    coord_out.x = coord_out.z + 8; \
+    float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp,\
+        rMean * r_scale * output_scale - output_zp, \
+        rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \
+ \
+    half4 paramData_f16; \
+    copy_type data0, data1, data2, dst; \
+    _viv_asm(CONV, paramData_f16, paramData0); \
+    VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+        uniDataMeanStddevLo_2x8); \
+    float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp,\
+        gMean * g_scale * output_scale - output_zp, \
+        gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \
+    _viv_asm(CONV, paramData_f16, paramData1); \
+    VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), \
+        uniDataMeanStddevLo_2x8); \
+    _viv_asm(COPY, data0, dst0, 16); \
+ \
+    float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp,\
+        bMean * b_scale * output_scale - output_zp, \
+        bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \
+    _viv_asm(CONV, paramData_f16, paramData2); \
+    VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+        uniDataMeanStddevLo_2x8); \
+    _viv_asm(COPY, data1, dst0, 16); \
+    VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uni16BitsDataInterleave_0_2x8); \
+    VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uni16BitsDataInterleave_1_2x8); \
+    VXC_WriteImage(output, coord_out.xw, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+RGB888_PLANAR_SEP_COPY_16BITS(F16, vxc_half8,  vxc_short8)
+RGB888_PLANAR_SEP_COPY_16BITS(I16, vxc_short8, vxc_short8)
+
+#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \
+__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name##_nhwc \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_array_t input1, \
+    __read_only  image2d_array_t input2, \
+    __write_only image2d_array_t output, \
+          global int             *xRatio, \
+          global int             *yRatio, \
+          global int             *xOffset, \
+          global int             *yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           r_scale, \
+                 int             reverse, \
+                 float           g_scale, \
+                 float           b_scale \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    coord.xy += (int2) (*xOffset, *yOffset); \
+    vxc_uchar16 src0, src1, src2; \
+    write_type dst0, dst1, dst2, dst3; \
+ \
+    VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int4 coord_out = coord; \
+    coord_out.z = coord_out.z * 3; \
+    coord_out.x = coord_out.z + 16; \
+    float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp,\
+        rMean * r_scale * output_scale - output_zp, \
+        rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \
+ \
+    half4 paramData_f16; \
+    _viv_asm(CONV, paramData_f16, paramData0); \
+ \
+    VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniDataMeanStddevLo_2x8); \
+ \
+    float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp,\
+        gMean * g_scale * output_scale - output_zp, \
+        gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \
+    _viv_asm(CONV, paramData_f16, paramData1); \
+ \
+    VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniDataMeanStddevLo_2x8); \
+ \
+    float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp,\
+        bMean * b_scale * output_scale - output_zp, \
+        bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \
+    _viv_asm(CONV, paramData_f16, paramData2); \
+ \
+    VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniDataMeanStddevLo_2x8); \
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uni8BitsDataInterleave_0_2x8); \
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uni8BitsDataInterleave_1_2x8); \
+    VXC_DP2x8(dst3, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uni8BitsDataInterleave_2_2x8); \
+    VXC_WriteImage(output, coord_out.zw, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord_out.xw, dst3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)
+PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_2.vx
new file mode 100644
index 000000000..d43f82587
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_nhwc_2.vx
@@ -0,0 +1,54 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;
+
+__kernel void pre_process_rgb888_planar_sep_half_U8toU8_nhwc
+    (
+    __read_only  image2d_array_t input0,
+    __read_only  image2d_array_t input1,
+    __read_only  image2d_array_t input2,
+    __write_only image2d_array_t output,
+          global int             *xRatio,
+          global int             *yRatio,
+          global int             *xOffset,
+          global int             *yOffset,
+                 float           rMean,
+                 float           gMean,
+                 float           bMean,
+                 float           r_scale,
+                 int             reverse,
+                 float           g_scale,
+                 float           b_scale
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
+
+    vxc_uchar16 src0, src1, src2;
+
+    VXC_ReadImage(src0, input0, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input1, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src2, input2, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    int4 coord;
+    coord.xy = coord_in.xy >> 1;
+
+    coord.x = coord.x * 3;
+    coord.z = coord.x + 16;
+
+    vxc_uchar16 dst0, dst1;
+    src0.lo = src0.s02468ace;
+    src0.hi = src1.s02468ace;
+    src1.lo = src2.s02468ace;
+
+    VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),
+        uni8BitsDataInterleave_0_2x8);
+    VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),
+        uni8BitsDataInterleave_1_2x8);
+    VXC_DP2x8(dst1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),
+        uni8BitsDataInterleave_2_2x8);
+
+    VXC_WriteImage(output, coord.xy, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord.zy, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx
index c200019c3..5a343e708 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx
@@ -17,6 +17,8 @@ _viv_uniform VXC_512Bits uniExtractBtoF32_part1_4x4;
 _viv_uniform VXC_512Bits uniExtractBtoF32_part2_4x4;
 _viv_uniform VXC_512Bits uniExtractBtoF32_part3_4x4;
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform float4 param_data;
+_viv_uniform float4 rgb_scale;
 
 #define IMAGE_PRE_PROCESS_COPY_16BITS(dst_name, dst_type, copy_type, convert_type) \
 __kernel void pre_process_rgb_copy_U8to##dst_name \
@@ -30,9 +32,11 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
                 float            rMean, \
                 float            gMean, \
                 float            bMean, \
-                float            f32Var, \
+                float            r_scale, \
                 int              reverse_channel, \
-                int              trans \
+                int              trans, \
+                float            g_scale, \
+                float            b_scale \
     ) \
 { \
     int2 coord      = (int2)(get_global_id(0) * 3, get_global_id(1)); \
@@ -46,10 +50,6 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
         VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
     VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \
         VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
- \
-    f32Var *= outputScale; \
-    float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \
-        bMean * f32Var - outputZP, f32Var); \
  \
     int4 coord_out = (int4)(get_global_id(0), get_global_id(1), r_order, 0); \
     float4 tmp0, tmp1; \
@@ -57,8 +57,8 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
  \
     VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \
-    tmp0 = tmp0 * paramData.w - paramData.x; \
-    tmp1 = tmp1 * paramData.w - paramData.x; \
+    tmp0 = tmp0 * rgb_scale.x - param_data.x; \
+    tmp1 = tmp1 * rgb_scale.x - param_data.x; \
     _viv_asm(CONV_RTE, result0, tmp0); \
     _viv_asm(CONV_RTE, result1, tmp1); \
     VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
@@ -68,8 +68,8 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
     coord_out.z = 1; \
     VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \
-    tmp0 = tmp0 * paramData.w - paramData.y; \
-    tmp1 = tmp1 * paramData.w - paramData.y; \
+    tmp0 = tmp0 * rgb_scale.y - param_data.y; \
+    tmp1 = tmp1 * rgb_scale.y - param_data.y; \
     _viv_asm(CONV_RTE, result0, tmp0); \
     _viv_asm(CONV_RTE, result1, tmp1); \
     VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
@@ -79,8 +79,8 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
     coord_out.z = b_order; \
     VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \
-    tmp0 = tmp0 * paramData.w - paramData.z; \
-    tmp1 = tmp1 * paramData.w - paramData.z; \
+    tmp0 = tmp0 * rgb_scale.z - param_data.z; \
+    tmp1 = tmp1 * rgb_scale.z - param_data.z; \
     _viv_asm(CONV_RTE, result0, tmp0); \
     _viv_asm(CONV_RTE, result1, tmp1); \
     VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
@@ -102,9 +102,11 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
                 float            rMean, \
                 float            gMean, \
                 float            bMean, \
-                float            f32Var, \
+                float            r_scale, \
                 int              reverse_channel, \
-                int              trans \
+                int              trans, \
+                float            g_scale, \
+                float            b_scale \
     ) \
 { \
     int2 coord      = (int2)(get_global_id(0) * 3, get_global_id(1)); \
@@ -119,10 +121,6 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
         coord.x += 16; \
     VXC_ReadImage(src2, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \
         VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
- \
-    f32Var *= outputScale; \
-    float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \
-        bMean * f32Var - outputZP, f32Var); \
  \
     int4 coord_out = (int4)(get_global_id(0), get_global_id(1), r_order, 0); \
     float4 tmp0, tmp1; \
@@ -130,15 +128,15 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
  \
     VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \
-    tmp0 = tmp0 * paramData.w - paramData.x; \
-    tmp1 = tmp1 * paramData.w - paramData.x; \
+    tmp0 = tmp0 * rgb_scale.x - param_data.x; \
+    tmp1 = tmp1 * rgb_scale.x - param_data.x; \
     result0 = convert_int4_rte(tmp0); \
     result1 = convert_int4_rte(tmp1); \
     VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
     VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part2_4x4); \
     VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part3_4x4); \
-    tmp0 = tmp0 * paramData.w - paramData.x; \
-    tmp1 = tmp1 * paramData.w - paramData.x; \
+    tmp0 = tmp0 * rgb_scale.x - param_data.x; \
+    tmp1 = tmp1 * rgb_scale.x - param_data.x; \
     result0 = convert_int4_rte(tmp0); \
     result1 = convert_int4_rte(tmp1); \
     VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
@@ -147,15 +145,15 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
     coord_out.z = 1; \
     VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \
-    tmp0 = tmp0 * paramData.w - paramData.y; \
-    tmp1 = tmp1 * paramData.w - paramData.y; \
+    tmp0 = tmp0 * rgb_scale.y - param_data.y; \
+    tmp1 = tmp1 * rgb_scale.y - param_data.y; \
     result0 = convert_int4_rte(tmp0); \
     result1 = convert_int4_rte(tmp1); \
     VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
     VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part2_4x4); \
     VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part3_4x4); \
-    tmp0 = tmp0 * paramData.w - paramData.y; \
-    tmp1 = tmp1 * paramData.w - paramData.y; \
+    tmp0 = tmp0 * rgb_scale.y - param_data.y; \
+    tmp1 = tmp1 * rgb_scale.y - param_data.y; \
     result0 = convert_int4_rte(tmp0); \
     result1 = convert_int4_rte(tmp1); \
     VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
@@ -164,15 +162,15 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
     coord_out.z = b_order; \
     VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \
-    tmp0 = tmp0 * paramData.w - paramData.z; \
-    tmp1 = tmp1 * paramData.w - paramData.z; \
+    tmp0 = tmp0 * rgb_scale.z - param_data.z; \
+    tmp1 = tmp1 * rgb_scale.z - param_data.z; \
     result0 = convert_int4_rte(tmp0); \
     result1 = convert_int4_rte(tmp1); \
     VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
     VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part2_4x4); \
     VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part3_4x4); \
-    tmp0 = tmp0 * paramData.w - paramData.z; \
-    tmp1 = tmp1 * paramData.w - paramData.z; \
+    tmp0 = tmp0 * rgb_scale.z - param_data.z; \
+    tmp1 = tmp1 * rgb_scale.z - param_data.z; \
     result0 = convert_int4_rte(tmp0); \
     result1 = convert_int4_rte(tmp1); \
     VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx
index 25f981a11..3a91a3559 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx
@@ -49,9 +49,11 @@ __kernel void pre_process_yuv420_copy_##name \
                  float           rMean, \
                  float           gMean, \
                  float           bMean, \
-                 float           var, \
+                 float           r_scale, \
                  int             reverse_channel, \
-                 int             trans \
+                 int             trans, \
+                 float           g_scale, \
+                 float           b_scale \
     ) \
 { \
     int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \
@@ -110,17 +112,23 @@ __kernel void pre_process_yuv420_copy_##name \
     VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
     VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
  \
-    var *= output_scale; \
-    float4  paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \
-        rMean * var - output_zp, var); \
+    float4  paramData = (float4)(bMean * b_scale * output_scale - output_zp,\
+                                 gMean * g_scale * output_scale - output_zp, \
+                                 rMean * r_scale * output_scale - output_zp, b_scale * output_scale); \
     half4 paramData_f16; \
     _viv_asm(CONV, paramData_f16, paramData); \
  \
     VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \
     VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \
+ \
+    paramData.w = g_scale * output_scale; \
+    _viv_asm(CONV, paramData_f16, paramData); \
  \
     VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \
     VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \
+ \
+    paramData.w = r_scale * output_scale; \
+    _viv_asm(CONV, paramData_f16, paramData); \
  \
     VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \
     VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \
@@ -150,9 +158,11 @@ __kernel void pre_process_yuv420_copy_##name \
                  float           rMean, \
                  float           gMean, \
                  float           bMean, \
-                 float           var, \
+                 float           r_scale, \
                  int             reverse_channel, \
-                 int             trans \
+                 int             trans, \
+                 float           g_scale, \
+                 float           b_scale \
     ) \
 { \
     int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \
@@ -202,18 +212,22 @@ __kernel void pre_process_yuv420_copy_##name \
     VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
     VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
  \
-    var *= output_scale; \
-    float4  paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \
-        rMean * var - output_zp, var); \
+    float4  paramData = (float4)(bMean * b_scale * output_scale - output_zp, \
+                                 gMean * g_scale * output_scale - output_zp, \
+                                 rMean * r_scale * output_scale - output_zp, b_scale * output_scale); \
     half4 paramData_f16; \
     _viv_asm(CONV, paramData_f16, paramData); \
  \
     VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \
     VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \
  \
+    paramData.w = g_scale * output_scale; \
+    _viv_asm(CONV, paramData_f16, paramData); \
     VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \
     VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \
  \
+    paramData.w = r_scale * output_scale; \
+    _viv_asm(CONV, paramData_f16, paramData); \
     VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \
     VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \
  \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx
index 40db13719..99a64459e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx
@@ -48,9 +48,11 @@ __kernel void pre_process_yuv420_scale_##name \
                  float           rMean, \
                  float           gMean, \
                  float           bMean, \
-                 float           var, \
+                 float           r_scale, \
                  int             reverse_channel, \
-                 int             trans \
+                 int             trans, \
+                 float           g_scale, \
+                 float           b_scale \
     ) \
 { \
     int4 gidx = get_global_id(0); \
@@ -199,7 +201,7 @@ __kernel void pre_process_yuv420_scale_##name \
     float4 tmpDst; \
     int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
-    tmpDst = (tmpDst - bMean) * var; \
+    tmpDst = (tmpDst - bMean) * b_scale; \
     dstPos.z = bOrder; \
     result = convert_int4_rte(tmpDst * output_scale + output_zp); \
     VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
@@ -213,7 +215,7 @@ __kernel void pre_process_yuv420_scale_##name \
     temp2 = fx * tmpData0 + tmpData1; \
     result = fy * temp2 + (temp1 << 10); \
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
-    tmpDst = (tmpDst - gMean) * var; \
+    tmpDst = (tmpDst - gMean) * g_scale; \
     dstPos.z = 1; \
     result = convert_int4_rte(tmpDst * output_scale + output_zp); \
     VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
@@ -227,7 +229,7 @@ __kernel void pre_process_yuv420_scale_##name \
     temp2 = fx * tmpData0 + tmpData1; \
     result = fy * temp2 + (temp1 << 10); \
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
-    tmpDst = (tmpDst - rMean) * var; \
+    tmpDst = (tmpDst - rMean) * r_scale; \
     dstPos.z = rOrder; \
     result = convert_int4_rte(tmpDst * output_scale + output_zp); \
     VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx
index 7bfa6d112..676a8485c 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx
@@ -48,9 +48,11 @@ __kernel void pre_process_yuv420_scale_##name \
                  float           rMean, \
                  float           gMean, \
                  float           bMean, \
-                 float           var, \
+                 float           r_scale, \
                  int             reverse_channel, \
-                 int             trans \
+                 int             trans, \
+                 float           g_scale, \
+                 float           b_scale \
     ) \
 { \
     int4 gidx = get_global_id(0); \
@@ -201,7 +203,7 @@ __kernel void pre_process_yuv420_scale_##name \
     float4 tmpDst; \
     int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
-    tmpDst = (tmpDst - bMean) * var; \
+    tmpDst = (tmpDst - bMean) * b_scale; \
     dstPos.z = bOrder; \
     tmpDst = tmpDst * output_scale + output_zp; \
     _viv_asm(CONV_RTE, tmpVal, tmpDst); \
@@ -217,7 +219,7 @@ __kernel void pre_process_yuv420_scale_##name \
     temp2 = fx * tmpData0 + tmpData1; \
     result = fy * temp2 + (temp1 << 10); \
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
-    tmpDst = (tmpDst - gMean) * var; \
+    tmpDst = (tmpDst - gMean) * g_scale; \
     dstPos.z = 1; \
     tmpDst = tmpDst * output_scale + output_zp; \
     _viv_asm(CONV_RTE, tmpVal, tmpDst); \
@@ -233,7 +235,7 @@ __kernel void pre_process_yuv420_scale_##name \
     temp2 = fx * tmpData0 + tmpData1; \
     result = fy * temp2 + (temp1 << 10); \
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
-    tmpDst = (tmpDst - rMean) * var; \
+    tmpDst = (tmpDst - rMean) * r_scale; \
     dstPos.z = rOrder; \
     tmpDst = tmpDst * output_scale + output_zp; \
     _viv_asm(CONV_RTE, tmpVal, tmpDst); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx
index eed071587..0006e4a71 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx
@@ -3,7 +3,9 @@
 _viv_uniform int bOrder;
 _viv_uniform int rOrder;
 
-_viv_uniform float outputScaleVar;
+_viv_uniform float outputScaleVar_b;
+_viv_uniform float outputScaleVar_g;
+_viv_uniform float outputScaleVar_r;
 _viv_uniform float bMeanScaleVarZp;
 _viv_uniform float gMeanScaleVarZp;
 _viv_uniform float rMeanScaleVarZp;
@@ -27,10 +29,12 @@ __kernel void pre_process_yuv422_copy_##name \
                  float           rMean, \
                  float           gMean, \
                  float           bMean, \
-                 float           var, \
+                 float           r_scale, \
                  int             reverse_channel, \
                  int             trans, \
-                 int             yuv422_type \
+                 int             yuv422_type, \
+                 float           g_scale, \
+                 float           b_scale \
     ) \
 { \
     int gidx = get_global_id(0); \
@@ -60,21 +64,21 @@ __kernel void pre_process_yuv422_copy_##name \
     dst_type dst0; \
     save_type dst; \
     int4 dstPos = (int4)(gidx, gidy, 0, 0); \
-    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
+    tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \
     _viv_asm(CONV_RTE, result, tmpDstB); \
     dstPos.z = bOrder; \
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
     _viv_asm(COPY, dst, dst0, copy_bytes); \
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
-    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
+    tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \
     _viv_asm(CONV_RTE, result, tmpDstG); \
     dstPos.z = 1; \
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
     _viv_asm(COPY, dst, dst0, copy_bytes); \
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
-    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
+    tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \
     _viv_asm(CONV_RTE, result, tmpDstR); \
     dstPos.z = rOrder; \
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx
index 78546d991..9fb80e504 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx
@@ -3,7 +3,10 @@
 _viv_uniform int bOrder;
 _viv_uniform int rOrder;
 
-_viv_uniform float outputScaleVar;
+_viv_uniform float outputScaleVar_b;
+_viv_uniform float outputScaleVar_g;
+_viv_uniform float outputScaleVar_r;
+
 _viv_uniform float bMeanScaleVarZp;
 _viv_uniform float gMeanScaleVarZp;
 _viv_uniform float rMeanScaleVarZp;
@@ -33,10 +36,12 @@ __kernel void pre_process_yuv422_scale_##name \
                  float           rMean, \
                  float           gMean, \
                  float           bMean, \
-                 float           var, \
+                 float           r_scale, \
                  int             reverse_channel, \
                  int             trans, \
-                 int             yuv422_type \
+                 int             yuv422_type, \
+                 float           g_scale, \
+                 float           b_scale \
     ) \
 { \
     int4 gidx = get_global_id(0); \
@@ -108,21 +113,21 @@ __kernel void pre_process_yuv422_scale_##name \
     dst_type dst0; \
     save_type dst; \
     int4 dstPos = (int4)(gidx.x, gidy, 0, 0); \
-    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
+    tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \
     _viv_asm(CONV_RTE, result, tmpDstB); \
     dstPos.z = bOrder; \
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
     _viv_asm(COPY, dst, dst0, copy_bytes); \
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
-    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
+    tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \
     _viv_asm(CONV_RTE, result, tmpDstG); \
     dstPos.z = 1; \
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
     _viv_asm(COPY, dst, dst0, copy_bytes); \
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
  \
-    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
+    tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \
     _viv_asm(CONV_RTE, result, tmpDstR); \
     dstPos.z = rOrder; \
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx
index 05f9973c3..3a6a3c50f 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx
@@ -46,9 +46,11 @@ __kernel void pre_process_yuv444_copy_U8toU8(
                float                 rMean,
                float                 gMean,
                float                 bMean,
-               float                   var,
+               float               r_scale,
                int         reverse_channel,
-               int                   trans
+               int                   trans,
+               float               g_scale,
+               float               b_scale
     )
 {
     int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset));
@@ -107,18 +109,23 @@ __kernel void pre_process_yuv444_copy_U8toU8(
     VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
     VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
 
-    var *= outputScale;
-    float4  paramData = (float4)(bMean * var - zp, gMean * var - zp,\
-        rMean * var - zp, var);
+    float4  paramData = (float4)(bMean * b_scale * outputScale - zp, gMean * g_scale * outputScale - zp,\
+        rMean * r_scale * outputScale - zp, b_scale * outputScale);
     half4 paramData_f16;
     _viv_asm(CONV, paramData_f16, paramData);
 
     VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);
     VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);
 
+    paramData.w = g_scale * outputScale;
+    _viv_asm(CONV, paramData_f16, paramData);
+
     VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);
     VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);
 
+    paramData.w = r_scale * outputScale;
+    _viv_asm(CONV, paramData_f16, paramData);
+
     VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);
     VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);
 
@@ -142,9 +149,11 @@ __kernel void pre_process_yuv444_copy_U8toF16(
                float                 rMean,
                float                 gMean,
                float                 bMean,
-               float                   var,
+               float               r_scale,
                int         reverse_channel,
-               int                   trans
+               int                   trans,
+               float               g_scale,
+               float               b_scale
     )
 {
     int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset));
@@ -204,17 +213,23 @@ __kernel void pre_process_yuv444_copy_U8toF16(
     VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
     VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
 
-    float4  paramData = (float4)(bMean * var, gMean * var,\
-        rMean * var, var);
+    float4  paramData = (float4)(bMean * b_scale * outputScale, gMean * g_scale * outputScale,\
+        rMean * r_scale * outputScale, b_scale * outputScale);
     half4 paramData_f16;
     _viv_asm(CONV, paramData_f16, paramData);
 
     VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);
     VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);
 
+    paramData.w = g_scale * outputScale;
+     _viv_asm(CONV, paramData_f16, paramData);
+
     VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);
     VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);
 
+    paramData.w = r_scale * outputScale;
+     _viv_asm(CONV, paramData_f16, paramData);
+
     VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);
     VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale.vx
index a195750c4..9b4a418e2 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale.vx
@@ -39,7 +39,8 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \
     __read_only image2d_t y_img, __read_only image2d_t u_img, \
     __read_only image2d_t v_img, __write_only image2d_array_t    output, \
     global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, \
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) \
+    float rMean, float gMean, float bMean, float r_scale, int reverse_channel, int trans, \
+    float g_scale, float b_scale) \
 { \
     int4 gidx = get_global_id(0); \
     int gidy = get_global_id(1); \
@@ -151,7 +152,7 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \
     float4 tmpDst; \
     int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
-    tmpDst = (tmpDst - bMean) * var; \
+    tmpDst = (tmpDst - bMean) * b_scale; \
     dstPos.z = bOrder; \
     result = convert_int4_rte(tmpDst * outputScale + zp); \
     VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \
@@ -165,7 +166,7 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \
     temp2 = fx * tmpData0 + tmpData1; \
     result = fy * temp2 + (temp1 << 10); \
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
-    tmpDst = (tmpDst - gMean) * var; \
+    tmpDst = (tmpDst - gMean) * g_scale; \
     dstPos.z = 1; \
     result = convert_int4_rte(tmpDst * outputScale + zp); \
     VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \
@@ -179,7 +180,7 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \
     temp2 = fx * tmpData0 + tmpData1; \
     result = fy * temp2 + (temp1 << 10); \
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
-    tmpDst = (tmpDst - rMean) * var; \
+    tmpDst = (tmpDst - rMean) * r_scale; \
     dstPos.z = rOrder; \
     result = convert_int4_rte(tmpDst * outputScale + zp); \
     VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale_fp16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale_fp16.vx
index c5e706d9a..99325d87d 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale_fp16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_scale_fp16.vx
@@ -37,7 +37,8 @@ __kernel void pre_process_yuv444_scale_U8toF16(
     __read_only image2d_t y_img, __read_only image2d_t u_img,
     __read_only image2d_t v_img, __write_only image2d_array_t    output,
     global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
+    float rMean, float gMean, float bMean, float r_scale, int reverse_channel, int trans,
+    float g_scale, float b_scale)
 {
     int4 gidx = get_global_id(0);
     int gidy = get_global_id(1);
@@ -157,7 +158,7 @@ __kernel void pre_process_yuv444_scale_U8toF16(
     float4 tmpDst;
     int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - bMean) * var;
+    tmpDst = (tmpDst - bMean) * b_scale;
     dstPos.z = bOrder;
     _viv_asm(CONV, hDst, tmpDst);
     VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
@@ -172,7 +173,7 @@ __kernel void pre_process_yuv444_scale_U8toF16(
     temp2 = fx * tmpData0 + tmpData1;
     result = fy * temp2 + (temp1 << 10);
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - gMean) * var;
+    tmpDst = (tmpDst - gMean) * g_scale;
     dstPos.z = 1;
     _viv_asm(CONV, hDst, tmpDst);
     VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
@@ -187,7 +188,7 @@ __kernel void pre_process_yuv444_scale_U8toF16(
     temp2 = fx * tmpData0 + tmpData1;
     result = fy * temp2 + (temp1 << 10);
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - rMean) * var;
+    tmpDst = (tmpDst - rMean) * r_scale;
     dstPos.z = rOrder;
     _viv_asm(CONV, hDst, tmpDst);
     VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx
index 80840646b..750eadaf1 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx
@@ -2,7 +2,6 @@
 
 _viv_uniform VXC_512Bits uniExtact8Bit_2x8;
 _viv_uniform VXC_512Bits uniFp16toFp32_4x4;
-_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
 _viv_uniform VXC_512Bits uniExtactHalf8_2x8;
 _viv_uniform float scale_x;
 _viv_uniform int out_height;
@@ -63,8 +62,10 @@ __kernel void resize_1d_bilinear_F16toF16_DOWN
 
         _viv_asm(COPY, src_half, src, 16);
 
-        VXC_DP4x4(left4,  src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_left_4x4);
-        VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_right_4x4);
+        VXC_DP4x4(left4,  src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+            uniConvertFp2FP32_left_4x4);
+        VXC_DP4x4(right4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+            uniConvertFp2FP32_right_4x4);
         right4      -= left4;
         float4 dst4  = right4 * x_lerp + left4;
 
@@ -129,8 +130,10 @@ __kernel void resize_1d_bilinear_F16toU8_DOWN
 
         _viv_asm(COPY, src_half, src, 16);
 
-        VXC_DP4x4(left4,  src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_left_4x4);
-        VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_right_4x4);
+        VXC_DP4x4(left4,  src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+            uniConvertFp2FP32_left_4x4);
+        VXC_DP4x4(right4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+            uniConvertFp2FP32_right_4x4);
         right4      -= left4;
         float4 dst4  = right4 * x_lerp + left4;
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_fp.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_fp.vx
new file mode 100644
index 000000000..a60e9b8e9
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_fp.vx
@@ -0,0 +1,307 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int update_width;
+_viv_uniform int output_width;
+_viv_uniform int ref_stride;
+_viv_uniform int output_stride;
+
+_viv_uniform int4 coord_stride;
+_viv_uniform int4 coord_stride1;
+_viv_uniform float inout_scale;
+_viv_uniform float output_zp;
+
+_viv_uniform VXC_512Bits uniConvertFp16ToFp32_4x4;
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+
+inline void AtomicAdd_float(volatile __global float *source, const float operand)
+{
+    union
+    {
+        unsigned int intVal;
+        float floatVal;
+    } newVal;
+    union
+    {
+        unsigned int intVal;
+        float floatVal;
+    } prevVal;
+    do
+    {
+        prevVal.floatVal = *source;
+        newVal.floatVal = prevVal.floatVal + operand;
+    } while(atomic_cmpxchg((volatile __global unsigned int *)source,
+                             prevVal.intVal, newVal.intVal) != prevVal.intVal);
+}
+
+__kernel void scatter_nd_update_update_F16(
+    __read_only image2d_t   index,
+    __read_only image2d_t   update,
+    image2d_t  temp_buf_float,
+    image2d_t  link_buffer0,
+    int width, int area, int vol, int val4,
+    int val5, int val6, int val7, int coord_dim)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    Image img1 = create_image_from_image2d(index, 4);
+    Image img2 = create_image_from_image2d(update, 2);
+    Image img3 = create_image_from_image2d(temp_buf_float, 4);
+    __global int* index_ptr = (__global int*)img1.ptr;
+    __global short* update_ptr = (__global short*)img2.ptr;
+    __global float* output_ptr = (__global float*)img3.ptr;
+    half src;
+
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim);
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);
+    short tmpData = update_ptr[gidy * update_width + gidx];
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;
+    int loc = idx * output_width + gidx;
+    _viv_asm(COPY, src, tmpData, 4);
+    float data;
+    _viv_asm(CONV, data, src);
+    AtomicAdd_float(output_ptr + loc, data);
+}
+
+__kernel void scatter_nd_update_update_F16_4X(
+    __read_only image2d_t   index,
+    __read_only image2d_t   update,
+    image2d_t  temp_buf_float,
+    image2d_t  link_buffer0,
+    int width, int area, int vol, int val4,
+    int val5, int val6, int val7, int coord_dim)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    Image img1 = create_image_from_image2d(index, 4);
+    Image img2 = create_image_from_image2d(update, 2);
+    Image img3 = create_image_from_image2d(temp_buf_float, 4);
+    __global int* index_ptr = (__global int*)img1.ptr;
+    __global vxc_short4* update_ptr = (__global vxc_short4*)img2.ptr;
+    __global float* output_ptr = (__global float*)img3.ptr;
+    vxc_half4 src;
+
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim);
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);
+    vxc_short4 tmpData = update_ptr[gidy * update_width + gidx];
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;
+    int4 loc = idx * output_width + gidx * 4 + (int4)(0, 1, 2, 3);
+
+    _viv_asm(COPY, src, tmpData, 8);
+    float4 data;
+    VXC_DP4x4(data, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1),
+             uniConvertFp16ToFp32_4x4);
+    AtomicAdd_float(output_ptr + loc.x, data.x);
+    AtomicAdd_float(output_ptr + loc.y, data.y);
+    AtomicAdd_float(output_ptr + loc.z, data.z);
+    AtomicAdd_float(output_ptr + loc.w, data.w);
+}
+
+__kernel void scatter_nd_update_update_BF16(
+    __read_only image2d_t   index,
+    __read_only image2d_t   update,
+    image2d_t  temp_buf_float,
+    image2d_t  link_buffer0,
+    int width, int area, int vol, int val4,
+    int val5, int val6, int val7, int coord_dim)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    Image img1 = create_image_from_image2d(index, 4);
+    Image img2 = create_image_from_image2d(update, 2);
+    Image img3 = create_image_from_image2d(temp_buf_float, 4);
+    __global int* index_ptr = (__global int*)img1.ptr;
+    __global short* update_ptr = (__global short*)img2.ptr;
+    __global float* output_ptr = (__global float*)img3.ptr;
+    float data;
+
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim);
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);
+    short tmpData = update_ptr[gidy * update_width + gidx];
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_short8 src0, src1;
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;
+    int loc = idx * output_width + gidx;
+    _viv_asm(COPY, src0, tmpData, 4);
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                        uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, data, src1, 4);
+    AtomicAdd_float(output_ptr + loc, data);
+}
+
+__kernel void scatter_nd_update_update_BF16_4X(
+    __read_only image2d_t   index,
+    __read_only image2d_t   update,
+    image2d_t  temp_buf_float,
+    image2d_t  link_buffer0,
+    int width, int area, int vol, int val4,
+    int val5, int val6, int val7, int coord_dim)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    Image img1 = create_image_from_image2d(index, 4);
+    Image img2 = create_image_from_image2d(update, 2);
+    Image img3 = create_image_from_image2d(temp_buf_float, 4);
+    __global int* index_ptr = (__global int*)img1.ptr;
+    __global vxc_short4* update_ptr = (__global vxc_short4*)img2.ptr;
+    __global float* output_ptr = (__global float*)img3.ptr;
+
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim);
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);
+    vxc_short4 tmpData = update_ptr[gidy * update_width + gidx];
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_short8 src0, src1;
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;
+    int4 loc = idx * output_width + gidx * 4 + (int4)(0, 1, 2, 3);
+
+    _viv_asm(COPY, src0, tmpData, 8);
+    float4 data;
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                        uniConvBF16toF32_Part0_2x8);
+    _viv_asm(COPY, data, src1, 16);
+    AtomicAdd_float(output_ptr + loc.x, data.x);
+    AtomicAdd_float(output_ptr + loc.y, data.y);
+    AtomicAdd_float(output_ptr + loc.z, data.z);
+    AtomicAdd_float(output_ptr + loc.w, data.w);
+}
+
+#define SCATTER_ND_UPDATE_REF_FP16(type0, type1, ptr_type) \
+__kernel void scatter_nd_update_ref_##type0##to##type1( \
+    __read_only image2d_t   index, \
+    __read_only image2d_t   update, \
+    __read_only image2d_t   temp_buf_int, \
+    image2d_t  temp_ref, \
+    image2d_t  link_buffer0, \
+    image2d_t  link_buffer1, \
+    int width, int area, int vol, int val4, \
+    int val5, int val6, int val7, int coord_dim) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    Image img1 = create_image_from_image2d(index, 4); \
+    Image img2 = create_image_from_image2d(temp_buf_int, 4); \
+    Image img3 = create_image_from_image2d(temp_ref, 2); \
+    __global int* index_ptr = (__global int*)img1.ptr; \
+    __global ptr_type* acc_ptr = (__global ptr_type*)img2.ptr; \
+    __global short* ref_ptr = (__global short*)img3.ptr; \
+ \
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \
+    int loc = idx * output_stride + gidx; \
+    float4 tmpData; \
+    tmpData.x = convert_float(acc_ptr[loc]) * inout_scale + output_zp; \
+    half4 data; \
+    short tmpDst; \
+    _viv_asm(CONV, data, tmpData); \
+    _viv_asm(COPY, tmpDst, data, 4); \
+    ref_ptr[loc] = tmpDst; \
+}
+SCATTER_ND_UPDATE_REF_FP16(I32, F16, int)
+SCATTER_ND_UPDATE_REF_FP16(F32, F16, float)
+
+#define SCATTER_ND_UPDATE_REF_FP16_4X(type0, type1, ptr_type) \
+__kernel void scatter_nd_update_ref_##type0##to##type1##_4X( \
+    __read_only image2d_t   index, \
+    __read_only image2d_t   update, \
+    __read_only image2d_t   temp_buf_int, \
+    image2d_t  temp_ref, \
+    image2d_t  link_buffer0, \
+    image2d_t  link_buffer1, \
+    int width, int area, int vol, int val4, \
+    int val5, int val6, int val7, int coord_dim) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    Image img1 = create_image_from_image2d(index, 4); \
+    Image img2 = create_image_from_image2d(temp_buf_int, 4); \
+    Image img3 = create_image_from_image2d(temp_ref, 2); \
+    __global int* index_ptr = (__global int*)img1.ptr; \
+    __global ptr_type* acc_ptr = (__global ptr_type*)img2.ptr; \
+    __global vxc_short4* ref_ptr = (__global vxc_short4*)img3.ptr; \
+ \
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \
+    float4 tmpData = convert_float4(vload4(gidx, acc_ptr + idx * ref_stride)); \
+    int loc = idx * output_stride + gidx; \
+    float4 tmpVal = tmpData * inout_scale + output_zp; \
+    half4 data; \
+    vxc_short8 tmpDst; \
+    _viv_asm(CONV, data, tmpVal); \
+    _viv_asm(COPY, tmpDst, data, 16); \
+    ref_ptr[loc] = tmpDst.s0246; \
+}
+SCATTER_ND_UPDATE_REF_FP16_4X(I32, F16, int)
+SCATTER_ND_UPDATE_REF_FP16_4X(F32, F16, float)
+
+__kernel void scatter_nd_update_ref_F32toBF16(
+    __read_only image2d_t   index,
+    __read_only image2d_t   update,
+    __read_only image2d_t   temp_buf_int,
+    image2d_t  temp_ref,
+    image2d_t  link_buffer0,
+    image2d_t  link_buffer1,
+    int width, int area, int vol, int val4,
+    int val5, int val6, int val7, int coord_dim)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    Image img1 = create_image_from_image2d(index, 4);
+    Image img2 = create_image_from_image2d(temp_buf_int, 4);
+    Image img3 = create_image_from_image2d(temp_ref, 2);
+    __global int* index_ptr = (__global int*)img1.ptr;
+    __global float* acc_ptr = (__global float*)img2.ptr;
+    __global short* ref_ptr = (__global short*)img3.ptr;
+
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim);
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;
+    int loc = idx * output_stride + gidx;
+    float tmpData;
+    tmpData = acc_ptr[loc];
+    vxc_ushort8 src0, src2;
+    _viv_asm(COPY, src0, tmpData, 4);
+    VXC_DP2x8(src2, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+
+    ref_ptr[loc] = src2.x;
+}
+
+__kernel void scatter_nd_update_ref_F32toBF16_4X(
+    __read_only image2d_t   index,
+    __read_only image2d_t   update,
+    __read_only image2d_t   temp_buf_int,
+    image2d_t  temp_ref,
+    image2d_t  link_buffer0,
+    image2d_t  link_buffer1,
+    int width, int area, int vol, int val4,
+    int val5, int val6, int val7, int coord_dim)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    Image img1 = create_image_from_image2d(index, 4);
+    Image img2 = create_image_from_image2d(temp_buf_int, 4);
+    Image img3 = create_image_from_image2d(temp_ref, 2);
+    __global int* index_ptr = (__global int*)img1.ptr;
+    __global float* acc_ptr = (__global float*)img2.ptr;
+    __global vxc_short4* ref_ptr = (__global vxc_short4*)img3.ptr;
+
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim);
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;
+    float4 tmpData = vload4(gidx, acc_ptr + idx * ref_stride);
+    int loc = idx * output_stride + gidx;
+    vxc_short8 src0, src2;
+    _viv_asm(COPY, src0, tmpData, 16);
+    VXC_DP2x8(src2, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    ref_ptr[loc] = src2.s0123;
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_qint.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_qint.vx
new file mode 100644
index 000000000..2284f49ce
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_qint.vx
@@ -0,0 +1,263 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;
+_viv_uniform int update_width;
+_viv_uniform int output_width;
+_viv_uniform int ref_stride;
+_viv_uniform int output_stride;
+_viv_uniform int2 multAndoutZP0;
+
+_viv_uniform int4 coord_stride;
+_viv_uniform int4 coord_stride1;
+
+_viv_uniform float output_zp;
+_viv_uniform int input_zp;
+_viv_uniform float input_scale;
+_viv_uniform float inout_scale;
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+#define SCATTER_RESET(name0, name1, ptr0, ptr1, type0, type1, len0, len1, size0, size1, ptr2, ptr3, len3) \
+__kernel void scatter_nd_update_reset_##name0##to##name1( \
+    __read_only image2d_t   input_ref, \
+    image2d_t  temp_ref, \
+    image2d_t  temp_buf_int, \
+    int length, int res) \
+{ \
+    int gidx = get_global_id(0); \
+    Image img1 = create_image_from_image2d(input_ref, size0); \
+    Image img2 = create_image_from_image2d(temp_ref, size1); \
+    Image img3 = create_image_from_image2d(temp_buf_int, 4); \
+    __global ptr0* input_ptr = (__global ptr0*)img1.ptr; \
+    __global ptr1* output_ptr = (__global ptr1*)img2.ptr; \
+    __global int* tmp_update_ptr = (__global int*)img3.ptr; \
+    ptr0 tmpData = input_ptr[gidx]; \
+    int4 zeros = (int4)(0); \
+    int loc2 = gidx * 8; \
+    type0 src; \
+    type1 tmpDst; \
+    ptr1 dst; \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    _viv_asm(COPY, src, tmpData, len0); \
+    VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                uniU8MulAndPostShift_0_Lo_2x8); \
+    _viv_asm(COPY, dst, tmpDst, len1); \
+    output_ptr[gidx] = dst; \
+    vstore4(zeros, 0, tmp_update_ptr + loc2); \
+    vstore4(zeros, 1, tmp_update_ptr + loc2); \
+    if(gidx < res) \
+    { \
+        __global ptr2* input_ptr1 = (__global ptr2*)img1.ptr; \
+        __global ptr3* output_ptr1 = (__global ptr3*)img2.ptr; \
+        ptr2 tmpData1 = input_ptr1[length + gidx]; \
+        ptr3 dst1; \
+        dst1 ^= dst1; \
+        tmp_update_ptr[length + gidx] = 0; \
+        _viv_asm(COPY, src, tmpData1, 4); \
+        VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                uniU8MulAndPostShift_0_Lo_2x8); \
+        _viv_asm(COPY, dst1, tmpDst, len3); \
+        output_ptr1[length + gidx] = dst1; \
+    } \
+}
+SCATTER_RESET(U8,  U8,  vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, 8, 8, 1, 1, uchar, uchar, 1)
+SCATTER_RESET(I8,  I8,  vxc_char8,  vxc_char8,  vxc_char8,  vxc_char8,  8, 8, 1, 1, char, char, 1)
+SCATTER_RESET(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, 16, 16, 2, 2, short, short, 2)
+SCATTER_RESET(F16, F16, vxc_short8, vxc_short8, vxc_half8,  vxc_half8,  16, 16, 2, 2, short, short, 2)
+SCATTER_RESET(U8,  F16, vxc_uchar8, vxc_short8, vxc_uchar8, vxc_half8,  8, 16, 1, 2, uchar, short, 2)
+SCATTER_RESET(I8,  F16, vxc_char8,  vxc_short8, vxc_char8,  vxc_half8,  8, 16, 1, 2, char, short, 2)
+SCATTER_RESET(I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8,  16, 8, 2, 1, short, short, 2)
+SCATTER_RESET(F16, U8,  vxc_short8, vxc_uchar8, vxc_half8,  vxc_uchar8, 16, 8, 2, 1, short, uchar, 1)
+
+__kernel void scatter_nd_update_reset_BF16toBF16(
+    __read_only image2d_t   input_ref,
+    image2d_t  temp_ref,
+    image2d_t  temp_buf_int)
+{
+    int gidx = get_global_id(0);
+    Image img1 = create_image_from_image2d(input_ref, 2);
+    Image img2 = create_image_from_image2d(temp_ref, 2);
+    Image img3 = create_image_from_image2d(temp_buf_int, 4);
+    __global vxc_short8* input_ptr = (__global vxc_short8*)img1.ptr;
+    __global vxc_short8* output_ptr = (__global vxc_short8*)img2.ptr;
+    __global float* tmp_update_ptr = (__global float*)img3.ptr;
+    vxc_short8 src = input_ptr[gidx];
+    float4 zeros = (float4)(0, 0, 0, 0);
+    int loc2 = gidx * 8;
+    output_ptr[gidx] = src;
+    vstore4(zeros, 0, tmp_update_ptr + loc2);
+    vstore4(zeros, 1, tmp_update_ptr + loc2);
+}
+
+#define SCATTER_ND_UPDATE_QINT(src0_type, data_type, ptr_type, element_size) \
+__kernel void scatter_nd_update_update_##src0_type( \
+    __read_only image2d_t   index, \
+    __read_only image2d_t   update, \
+    image2d_t  temp_buf_int, \
+    image2d_t  link_buffer0, \
+    int width, int area, int vol, int val4, \
+    int val5, int val6, int val7, int coord_dim) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    Image img1 = create_image_from_image2d(index, 4); \
+    Image img2 = create_image_from_image2d(update, element_size); \
+    Image img3 = create_image_from_image2d(temp_buf_int, 4); \
+    __global int* index_ptr = (__global int*)img1.ptr; \
+    __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \
+    __global int* output_ptr = (__global int*)img3.ptr; \
+    data_type src; \
+ \
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \
+    ptr_type tmpData = update_ptr[gidy * update_width + gidx]; \
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \
+    int loc = idx * output_width + gidx; \
+    _viv_asm(COPY, src, tmpData, 4); \
+    vxc_int4 data; \
+    short zp = input_zp; \
+    VXC_DP4x4(data, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+             uniConvert1stUint8SubZpToFp32_4x4); \
+    atomic_add(output_ptr + loc, data.x); \
+}
+SCATTER_ND_UPDATE_QINT(U8,  vxc_uchar8, uchar, 1)
+SCATTER_ND_UPDATE_QINT(I8,  vxc_char8,  char,  1)
+SCATTER_ND_UPDATE_QINT(I16, vxc_short8, short, 2)
+
+#define SCATTER_ND_UPDATE_QINT_4X(src0_type, data_type, ptr_type, element_size) \
+__kernel void scatter_nd_update_update_##src0_type##_4X( \
+    __read_only image2d_t   index, \
+    __read_only image2d_t   update, \
+    image2d_t  temp_buf_int, \
+    image2d_t  link_buffer0, \
+    int width, int area, int vol, int val4, \
+    int val5, int val6, int val7, int coord_dim) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    Image img1 = create_image_from_image2d(index, 4); \
+    Image img2 = create_image_from_image2d(update, element_size); \
+    Image img3 = create_image_from_image2d(temp_buf_int, 4); \
+    __global int* index_ptr = (__global int*)img1.ptr; \
+    __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \
+    __global int* output_ptr = (__global int*)img3.ptr; \
+ \
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \
+    ptr_type src = update_ptr[gidy * update_width + gidx]; \
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \
+    int4 loc = idx * output_width + gidx * 4 + (int4)(0, 1, 2, 3); \
+    vxc_int4 data; \
+    short zp = input_zp; \
+    VXC_DP4x4(data, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+             uniConvert1stUint8SubZpToFp32_4x4); \
+    atomic_add(output_ptr + loc.x, data.x); \
+    atomic_add(output_ptr + loc.y, data.y); \
+    atomic_add(output_ptr + loc.z, data.z); \
+    atomic_add(output_ptr + loc.w, data.w); \
+}
+SCATTER_ND_UPDATE_QINT_4X(U8,  vxc_uchar8, vxc_uchar4, 1)
+SCATTER_ND_UPDATE_QINT_4X(I8,  vxc_char8,  vxc_char4,  1)
+SCATTER_ND_UPDATE_QINT_4X(I16, vxc_short8, vxc_short4, 2)
+
+#define SCATTER_ND_UPDATE_REF(src0_type, dst_type, data_type, ptr_type, element_size) \
+__kernel void scatter_nd_update_ref_##src0_type##to##dst_type( \
+    __read_only image2d_t   index, \
+    __read_only image2d_t   update, \
+    __read_only image2d_t   temp_buf_int, \
+    image2d_t  temp_ref, \
+    image2d_t  link_buffer0, \
+    image2d_t  link_buffer1, \
+    int width, int area, int vol, int val4, \
+    int val5, int val6, int val7, int coord_dim) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    Image img1 = create_image_from_image2d(index, 4); \
+    Image img2 = create_image_from_image2d(temp_buf_int, 4); \
+    Image img3 = create_image_from_image2d(temp_ref, element_size); \
+    __global int* index_ptr = (__global int*)img1.ptr; \
+    __global int* acc_ptr = (__global int*)img2.ptr; \
+    __global ptr_type* ref_ptr = (__global ptr_type*)img3.ptr; \
+    data_type dst; \
+ \
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \
+    int loc = idx * output_stride + gidx; \
+    int tmpData = acc_ptr[loc]; \
+    int4 data; \
+    data.x = convert_int_rte(tmpData * inout_scale + output_zp); \
+    VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                    uniConvertInt32toUint8_2x8); \
+    ref_ptr[loc] = dst.x; \
+}
+SCATTER_ND_UPDATE_REF(I32, U8,  vxc_uchar8, uchar, 1)
+SCATTER_ND_UPDATE_REF(I32, I8,  vxc_char8,  char,  1)
+SCATTER_ND_UPDATE_REF(I32, I16, vxc_short8, short, 2)
+
+#define SCATTER_ND_UPDATE_REF_4X(src0_type, dst_type, data_type, ptr_type, element_size) \
+__kernel void scatter_nd_update_ref_##src0_type##to##dst_type##_4X( \
+    __read_only image2d_t   index, \
+    __read_only image2d_t   update, \
+    __read_only image2d_t   temp_buf_int, \
+    image2d_t  temp_ref, \
+    image2d_t  link_buffer0, \
+    image2d_t  link_buffer1, \
+    int width, int area, int vol, int val4, \
+    int val5, int val6, int val7, int coord_dim) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    Image img1 = create_image_from_image2d(index, 4); \
+    Image img2 = create_image_from_image2d(temp_buf_int, 4); \
+    Image img3 = create_image_from_image2d(temp_ref, element_size); \
+    __global int* index_ptr = (__global int*)img1.ptr; \
+    __global int* acc_ptr = (__global int*)img2.ptr; \
+    __global ptr_type* ref_ptr = (__global ptr_type*)img3.ptr; \
+    data_type dst; \
+ \
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \
+    float4 tmpData = convert_float4(vload4(gidx, acc_ptr + idx * ref_stride)); \
+    int loc = idx * output_stride + gidx; \
+    int4 data = convert_int4_rte(tmpData * inout_scale + output_zp); \
+    VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                    uniConvertInt32toUint8_2x8); \
+    ref_ptr[loc] = dst.xyzw; \
+}
+SCATTER_ND_UPDATE_REF_4X(I32, U8,  vxc_uchar8, vxc_uchar4, 1)
+SCATTER_ND_UPDATE_REF_4X(I32, I8,  vxc_char8,  vxc_char4,  1)
+SCATTER_ND_UPDATE_REF_4X(I32, I16, vxc_short8, vxc_short4, 2)
+
+#define SCATTER_ND_UPDATE_COPY(src0_type, ptr_type, element_size, ptr_type1) \
+__kernel void scatter_nd_update_copy_##src0_type( \
+    __read_only image2d_t  temp_ref, \
+    __read_only image2d_t  link_buffer1, \
+    image2d_t  output, \
+    int length, int res) \
+{ \
+    int gidx = get_global_id(0); \
+    Image img1 = create_image_from_image2d(temp_ref, element_size); \
+    Image img2 = create_image_from_image2d(output, element_size); \
+    __global ptr_type* input_ptr = (__global ptr_type*)img1.ptr; \
+    __global ptr_type* output_ptr = (__global ptr_type*)img2.ptr; \
+    output_ptr[gidx] = input_ptr[gidx]; \
+    if(gidx < res) \
+    { \
+        __global ptr_type1* input_ptr1 = (__global ptr_type1*)img1.ptr; \
+        __global ptr_type1* output_ptr1 = (__global ptr_type1*)img2.ptr; \
+        output_ptr1[length + gidx] = input_ptr1[length + gidx]; \
+    } \
+}
+SCATTER_ND_UPDATE_COPY(U8,  vxc_uchar8, 1, uchar)
+SCATTER_ND_UPDATE_COPY(I8,  vxc_char8,  1, char)
+SCATTER_ND_UPDATE_COPY(I16, vxc_short8, 2, short)
+SCATTER_ND_UPDATE_COPY(F16, vxc_short8, 2, short)
+SCATTER_ND_UPDATE_COPY(BF16, vxc_short8, 2, short)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx b/src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx
index 319348593..3c770f373 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx
@@ -21,7 +21,7 @@ __kernel void sequence_mask_##src0_type_name##to##src1_type_name##_2D( \
     short zp = inputZP; \
     VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
                  uniConvert1stUint8SubZpToFp32_4x4); \
-    int index = convert_int_rte(tmpData.s0 * input_scale); \
+    int index = convert_int_rtz(tmpData.s0 * input_scale); \
     int4 data; \
     data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \
     write_type dst; \
@@ -47,7 +47,7 @@ __kernel void sequence_mask_##src0_type_name##to##src1_type_name( \
     short zp = inputZP; \
     VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
                  uniConvert1stUint8SubZpToFp32_4x4); \
-    int index = convert_int_rte(tmpData.s0 * input_scale); \
+    int index = convert_int_rtz(tmpData.s0 * input_scale); \
     int4 data; \
     data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \
     write_type dst; \
@@ -73,7 +73,7 @@ __kernel void sequence_mask_F16toF16_2D(
     float4 tmpData;
     VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
             UniFP16toFP32Lo4_dp4x4);
-    int index = convert_int_rte(tmpData.x);
+    int index = convert_int_rtz(tmpData.x);
     float4 data;
     data = outIdx < index? outputVal1 : convert_float(output_ZP);
     vxc_short8 dst;
@@ -96,7 +96,7 @@ __kernel void sequence_mask_F16toF16(
     float4 tmpData;
     VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
             UniFP16toFP32Lo4_dp4x4);
-    int index = convert_int_rte(tmpData.x);
+    int index = convert_int_rtz(tmpData.x);
     float4 data;
     data = outIdx < index? outputVal1 : convert_float(output_ZP);
     vxc_short8 dst;
@@ -119,7 +119,7 @@ __kernel void sequence_mask_F16toU8_2D(
     float4 tmpData;
     VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
             UniFP16toFP32Lo4_dp4x4);
-    int index = convert_int_rte(tmpData.x);
+    int index = convert_int_rtz(tmpData.x);
     int4 data;
     data = outIdx < index? convert_int_rte(outputVal1) : output_ZP;
     vxc_uchar16 dst;
@@ -140,7 +140,7 @@ __kernel void sequence_mask_F16toU8(
     float4 tmpData;
     VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
             UniFP16toFP32Lo4_dp4x4);
-    int index = convert_int_rte(tmpData.x);
+    int index = convert_int_rtz(tmpData.x);
     int4 data;
     data = outIdx < index? convert_int_rte(outputVal1) : output_ZP;
     vxc_uchar16 dst;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/tiny_yolov4_postprocess_box.vx b/src/tim/vx/internal/src/libnnext/ops/vx/tiny_yolov4_postprocess_box.vx
new file mode 100644
index 000000000..6e513f126
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/tiny_yolov4_postprocess_box.vx
@@ -0,0 +1,103 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+#include "cl_viv_vx_ext.h"
+
+#define logE    (1.44269502f)
+
+float4 sigmoid4(float4 x)
+{
+    x *= -logE;
+    x = 1 + exp2(x);
+    return 1 / x;
+}
+
+float4 exp4(float4 x)
+{
+    x *= logE;
+    return exp2(x);
+}
+
+#define CONST0      (1.0499999523162842f)
+#define CONST1      (0.0250000003725290f)
+
+_viv_uniform VXC_512Bits uniDatatoFloat32_0_4x4;
+_viv_uniform VXC_512Bits uniDatatoFloat32_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uniDataTranspose_0_2x8;
+_viv_uniform VXC_512Bits uniDataTranspose_1_2x8;
+_viv_uniform float input0_scale;
+_viv_uniform float input0_tail;
+_viv_uniform float input1_scale;
+_viv_uniform float input1_tail;
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+_viv_uniform float CONST2;
+__kernel void tiny_yolov4_postprocess_box_U8_U8toU8
+    (
+    __read_only  image2d_array_t  input0,
+    __read_only  image2d_array_t  input1,
+    __write_only image2d_array_t  output,
+                 float            bias_0,
+                 float            bias_1
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(0));
+
+    vxc_uchar16 src0, src1, src2, src3;
+    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input0, coord.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input0, coord.xy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+
+    VXC_ReadImage(src2, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src3, input1, coord.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    coord.zw += (int2)(2, 3);
+
+    float4 data0, data1, data2, data3, data;
+    VXC_DP4x4(data0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);
+    data0 = data0 * input0_scale + input0_tail;
+    data0 = sigmoid4(data0);
+    data0 = data0 * CONST0 - CONST1;
+
+    VXC_DP4x4(data, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);
+    data = data * input1_scale + input1_tail;
+    data0 = data0 * CONST2 + data * CONST2;
+
+    VXC_DP4x4(data1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_1_4x4);
+    data1 = data1 * input0_scale + input0_tail;
+    data1 = sigmoid4(data1);
+    data1 = data1 * CONST0 - CONST1;
+
+    VXC_DP4x4(data, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);
+    data = data * input1_scale + input1_tail;
+    data1 = data1 * CONST2 + data * CONST2;
+
+    VXC_DP4x4(data2, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);
+    data2 = data2 * input0_scale + input0_tail;
+    data2 = exp4(data2) * bias_0;
+
+    VXC_DP4x4(data3, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_1_4x4);
+    data3 = data3 * input0_scale + input0_tail;
+    data3 = exp4(data3) * bias_1;
+
+    data0 = data0 * output_scale + output_zp;
+    data1 = data1 * output_scale + output_zp;
+
+    int4 dst0 = convert_int4_rte(data0);
+    int4 dst1 = convert_int4_rte(data1);
+    VXC_DP2x8(src1, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+    data2 = data2 * output_scale + output_zp;
+    data3 = data3 * output_scale + output_zp;
+    dst0 = convert_int4_rte(data2);
+    dst1 = convert_int4_rte(data3);
+    VXC_DP2x8(src1, dst0, dst1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);
+
+    VXC_DP2x8(src0, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniDataTranspose_0_2x8);
+    VXC_DP2x8(src0, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniDataTranspose_1_2x8);
+
+    VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    coord.x ++;
+    VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord.yz, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord.yw, src0, VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/tiny_yolov4_postprocess_confidence.vx b/src/tim/vx/internal/src/libnnext/ops/vx/tiny_yolov4_postprocess_confidence.vx
new file mode 100644
index 000000000..0a41c0e2c
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/tiny_yolov4_postprocess_confidence.vx
@@ -0,0 +1,54 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniU8TimesU8_0_4x4;
+_viv_uniform VXC_512Bits uniU8PlusU8_trans_0_2x8;
+_viv_uniform VXC_512Bits uniU8PlusU8_trans_1_2x8;
+_viv_uniform VXC_512Bits uniU16TimesMultiplier_PostShift_2x8;
+_viv_uniform int output_zp;
+
+__kernel void tiny_yolov4_postprocess_conf_U8toU8
+(
+    __read_only  image2d_t input,
+    __write_only image2d_t output
+)
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, get_global_id(0));
+
+    vxc_uchar16 src0, src1, src2, src3, src4;
+
+    VXC_ReadImage(src0, input, coord.wz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    vxc_ushort8 data0, data1;
+
+    VXC_ReadImage(src1, input, coord.wy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src2, input, coord.wy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src3, input, coord.wy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src4, input, coord.wy, VXC_5BITOFFSET_XY(0, 4), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+
+    coord.zw = coord.xx + (int2)(2, 3);
+
+    VXC_DP4x4(data0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);
+    VXC_DP4x4(data0, src0, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);
+    VXC_DP4x4(data1, src0, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);
+    VXC_DP4x4(data1, src0, src4, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);
+
+    VXC_DP2x8(src1, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),
+        uniU16TimesMultiplier_PostShift_2x8);
+    VXC_DP2x8(src1, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),
+        uniU16TimesMultiplier_PostShift_2x8);
+
+    uchar zp;
+    _viv_asm(COPY, zp, output_zp, 2);
+
+    VXC_DP2x8(src0, src1, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),
+        uniU8PlusU8_trans_0_2x8);
+    VXC_DP2x8(src0, src1, zp, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),
+        uniU8PlusU8_trans_1_2x8);
+
+    VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    coord.x ++;
+    VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord.yz, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord.yw, src0, VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
index f528ccb35..5421a5aba 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
@@ -4902,6 +4902,710 @@ __kernel void cumsum_BF16toBF16_axis0_2D(\n\
 }\n\
 "; /* end of cumsum_bf16_vx*/
 
+static const char cumsum_ex_rev_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;\n\
+_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16B_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16C_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzRevF16toF16_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSumHorzRevU8toI16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzRevU8toI16B_8x4;\n\
+_viv_uniform VXC_512Bits uniSubZpRevI16toI16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32B_4x4;\n\
+\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int input_zp;\n\
+_viv_uniform float in_out_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+__kernel void cumsum_ex_rev_F16toF16_axis0(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    vxc_short8 src, dst;\n\
+    vxc_half8 data, tmpsum, sum;\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+    if(exclusive == 0 && rev)\n\
+    {\n\
+        for(coord.x = width - 8; coord.x >= 0; coord.x -= 8)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);\n\
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);\n\
+            VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                        uniSumHorzRevF16toF16C_2x8);\n\
+            VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);\n\
+            _viv_asm(COPY, dst, sum, 16);\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(exclusive && rev == 0)\n\
+    {\n\
+        _viv_asm(COPY, dst, sum, 16);\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+        for(; coord.x < width - 8;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord_out.x = coord.x + 1;\n\
+            coord.x += 8;\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);\n\
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);\n\
+            VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);\n\
+            VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);\n\
+            _viv_asm(COPY, dst, sum, 16);\n\
+            VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(exclusive && rev)\n\
+    {\n\
+        coord.x = width - 8;\n\
+        coord_out.x = width - 1;\n\
+        _viv_asm(COPY, dst, sum, 16);\n\
+        VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+        for(; coord.x > 0;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord_out.x = coord.x - 1;\n\
+            coord.x -= 8;\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);\n\
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);\n\
+            VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                        uniSumHorzRevF16toF16C_2x8);\n\
+            VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);\n\
+            _viv_asm(COPY, dst, sum, 16);\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+#define CUMSUM_QINT_EX_REV_AXIS0(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis0( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); \\\n\
+    int4 coord_out = coord; \\\n\
+ \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    vxc_short8 rowSum; \\\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0); \\\n\
+    short zp = (short)input_zp; \\\n\
+ \\\n\
+    if(exclusive == 0 && rev) \\\n\
+    { \\\n\
+        for(coord.x = width - 8; coord.x >= 0; coord.x -= 8) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \\\n\
+            VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \\\n\
+            VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \\\n\
+            VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniAccSumHorzRevI16toI32A_4x4); \\\n\
+            VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniAccSumHorzRevI16toI32B_4x4); \\\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+            VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive && rev == 0) \\\n\
+    { \\\n\
+        for(coord.x = -1; coord.x < width - 8;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord_out.x = coord.x + 1; \\\n\
+            coord.x += 8; \\\n\
+            VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \\\n\
+            VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \\\n\
+            VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \\\n\
+            VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniAccSumHorzI16toI32A_4x4); \\\n\
+            VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniAccSumHorzI16toI32B_4x4); \\\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive && rev) \\\n\
+    { \\\n\
+        for(coord.x = width - 7; coord.x > 0;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord_out.x = coord.x - 1; \\\n\
+            coord.x -= 8; \\\n\
+            VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \\\n\
+            VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \\\n\
+            VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \\\n\
+            VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniAccSumHorzRevI16toI32A_4x4); \\\n\
+            VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniAccSumHorzRevI16toI32B_4x4); \\\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+            VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+CUMSUM_QINT_EX_REV_AXIS0(U8,  U8,  vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_QINT_EX_REV_AXIS0(I8,  I8,  vxc_char16,  vxc_char16)\n\
+CUMSUM_QINT_EX_REV_AXIS0(I16, I16, vxc_short8,  vxc_short8)\n\
+"; /* end of cumsum_ex_rev_axis0_vx*/
+
+static const char cumsum_ex_rev_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\
+\n\
+_viv_uniform int height;\n\
+_viv_uniform float in_out_scale;\n\
+_viv_uniform float in_out_zp_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+__kernel void cumsum_ex_rev_F16toF16_axis1(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev)\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    vxc_half8 data, sum;\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+    if(exclusive == 0 && rev)\n\
+    {\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+            _viv_asm(COPY, dst, sum, 16);\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(exclusive && rev == 0)\n\
+    {\n\
+        dst ^= dst;\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        for(; coord.y < height - 1;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+            _viv_asm(COPY, dst, sum, 16);\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(exclusive && rev)\n\
+    {\n\
+        dst ^= dst;\n\
+        coord.y = height - 1;\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+        for(; coord.y > 0;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y--;\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+            _viv_asm(COPY, dst, sum, 16);\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+#define CUMSUM_8BITS_EX_REV_AXIS1(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \\\n\
+ \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\
+ \\\n\
+    if(exclusive == 0 && rev) \\\n\
+    { \\\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \\\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive && rev == 0) \\\n\
+    { \\\n\
+        int tmpAlpha0 = convert_int_rte(output_zp); \\\n\
+        int4 tmpVal; \\\n\
+        tmpVal.x = tmpAlpha0; \\\n\
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        for(; coord.y < height - 1;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.y++; \\\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+            float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp; \\\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8);\\\n\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8);\\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive && rev) \\\n\
+    { \\\n\
+        coord.y = height - 1; \\\n\
+        int tmpAlpha0 = convert_int_rte(output_zp); \\\n\
+        int4 tmpVal; \\\n\
+        tmpVal.x = tmpAlpha0; \\\n\
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        for(; coord.y > 0;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \\\n\
+            coord.y--; \\\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8);\\\n\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8);\\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+CUMSUM_8BITS_EX_REV_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_8BITS_EX_REV_AXIS1(I8, I8, vxc_char16,  vxc_char16)\n\
+\n\
+__kernel void cumsum_ex_rev_I16toI16_axis1(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev)\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\
+    if(exclusive == 0 && rev)\n\
+    {\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\
+                        uniConvertInt32toUint8_2x8);\n\
+\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(exclusive && rev == 0)\n\
+    {\n\
+        int tmpAlpha0 = convert_int_rte(output_zp);\n\
+        int4 tmpVal;\n\
+        tmpVal.x = tmpAlpha0;\n\
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+        for(; coord.y < height - 1;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+            float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp;\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\
+                        uniConvertInt32toUint8_2x8);\n\
+\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(exclusive && rev)\n\
+    {\n\
+        coord.y = height - 1;\n\
+        int tmpAlpha0 = convert_int_rte(output_zp);\n\
+        int4 tmpVal;\n\
+        tmpVal.x = tmpAlpha0;\n\
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+        for(; coord.y > 0;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;\n\
+            coord.y--;\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\
+                        uniConvertInt32toUint8_2x8);\n\
+\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+}\n\
+"; /* end of cumsum_ex_rev_axis1_vx*/
+
+static const char cumsum_ex_rev_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\
+\n\
+_viv_uniform int channel;\n\
+_viv_uniform float in_out_scale;\n\
+_viv_uniform float in_out_zp_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+__kernel void cumsum_ex_rev_F16toF16_axis2(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev)\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    vxc_half8 data, sum;\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+    if(rev && exclusive == 0)\n\
+    {\n\
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+            _viv_asm(COPY, dst, sum, 16);\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(rev == 0 && exclusive)\n\
+    {\n\
+        _viv_asm(COPY, dst, sum, 16);\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        for(; coord.z < channel - 1;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.z++;\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+            _viv_asm(COPY, dst, sum, 16);\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(rev && exclusive)\n\
+    {\n\
+        _viv_asm(COPY, dst, sum, 16);\n\
+        coord.z = channel - 1;\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        for(; coord.z > 0;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.z--;\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+            _viv_asm(COPY, dst, sum, 16);\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+#define CUMSUM_8BITS_EX_REV_AXIS2(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
+ \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\
+ \\\n\
+    if(rev && exclusive == 0) \\\n\
+    { \\\n\
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \\\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8);\\\n\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \\\n\
+                        uniConvertInt32toUint8_2x8);\\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive && rev == 0) \\\n\
+    { \\\n\
+        int tmpAlpha0 = convert_int_rte(output_zp); \\\n\
+        int4 tmpVal; \\\n\
+        tmpVal.x = tmpAlpha0; \\\n\
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        for(; coord.z < channel - 1;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.z++; \\\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+            float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp; \\\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(rev && exclusive) \\\n\
+    { \\\n\
+        coord.z = channel - 1; \\\n\
+        int tmpAlpha0 = convert_int_rte(output_zp); \\\n\
+        int4 tmpVal; \\\n\
+        tmpVal.x = tmpAlpha0; \\\n\
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        for(; coord.z > 0;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \\\n\
+            coord.z--; \\\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1),\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+CUMSUM_8BITS_EX_REV_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_8BITS_EX_REV_AXIS2(I8, I8, vxc_char16, vxc_char16)\n\
+\n\
+__kernel void cumsum_ex_rev_I16toI16_axis2(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev)\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\
+    if(exclusive == 0 && rev)\n\
+    {\n\
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\
+                        uniConvertInt32toUint8_2x8);\n\
+\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(exclusive && rev == 0)\n\
+    {\n\
+        int tmpAlpha0 = convert_int_rte(output_zp);\n\
+        int4 tmpVal;\n\
+        tmpVal.x = tmpAlpha0;\n\
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        for(; coord.z < channel - 1;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.z++;\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+            float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp;\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\
+                        uniConvertInt32toUint8_2x8);\n\
+\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(exclusive && rev)\n\
+    {\n\
+        coord.z = channel - 1;\n\
+        int tmpAlpha0 = convert_int_rte(output_zp);\n\
+        int4 tmpVal;\n\
+        tmpVal.x = tmpAlpha0;\n\
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        for(; coord.z > 0;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;\n\
+            coord.z--;\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\
+                        uniConvertInt32toUint8_2x8);\n\
+\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+}\n\
+"; /* end of cumsum_ex_rev_axis2_vx*/
+
 static const char cumsum_f16_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
@@ -5080,6 +5784,138 @@ __kernel void cumsum_F16to##out_name##_axis0_2D( \\\n\
 CUMSUM_F16TOQINT_AXIS0_2D(I8,  vxc_half8, vxc_char16)\n\
 CUMSUM_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8)\n\
 CUMSUM_F16TOQINT_AXIS0_2D(U8,  vxc_half8, vxc_uchar16)\n\
+\n\
+#define CUMSUM_F16TOQINT_EX_REV_AXIS2(out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_ex_rev_F16to##out_name##_axis2( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
+ \\\n\
+    vxc_short8 src; \\\n\
+    dst_type dst; \\\n\
+    vxc_half8 data, sum; \\\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    if(exclusive == 0 && rev) \\\n\
+    { \\\n\
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, data, src, 16); \\\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive && rev == 0) \\\n\
+    { \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        for(; coord.z < channel - 1;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.z++; \\\n\
+            _viv_asm(COPY, data, src, 16); \\\n\
+     \\\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive && rev) \\\n\
+    { \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        coord.z = channel - 1; \\\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        for(; coord.z > 0;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.z--; \\\n\
+            _viv_asm(COPY, data, src, 16); \\\n\
+     \\\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+CUMSUM_F16TOQINT_EX_REV_AXIS2(I8,  vxc_half8, vxc_char16)\n\
+CUMSUM_F16TOQINT_EX_REV_AXIS2(I16, vxc_half8, vxc_short8)\n\
+CUMSUM_F16TOQINT_EX_REV_AXIS2(U8,  vxc_half8, vxc_uchar16)\n\
+\n\
+#define CUMSUM_F16TOQINT_EX_REV_AXIS1(out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_ex_rev_F16to##out_name##_axis1( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \\\n\
+ \\\n\
+    vxc_short8 src; \\\n\
+    dst_type dst; \\\n\
+    vxc_half8 data, sum; \\\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    if(exclusive == 0 && rev) \\\n\
+    { \\\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, data, src, 16); \\\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive && rev == 0) \\\n\
+    { \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        for(; coord.y < height - 1;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.y++; \\\n\
+            _viv_asm(COPY, data, src, 16); \\\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive && rev) \\\n\
+    { \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        coord.y = height - 1; \\\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        for(; coord.y > 0;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.y--; \\\n\
+            _viv_asm(COPY, data, src, 16); \\\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+CUMSUM_F16TOQINT_EX_REV_AXIS1(I8,  vxc_half8, vxc_char16)\n\
+CUMSUM_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8)\n\
+CUMSUM_F16TOQINT_EX_REV_AXIS1(U8,  vxc_half8, vxc_uchar16)\n\
 "; /* end of cumsum_f16_u8_vx*/
 
 static const char custom_softmax_vx[] = "/*\n\
@@ -5509,15 +6345,13 @@ __kernel void custom_warp_affine_bilinear_U8toU8\n\
 }\n\
 "; /* end of custom_warp_affine_vx*/
 
-static const char custom_warp_perspective_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+static const char custom_warp_affine_rgb_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
 \n\
 #include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform float4 matrix0;\n\
-_viv_uniform float4 matrix1;\n\
-_viv_uniform float4 matrix2;\n\
-_viv_uniform float4 matrix4;\n\
-__kernel void custom_warp_perspective_nearest_neighbor_U8toU8_2D\n\
+_viv_uniform float2 matrix1;\n\
+__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb_2D\n\
 (\n\
     __read_only  image2d_array_t input,\n\
     __write_only image2d_array_t output,\n\
@@ -5526,53 +6360,38 @@ __kernel void custom_warp_perspective_nearest_neighbor_U8toU8_2D\n\
                  float           _m2,\n\
                  float           _m3,\n\
                  float           _m4,\n\
-                 float           _m5,\n\
-                 float           _m6,\n\
-                 float           _m7,\n\
-                 float           _m8\n\
+                 float           _m5\n\
 )\n\
 {\n\
-    int2   coord = (int2)(get_global_id(0), get_global_id(1));\n\
+    int2   coord = (int2)(get_global_id(0) * 3, get_global_id(1));\n\
     int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
 \n\
-    float4 coord_f0 = convert_float4(coord_in);\n\
-\n\
-    float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\
-    z0.zw = z0.zw + 2 * matrix1.z;\n\
-    float4 z1 = z0 + 4 * matrix1.z;\n\
-\n\
-    z0 = 1.0f / z0;\n\
-    z1 = 1.0f / z1;\n\
+    float4 coord_f = convert_float4(coord_in);\n\
 \n\
-    coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\
-    float4 coord_f = coord_f0 * z0.xxyy;\n\
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\
 \n\
-    coord_in = convert_int4(coord_f);\n\
+    coord_in.x = floor(coord_f.x) * 3;\n\
+    coord_in.y = floor(coord_f.y);\n\
+    coord_in.z = floor(coord_f.z) * 3;\n\
+    coord_in.w = floor(coord_f.w);\n\
 \n\
     vxc_uchar16 dst;\n\
     VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_f0 = coord_f0.zwzw + matrix4;\n\
-    coord_f = coord_f0 * z0.zzww;\n\
-    coord_in = convert_int4(coord_f);\n\
+    coord_in.x = coord_in.x + 1;\n\
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = coord_in.x + 1;\n\
     VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+\n\
     VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-    coord_f0 = coord_f0.zwzw + matrix4;\n\
-    coord_f = coord_f0 * z1.xxyy;\n\
-    coord_in = convert_int4(coord_f);\n\
-    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.z = coord_in.z + 1;\n\
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.z = coord_in.z + 1;\n\
     VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
-    coord_f0 = coord_f0.zwzw + matrix4;\n\
-    coord_f = coord_f0 * z1.zzww;\n\
-    coord_in = convert_int4(coord_f);\n\
-    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
-__kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\
+__kernel void custom_warp_affine_bilinear_U8toU8_rgb_2D\n\
 (\n\
     __read_only  image2d_array_t input,\n\
     __write_only image2d_array_t output,\n\
@@ -5581,32 +6400,30 @@ __kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\
                  float           _m2,\n\
                  float           _m3,\n\
                  float           _m4,\n\
-                 float           _m5,\n\
-                 float           _m6,\n\
-                 float           _m7,\n\
-                 float           _m8\n\
+                 float           _m5\n\
 )\n\
 {\n\
-    int2   coord = (int2)(get_global_id(0), get_global_id(1));\n\
+    int2   coord = (int2)(get_global_id(0) * 3, get_global_id(1));\n\
     int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
 \n\
-    float4 coord_f0 = convert_float4(coord_in);\n\
+    float4 coord_f = convert_float4(coord_in);\n\
 \n\
-    float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\
-    z0.zw = z0.zw + 2 * matrix1.z;\n\
-    float4 z1 = z0 + 4 * matrix1.z;\n\
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\
 \n\
-    z0 = 1.0f / z0;\n\
-    z1 = 1.0f / z1;\n\
+    coord_in.x = floor(coord_f.x) * 3;\n\
+    coord_in.y = floor(coord_f.y);\n\
+    coord_in.z = floor(coord_f.z) * 3;\n\
+    coord_in.w = floor(coord_f.w);\n\
 \n\
-    coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\
-    float4 coord_f = coord_f0 * z0.xxyy;\n\
+    vxc_uchar16 src0, src1, src_0, src_1, dst;\n\
+    VXC_ReadImage(src_0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src_1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-    coord_in = convert_int4(floor(coord_f));\n\
+    src0.x = src_0.s0;\n\
+    src0.y = src_0.s3;\n\
+    src1.x = src_1.s0;\n\
+    src1.y = src_1.s3;\n\
 \n\
-    vxc_uchar16 src0, src1, dst;\n\
-    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
 #if (VX_VERSION==1)\n\
     VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 #else\n\
@@ -5615,21 +6432,22 @@ __kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\
     VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 #endif\n\
 \n\
-    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src0.x = src_0.s1;\n\
+    src0.y = src_0.s4;\n\
+    src1.x = src_1.s1;\n\
+    src1.y = src_1.s4;\n\
 #if (VX_VERSION==1)\n\
-    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
 #else\n\
-    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
     src1.s0 = src0.s1;\n\
-    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
 #endif\n\
 \n\
-    coord_f0 = coord_f0.zwzw + matrix4;\n\
-    coord_f = coord_f0 * z0.zzww;\n\
-    coord_in = convert_int4(floor(coord_f));\n\
-    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src0.x = src_0.s2;\n\
+    src0.y = src_0.s5;\n\
+    src1.x = src_1.s2;\n\
+    src1.y = src_1.s5;\n\
 #if (VX_VERSION==1)\n\
     VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
 #else\n\
@@ -5638,8 +6456,13 @@ __kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\
     VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
 #endif\n\
 \n\
-    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src_0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src_1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    src0.x = src_0.s0;\n\
+    src0.y = src_0.s3;\n\
+    src1.x = src_1.s0;\n\
+    src1.y = src_1.s3;\n\
 #if (VX_VERSION==1)\n\
     VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
 #else\n\
@@ -5648,21 +6471,22 @@ __kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\
     VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
 #endif\n\
 \n\
-    coord_f0 = coord_f0.zwzw + matrix4;\n\
-    coord_f = coord_f0 * z1.xxyy;\n\
-    coord_in = convert_int4(floor(coord_f));\n\
-    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src0.x = src_0.s1;\n\
+    src0.y = src_0.s4;\n\
+    src1.x = src_1.s1;\n\
+    src1.y = src_1.s4;\n\
 #if (VX_VERSION==1)\n\
-    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
 #else\n\
-    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
     src1.s0 = src0.s1;\n\
-    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
 #endif\n\
 \n\
-    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src0.x = src_0.s2;\n\
+    src0.y = src_0.s5;\n\
+    src1.x = src_1.s2;\n\
+    src1.y = src_1.s5;\n\
 #if (VX_VERSION==1)\n\
     VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
 #else\n\
@@ -5671,36 +6495,10 @@ __kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\
     VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
 #endif\n\
 \n\
-    coord_f0 = coord_f0.zwzw + matrix4;\n\
-    coord_f = coord_f0 * z1.zzww;\n\
-    coord_in = convert_int4(floor(coord_f));\n\
-    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-#if (VX_VERSION==1)\n\
-    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
-#else\n\
-    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    src1.s0 = src0.s1;\n\
-    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
-#endif\n\
-\n\
-    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-#if (VX_VERSION==1)\n\
-    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
-#else\n\
-    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    src1.s0 = src0.s1;\n\
-    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
-#endif\n\
-\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
-#define IMAGE_LOAD_3D(dst, xoffset, yoffset, start, end) \\\n\
-    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, VXC_5BITOFFSET_XY(xoffset, yoffset), \\\n\
-        VXC_MODIFIER(start, end, 0, VXC_RM_TowardZero, 0));\n\
-__kernel void custom_warp_perspective_nearest_neighbor_U8toU8\n\
+__kernel void custom_warp_affine_nearest_neighbor_U8toU8_rgb\n\
 (\n\
     __read_only  image2d_array_t input,\n\
     __write_only image2d_array_t output,\n\
@@ -5709,28 +6507,20 @@ __kernel void custom_warp_perspective_nearest_neighbor_U8toU8\n\
                  float           _m2,\n\
                  float           _m3,\n\
                  float           _m4,\n\
-                 float           _m5,\n\
-                 float           _m6,\n\
-                 float           _m7,\n\
-                 float           _m8\n\
+                 float           _m5\n\
 )\n\
 {\n\
-    int4   coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    int4   coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2));\n\
     int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
 \n\
-    float4 coord_f0 = convert_float4(coord_in);\n\
-\n\
-    float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\
-    z0.zw = z0.zw + 2 * matrix1.z;\n\
-    float4 z1 = z0 + 4 * matrix1.z;\n\
-\n\
-    z0 = 1.0f / z0;\n\
-    z1 = 1.0f / z1;\n\
+    float4 coord_f = convert_float4(coord_in);\n\
 \n\
-    coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\
-    float4 coord_f = coord_f0 * z0.xxyy;\n\
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\
 \n\
-    coord_in = convert_int4(coord_f);\n\
+    coord_in.x = floor(coord_f.x) * 3;\n\
+    coord_in.y = floor(coord_f.y);\n\
+    coord_in.z = floor(coord_f.z) * 3;\n\
+    coord_in.w = floor(coord_f.w);\n\
 \n\
     int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\
     int8 input_desc;\n\
@@ -5739,28 +6529,391 @@ __kernel void custom_warp_perspective_nearest_neighbor_U8toU8\n\
     _viv_asm(MOV, coord_input.w, baseAddr);\n\
 \n\
     vxc_uchar16 dst;\n\
-    IMAGE_LOAD_3D(dst, 0, 0, 0, 0)\n\
-    coord_input.xy = coord_in.zw;\n\
-    IMAGE_LOAD_3D(dst, 0, 0, 1, 1)\n\
-    coord_f0 = coord_f0.zwzw + matrix4;\n\
-    coord_f = coord_f0 * z0.zzww;\n\
-    coord_in = convert_int4(coord_f);\n\
-    coord_input.xy = coord_in.xy;\n\
-    IMAGE_LOAD_3D(dst, 0, 0, 2, 2)\n\
-    coord_input.xy = coord_in.zw;\n\
-    IMAGE_LOAD_3D(dst, 0, 0, 3, 3)\n\
-    coord_f0 = coord_f0.zwzw + matrix4;\n\
-    coord_f = coord_f0 * z1.xxyy;\n\
-    coord_in = convert_int4(coord_f);\n\
-    coord_input.xy = coord_in.xy;\n\
-    IMAGE_LOAD_3D(dst, 0, 0, 4, 4)\n\
-    coord_input.xy = coord_in.zw;\n\
-    IMAGE_LOAD_3D(dst, 0, 0, 5, 5)\n\
-    coord_f0 = coord_f0.zwzw + matrix4;\n\
-    coord_f = coord_f0 * z1.zzww;\n\
-    coord_in = convert_int4(coord_f);\n\
-    coord_input.xy = coord_in.xy;\n\
-    IMAGE_LOAD_3D(dst, 0, 0, 6, 6)\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    coord_input.x = coord_input.x + 1;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    coord_input.x = coord_input.x + 1;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    coord_input.xy = coord_in.zw;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+    coord_input.x = coord_input.x + 1;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+    coord_input.x = coord_input.x + 1;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void custom_warp_affine_bilinear_U8toU8_rgb\n\
+(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                 float           _m0,\n\
+                 float           _m1,\n\
+                 float           _m2,\n\
+                 float           _m3,\n\
+                 float           _m4,\n\
+                 float           _m5\n\
+)\n\
+{\n\
+    int4   coord = (int4)(get_global_id(0) * 3, get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+\n\
+    float4 coord_f = convert_float4(coord_in);\n\
+\n\
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+\n\
+    coord_in.x = floor(coord_f.x) * 3;\n\
+    coord_in.y = floor(coord_f.y);\n\
+    coord_in.z = floor(coord_f.z) * 3;\n\
+    coord_in.w = floor(coord_f.w);\n\
+\n\
+    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_input.w, baseAddr);\n\
+\n\
+    vxc_uchar16 src0, src1, src_0, src_1, dst;\n\
+    VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    src0.x = src_0.s0;\n\
+    src0.y = src_0.s3;\n\
+    src1.x = src_1.s0;\n\
+    src1.y = src_1.s3;\n\
+\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    src0.x = src_0.s1;\n\
+    src0.y = src_0.s4;\n\
+    src1.x = src_1.s1;\n\
+    src1.y = src_1.s4;\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    src0.x = src_0.s2;\n\
+    src0.y = src_0.s5;\n\
+    src1.x = src_1.s2;\n\
+    src1.y = src_1.s5;\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_input.xy = coord_in.zw;\n\
+    VXC_OP4(img_load_3d, src_0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src_1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    src0.x = src_0.s0;\n\
+    src0.y = src_0.s3;\n\
+    src1.x = src_1.s0;\n\
+    src1.y = src_1.s3;\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    src0.x = src_0.s1;\n\
+    src0.y = src_0.s4;\n\
+    src1.x = src_1.s1;\n\
+    src1.y = src_1.s4;\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    src0.x = src_0.s2;\n\
+    src0.y = src_0.s5;\n\
+    src1.x = src_1.s2;\n\
+    src1.y = src_1.s5;\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0));\n\
+}"; /* end of custom_warp_affine_rgb_vx*/
+
+static const char custom_warp_perspective_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float4 matrix0;\n\
+_viv_uniform float4 matrix1;\n\
+_viv_uniform float4 matrix2;\n\
+_viv_uniform float4 matrix4;\n\
+__kernel void custom_warp_perspective_nearest_neighbor_U8toU8_2D\n\
+(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                 float           _m0,\n\
+                 float           _m1,\n\
+                 float           _m2,\n\
+                 float           _m3,\n\
+                 float           _m4,\n\
+                 float           _m5,\n\
+                 float           _m6,\n\
+                 float           _m7,\n\
+                 float           _m8\n\
+)\n\
+{\n\
+    int2   coord = (int2)(get_global_id(0), get_global_id(1));\n\
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+\n\
+    float4 coord_f0 = convert_float4(coord_in);\n\
+\n\
+    float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\
+    z0.zw = z0.zw + 2 * matrix1.z;\n\
+    float4 z1 = z0 + 4 * matrix1.z;\n\
+\n\
+    z0 = 1.0f / z0;\n\
+    z1 = 1.0f / z1;\n\
+\n\
+    coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+    float4 coord_f = coord_f0 * z0.xxyy;\n\
+\n\
+    coord_in = convert_int4(coord_f);\n\
+\n\
+    vxc_uchar16 dst;\n\
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z0.zzww;\n\
+    coord_in = convert_int4(coord_f);\n\
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z1.xxyy;\n\
+    coord_in = convert_int4(coord_f);\n\
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z1.zzww;\n\
+    coord_in = convert_int4(coord_f);\n\
+    VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void custom_warp_perspective_bilinear_U8toU8_2D\n\
+(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                 float           _m0,\n\
+                 float           _m1,\n\
+                 float           _m2,\n\
+                 float           _m3,\n\
+                 float           _m4,\n\
+                 float           _m5,\n\
+                 float           _m6,\n\
+                 float           _m7,\n\
+                 float           _m8\n\
+)\n\
+{\n\
+    int2   coord = (int2)(get_global_id(0), get_global_id(1));\n\
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+\n\
+    float4 coord_f0 = convert_float4(coord_in);\n\
+\n\
+    float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\
+    z0.zw = z0.zw + 2 * matrix1.z;\n\
+    float4 z1 = z0 + 4 * matrix1.z;\n\
+\n\
+    z0 = 1.0f / z0;\n\
+    z1 = 1.0f / z1;\n\
+\n\
+    coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+    float4 coord_f = coord_f0 * z0.xxyy;\n\
+\n\
+    coord_in = convert_int4(floor(coord_f));\n\
+\n\
+    vxc_uchar16 src0, src1, dst;\n\
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z0.zzww;\n\
+    coord_in = convert_int4(floor(coord_f));\n\
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z1.xxyy;\n\
+    coord_in = convert_int4(floor(coord_f));\n\
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z1.zzww;\n\
+    coord_in = convert_int4(floor(coord_f));\n\
+    VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
+    VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+#else\n\
+    VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    src1.s0 = src0.s1;\n\
+    VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+#endif\n\
+\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+#define IMAGE_LOAD_3D(dst, xoffset, yoffset, start, end) \\\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, VXC_5BITOFFSET_XY(xoffset, yoffset), \\\n\
+        VXC_MODIFIER(start, end, 0, VXC_RM_TowardZero, 0));\n\
+__kernel void custom_warp_perspective_nearest_neighbor_U8toU8\n\
+(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                 float           _m0,\n\
+                 float           _m1,\n\
+                 float           _m2,\n\
+                 float           _m3,\n\
+                 float           _m4,\n\
+                 float           _m5,\n\
+                 float           _m6,\n\
+                 float           _m7,\n\
+                 float           _m8\n\
+)\n\
+{\n\
+    int4   coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+\n\
+    float4 coord_f0 = convert_float4(coord_in);\n\
+\n\
+    float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;\n\
+    z0.zw = z0.zw + 2 * matrix1.z;\n\
+    float4 z1 = z0 + 4 * matrix1.z;\n\
+\n\
+    z0 = 1.0f / z0;\n\
+    z1 = 1.0f / z1;\n\
+\n\
+    coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+    float4 coord_f = coord_f0 * z0.xxyy;\n\
+\n\
+    coord_in = convert_int4(coord_f);\n\
+\n\
+    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_input.w, baseAddr);\n\
+\n\
+    vxc_uchar16 dst;\n\
+    IMAGE_LOAD_3D(dst, 0, 0, 0, 0)\n\
+    coord_input.xy = coord_in.zw;\n\
+    IMAGE_LOAD_3D(dst, 0, 0, 1, 1)\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z0.zzww;\n\
+    coord_in = convert_int4(coord_f);\n\
+    coord_input.xy = coord_in.xy;\n\
+    IMAGE_LOAD_3D(dst, 0, 0, 2, 2)\n\
+    coord_input.xy = coord_in.zw;\n\
+    IMAGE_LOAD_3D(dst, 0, 0, 3, 3)\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z1.xxyy;\n\
+    coord_in = convert_int4(coord_f);\n\
+    coord_input.xy = coord_in.xy;\n\
+    IMAGE_LOAD_3D(dst, 0, 0, 4, 4)\n\
+    coord_input.xy = coord_in.zw;\n\
+    IMAGE_LOAD_3D(dst, 0, 0, 5, 5)\n\
+    coord_f0 = coord_f0.zwzw + matrix4;\n\
+    coord_f = coord_f0 * z1.zzww;\n\
+    coord_in = convert_int4(coord_f);\n\
+    coord_input.xy = coord_in.xy;\n\
+    IMAGE_LOAD_3D(dst, 0, 0, 6, 6)\n\
     coord_input.xy = coord_in.zw;\n\
     IMAGE_LOAD_3D(dst, 0, 0, 7, 7)\n\
 \n\
@@ -8432,6 +9585,7 @@ __kernel void gather_I8toI8(\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
     int4 indice = read_imagei(input1, coord_in.xy);\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     vxc_char16 src;\n\
@@ -8456,6 +9610,7 @@ __kernel void gather_U8toU8(\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
     int4 indice = read_imagei(input1, coord_in.xy);\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     vxc_uchar16 src;\n\
@@ -8479,9 +9634,9 @@ __kernel void gather_I16toI16(\n\
     int gidz = get_global_id(2);  // block_num\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
-\n\
 \n\
     int4 indice = read_imagei(input1, coord_in.xy);\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     vxc_short8 src;\n\
@@ -8506,6 +9661,7 @@ __kernel void gather_F16toF16(\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
     int4 indice = read_imagei(input1, coord_in.xy);\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     vxc_short8 src;\n\
@@ -8526,6 +9682,7 @@ __kernel void gather_I8toI8_axis0(\n\
 {\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
     int4 indices = read_imagei(input1, coord.xx);\n\
+    indices = indices >= 0 ? indices : indices + axis_num;\n\
     int2 coord_in = (int2)(indices.x, get_global_id(1));\n\
 \n\
     vxc_char16 src, dst;\n\
@@ -8552,6 +9709,7 @@ __kernel void gather_U8toU8_axis0(\n\
 {\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
     int4 indices = read_imagei(input1, coord.xx);\n\
+    indices = indices >= 0 ? indices : indices + axis_num;\n\
     int2 coord_in = (int2)(indices.x, get_global_id(1));\n\
 \n\
     vxc_uchar16 src, dst;\n\
@@ -8578,6 +9736,7 @@ __kernel void gather_I16toI16_axis0(\n\
 {\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
     int4 indices = read_imagei(input1, coord.xx);\n\
+    indices = indices >= 0 ? indices : indices + axis_num;\n\
     int2 coord_in = (int2)(indices.x, get_global_id(1));\n\
 \n\
     vxc_short8 src, dst;\n\
@@ -8604,6 +9763,7 @@ __kernel void gather_F16toF16_axis0(\n\
 {\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
     int4 indices = read_imagei(input1, coord.xx);\n\
+    indices = indices >= 0 ? indices : indices + axis_num;\n\
     int2 coord_in = (int2)(indices.x, get_global_id(1));\n\
 \n\
     vxc_short8 src, dst;\n\
@@ -8640,6 +9800,7 @@ __kernel void gather_I8toI8_array(\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
     int4 indice = read_imagei(input1, coord_in.xy);\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     Image img1 = create_image_from_image2d(input0, 1);\n\
@@ -8668,6 +9829,7 @@ __kernel void gather_U8toU8_array(\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
     int4 indice = read_imagei(input1, coord_in.xy);\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     Image img1 = create_image_from_image2d(input0, 1);\n\
@@ -8695,9 +9857,9 @@ __kernel void gather_I16toI16_array(\n\
     int gidz = get_global_id(2);  // block_num\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
-\n\
 \n\
     int4 indice = read_imagei(input1, coord_in.xy);\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     Image img1 = create_image_from_image2d(input0, 2);\n\
@@ -8727,6 +9889,7 @@ __kernel void gather_F16toF16_array(\n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
 \n\
     int4 indice = read_imagei(input1, coord_in.xy);\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     Image img1 = create_image_from_image2d(input0, 2);\n\
@@ -8764,6 +9927,7 @@ __kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \\\n\
     uchar* output_ptr = get_image_ptr_from_coord(img2, coord.xy); \\\n\
     __global data_type* data_ptr = (__global data_type*)input_ptr; \\\n\
     __global write_type* out_ptr = (__global write_type*)output_ptr; \\\n\
+    indices = indices >= 0 ? indices : indices + axis_num; \\\n\
     src.s0 = data_ptr[indices.x]; \\\n\
     src.s1 = data_ptr[indices.y]; \\\n\
     src.s2 = data_ptr[indices.z]; \\\n\
@@ -8804,6 +9968,7 @@ __kernel void gather_batch_I8toI8(\n\
     {\n\
         int4 indice = read_imagei(input1, coord_idx);\n\
         coord_idx.y++;\n\
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
         coord_in.y = gidz * axis_num + indice.x;\n\
 \n\
         vxc_char16 src;\n\
@@ -8834,6 +9999,7 @@ __kernel void gather_batch_U8toU8(\n\
     {\n\
         int4 indice = read_imagei(input1, coord_idx);\n\
         coord_idx.y++;\n\
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
         coord_in.y = gidz * axis_num + indice.x;\n\
 \n\
         vxc_uchar16 src;\n\
@@ -8864,6 +10030,7 @@ __kernel void gather_batch_I16toI16(\n\
     {\n\
         int4 indice = read_imagei(input1, coord_idx);\n\
         coord_idx.y++;\n\
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
         coord_in.y = gidz * axis_num + indice.x;\n\
 \n\
         vxc_short8 src;\n\
@@ -8894,6 +10061,7 @@ __kernel void gather_batch_F16toF16(\n\
     {\n\
         int4 indice = read_imagei(input1, coord_idx);\n\
         coord_idx.y++;\n\
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
         coord_in.y = gidz * axis_num + indice.x;\n\
 \n\
         vxc_short8 src;\n\
@@ -8915,6 +10083,7 @@ __kernel void gather_batch_I8toI8_axis0(\n\
 {\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
     int4 indices = read_imagei(input1, coord.xz);\n\
+    indices = indices >= 0 ? indices : indices + axis_num;\n\
     int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\
 \n\
     vxc_char16 src, dst;\n\
@@ -8943,6 +10112,7 @@ __kernel void gather_batch_U8toU8_axis0(\n\
 {\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
     int4 indices = read_imagei(input1, coord.xz);\n\
+    indices = indices >= 0 ? indices : indices + axis_num;\n\
     int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\
 \n\
     vxc_uchar16 src, dst;\n\
@@ -8971,6 +10141,7 @@ __kernel void gather_batch_I16toI16_axis0(\n\
 {\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
     int4 indices = read_imagei(input1, coord.xz);\n\
+    indices = indices >= 0 ? indices : indices + axis_num;\n\
     int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\
 \n\
     vxc_short8 src, dst;\n\
@@ -8999,6 +10170,7 @@ __kernel void gather_batch_F16toF16_axis0(\n\
 {\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
     int4 indices = read_imagei(input1, coord.xz);\n\
+    indices = indices >= 0 ? indices : indices + axis_num;\n\
     int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\
 \n\
     vxc_short8 src, dst;\n\
@@ -9020,6 +10192,12 @@ __kernel void gather_batch_F16toF16_axis0(\n\
 static const char gather_elements_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int axis_size;\n\
+_viv_uniform uint width0;\n\
+_viv_uniform uint height0;\n\
+_viv_uniform uint width1;\n\
+_viv_uniform uint height1;\n\
+_viv_uniform uint width_out;\n\
+_viv_uniform uint height_out;\n\
 \n\
 #define GATHER_ELEMENTS_AXIS0_2D(name, data_type) \\\n\
 __kernel void gather_elements_axis0_##name##_I32to##name##_2D \\\n\
@@ -9170,6 +10348,144 @@ GATHER_ELEMENTS_AXIS2(F16, vxc_short4)\n\
 GATHER_ELEMENTS_AXIS2(I16, vxc_short4)\n\
 GATHER_ELEMENTS_AXIS2(I8,  vxc_char4)\n\
 GATHER_ELEMENTS_AXIS2(U8,  vxc_uchar4)\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    int    axis \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+    Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \\\n\
+    int* index_ptr = (int*)index_tensor.ptr; \\\n\
+    int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \\\n\
+ \\\n\
+    Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \\\n\
+    data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \\\n\
+    data_type data = input_ptr[index + coord.y * width0 + coord.z * width0 * height0]; \\\n\
+ \\\n\
+    Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \\\n\
+    data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \\\n\
+    output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I8,  char,  char*,  1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(U8,  uchar, uchar*, 1)\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    int    axis \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+    Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \\\n\
+    int* index_ptr = (int*)index_tensor.ptr; \\\n\
+    int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \\\n\
+ \\\n\
+    Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \\\n\
+    data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \\\n\
+    data_type data = input_ptr[coord.x + index * width0 + coord.z * width0 * height0]; \\\n\
+ \\\n\
+    Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \\\n\
+    data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \\\n\
+    output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I8,  char,  char*,  1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(U8,  uchar, uchar*, 1)\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis2_##name##_I32to##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    int    axis \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+    Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \\\n\
+    int* index_ptr = (int*)index_tensor.ptr; \\\n\
+    int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \\\n\
+ \\\n\
+    Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \\\n\
+    data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \\\n\
+    data_type data = input_ptr[coord.x + coord.y * width0 + index * width0 * height0]; \\\n\
+ \\\n\
+    Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \\\n\
+    data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \\\n\
+    output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I8,  char,  char*,  1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(U8,  uchar, uchar*, 1)\n\
+\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_t input0, \\\n\
+    __read_only  image2d_t input1, \\\n\
+    __write_only image2d_t output, \\\n\
+    int    axis \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    Image index_img = create_image_from_image2d(input1, 4); \\\n\
+    int* index_ptr = (int*)index_img.ptr; \\\n\
+    int index = index_ptr[coord.x + coord.y * width1]; \\\n\
+ \\\n\
+    Image input_img = create_image_from_image2d(input0, stride); \\\n\
+    data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \\\n\
+    data_type data = input_ptr[index + coord.y * width0]; \\\n\
+ \\\n\
+    Image output_img = create_image_from_image2d(output, stride); \\\n\
+    data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \\\n\
+    output_ptr[coord.x + coord.y * width_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I8,  char,  char*,  1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(U8,  uchar, uchar*, 1)\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_t input0, \\\n\
+    __read_only  image2d_t input1, \\\n\
+    __write_only image2d_t output, \\\n\
+    int    axis \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    Image index_img = create_image_from_image2d(input1, 4); \\\n\
+    int* index_ptr = (int*)index_img.ptr; \\\n\
+    int index = index_ptr[coord.x + coord.y * width1]; \\\n\
+ \\\n\
+    Image input_img = create_image_from_image2d(input0, stride); \\\n\
+    data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \\\n\
+    data_type data = input_ptr[coord.x + index  * width0]; \\\n\
+ \\\n\
+    Image output_img = create_image_from_image2d(output, stride); \\\n\
+    data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \\\n\
+    output_ptr[coord.x + coord.y * width_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I8,  char,  char*,  1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(U8,  uchar, uchar*, 1)\n\
+\n\
+\n\
 "; /* end of gather_elements_vx*/
 
 static const char gather_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -9198,6 +10514,7 @@ __kernel void gather_##src0_type_name##toF16( \\\n\
  \\\n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0); \\\n\
     int4 indice = read_imagei(input1, coord_in.xy); \\\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \\\n\
     coord_in.w = gidz * axis_num + indice.x; \\\n\
  \\\n\
     read_type src; \\\n\
@@ -9234,6 +10551,7 @@ __kernel void gather_F16to##src1_type_name( \\\n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0); \\\n\
  \\\n\
     int4 indice = read_imagei(input1, coord_in.xy); \\\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \\\n\
     coord_in.w = gidz * axis_num + indice.x; \\\n\
  \\\n\
     vxc_short8 src; \\\n\
@@ -9266,6 +10584,7 @@ __kernel void gather_I16toF16(\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
     int4 indice = read_imagei(input1, coord_in.xy);\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     vxc_short8 src;\n\
@@ -9296,6 +10615,7 @@ __kernel void gather_##src0_type_name##toF16_axis0( \\\n\
 { \\\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
     int4 indices = read_imagei(input1, coord.xx); \\\n\
+    indices = indices >= 0 ? indices : indices + axis_num; \\\n\
     int2 coord_in = (int2)(indices.x, get_global_id(1)); \\\n\
  \\\n\
     read_type src; \\\n\
@@ -9327,6 +10647,7 @@ __kernel void gather_F16to##src1_type_name##_axis0( \\\n\
 { \\\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
     int4 indices = read_imagei(input1, coord.xx); \\\n\
+    indices = indices >= 0 ? indices : indices + axis_num; \\\n\
     int2 coord_in = (int2)(indices.x, get_global_id(1)); \\\n\
  \\\n\
     vxc_short8 src; \\\n\
@@ -9358,6 +10679,7 @@ __kernel void gather_I16toF16_axis0(\n\
 {\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
     int4 indices = read_imagei(input1, coord.xx);\n\
+    indices = indices >= 0 ? indices : indices + axis_num;\n\
     int2 coord_in = (int2)(indices.x, get_global_id(1));\n\
 \n\
     vxc_short8 src;\n\
@@ -9414,6 +10736,7 @@ __kernel void gather_batch_##src0_type_name##toF16( \\\n\
     { \\\n\
         int4 indice = read_imagei(input1, coord_idx); \\\n\
         coord_idx.y++; \\\n\
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \\\n\
         coord_in.y = gidz * axis_num + indice.x; \\\n\
  \\\n\
         read_type src; \\\n\
@@ -9459,6 +10782,7 @@ __kernel void gather_batch_F16to##src1_type_name( \\\n\
     { \\\n\
         int4 indice = read_imagei(input1, coord_idx); \\\n\
         coord_idx.y++; \\\n\
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num; \\\n\
         coord_in.y = gidz * axis_num + indice.x; \\\n\
  \\\n\
         vxc_short8 src; \\\n\
@@ -9501,6 +10825,7 @@ __kernel void gather_batch_I16toF16(\n\
     {\n\
         int4 indice = read_imagei(input1, coord_idx);\n\
         coord_idx.y++;\n\
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
         coord_in.y = gidz * axis_num + indice.x;\n\
 \n\
         vxc_short8 src;\n\
@@ -9526,6 +10851,7 @@ __kernel void gather_batch_##src0_type_name##toF16_axis0( \\\n\
 { \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
     int4 indices = read_imagei(input1, coord.xz); \\\n\
+    indices = indices >= 0 ? indices : indices + axis_num; \\\n\
     int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \\\n\
  \\\n\
     read_type src; \\\n\
@@ -9560,6 +10886,7 @@ __kernel void gather_batch_F16to##src1_type_name##_axis0( \\\n\
 { \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
     int4 indices = read_imagei(input1, coord.xz); \\\n\
+    indices = indices >= 0 ? indices : indices + axis_num; \\\n\
     int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \\\n\
  \\\n\
     vxc_short8 src; \\\n\
@@ -9594,6 +10921,7 @@ __kernel void gather_batch_I16toF16_axis0(\n\
 {\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
     int4 indices = read_imagei(input1, coord.xz);\n\
+    indices = indices >= 0 ? indices : indices + axis_num;\n\
     int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\
 \n\
     vxc_short8 src, dst;\n\
@@ -10083,95 +11411,98 @@ static const char gather_nd_batch_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 __kernel void gather_nd_batch_I8toI8_1D(\n\
     __read_only image2d_t   input0,\n\
-    __read_only image2d_t   input1,\n\
-    __write_only image2d_t  output,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int coord_dim\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
-    int gidy = get_global_id(1);  // batch\n\
+    int gidy = get_global_id(1);  // index num\n\
+    int gidz = get_global_id(2);  // batch num\n\
 \n\
-    int4 coord = (int4)(gidx, gidy, 0, 0);\n\
-    Image img = create_image_from_image2d(input1, 4);\n\
-    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
     int4 indice = ((int4 *)indice_ptr)[0];\n\
-\n\
-    coord.z = indice.x * block_size + gidx;\n\
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\
 \n\
     vxc_char16 src;\n\
-    VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-    VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 __kernel void gather_nd_batch_U8toU8_1D(\n\
     __read_only image2d_t   input0,\n\
-    __read_only image2d_t   input1,\n\
-    __write_only image2d_t  output,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int coord_dim\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
-    int gidy = get_global_id(1);  // batch num\n\
+    int gidy = get_global_id(1);  // index num\n\
+    int gidz = get_global_id(2);  // batch num\n\
 \n\
-    int4 coord = (int4)(gidx, gidy, 0, 0);\n\
-    Image img = create_image_from_image2d(input1, 4);\n\
-    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
     int4 indice = ((int4 *)indice_ptr)[0];\n\
 \n\
-    coord.z = indice.x * block_size + gidx;\n\
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\
 \n\
     vxc_uchar16 src;\n\
-    VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 __kernel void gather_nd_batch_I16toI16_1D(\n\
     __read_only image2d_t   input0,\n\
-    __read_only image2d_t   input1,\n\
-    __write_only image2d_t  output,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int coord_dim\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
-    int gidy = get_global_id(1);  // batch num\n\
+    int gidy = get_global_id(1);  // index num\n\
+    int gidz = get_global_id(2);  // batch num\n\
 \n\
-    int4 coord = (int4)(gidx, gidy, 0, 0);\n\
-    Image img = create_image_from_image2d(input1, 4);\n\
-    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
     int4 indice = ((int4 *)indice_ptr)[0];\n\
 \n\
-    coord.z = indice.x * block_size + gidx;\n\
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\
 \n\
     vxc_short8 src;\n\
-    VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 __kernel void gather_nd_batch_F16toF16_1D(\n\
     __read_only image2d_t   input0,\n\
-    __read_only image2d_t   input1,\n\
-    __write_only image2d_t  output,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int coord_dim\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
-    int gidy = get_global_id(1);  // batch num\n\
+    int gidy = get_global_id(1);  // index num\n\
+    int gidz = get_global_id(2);  // batch num\n\
 \n\
-    int4 coord = (int4)(gidx, gidy, 0, 0);\n\
-    Image img = create_image_from_image2d(input1, 4);\n\
-    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
     int4 indice = ((int4 *)indice_ptr)[0];\n\
 \n\
-    coord.z = indice.x * block_size + gidx;\n\
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\
 \n\
     vxc_short8 src;\n\
-    VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 "; /* end of gather_nd_batch_vx*/
 
@@ -10179,18 +11510,19 @@ static const char gather_nd_batch_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 __kernel void gather_nd_batch_I8toI8_2D(\n\
     __read_only image2d_array_t   input0,\n\
-    __read_only image2d_t   input1,\n\
-    __write_only image2d_t  output,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int coord_dim\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
-    int gidy = get_global_id(1);  // batch num\n\
+    int gidy = get_global_id(1);  // index num\n\
+    int gidz = get_global_id(2);  // batch num\n\
 \n\
-    int4 coord = (int4)(gidx, 0, gidy, 0);\n\
-    Image img = create_image_from_image2d(input1, 4);\n\
-    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
     int4 indice = ((int4 *)indice_ptr)[0];\n\
 \n\
     indice.x = indice.x * block_size + gidx;\n\
@@ -10199,23 +11531,24 @@ __kernel void gather_nd_batch_I8toI8_2D(\n\
     vxc_char16 src;\n\
     VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-    VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 __kernel void gather_nd_U8toU8_2D(\n\
     __read_only image2d_array_t   input0,\n\
-    __read_only image2d_t   input1,\n\
-    __write_only image2d_t  output,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int coord_dim\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
-    int gidy = get_global_id(1);  // batch num\n\
+    int gidy = get_global_id(1);  // index num\n\
+    int gidz = get_global_id(2);  // batch num\n\
 \n\
-    int4 coord = (int4)(gidx, 0, gidy, 0);\n\
-    Image img = create_image_from_image2d(input1, 4);\n\
-    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
     int4 indice = ((int4 *)indice_ptr)[0];\n\
 \n\
     indice.x = indice.x * block_size + gidx;\n\
@@ -10223,23 +11556,24 @@ __kernel void gather_nd_U8toU8_2D(\n\
 \n\
     vxc_uchar16 src;\n\
     VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 __kernel void gather_nd_I16toI16_2D(\n\
     __read_only image2d_array_t   input0,\n\
-    __read_only image2d_t   input1,\n\
-    __write_only image2d_t  output,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int coord_dim\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
-    int gidy = get_global_id(1);  // batch num\n\
+    int gidy = get_global_id(1);  // index num\n\
+    int gidz = get_global_id(2);  // batch num\n\
 \n\
-    int4 coord = (int4)(gidx, 0, gidy, 0);\n\
-    Image img = create_image_from_image2d(input1, 4);\n\
-    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
     int4 indice = ((int4 *)indice_ptr)[0];\n\
 \n\
     indice.x = indice.x * block_size + gidx;\n\
@@ -10247,23 +11581,24 @@ __kernel void gather_nd_I16toI16_2D(\n\
 \n\
     vxc_short8 src;\n\
     VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 __kernel void gather_nd_F16toF16_2D(\n\
     __read_only image2d_array_t   input0,\n\
-    __read_only image2d_t   input1,\n\
-    __write_only image2d_t  output,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int coord_dim\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
-    int gidy = get_global_id(1);  // batch num\n\
+    int gidy = get_global_id(1);  // index num\n\
+    int gidz = get_global_id(2);  // batch num\n\
 \n\
-    int4 coord = (int4)(gidx, 0, gidy, 0);\n\
-    Image img = create_image_from_image2d(input1, 4);\n\
-    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
     int4 indice = ((int4 *)indice_ptr)[0];\n\
 \n\
     indice.x = indice.x * block_size + gidx;\n\
@@ -10271,7 +11606,7 @@ __kernel void gather_nd_F16toF16_2D(\n\
 \n\
     vxc_short8 src;\n\
     VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 "; /* end of gather_nd_batch_2d_vx*/
 
@@ -10733,12 +12068,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
     __read_only  image2d_array_t scale, \\\n\
     __read_only  image2d_t       meanVari, \\\n\
     __write_only image2d_array_t output, \\\n\
-    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+    float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
 { \\\n\
+    int gidx = get_global_id(0); \\\n\
     int gidy = get_global_id(1); \\\n\
     int gidz = get_global_id(2); \\\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\
+    int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \\\n\
     src_type src0; \\\n\
     dst_type dst; \\\n\
     vxc_short8 src1; \\\n\
@@ -10784,7 +12120,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
     __read_only  image2d_array_t scale, \\\n\
     __read_only  image2d_t       meanVari, \\\n\
     __write_only image2d_array_t output, \\\n\
-    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+    float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
 { \\\n\
     int gidz = get_global_id(1); \\\n\
     int2 coord = (int2)(get_global_id(0), gidz); \\\n\
@@ -10834,12 +12170,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
     __read_only  image2d_t       scale, \\\n\
     __read_only  image2d_t       meanVari, \\\n\
     __write_only image2d_array_t output, \\\n\
-    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+    float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
 { \\\n\
+    int gidx = get_global_id(0); \\\n\
     int gidy = get_global_id(1); \\\n\
     int gidz = get_global_id(2); \\\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\
+    int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \\\n\
     src_type src0; \\\n\
     dst_type dst; \\\n\
     float scale_vari, bias_val; \\\n\
@@ -10880,7 +12217,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
     __read_only  image2d_t       scale, \\\n\
     __read_only  image2d_t       meanVari, \\\n\
     __write_only image2d_array_t output, \\\n\
-    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+    float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
 { \\\n\
     int gidz = get_global_id(1); \\\n\
     int2 coord = (int2)(get_global_id(0), gidz); \\\n\
@@ -10938,12 +12275,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
     __read_only  image2d_array_t scale, \\\n\
     __read_only  image2d_t       meanVari, \\\n\
     __write_only image2d_array_t output, \\\n\
-    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+    float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
 { \\\n\
+    int gidx = get_global_id(0); \\\n\
     int gidy = get_global_id(1); \\\n\
     int gidz = get_global_id(2); \\\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\
+    int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \\\n\
     src_type src0; \\\n\
     vxc_short8 src1, outval; \\\n\
     vxc_half8 scale_h, dst; \\\n\
@@ -10996,7 +12334,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
     __read_only  image2d_array_t scale, \\\n\
     __read_only  image2d_t       meanVari, \\\n\
     __write_only image2d_array_t output, \\\n\
-    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+    float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
 { \\\n\
     int gidz = get_global_id(1); \\\n\
     int2 coord = (int2)(get_global_id(0), gidz); \\\n\
@@ -11053,12 +12391,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
     __read_only  image2d_t       scale, \\\n\
     __read_only  image2d_t       meanVari, \\\n\
     __write_only image2d_array_t output, \\\n\
-    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+    float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
 { \\\n\
+    int gidx = get_global_id(0); \\\n\
     int gidy = get_global_id(1); \\\n\
     int gidz = get_global_id(2); \\\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\
+    int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \\\n\
     src_type src0; \\\n\
     vxc_short8 outval; \\\n\
     vxc_half8 dst; \\\n\
@@ -11107,7 +12446,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
     __read_only  image2d_t       scale, \\\n\
     __read_only  image2d_t       meanVari, \\\n\
     __write_only image2d_array_t output, \\\n\
-    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+    float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
 { \\\n\
     int gidz = get_global_id(1); \\\n\
     int2 coord = (int2)(get_global_id(0), gidz); \\\n\
@@ -11294,12 +12633,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
     __read_only  image2d_array_t scale, \\\n\
     __read_only  image2d_t       meanVari, \\\n\
     __write_only image2d_array_t output, \\\n\
-    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+    float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
 { \\\n\
+    int gidx = get_global_id(0); \\\n\
     int gidy = get_global_id(1); \\\n\
     int gidz = get_global_id(2); \\\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\
+    int4 coord_para = (int4)((convert_int(gidx* rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \\\n\
     vxc_short8 src0; \\\n\
     vxc_short8 src1; \\\n\
     vxc_half8 scale_h; \\\n\
@@ -11351,7 +12691,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
     __read_only  image2d_array_t scale, \\\n\
     __read_only  image2d_t       meanVari, \\\n\
     __write_only image2d_array_t output, \\\n\
-    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+    float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
 { \\\n\
     int gidz = get_global_id(1); \\\n\
     int2 coord = (int2)(get_global_id(0), gidz); \\\n\
@@ -11406,12 +12746,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name(
     __read_only  image2d_t       scale, \\\n\
     __read_only  image2d_t       meanVari, \\\n\
     __write_only image2d_array_t output, \\\n\
-    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+    float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
 { \\\n\
+    int gidx = get_global_id(0); \\\n\
     int gidy = get_global_id(1); \\\n\
     int gidz = get_global_id(2); \\\n\
-    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\
-    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0); \\\n\
+    int4 coord_para = (int4)((convert_int(gidx * rSpaceOrg) + convert_int(gidy * pStride)), gidz, 0, 0); \\\n\
     vxc_short8 src0; \\\n\
     src_type in_h; \\\n\
     float scale_vari, bias_val; \\\n\
@@ -11458,7 +12799,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
     __read_only  image2d_t       scale, \\\n\
     __read_only  image2d_t       meanVari, \\\n\
     __write_only image2d_array_t output, \\\n\
-    float eps, int is2D, float rSpaceOrg, int pStride) \\\n\
+    float eps, int is2D, float rSpaceOrg, float pStride) \\\n\
 { \\\n\
     int gidz = get_global_id(1); \\\n\
     int2 coord = (int2)(get_global_id(0), gidz); \\\n\
@@ -12731,8 +14072,8 @@ _viv_uniform VXC_512Bits uniConvertF16_0_4x4;\n\
 _viv_uniform VXC_512Bits uniConvertF16_1_4x4;\n\
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
 \n\
-#define GRUCELL_F16_F16TOF16(act_name, act_func) \\\n\
-__kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \\\n\
+#define GRUCELL_F16_F16TOF16(act_name, act_func, rec_act_name, rec_act_func) \\\n\
+__kernel void grucell_reset_after_activation_F16_F16toF16_##act_name##_##rec_act_name( \\\n\
     __read_only  image2d_t hstate_in, \\\n\
     __read_only  image2d_t input_z_conv, \\\n\
     __read_only  image2d_t input_r_conv, \\\n\
@@ -12764,15 +14105,15 @@ __kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \\\n\
  \\\n\
     float4 r; \\\n\
     VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\
-    r = act_func(r); \\\n\
+    r = rec_act_func(r); \\\n\
     float4 h0, h1; \\\n\
     VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
     VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
     float4 h = h0 + r * h1; \\\n\
     float4 z; \\\n\
     VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\
-    z = act_func(z); \\\n\
-    h = tanh_func(h); \\\n\
+    z = rec_act_func(z); \\\n\
+    h = act_func(h); \\\n\
     float4 h_tm; \\\n\
     VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
     float4 result = (1 - z) * h + z * h_tm; \\\n\
@@ -12785,14 +14126,15 @@ __kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \\\n\
     VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)\n\
+GRUCELL_F16_F16TOF16(TANH,    tanh_func,    SIGMOID, sigmoid_func)\n\
+GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func, SIGMOID, sigmoid_func)\n\
 \n\
 _viv_uniform float hstate_in_scale;\n\
 _viv_uniform float hstate_in_tail;\n\
 _viv_uniform float output_scale;\n\
 _viv_uniform float output_zp;\n\
-#define GRUCELL_QNT_F16TO_QNT(name0, name1, act_name, act_func, src0_type, dst_type) \\\n\
-__kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name( \\\n\
+#define GRUCELL_QNT_F16TO_QNT(name, act_func, rec_act_func, src0_type, dst_type) \\\n\
+__kernel void grucell_reset_after_activation_##name( \\\n\
     __read_only  image2d_t hstate_in, \\\n\
     __read_only  image2d_t input_z_conv, \\\n\
     __read_only  image2d_t input_r_conv, \\\n\
@@ -12824,15 +14166,15 @@ __kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name
  \\\n\
     float4 r; \\\n\
     VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\
-    r = act_func(r); \\\n\
+    r = rec_act_func(r); \\\n\
     float4 h0, h1; \\\n\
     VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
     VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
     float4 h = h0 + r * h1; \\\n\
     float4 z; \\\n\
     VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\
-    z = act_func(z); \\\n\
-    h = tanh_func(h); \\\n\
+    z = rec_act_func(z); \\\n\
+    h = act_func(h); \\\n\
     float4 h_tm; \\\n\
     VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
     h_tm = h_tm * hstate_in_scale + hstate_in_tail; \\\n\
@@ -12845,9 +14187,12 @@ __kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name
     VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-GRUCELL_QNT_F16TO_QNT(U8,  U8,  SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\
-GRUCELL_QNT_F16TO_QNT(I8,  I8,  SIGMOID, sigmoid_func, vxc_char8,  vxc_char8)\n\
-GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8)\n\
+GRUCELL_QNT_F16TO_QNT(U8_F16toU8_TANH_SIGMOID,      tanh_func,    sigmoid_func, vxc_uchar8, vxc_uchar8)\n\
+GRUCELL_QNT_F16TO_QNT(I8_F16toI8_TANH_SIGMOID,      tanh_func,    sigmoid_func, vxc_char8,  vxc_char8)\n\
+GRUCELL_QNT_F16TO_QNT(I16_F16toI16_TANH_SIGMOID,    tanh_func,    sigmoid_func, vxc_short8, vxc_short8)\n\
+GRUCELL_QNT_F16TO_QNT(U8_F16toU8_SIGMOID_SIGMOID,   sigmoid_func, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\
+GRUCELL_QNT_F16TO_QNT(I8_F16toI8_SIGMOID_SIGMOID,   sigmoid_func, sigmoid_func, vxc_char8,  vxc_char8)\n\
+GRUCELL_QNT_F16TO_QNT(I16_F16toI16_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_short8, vxc_short8)\n\
 "; /* end of grucell_reset_after_activation_vx*/
 
 static const char hswish_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -21791,6 +23136,432 @@ __kernel void gemm_transb_BF16BF16toBF16(image2d_array_t inputA,\n\
 }\n\
 "; /* end of matrixmul_bf16_vx*/
 
+static const char matrixmul_cross_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float output_ZP;\n\
+_viv_uniform float mulKIn0In1Zp;\n\
+_viv_uniform float inOutScale;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4;\n\
+_viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;\n\
+\n\
+#define GEMM_QINT_TO_QINT_CROSS(src0_type_name, read_type) \\\n\
+__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_cross( \\\n\
+        image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\
+        int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N, \\\n\
+        int axis_size, int inner_size, int outer_size, int axis_size0, \\\n\
+        int inner_size0, int outer_size0, int axis_size1, int inner_size1, \\\n\
+        int outer_size1, int axis_size2, int inner_size2, int outer_size2) \\\n\
+{ \\\n\
+    read_type srcA0, srcA1, srcA2, srcA3, srcB, outC; \\\n\
+    vxc_float4 sum = (vxc_float4)(mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp); \\\n\
+    int gidz = get_global_id(2); \\\n\
+    for(int j = 0; j < outer_size; j++) \\\n\
+    { \\\n\
+        for(int i = 0; i < inner_size; i++) \\\n\
+        { \\\n\
+            vxc_float4 sum0 = sum, sum1 = sum, sum2 = sum, sum3 = sum; \\\n\
+            int4 coord_a = (int4)(0, get_global_id(1), gidz * axis_size0 + i * inner_size0 + j * outer_size0, 0); \\\n\
+            int4 coord_b = (int4)(get_global_id(0), 0, gidz * axis_size1 + i * inner_size1 + j * outer_size1, 0); \\\n\
+            int8 inputA_desc, inputB_desc, output_desc; \\\n\
+            _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\
+            int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\
+            _viv_asm(MOV, coord_a.w, baseAddr_a);  \\\n\
+            _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\
+            int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\
+            _viv_asm(MOV, coord_b.w, baseAddr_b);  \\\n\
+            for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\
+            { \\\n\
+                vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\
+                vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\
+                VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+                VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+                VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+                VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                            VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+                VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+                VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+                            VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+                VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+                VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+                            VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+                coord_a.x += 4; coord_b.y += 4; \\\n\
+                VXC_DP4x4(tempA0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                            uniGemmU8U8toFp32Block4_4x4); \\\n\
+                VXC_DP4x4(tempA1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                            uniGemmU8U8toFp32Block4_4x4); \\\n\
+                VXC_DP4x4(tempA2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                            uniGemmU8U8toFp32Block4_4x4); \\\n\
+                VXC_DP4x4(tempA3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                            uniGemmU8U8toFp32Block4_4x4); \\\n\
+                VXC_DP8x4(tempB0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                            uniGemmU8U8MulZptoFp32_8x4); \\\n\
+                VXC_DP8x4(tempB1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                            uniGemmU8U8MulZptoFp32_8x4); \\\n\
+                VXC_DP8x4(tempB2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                            uniGemmU8U8MulZptoFp32_8x4); \\\n\
+                VXC_DP8x4(tempB3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                            uniGemmU8U8MulZptoFp32_8x4); \\\n\
+                sum0 += tempA0 + tempB0; \\\n\
+                sum1 += tempA1 + tempB1; \\\n\
+                sum2 += tempA2 + tempB2; \\\n\
+                sum3 += tempA3 + tempB3; \\\n\
+            } \\\n\
+            vxc_int4 tmpOut0, tmpOut1; \\\n\
+            coord_b.y = get_global_id(1); \\\n\
+            coord_b.z = gidz * axis_size2 + i * inner_size2 + j * outer_size2; \\\n\
+            _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+            int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \\\n\
+            _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
+            tmpOut0 = convert_int4_rte(sum0 * inOutScale + output_ZP); \\\n\
+            tmpOut1 = convert_int4_rte(sum1 * inOutScale + output_ZP); \\\n\
+            VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+            coord_b.y++; \\\n\
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+            coord_b.y++; \\\n\
+            tmpOut0 = convert_int4_rte(sum2 * inOutScale + output_ZP); \\\n\
+            tmpOut1 = convert_int4_rte(sum3 * inOutScale + output_ZP); \\\n\
+            VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+            coord_b.y++; \\\n\
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+GEMM_QINT_TO_QINT_CROSS(U8, vxc_uchar16)\n\
+GEMM_QINT_TO_QINT_CROSS(I8, vxc_char16)\n\
+\n\
+__kernel void gemm_F16F16toF16_cross(image2d_array_t inputA,\n\
+            image2d_array_t inputB, image2d_array_t output,\n\
+            int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N,\n\
+            int axis_size, int inner_size, int outer_size, int axis_size0,\n\
+            int inner_size0, int outer_size0, int axis_size1, int inner_size1,\n\
+            int outer_size1, int axis_size2, int inner_size2, int outer_size2)\n\
+{\n\
+    uint gidy = get_global_id(1);\n\
+    uint gidz = get_global_id(2);\n\
+    for(int j = 0; j < outer_size; j++)\n\
+    {\n\
+        for(int i = 0; i < inner_size; i++)\n\
+        {\n\
+            int4 coord_a = (int4)(0, gidy, gidz * axis_size0 + i * inner_size0 + j * outer_size0, 0);\n\
+            int4 coord_b = (int4)(get_global_id(0), 0, gidz * axis_size1 + i * inner_size1 + j * outer_size1, 0);\n\
+\n\
+            half4 valC;\n\
+            vxc_short8 srcA0, srcA1, srcA2, srcA3, outC;\n\
+            vxc_half8 tmpA0, tmpA1, tmpA2, tmpA3;\n\
+            vxc_short16 srcB;\n\
+            vxc_half16 tmpB;\n\
+            vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);\n\
+            vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);\n\
+\n\
+            int8 inputA_desc, inputB_desc, output_desc;\n\
+            _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\
+            int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\
+            _viv_asm(MOV, coord_a.w, baseAddr_a);\n\
+            _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\
+            int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\
+            _viv_asm(MOV, coord_b.w, baseAddr_b);\n\
+\n\
+            for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)\n\
+            {\n\
+                vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\
+                VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+                VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+                VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+                VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+                            VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+                VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+                VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+                VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+                VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+                            VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+                coord_a.x += 4; coord_b.y += 4;\n\
+                _viv_asm(COPY, tmpA0, srcA0, 16);\n\
+                _viv_asm(COPY, tmpA1, srcA1, 16);\n\
+                _viv_asm(COPY, tmpA2, srcA2, 16);\n\
+                _viv_asm(COPY, tmpA3, srcA3, 16);\n\
+                _viv_asm(COPY, tmpB.hi, srcB.hi, 16);\n\
+                _viv_asm(COPY, tmpB.lo, srcB.lo, 16);\n\
+                VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                            uniGemmU8F16toF32Lo_4x4b);\n\
+                VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                            uniGemmU8F16toF32Lo_4x4b);\n\
+                VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, tmpA2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                            uniGemmU8F16toF32Lo_4x4b);\n\
+                VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, tmpA3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                            uniGemmU8F16toF32Lo_4x4b);\n\
+                sum0 += (tempA0);\n\
+                sum1 += (tempA1);\n\
+                sum2 += (tempA2);\n\
+                sum3 += (tempA3);\n\
+            }\n\
+            coord_b.y = gidy;\n\
+            coord_b.z = gidz * axis_size2 + i * inner_size2 + j * outer_size2;\n\
+            _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+            int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0;\n\
+            _viv_asm(MOV, coord_b.w, baseAddr);\n\
+            _viv_asm(CONV, valC, sum0);\n\
+            _viv_asm(COPY, outC, valC, 16);\n\
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,\n\
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+            coord_b.y++;\n\
+            _viv_asm(CONV, valC, sum1);\n\
+            _viv_asm(COPY, outC, valC, 16);\n\
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,\n\
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+            coord_b.y++;\n\
+            _viv_asm(CONV, valC, sum2);\n\
+            _viv_asm(COPY, outC, valC, 16);\n\
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,\n\
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+            coord_b.y++;\n\
+            _viv_asm(CONV, valC, sum3);\n\
+            _viv_asm(COPY, outC, valC, 16);\n\
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246,\n\
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+}\n\
+"; /* end of matrixmul_cross_vx*/
+
+static const char matrixmul_cross_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int input0_ZP;\n\
+_viv_uniform int input1_ZP;\n\
+_viv_uniform float output_ZP;\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32B_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+_viv_uniform int ac2zero;\n\
+_viv_uniform int bc2zero;\n\
+\n\
+_viv_uniform int outer;\n\
+\n\
+#define GEMM_QINT_TO_QINT_MERGE(src0_type_name, read_type) \\\n\
+__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_merge( \\\n\
+        image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\
+        int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\
+{ \\\n\
+    uint gidy = get_global_id(1); \\\n\
+    short in0_zp, in1_zp; \\\n\
+    _viv_asm(COPY, in0_zp, input0_ZP, 4); \\\n\
+    _viv_asm(COPY, in1_zp, input1_ZP, 4); \\\n\
+    for(int i = 0; i < outer; i++) \\\n\
+    { \\\n\
+        read_type srcA, srcB, outC; \\\n\
+        int4 coord_a = (int4)(0, gidy, (ac2zero ? i : get_global_id(2)), 0); \\\n\
+        int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); \\\n\
+        vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\
+        vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\
+     \\\n\
+        int8 inputA_desc, inputB_desc, output_desc; \\\n\
+        _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\
+        int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\
+        _viv_asm(MOV, coord_a.w, baseAddr_a);  \\\n\
+        _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\
+        int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\
+        _viv_asm(MOV, coord_b.w, baseAddr_b);  \\\n\
+     \\\n\
+        for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\
+        { \\\n\
+            vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\
+            vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\
+            VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniConvertUint8SubZpToFp32_4x4); \\\n\
+            VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniConvertUint8SubZpToFp32B_4x4); \\\n\
+            VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniConvertUint8SubZpToFp32_4x4); \\\n\
+            VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniConvertUint8SubZpToFp32B_4x4); \\\n\
+            VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniConvertUint8SubZpToFp32_4x4); \\\n\
+            VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniConvertUint8SubZpToFp32B_4x4); \\\n\
+            VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord_a.x += 4; \\\n\
+            coord_b.y += 4; \\\n\
+            VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniConvertUint8SubZpToFp32_4x4); \\\n\
+            VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniConvertUint8SubZpToFp32B_4x4); \\\n\
+            sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \\\n\
+            sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \\\n\
+            sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \\\n\
+            sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \\\n\
+        } \\\n\
+        vxc_int4 tmpOut0, tmpOut1; \\\n\
+        coord_b.y = gidy; \\\n\
+        coord_b.z = get_global_id(2) + i * get_global_size(2); \\\n\
+        _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+        int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \\\n\
+        _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
+        tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \\\n\
+        tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\
+        VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                    uniConvertInt32toUint8_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+                    VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_b.y++; \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+                    VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_b.y++; \\\n\
+        tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \\\n\
+        tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\
+        VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                    uniConvertInt32toUint8_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+                    VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_b.y++; \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+                    VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+GEMM_QINT_TO_QINT_MERGE(I16, vxc_short8)\n\
+\n\
+#define GEMM_QINT_TO_QINT_CROSS(src0_type_name, read_type) \\\n\
+__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_cross( \\\n\
+        image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\
+        int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N, \\\n\
+        int axis_size, int inner_size, int outer_size, int axis_size0, \\\n\
+        int inner_size0, int outer_size0, int axis_size1, int inner_size1, \\\n\
+        int outer_size1, int axis_size2, int inner_size2, int outer_size2) \\\n\
+{ \\\n\
+    uint gidy = get_global_id(1); \\\n\
+    uint gidz = get_global_id(2); \\\n\
+    short in0_zp, in1_zp; \\\n\
+    _viv_asm(COPY, in0_zp, input0_ZP, 4); \\\n\
+    _viv_asm(COPY, in1_zp, input1_ZP, 4); \\\n\
+    for(int j = 0; j < outer_size; j++) \\\n\
+    { \\\n\
+        for(int i = 0; i < inner_size; i++) \\\n\
+        { \\\n\
+            read_type srcA, srcB, outC; \\\n\
+            int4 coord_a = (int4)(0, gidy, gidz * axis_size0 + i * inner_size0 + j * outer_size0, 0); \\\n\
+            int4 coord_b = (int4)(get_global_id(0), 0, gidz * axis_size1 + i * inner_size1 + j * outer_size1, 0); \\\n\
+            vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\
+            vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\
+         \\\n\
+            int8 inputA_desc, inputB_desc, output_desc; \\\n\
+            _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\
+            int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\
+            _viv_asm(MOV, coord_a.w, baseAddr_a);  \\\n\
+            _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\
+            int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\
+            _viv_asm(MOV, coord_b.w, baseAddr_b);  \\\n\
+         \\\n\
+            for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\
+            { \\\n\
+                vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\
+                vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\
+                VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+                VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+                VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                            uniConvertUint8SubZpToFp32_4x4); \\\n\
+                VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                            uniConvertUint8SubZpToFp32B_4x4); \\\n\
+                VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+                VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+                VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                            uniConvertUint8SubZpToFp32_4x4); \\\n\
+                VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                            uniConvertUint8SubZpToFp32B_4x4); \\\n\
+                VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+                VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+                VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                            uniConvertUint8SubZpToFp32_4x4); \\\n\
+                VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                            uniConvertUint8SubZpToFp32B_4x4); \\\n\
+                VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+                VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+                            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+                coord_a.x += 4; \\\n\
+                coord_b.y += 4; \\\n\
+                VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                            uniConvertUint8SubZpToFp32_4x4); \\\n\
+                VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                            uniConvertUint8SubZpToFp32B_4x4); \\\n\
+                sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \\\n\
+                sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \\\n\
+                sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \\\n\
+                sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \\\n\
+            } \\\n\
+            vxc_int4 tmpOut0, tmpOut1; \\\n\
+            coord_b.y = gidy; \\\n\
+            coord_b.z = gidz * axis_size2 + i * inner_size2 + j * outer_size2; \\\n\
+            _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+            int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \\\n\
+            _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
+            tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \\\n\
+            tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\
+            VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+            coord_b.y++; \\\n\
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+            coord_b.y++; \\\n\
+            tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \\\n\
+            tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\
+            VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+            coord_b.y++; \\\n\
+            VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+                        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+GEMM_QINT_TO_QINT_CROSS(I16, vxc_short8)\n\
+"; /* end of matrixmul_cross_i16_vx*/
+
 static const char matrixmul_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
@@ -23001,6 +24772,302 @@ __kernel void gemm_transb_I16I16toI16(image2d_array_t inputA,\n\
 }\n\
 "; /* end of matrixmul_i16_vx*/
 
+static const char matrixmul_merge_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float output_ZP;\n\
+_viv_uniform float mulKIn0In1Zp;\n\
+_viv_uniform float inOutScale;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+_viv_uniform int ac2zero;\n\
+_viv_uniform int bc2zero;\n\
+_viv_uniform int outer;\n\
+\n\
+_viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4;\n\
+_viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Lo_4x4;\n\
+_viv_uniform VXC_512Bits uniGemmFp16toFp32Row0Hi_4x4;\n\
+_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Lo_4x4;\n\
+_viv_uniform VXC_512Bits uniGemmFp16toFp32Row1Hi_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;\n\
+\n\
+#define GEMM_QINT_TO_QINT_MERGE(src0_type_name, read_type) \\\n\
+__kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name##_merge( \\\n\
+        image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\
+        int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\
+{ \\\n\
+    read_type srcA0, srcA1, srcA2, srcA3, srcB, outC; \\\n\
+    vxc_float4 sum = (vxc_float4)(mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp); \\\n\
+    for(int i = 0; i < outer; i++) \\\n\
+    { \\\n\
+        vxc_float4 sum0 = sum, sum1 = sum, sum2 = sum, sum3 = sum; \\\n\
+        int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? i : get_global_id(2)), 0); \\\n\
+        int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); \\\n\
+        int8 inputA_desc, inputB_desc, output_desc; \\\n\
+        _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\
+        int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\
+        _viv_asm(MOV, coord_a.w, baseAddr_a);  \\\n\
+        _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\
+        int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\
+        _viv_asm(MOV, coord_b.w, baseAddr_b);  \\\n\
+        for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\
+        { \\\n\
+            vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\
+            vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\
+            VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                        VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+                        VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+                        VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord_a.x += 4; coord_b.y += 4; \\\n\
+            VXC_DP4x4(tempA0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniGemmU8U8toFp32Block4_4x4); \\\n\
+            VXC_DP4x4(tempA1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniGemmU8U8toFp32Block4_4x4); \\\n\
+            VXC_DP4x4(tempA2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniGemmU8U8toFp32Block4_4x4); \\\n\
+            VXC_DP4x4(tempA3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniGemmU8U8toFp32Block4_4x4); \\\n\
+            VXC_DP8x4(tempB0, srcA0, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniGemmU8U8MulZptoFp32_8x4); \\\n\
+            VXC_DP8x4(tempB1, srcA1, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniGemmU8U8MulZptoFp32_8x4); \\\n\
+            VXC_DP8x4(tempB2, srcA2, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniGemmU8U8MulZptoFp32_8x4); \\\n\
+            VXC_DP8x4(tempB3, srcA3, srcB, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniGemmU8U8MulZptoFp32_8x4); \\\n\
+            sum0 += tempA0 + tempB0; \\\n\
+            sum1 += tempA1 + tempB1; \\\n\
+            sum2 += tempA2 + tempB2; \\\n\
+            sum3 += tempA3 + tempB3; \\\n\
+        } \\\n\
+        vxc_int4 tmpOut0, tmpOut1; \\\n\
+        coord_b.y = get_global_id(1); \\\n\
+        coord_b.z = get_global_id(2) + i * get_global_size(2); \\\n\
+        _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+        int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0; \\\n\
+        _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
+        tmpOut0 = convert_int4_rte(sum0 * inOutScale + output_ZP); \\\n\
+        tmpOut1 = convert_int4_rte(sum1 * inOutScale + output_ZP); \\\n\
+        VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                    uniConvertInt32toUint8_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+                    VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_b.y++; \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+                    VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_b.y++; \\\n\
+        tmpOut0 = convert_int4_rte(sum2 * inOutScale + output_ZP); \\\n\
+        tmpOut1 = convert_int4_rte(sum3 * inOutScale + output_ZP); \\\n\
+        VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                    uniConvertInt32toUint8_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+                    VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_b.y++; \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+                    VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+GEMM_QINT_TO_QINT_MERGE(U8, vxc_uchar16)\n\
+GEMM_QINT_TO_QINT_MERGE(I8, vxc_char16)\n\
+\n\
+#if (VX_VERSION==2)\n\
+__kernel void gemm_F16F16toF16_merge(image2d_array_t inputA,\n\
+            image2d_array_t inputB, image2d_array_t output,\n\
+            int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N)\n\
+{\n\
+    uint gidy = get_global_id(1);\n\
+    for(int i = 0; i < outer; i++)\n\
+    {\n\
+        int4 coord_a = (int4)(0, gidy, (ac2zero ? i : get_global_id(2)), 0);\n\
+        int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0);\n\
+\n\
+        half4 valC;\n\
+        vxc_short8 srcA0, srcA1, srcA2, srcA3, outC;\n\
+        vxc_half8 tmpA0, tmpA1, tmpA2, tmpA3;\n\
+        vxc_short16 srcB;\n\
+        vxc_half16 tmpB;\n\
+        vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);\n\
+        vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);\n\
+\n\
+        int8 inputA_desc, inputB_desc, output_desc;\n\
+        _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\
+        int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\
+        _viv_asm(MOV, coord_a.w, baseAddr_a);\n\
+        _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\
+        int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\
+        _viv_asm(MOV, coord_b.w, baseAddr_b);\n\
+\n\
+        for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)\n\
+        {\n\
+            vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\
+            VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_OP4(img_load_3d, srcB.hi, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+                        VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_OP4(img_load_3d, srcB.lo, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+                        VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord_a.x += 4; coord_b.y += 4;\n\
+            _viv_asm(COPY, tmpA0, srcA0, 16);\n\
+            _viv_asm(COPY, tmpA1, srcA1, 16);\n\
+            _viv_asm(COPY, tmpA2, srcA2, 16);\n\
+            _viv_asm(COPY, tmpA3, srcA3, 16);\n\
+            _viv_asm(COPY, tmpB.hi, srcB.hi, 16);\n\
+            _viv_asm(COPY, tmpB.lo, srcB.lo, 16);\n\
+            VXC_DP4x4_b(tempA0, tmpB.hi, tmpB.lo, tmpA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                        uniGemmU8F16toF32Lo_4x4b);\n\
+            VXC_DP4x4_b(tempA1, tmpB.hi, tmpB.lo, tmpA1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                        uniGemmU8F16toF32Lo_4x4b);\n\
+            VXC_DP4x4_b(tempA2, tmpB.hi, tmpB.lo, tmpA2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                        uniGemmU8F16toF32Lo_4x4b);\n\
+            VXC_DP4x4_b(tempA3, tmpB.hi, tmpB.lo, tmpA3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                        uniGemmU8F16toF32Lo_4x4b);\n\
+            sum0 += (tempA0);\n\
+            sum1 += (tempA1);\n\
+            sum2 += (tempA2);\n\
+            sum3 += (tempA3);\n\
+        }\n\
+        coord_b.y = gidy;\n\
+        coord_b.z = get_global_id(2) + i * get_global_size(2);\n\
+        _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+        int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0;\n\
+        _viv_asm(MOV, coord_b.w, baseAddr);\n\
+        _viv_asm(CONV, valC, sum0);\n\
+        _viv_asm(COPY, outC, valC, 16);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+        coord_b.y++;\n\
+        _viv_asm(CONV, valC, sum1);\n\
+        _viv_asm(COPY, outC, valC, 16);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+        coord_b.y++;\n\
+        _viv_asm(CONV, valC, sum2);\n\
+        _viv_asm(COPY, outC, valC, 16);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+        coord_b.y++;\n\
+        _viv_asm(CONV, valC, sum3);\n\
+        _viv_asm(COPY, outC, valC, 16);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+#else\n\
+__kernel void gemm_F16F16toF16_merge(image2d_array_t inputA,\n\
+            image2d_array_t inputB, image2d_array_t output,\n\
+            int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N)\n\
+{\n\
+    uint gidy = get_global_id(1);\n\
+    for(int i = 0; i < outer; i++)\n\
+    {\n\
+        int4 coord_a = (int4)(0, gidy, (ac2zero ? i : get_global_id(2)), 0);\n\
+        int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0);\n\
+\n\
+        half4 valC;\n\
+        vxc_short8 srcA0, srcB0, srcA1, srcB1, outC;\n\
+        vxc_half8 tmpA0, tmpB0, tmpA1, tmpB1;\n\
+        vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);\n\
+        vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);\n\
+\n\
+        int8 inputA_desc, inputB_desc, output_desc;\n\
+        _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\
+        int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\
+        _viv_asm(MOV, coord_a.w, baseAddr_a);\n\
+        _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\
+        int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\
+        _viv_asm(MOV, coord_b.w, baseAddr_b);\n\
+\n\
+        for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)\n\
+        {\n\
+            vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\
+            vxc_float4 tempB0, tempB1, tempB2, tempB3;\n\
+            VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+                        VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+                        VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+                        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+                        VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+                        VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord_a.x += 4; coord_b.y += 4;\n\
+            _viv_asm(COPY, tmpA0, srcA0, 16);\n\
+            _viv_asm(COPY, tmpB0, srcB0, 16);\n\
+            _viv_asm(COPY, tmpA1, srcA1, 16);\n\
+            _viv_asm(COPY, tmpB1, srcB1, 16);\n\
+\n\
+            VXC_DP4x4(tempA0, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                        uniGemmFp16toFp32Row0Lo_4x4);\n\
+            VXC_DP4x4(tempB0, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                        uniGemmFp16toFp32Row0Hi_4x4);\n\
+            VXC_DP4x4(tempA1, tmpA0, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                        uniGemmFp16toFp32Row1Lo_4x4);\n\
+            VXC_DP4x4(tempB1, tmpA0, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                        uniGemmFp16toFp32Row1Hi_4x4);\n\
+            VXC_DP4x4(tempA2, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                        uniGemmFp16toFp32Row0Lo_4x4);\n\
+            VXC_DP4x4(tempB2, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                        uniGemmFp16toFp32Row0Hi_4x4);\n\
+            VXC_DP4x4(tempA3, tmpA1, tmpB0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                        uniGemmFp16toFp32Row1Lo_4x4);\n\
+            VXC_DP4x4(tempB3, tmpA1, tmpB1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                        uniGemmFp16toFp32Row1Hi_4x4);\n\
+            sum0 += (tempA0 + tempB0);\n\
+            sum1 += (tempA1 + tempB1);\n\
+            sum2 += (tempA2 + tempB2);\n\
+            sum3 += (tempA3 + tempB3);\n\
+        }\n\
+        coord_b.y = gidy;\n\
+        coord_b.z = get_global_id(2) + i * get_global_size(2);\n\
+        _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+        int baseAddr = (int)coord_b.z * output_desc.s4 + output_desc.s0;\n\
+        _viv_asm(MOV, coord_b.w, baseAddr);\n\
+        _viv_asm(CONV, valC, sum0);\n\
+        _viv_asm(COPY, outC, valC, 16);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+        coord_b.y++;\n\
+        _viv_asm(CONV, valC, sum1);\n\
+        _viv_asm(COPY, outC, valC, 16);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+        coord_b.y++;\n\
+        _viv_asm(CONV, valC, sum2);\n\
+        _viv_asm(COPY, outC, valC, 16);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+        coord_b.y++;\n\
+        _viv_asm(CONV, valC, sum3);\n\
+        _viv_asm(COPY, outC, valC, 16);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+#endif\n\
+"; /* end of matrixmul_merge_vx*/
+
 static const char matrixmul_transA_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int input0_ZP;\n\
@@ -27977,6 +30044,791 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis01_BF1
     }\n\
 }"; /* end of moments_u8_axis012_vx*/
 
+static const char nearest_grid_sample_BF16_to_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+\n\
+_viv_uniform VXC_512Bits uniBF16toFp32_part0_2x8;\n\
+_viv_uniform VXC_512Bits uniBF16toFp32_part1_2x8;\n\
+\n\
+#define GRID_SAMPLE_BF16_PROCESS() \\\n\
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \\\n\
+    int4   x_idx       = convert_int4(in_x); \\\n\
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \\\n\
+    int4    y_idx        = convert_int4(in_y); \\\n\
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+    int baseAddr = input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    vxc_short8 src; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.y; \\\n\
+    coord_in.y = y_idx.y; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.z; \\\n\
+    coord_in.y = y_idx.z; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.w; \\\n\
+    coord_in.y = y_idx.w; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+    int loop = depth - 1; \\\n\
+    while (coord_in.z < loop) \\\n\
+    { \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+        coord_in.x = x_idx.x; \\\n\
+        coord_in.y = y_idx.x; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.y; \\\n\
+        coord_in.y = y_idx.y; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.z; \\\n\
+        coord_in.y = y_idx.z; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.w; \\\n\
+        coord_in.y = y_idx.w; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_BF16_BF16toBF16(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+    vxc_short8 read_val;\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    vxc_short8 read_src;\n\
+    VXC_DP2x8(read_src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part0_2x8);\n\
+    _viv_asm(COPY, fxy0, read_src, 16);\n\
+    VXC_DP2x8(read_src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part1_2x8);\n\
+    _viv_asm(COPY, fxy1, read_src, 16);\n\
+\n\
+\n\
+\n\
+    GRID_SAMPLE_BF16_PROCESS();\n\
+\n\
+}\n\
+"; /* end of nearest_grid_sample_BF16_to_BF16_vx*/
+
+static const char nearest_grid_sample_F16_to_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;\n\
+_viv_uniform VXC_512Bits uniExtactHalf8_2x8;\n\
+\n\
+#define GRID_SAMPLE_F16_PROCESS() \\\n\
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \\\n\
+    int4   x_idx       = convert_int4(in_x); \\\n\
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \\\n\
+    int4    y_idx        = convert_int4(in_y); \\\n\
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+    int baseAddr = input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    vxc_short8 src; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.y; \\\n\
+    coord_in.y = y_idx.y; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.z; \\\n\
+    coord_in.y = y_idx.z; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.w; \\\n\
+    coord_in.y = y_idx.w; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+    int loop = depth - 1; \\\n\
+    while (coord_in.z < loop) \\\n\
+    { \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+        coord_in.x = x_idx.x; \\\n\
+        coord_in.y = y_idx.x; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.y; \\\n\
+        coord_in.y = y_idx.y; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.z; \\\n\
+        coord_in.y = y_idx.z; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.w; \\\n\
+        coord_in.y = y_idx.w; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, src, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_F16_F32toF16(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+    coord_in1.z  = coord_in1.z + 4;\n\
+\n\
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);\n\
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);\n\
+\n\
+    GRID_SAMPLE_F16_PROCESS();\n\
+\n\
+}\n\
+\n\
+_viv_uniform int input1_ZP;\n\
+_viv_uniform float input1Scale;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\
+\n\
+__kernel void nearest_grid_sample_F16_U8toF16(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+    vxc_uchar16 read_coord;\n\
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+    unsigned char input1ZP;\n\
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);\n\
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\
+    fxy0 = fxy0 * input1Scale;\n\
+    fxy1 = fxy1 * input1Scale;\n\
+\n\
+    GRID_SAMPLE_F16_PROCESS();\n\
+\n\
+}\n\
+\n\
+\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\
+\n\
+__kernel void nearest_grid_sample_F16_F16toF16(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+    vxc_short8 read_val;\n\
+    vxc_half8  read_coord;\n\
+\n\
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, read_coord, read_val, 16);\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\
+\n\
+    GRID_SAMPLE_F16_PROCESS();\n\
+\n\
+}\n\
+"; /* end of nearest_grid_sample_F16_to_F16_vx*/
+
+static const char nearest_grid_sample_F16_to_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+\n\
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\
+_viv_uniform float uint8Scale;\n\
+_viv_uniform float output_ZP;\n\
+\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\
+\n\
+#define GRID_SAMPLE_F16_to_U8_PROCESS() \\\n\
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \\\n\
+    int4   x_idx       = convert_int4(in_x); \\\n\
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \\\n\
+    int4    y_idx        = convert_int4(in_y); \\\n\
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+    int baseAddr = input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    vxc_short8 s0; \\\n\
+    vxc_uchar16 result; \\\n\
+    vxc_half8 src; \\\n\
+    VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.y; \\\n\
+    coord_in.y = y_idx.y; \\\n\
+    VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.z; \\\n\
+    coord_in.y = y_idx.z; \\\n\
+    VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.w; \\\n\
+    coord_in.y = y_idx.w; \\\n\
+    VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src, s0, 16); \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+    int loop = depth - 1; \\\n\
+    float4 dst4; \\\n\
+    int4 dst; \\\n\
+    while (coord_in.z < loop) \\\n\
+    { \\\n\
+        VXC_DP4x4(dst4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4); \\\n\
+        dst4         = dst4 * uint8Scale + output_ZP; \\\n\
+        dst     = convert_int4_rte(dst4); \\\n\
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\
+        result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+        coord_in.x = x_idx.x; \\\n\
+        coord_in.y = y_idx.x; \\\n\
+        VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.y; \\\n\
+        coord_in.y = y_idx.y; \\\n\
+        VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.z; \\\n\
+        coord_in.y = y_idx.z; \\\n\
+        VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.w; \\\n\
+        coord_in.y = y_idx.w; \\\n\
+        VXC_OP4(img_load_3d, s0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src, s0, 16); \\\n\
+    } \\\n\
+    VXC_DP4x4(dst4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4); \\\n\
+    dst4         = dst4 * uint8Scale + output_ZP; \\\n\
+    dst     = convert_int4_rte(dst4); \\\n\
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_F16_F32toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+    coord_in1.z  = coord_in1.z + 4;\n\
+\n\
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);\n\
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);\n\
+    GRID_SAMPLE_F16_to_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+_viv_uniform int input1_ZP;\n\
+_viv_uniform float input1Scale;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_F16_U8toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+    vxc_uchar16 read_coord;\n\
+\n\
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    unsigned char input1ZP;\n\
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\
+\n\
+    fxy0 = fxy0 * input1Scale;\n\
+    fxy1 = fxy1 * input1Scale;\n\
+\n\
+    GRID_SAMPLE_F16_to_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_F16_F16toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+    vxc_short8 read_val;\n\
+    vxc_half8  read_coord;\n\
+\n\
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, read_coord, read_val, 16);\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\
+\n\
+    GRID_SAMPLE_F16_to_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+"; /* end of nearest_grid_sample_F16_to_U8_vx*/
+
+static const char nearest_grid_sample_I16_to_I16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;\n\
+_viv_uniform float input1_scale;\n\
+_viv_uniform VXC_512Bits uniConvertI8toI8_2x8;\n\
+\n\
+\n\
+#define GRID_SAMPLE_I16_PROCESS() \\\n\
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \\\n\
+    int4   x_idx       = convert_int4(in_x); \\\n\
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \\\n\
+    int4    y_idx        = convert_int4(in_y); \\\n\
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+    int baseAddr = input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    vxc_short8 src, dst; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.y; \\\n\
+    coord_in.y = y_idx.y; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.z; \\\n\
+    coord_in.y = y_idx.z; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.w; \\\n\
+    coord_in.y = y_idx.w; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+    int loop = depth - 1; \\\n\
+    while (coord_in.z < loop) \\\n\
+    { \\\n\
+        VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+        coord_in.x = x_idx.x; \\\n\
+        coord_in.y = y_idx.x; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.y; \\\n\
+        coord_in.y = y_idx.y; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.z; \\\n\
+        coord_in.y = y_idx.z; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.w; \\\n\
+        coord_in.y = y_idx.w; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_I16_I16toI16(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+    vxc_short8 read_coord;\n\
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);\n\
+\n\
+    fxy0 = fxy0 * input1_scale;\n\
+    fxy1 = fxy1 * input1_scale;\n\
+\n\
+    GRID_SAMPLE_I16_PROCESS();\n\
+\n\
+}\n\
+"; /* end of nearest_grid_sample_I16_to_I16_vx*/
+
+static const char nearest_grid_sample_I8_to_I8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+\n\
+\n\
+_viv_uniform float input1_scale;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertI8toI8_2x8;\n\
+\n\
+#define GRID_SAMPLE_I8_PROCESS() \\\n\
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \\\n\
+    int4   x_idx       = convert_int4(in_x); \\\n\
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \\\n\
+    int4    y_idx        = convert_int4(in_y); \\\n\
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+    int baseAddr = input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    vxc_char16 src, dst; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.y; \\\n\
+    coord_in.y = y_idx.y; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.z; \\\n\
+    coord_in.y = y_idx.z; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.w; \\\n\
+    coord_in.y = y_idx.w; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+    int loop = depth - 1; \\\n\
+    while (coord_in.z < loop) \\\n\
+    { \\\n\
+        VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+        coord_in.x = x_idx.x; \\\n\
+        coord_in.y = y_idx.x; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.y; \\\n\
+        coord_in.y = y_idx.y; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.z; \\\n\
+        coord_in.y = y_idx.z; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.w; \\\n\
+        coord_in.y = y_idx.w; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertI8toI8_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_I8_I8toI8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+    vxc_char16 read_coord;\n\
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);\n\
+\n\
+    fxy0 = fxy0 * input1_scale;\n\
+    fxy1 = fxy1 * input1_scale;\n\
+\n\
+    GRID_SAMPLE_I8_PROCESS();\n\
+\n\
+}\n\
+"; /* end of nearest_grid_sample_I8_to_I8_vx*/
+
+static const char nearest_grid_sample_U8_to_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+\n\
+_viv_uniform int input1_ZP;\n\
+_viv_uniform float input1Scale;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniMultiplyAndPostShift_2x8;\n\
+_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp\n\
+\n\
+#define GRID_SAMPLE_U8_PROCESS() \\\n\
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \\\n\
+    int4   x_idx       = convert_int4(in_x); \\\n\
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \\\n\
+    int4    y_idx        = convert_int4(in_y); \\\n\
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+    int baseAddr = input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    vxc_uchar16 src, dst; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.y; \\\n\
+    coord_in.y = y_idx.y; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.z; \\\n\
+    coord_in.y = y_idx.z; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.w; \\\n\
+    coord_in.y = y_idx.w; \\\n\
+    VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+    int loop = depth - 1; \\\n\
+    vxc_ushort8 multiplier; \\\n\
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\
+    while (coord_in.z < loop) \\\n\
+    { \\\n\
+        VXC_DP2x8(dst, src, multiplier, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+        coord_in.x = x_idx.x; \\\n\
+        coord_in.y = y_idx.x; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.y; \\\n\
+        coord_in.y = y_idx.y; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.z; \\\n\
+        coord_in.y = y_idx.z; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.w; \\\n\
+        coord_in.y = y_idx.w; \\\n\
+        VXC_OP4(img_load_3d, src, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    VXC_DP2x8(dst, src, multiplier, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniMultiplyAndPostShift_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_U8_F32toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+    coord_in1.z  = coord_in1.z + 4;\n\
+\n\
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);\n\
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);\n\
+    GRID_SAMPLE_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_U8_U8toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+    vxc_uchar16 read_coord;\n\
+\n\
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    unsigned char input1ZP;\n\
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\
+\n\
+    fxy0 = fxy0 * input1Scale;\n\
+    fxy1 = fxy1 * input1Scale;\n\
+\n\
+    GRID_SAMPLE_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\
+\n\
+__kernel void nearest_grid_sample_U8_F16toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+    vxc_short8 read_val;\n\
+    vxc_half8  read_coord;\n\
+\n\
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, read_coord, read_val, 16);\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\
+\n\
+    GRID_SAMPLE_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+"; /* end of nearest_grid_sample_U8_to_U8_vx*/
+
 static const char one_hot_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniDataConvert_0_4x4;\n\
@@ -29077,8 +31929,8 @@ __kernel void pow_##name \\\n\
  \\\n\
     src0_type src0; \\\n\
     copy0_type data0; \\\n\
-    src0_type src1; \\\n\
-    copy0_type data1; \\\n\
+    src1_type src1; \\\n\
+    copy1_type data1; \\\n\
     VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
     _viv_asm(COPY, data0, src0, 16); \\\n\
     VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
@@ -29149,8 +32001,8 @@ __kernel void pow_##name##_2D \\\n\
  \\\n\
     src0_type src0; \\\n\
     copy0_type data0; \\\n\
-    src0_type src1; \\\n\
-    copy0_type data1; \\\n\
+    src1_type src1; \\\n\
+    copy1_type data1; \\\n\
     VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
     _viv_asm(COPY, data0, src0, 16); \\\n\
     VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
@@ -29331,9 +32183,21 @@ _viv_uniform int zp;\n\
 _viv_uniform float outputScale;\n\
 \n\
 __kernel void pre_process_bgra_scale_U8toU8(\n\
-    __read_only image2d_array_t input, __write_only image2d_array_t    output,\n\
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
+         __read_only image2d_array_t  input,\n\
+        __write_only image2d_array_t  output,\n\
+              global int *            xRatio,\n\
+              global int *            yRatio,\n\
+              global int *            xOffset,\n\
+              global int *            yOffset,\n\
+                     float            rMean,\n\
+                     float            gMean,\n\
+                     float            bMean,\n\
+                     float            r_scale,\n\
+                     int              reverse_channel,\n\
+                     int              trans,\n\
+                     float            g_scale,\n\
+                     float            b_scale\n\
+    )\n\
 {\n\
     int4 gidx = get_global_id(0);\n\
     int gidy = get_global_id(1);\n\
@@ -29389,6 +32253,7 @@ __kernel void pre_process_bgra_scale_U8toU8(\n\
     int4 tmp1, tmp2, result1, result2;\n\
     float4 tmpDst, tmp0;\n\
     float4 mean = (float4)(bMean, gMean, rMean, 0);\n\
+    float4 var = (float4)(b_scale, g_scale, r_scale, 0);\n\
     //tmpFx = (int4)(fx.x, fx.x, fx.x, fx.x);\n\
     int tmpV = 1 << 19;\n\
     vxc_short8 tmpFx;\n\
@@ -29451,9 +32316,21 @@ __kernel void pre_process_bgra_scale_U8toU8(\n\
 }\n\
 \n\
 __kernel void pre_process_bgra_copy_U8toU8(\n\
-    __read_only image2d_array_t input, __write_only image2d_array_t    output,\n\
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
+         __read_only image2d_array_t  input,\n\
+        __write_only image2d_array_t  output,\n\
+              global int *            xRatio,\n\
+              global int *            yRatio,\n\
+              global int *            xOffset,\n\
+              global int *            yOffset,\n\
+                     float            rMean,\n\
+                     float            gMean,\n\
+                     float            bMean,\n\
+                     float            r_scale,\n\
+                     int              reverse_channel,\n\
+                     int              trans,\n\
+                     float            g_scale,\n\
+                     float            b_scale\n\
+)\n\
 {\n\
     int2 pos = (int2)((get_global_id(0) + (*xOffset)) << 2, get_global_id(1) + (*yOffset));\n\
 \n\
@@ -29468,10 +32345,10 @@ __kernel void pre_process_bgra_copy_U8toU8(\n\
     VXC_DP4x4(tmpG, lineBGRA0, lineBGRA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGfromBgra_4x4);\n\
     VXC_DP4x4(tmpR, lineBGRA0, lineBGRA0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRfromBgra_4x4);\n\
 \n\
-    tmpDst = (tmpB - bMean) * var;\n\
+    tmpDst = (tmpB - bMean) * b_scale;\n\
     result1 = convert_int4_rte(tmpDst * outputScale + zp);\n\
 \n\
-    tmpDst = (tmpG - gMean) * var;\n\
+    tmpDst = (tmpG - gMean) * g_scale;\n\
     result2 = convert_int4_rte(tmpDst * outputScale + zp);\n\
     VXC_DP2x8(dst, result1, result2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
 \n\
@@ -29481,7 +32358,7 @@ __kernel void pre_process_bgra_copy_U8toU8(\n\
     dstPos.z = 1;\n\
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-    tmpDst = (tmpR - rMean) * var;\n\
+    tmpDst = (tmpR - rMean) * r_scale;\n\
     result1 = convert_int4_rte(tmpDst * outputScale + zp);\n\
     VXC_DP2x8(dst, result1, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
 \n\
@@ -30016,7 +32893,10 @@ static const char pre_process_nv12_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 _viv_uniform int bOrder;\n\
 _viv_uniform int rOrder;\n\
 \n\
-_viv_uniform float outputScaleVar;\n\
+_viv_uniform float outputScaleVar_b;\n\
+_viv_uniform float outputScaleVar_g;\n\
+_viv_uniform float outputScaleVar_r;\n\
+\n\
 _viv_uniform float bMeanScaleVarZp;\n\
 _viv_uniform float gMeanScaleVarZp;\n\
 _viv_uniform float rMeanScaleVarZp;\n\
@@ -30041,10 +32921,12 @@ __kernel void pre_process_nv12_copy_##name \\\n\
                  float           rMean, \\\n\
                  float           gMean, \\\n\
                  float           bMean, \\\n\
-                 float           var, \\\n\
+                 float           r_scale, \\\n\
                  int             reverse_channel, \\\n\
                  int             trans, \\\n\
-                 int             nv_type \\\n\
+                 int             nv_type, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
     ) \\\n\
 { \\\n\
     int gidx = get_global_id(0); \\\n\
@@ -30078,21 +32960,21 @@ __kernel void pre_process_nv12_copy_##name \\\n\
     dst_type dst0; \\\n\
     save_type dst; \\\n\
     int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
-    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\
+    tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\
     _viv_asm(CONV_RTE, result, tmpDstB); \\\n\
     dstPos.z = bOrder; \\\n\
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
     _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\
+    tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\
     _viv_asm(CONV_RTE, result, tmpDstG); \\\n\
     dstPos.z = 1; \\\n\
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
     _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\
+    tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\
     _viv_asm(CONV_RTE, result, tmpDstR); \\\n\
     dstPos.z = rOrder; \\\n\
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -30110,7 +32992,10 @@ static const char pre_process_nv12_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 _viv_uniform int bOrder;\n\
 _viv_uniform int rOrder;\n\
 \n\
-_viv_uniform float outputScaleVar;\n\
+_viv_uniform float outputScaleVar_b;\n\
+_viv_uniform float outputScaleVar_g;\n\
+_viv_uniform float outputScaleVar_r;\n\
+\n\
 _viv_uniform float bMeanScaleVarZp;\n\
 _viv_uniform float gMeanScaleVarZp;\n\
 _viv_uniform float rMeanScaleVarZp;\n\
@@ -30143,10 +33028,12 @@ __kernel void pre_process_nv12_scale_##name##_gq \\\n\
                  float           rMean, \\\n\
                  float           gMean, \\\n\
                  float           bMean, \\\n\
-                 float           var, \\\n\
+                 float           r_scale, \\\n\
                  int             reverse_channel, \\\n\
                  int             trans, \\\n\
-                 int             nv_type \\\n\
+                 int             nv_type, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
     ) \\\n\
 { \\\n\
     uint4 gidx = get_global_id(0); \\\n\
@@ -30200,21 +33087,21 @@ __kernel void pre_process_nv12_scale_##name##_gq \\\n\
     dst_type dst0; \\\n\
     save_type dst; \\\n\
     int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
-    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\
+    tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\
     _viv_asm(CONV_RTE, result, tmpDstB); \\\n\
     dstPos.z = bOrder; \\\n\
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
     _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\
+    tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\
     _viv_asm(CONV_RTE, result, tmpDstG); \\\n\
     dstPos.z = 1; \\\n\
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
     _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\
+    tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\
     _viv_asm(CONV_RTE, result, tmpDstR); \\\n\
     dstPos.z = rOrder; \\\n\
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
@@ -30239,10 +33126,12 @@ __kernel void pre_process_nv12_scale_##name \\\n\
                  float           rMean, \\\n\
                  float           gMean, \\\n\
                  float           bMean, \\\n\
-                 float           var, \\\n\
+                 float           r_scale, \\\n\
                  int             reverse_channel, \\\n\
                  int             trans, \\\n\
-                 int             nv_type \\\n\
+                 int             nv_type, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
     ) \\\n\
 { \\\n\
     uint4 gidx = get_global_id(0); \\\n\
@@ -30268,102 +33157,445 @@ __kernel void pre_process_nv12_scale_##name \\\n\
     coord.x = sx.w; \\\n\
     VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
-    coord_uv.x = uvX.y; \\\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
-    coord_uv.x = uvX.z; \\\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
-    coord_uv.x = uvX.w; \\\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_uv.x = uvX.y; \\\n\
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_uv.x = uvX.z; \\\n\
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_uv.x = uvX.w; \\\n\
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    if (nv_type == 1) \\\n\
+    { \\\n\
+        UV.s01234567 = UV.s10325476; \\\n\
+    } \\\n\
+ \\\n\
+    vxc_char16 tmpUV; \\\n\
+    short tmpVal = 128; \\\n\
+    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \\\n\
+ \\\n\
+    float4 tmpDstB, tmpDstG, tmpDstR; \\\n\
+    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \\\n\
+    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \\\n\
+    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \\\n\
+ \\\n\
+    conv_type result; \\\n\
+    dst_type dst0; \\\n\
+    save_type dst; \\\n\
+    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
+    tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstB); \\\n\
+    dstPos.z = bOrder; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstG); \\\n\
+    dstPos.z = 1; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstR); \\\n\
+    dstPos.z = rOrder; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+NV12_SH_IMPL(U8toU8,  vxc_uchar8, int4,  vxc_uchar8, 8)\n\
+NV12_SH_IMPL(U8toI8,  vxc_char8,  int4,  vxc_char8,  8)\n\
+NV12_SH_IMPL(U8toI16, vxc_short8, int4,  vxc_short8, 16)\n\
+NV12_SH_IMPL(U8toF16, vxc_half8,  half4, vxc_short8, 16)\n\
+"; /* end of pre_process_nv12_scale_vx*/
+
+static const char pre_process_rgb_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniVecShift10;\n\
+_viv_uniform VXC_512Bits uniAddRShift;\n\
+_viv_uniform VXC_512Bits uniGetTempVal;\n\
+_viv_uniform VXC_512Bits uniExtractBytes;\n\
+_viv_uniform VXC_512Bits uniUnpackToR;\n\
+_viv_uniform VXC_512Bits uniUnpackToG;\n\
+_viv_uniform VXC_512Bits uniUnpackToB;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform float outputZP;\n\
+_viv_uniform int r_order;\n\
+_viv_uniform int b_order;\n\
+\n\
+#define DESCALE(x) (((x) + (1<<19)) >> 20)\n\
+\n\
+#define IMAGE_PRE_PROCESS(dst_name, conv_type, dst_type, copy_type) \\\n\
+__kernel void pre_process_rgb_scale_U8to##dst_name \\\n\
+    ( \\\n\
+__read_only image2d_array_t  input, \\\n\
+__write_only image2d_array_t output, \\\n\
+        global int           *xRatio, \\\n\
+        global int           *yRatio, \\\n\
+        global int           *xOffset, \\\n\
+        global int           *yOffset, \\\n\
+               float         rMean, \\\n\
+               float         gMean, \\\n\
+               float         bMean, \\\n\
+               float         r_scale, \\\n\
+               int           reverse_channel, \\\n\
+               int           trans, \\\n\
+               float         g_scale, \\\n\
+               float         b_scale \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
+    int4 xPos       = get_global_id(0); \\\n\
+    int yPos        = get_global_id(1); \\\n\
+    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\
+    xPos += (int4)(0, 1, 2, 3); \\\n\
+ \\\n\
+    /*x*/ \\\n\
+    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\
+    int4 sx = fx0 & 0xffff8000; \\\n\
+    fx0 -= sx; \\\n\
+    sx = sx >> 15; \\\n\
+ \\\n\
+    vxc_short4 fx; \\\n\
+    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \\\n\
+    /*y*/ \\\n\
+    int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\
+    int sy = fy & 0xffff8000; \\\n\
+ \\\n\
+    fy -= sy; \\\n\
+    sy = sy >> 15; \\\n\
+ \\\n\
+    fy = (fy + (1<< 4)) >> 5; \\\n\
+ \\\n\
+    vxc_uchar16 line0RGB1, line0RGB2; \\\n\
+    vxc_uchar16 line1RGB3, line1RGB4; \\\n\
+    int4 coord; \\\n\
+    sx = (sx + (*xOffset)) * 3; \\\n\
+    coord.xyz    = sx.xyz; \\\n\
+    coord.w        = sy + *yOffset; \\\n\
+    int2 coord1 = (int2)(sx.w, coord.w); \\\n\
+    VXC_ReadImage(line0RGB1, input, coord.xw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0RGB1, input, coord.yw, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0RGB2, input, coord.zw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0RGB2, input, coord1, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \\\n\
+ \\\n\
+    bgrMean *= (float4)(b_scale, g_scale, r_scale, 0); \\\n\
+ \\\n\
+    int4 test01, temp1; \\\n\
+    int4 test02, temp2; \\\n\
+    int4 tt; \\\n\
+    vxc_uchar4 val; \\\n\
+    int4 coord_out = (int4)(xPos.x, yPos, r_order, 0); \\\n\
+ \\\n\
+    vxc_uchar8 line1, line2; \\\n\
+ \\\n\
+    /*R*/ \\\n\
+    VXC_DP2x8(line1, line0RGB1, line0RGB2, \\\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\
+    VXC_DP2x8(line2, line1RGB3, line1RGB4, \\\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\
+ \\\n\
+    VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+ \\\n\
+    vxc_float4 tmp_dst; \\\n\
+    vxc_uchar4 u8_dst; \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+    /*convert U8 to dst*/ \\\n\
+    dst_type dst; \\\n\
+    tmp_dst = tmp_dst * r_scale - bgrMean.zzzz; \\\n\
+    tmp_dst = tmp_dst * outputScale + outputZP; \\\n\
+    conv_type dst0; \\\n\
+    _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    copy_type result; \\\n\
+    _viv_asm(COPY, result, dst, 16); \\\n\
+    VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    /*G*/ \\\n\
+    VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\
+    VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\
+ \\\n\
+    coord_out.z = 1; \\\n\
+    VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+ \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+            uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+    tmp_dst = tmp_dst * g_scale - bgrMean.y; \\\n\
+    tmp_dst = tmp_dst * outputScale + outputZP; \\\n\
+    _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, result, dst, 16); \\\n\
+    VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    /*B*/ \\\n\
+    VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\
+    VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\
+ \\\n\
+    coord_out.z = b_order; \\\n\
+    VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+ \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+    tmp_dst = tmp_dst * b_scale - bgrMean.x; \\\n\
+    tmp_dst = tmp_dst * outputScale + outputZP; \\\n\
+    _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, result, dst, 16); \\\n\
+    VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+IMAGE_PRE_PROCESS(U8,  uint4, vxc_uchar16, vxc_uchar16)\n\
+IMAGE_PRE_PROCESS(I8,  int4,  vxc_char16,  vxc_char16)\n\
+IMAGE_PRE_PROCESS(I16, int4,  vxc_short8,  vxc_short8)\n\
+IMAGE_PRE_PROCESS(F16, half4, vxc_half8,   vxc_short8)\n\
+"; /* end of pre_process_rgb_vx*/
+
+static const char pre_process_rgb888_planar_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniVecShift10;\n\
+_viv_uniform VXC_512Bits uniAddRShift;\n\
+_viv_uniform VXC_512Bits uniGetTempVal;\n\
+_viv_uniform VXC_512Bits uniExtractBytes;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform int4 rgb_order;\n\
+\n\
+#define RESIZE_BILINEAR_4X1(scale, mean, output, _coord) \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.y; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.z; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.w; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.z ++; \\\n\
+    coord_in.x = coord.x; \\\n\
+ \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+    tmp_dst = tmp_dst * scale * output_scale - mean * scale * output_scale + output_zp; \\\n\
+    _viv_asm(CONV, dst0, tmp_dst); \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst1, 8); \\\n\
+    VXC_WriteImage(output, _coord, dst, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+#define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\
+__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+          global int             *xRatio, \\\n\
+          global int             *yRatio, \\\n\
+          global int             *xOffset, \\\n\
+          global int             *yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse, \\\n\
+                 int             height, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
+ \\\n\
+    int4 xPos = get_global_id(0); \\\n\
+    int yPos = get_global_id(1); \\\n\
+ \\\n\
+    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\
+    xPos += (int4)(0, 1, 2, 3); \\\n\
+ \\\n\
+    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\
+    int4 sx = fx0 & 0xffff8000; \\\n\
+    fx0 -= sx; \\\n\
+    sx = sx >> 15; \\\n\
+ \\\n\
+    vxc_short4 fx; \\\n\
+    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniAddRShift); \\\n\
+ \\\n\
+    int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\
+    int sy = fy & 0xffff8000; \\\n\
+ \\\n\
+    fy -= sy; \\\n\
+    sy = sy >> 15; \\\n\
+ \\\n\
+    fy = (fy + (1<< 4)) >> 5; \\\n\
+ \\\n\
+    vxc_uchar16 line0Y; \\\n\
+    vxc_uchar16 line1Y; \\\n\
+    int4 coord; \\\n\
+    int4 coord_in = (int4)(0, 0, 0, 0); \\\n\
+    sx = sx + *xOffset; \\\n\
+    coord = sx.xyzw; \\\n\
+    coord_in.y = sy + *yOffset; \\\n\
+    coord_in.x = coord.x; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.y; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.z; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.w; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.z ++; \\\n\
+    coord_in.x = coord.x; \\\n\
+ \\\n\
+    int4 test01, temp1; \\\n\
+    int4 test02, temp2; \\\n\
+    int4 tt; \\\n\
+    vxc_uchar4 val; \\\n\
+    int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \\\n\
+    coord_out.yzw += rgb_order.xyz; \\\n\
  \\\n\
-    if (nv_type == 1) \\\n\
-    { \\\n\
-        UV.s01234567 = UV.s10325476; \\\n\
-    } \\\n\
+    vxc_uchar8 line1, line2; \\\n\
  \\\n\
-    vxc_char16 tmpUV; \\\n\
-    short tmpVal = 128; \\\n\
-    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
  \\\n\
-    float4 tmpDstB, tmpDstG, tmpDstR; \\\n\
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \\\n\
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \\\n\
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
  \\\n\
-    conv_type result; \\\n\
-    dst_type dst0; \\\n\
-    save_type dst; \\\n\
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
-    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\
-    _viv_asm(CONV_RTE, result, tmpDstB); \\\n\
-    dstPos.z = bOrder; \\\n\
-    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
-    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    vxc_float4 tmp_dst; \\\n\
+    vxc_uchar4 u8_dst; \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
  \\\n\
-    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\
-    _viv_asm(CONV_RTE, result, tmpDstG); \\\n\
-    dstPos.z = 1; \\\n\
-    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
-    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    conv_type dst0; \\\n\
+    dst_type dst1; \\\n\
+    copy_type dst; \\\n\
+    tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\
+    _viv_asm(CONV, dst0, tmp_dst); \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst1, 8); \\\n\
+    VXC_WriteImage(output, coord_out.xy, dst, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\
-    _viv_asm(CONV_RTE, result, tmpDstR); \\\n\
-    dstPos.z = rOrder; \\\n\
-    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
-    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    RESIZE_BILINEAR_4X1(g_scale, gMean, output, coord_out.xz) \\\n\
+    RESIZE_BILINEAR_4X1(b_scale, bMean, output, coord_out.xw) \\\n\
 }\n\
-NV12_SH_IMPL(U8toU8,  vxc_uchar8, int4,  vxc_uchar8, 8)\n\
-NV12_SH_IMPL(U8toI8,  vxc_char8,  int4,  vxc_char8,  8)\n\
-NV12_SH_IMPL(U8toI16, vxc_short8, int4,  vxc_short8, 16)\n\
-NV12_SH_IMPL(U8toF16, vxc_half8,  half4, vxc_short8, 16)\n\
-"; /* end of pre_process_nv12_scale_vx*/
-
-static const char pre_process_rgb_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniVecShift10;\n\
-_viv_uniform VXC_512Bits uniAddRShift;\n\
-_viv_uniform VXC_512Bits uniGetTempVal;\n\
-_viv_uniform VXC_512Bits uniExtractBytes;\n\
-_viv_uniform VXC_512Bits uniUnpackToR;\n\
-_viv_uniform VXC_512Bits uniUnpackToG;\n\
-_viv_uniform VXC_512Bits uniUnpackToB;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\
-_viv_uniform float outputScale;\n\
-_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
-_viv_uniform float outputZP;\n\
-_viv_uniform int r_order;\n\
-_viv_uniform int b_order;\n\
-\n\
-#define DESCALE(x) (((x) + (1<<19)) >> 20)\n\
+PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8,  half4, vxc_short8)\n\
+PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4,  vxc_short8)\n\
 \n\
-#define IMAGE_PRE_PROCESS(dst_name, conv_type, dst_type, copy_type) \\\n\
-__kernel void pre_process_rgb_scale_U8to##dst_name \\\n\
+#define PRE_PROCESS_RGB888_PLANAR_8BITS(dst_name, write_type) \\\n\
+__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
     ( \\\n\
-__read_only image2d_array_t  input, \\\n\
-__write_only image2d_array_t output, \\\n\
-        global int           *xRatio, \\\n\
-        global int           *yRatio, \\\n\
-        global int           *xOffset, \\\n\
-        global int           *yOffset, \\\n\
-               float         rMean, \\\n\
-               float         gMean, \\\n\
-               float         bMean, \\\n\
-               float         f32Var, \\\n\
-               int           reverse_channel, \\\n\
-               int           trans \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+          global int             *xRatio, \\\n\
+          global int             *yRatio, \\\n\
+          global int             *xOffset, \\\n\
+          global int             *yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse, \\\n\
+                 int             height, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
     ) \\\n\
 { \\\n\
     int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
-    int4 xPos       = get_global_id(0); \\\n\
-    int yPos        = get_global_id(1); \\\n\
+    int4 xPos = get_global_id(0); \\\n\
+    int yPos  = get_global_id(1); \\\n\
+ \\\n\
     int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\
     xPos += (int4)(0, 1, 2, 3); \\\n\
  \\\n\
-    /*x*/ \\\n\
     int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\
     int4 sx = fx0 & 0xffff8000; \\\n\
     fx0 -= sx; \\\n\
@@ -30371,137 +33603,485 @@ __write_only image2d_array_t output, \\\n\
  \\\n\
     vxc_short4 fx; \\\n\
     VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \\\n\
-    /*y*/ \\\n\
+ \\\n\
     int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\
     int sy = fy & 0xffff8000; \\\n\
  \\\n\
     fy -= sy; \\\n\
     sy = sy >> 15; \\\n\
- \\\n\
     fy = (fy + (1<< 4)) >> 5; \\\n\
  \\\n\
-    vxc_uchar16 line0RGB1, line0RGB2; \\\n\
-    vxc_uchar16 line1RGB3, line1RGB4; \\\n\
+    vxc_uchar16 line0Y; \\\n\
+    vxc_uchar16 line1Y; \\\n\
     int4 coord; \\\n\
-    sx = (sx + (*xOffset)) * 3; \\\n\
-    coord.xyz    = sx.xyz; \\\n\
-    coord.w        = sy + *yOffset; \\\n\
+    sx = sx + *xOffset; \\\n\
+    coord.xyz = sx.xyz; \\\n\
+    coord.w   = sy + *yOffset; \\\n\
     int2 coord1 = (int2)(sx.w, coord.w); \\\n\
-    VXC_ReadImage(line0RGB1, input, coord.xw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(line0RGB1, input, coord.yw, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(line0RGB2, input, coord.zw, 0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(line0RGB2, input, coord1, 0, VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
-    VXC_ReadImage(line1RGB3, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
-        VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(line1RGB3, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
-        VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(line1RGB4, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
-        VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(line1RGB4, input, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
-        VXC_MODIFIER(6, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
-    float4 bgrMean = (float4)(bMean, gMean, rMean, 0); \\\n\
- \\\n\
-    bgrMean *= f32Var; \\\n\
+    int4 coord_in = (int4)(coord.xw, 0, 0); \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.y; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.z; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord1.x; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
     int4 test01, temp1; \\\n\
     int4 test02, temp2; \\\n\
-    int4 tt; \\\n\
-    vxc_uchar4 val; \\\n\
-    int4 coord_out = (int4)(xPos.x, yPos, r_order, 0); \\\n\
+    int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \\\n\
+    coord_out.yzw += rgb_order.xyz; \\\n\
  \\\n\
-    vxc_uchar8 line1, line2; \\\n\
- \\\n\
-    /*R*/ \\\n\
-    VXC_DP2x8(line1, line0RGB1, line0RGB2, \\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\
-    VXC_DP2x8(line2, line1RGB3, line1RGB4, \\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToR); \\\n\
- \\\n\
-    VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
-    VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
     temp1 = temp1 + test01; \\\n\
  \\\n\
-    VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
-    VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
     temp2 = temp2 + test02; \\\n\
     temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
  \\\n\
     vxc_float4 tmp_dst; \\\n\
     vxc_uchar4 u8_dst; \\\n\
-    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
     VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
         uniConvertIntergetoF32_4x4); \\\n\
  \\\n\
-    /*convert U8 to dst*/ \\\n\
-    dst_type dst; \\\n\
-    tmp_dst = tmp_dst * f32Var - bgrMean.zzzz; \\\n\
-    tmp_dst = tmp_dst * outputScale + outputZP; \\\n\
-    conv_type dst0; \\\n\
-    _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\
-    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
-    copy_type result; \\\n\
-    _viv_asm(COPY, result, dst, 16); \\\n\
-    VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    int4 dst0; \\\n\
+    write_type dst; \\\n\
+    tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\
+    dst0 = convert_int4_rte(tmp_dst); \\\n\
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
  \\\n\
-    /*G*/ \\\n\
-    VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\
-    VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToG); \\\n\
+    VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    coord_out.z = 1; \\\n\
-    VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
-    VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    coord_in.x = coord.x; \\\n\
+    coord_in.z = 1; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.y; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.z; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord1.x; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
     temp1 = temp1 + test01; \\\n\
  \\\n\
-    VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
-    VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
     temp2 = temp2 + test02; \\\n\
     temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
- \\\n\
-    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
     VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
-            uniConvertIntergetoF32_4x4); \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+    tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \\\n\
+    dst0 = convert_int4_rte(tmp_dst); \\\n\
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
  \\\n\
-    tmp_dst = tmp_dst * f32Var - bgrMean.y; \\\n\
-    tmp_dst = tmp_dst * outputScale + outputZP; \\\n\
-    _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\
-    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
-    _viv_asm(COPY, result, dst, 16); \\\n\
-    VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(output, coord_out.xz, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    /*B*/ \\\n\
-    VXC_DP2x8(line1, line0RGB1, line0RGB2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\
-    VXC_DP2x8(line2, line1RGB3, line1RGB4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniUnpackToB); \\\n\
+    coord_in.x = coord.x; \\\n\
+    coord_in.z = 2; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.y; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord.z; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = coord1.x; \\\n\
+    VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    coord_out.z = b_order; \\\n\
-    VXC_DP4x4(test01, line1, line1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
-    VXC_DP4x4(temp1, line1, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
     temp1 = temp1 + test01; \\\n\
  \\\n\
-    VXC_DP4x4(test02, line2, line2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniVecShift10); \\\n\
-    VXC_DP4x4(temp2, line2, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
     temp2 = temp2 + test02; \\\n\
     temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
- \\\n\
-    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtractBytes); \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
     VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
         uniConvertIntergetoF32_4x4); \\\n\
+    tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \\\n\
+    dst0 = convert_int4_rte(tmp_dst); \\\n\
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
  \\\n\
-    tmp_dst = tmp_dst * f32Var - bgrMean.x; \\\n\
-    tmp_dst = tmp_dst * outputScale + outputZP; \\\n\
-    _viv_asm(CONV_RTE, dst0, tmp_dst); \\\n\
-    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
-    _viv_asm(COPY, result, dst, 16); \\\n\
-    VXC_WriteImage2DArray(output, coord_out, result, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(output, coord_out.xw, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)\n\
+PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_0_vx*/
+
+static const char pre_process_rgb888_planar_1_vx[] = "\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\
+_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\
+\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform int4 rgb_order;\n\
+\n\
+#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\
+__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+          global int             *xRatio, \\\n\
+          global int             *yRatio, \\\n\
+          global int             *xOffset, \\\n\
+          global int             *yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse, \\\n\
+                 int             height, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    coord.xy += (int2)(*xOffset, *yOffset); \\\n\
+    vxc_uchar16 src0, src1, src2; \\\n\
+    dst_type dst0, dst1; \\\n\
+ \\\n\
+    int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\
+    VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.z ++; \\\n\
+    VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.z ++; \\\n\
+    VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    coord.x = coord.z + 8; \\\n\
+    float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp,\\\n\
+        rMean * r_scale * output_scale - output_zp, \\\n\
+        rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\
+ \\\n\
+    half4 paramData_f16; \\\n\
+    copy_type tmp_dst; \\\n\
+    _viv_asm(CONV, paramData_f16, paramData0); \\\n\
+    VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevHi_2x8); \\\n\
+    _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
+    int4 coord_out = coord; \\\n\
+    coord_out.yw = coord_out.ww + rgb_order.xy; \\\n\
+    VXC_WriteImage(output, coord_out.zy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
+    VXC_WriteImage(output, coord_out.xy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \\\n\
+        gMean * g_scale * output_scale - output_zp, \\\n\
+        gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\
+    _viv_asm(CONV, paramData_f16, paramData1); \\\n\
+    VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevHi_2x8); \\\n\
+    _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
+    VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
+    VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \\\n\
+        bMean * b_scale * output_scale - output_zp, \\\n\
+        bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
+    _viv_asm(CONV, paramData_f16, paramData2); \\\n\
+    VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevHi_2x8); \\\n\
+    _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
+    coord_out.w = coord.w + rgb_order.z; \\\n\
+    VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
+    VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8,  vxc_short8)\n\
+PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\
+\n\
+#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \\\n\
+__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+          global int             *xRatio, \\\n\
+          global int             *yRatio, \\\n\
+          global int             *xOffset, \\\n\
+          global int             *yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse, \\\n\
+                 int             height, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    coord.xy += (int2) (*xOffset, *yOffset); \\\n\
+    vxc_uchar16 src0, src1, src2; \\\n\
+    write_type dst; \\\n\
+ \\\n\
+    int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\
+    VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.z ++; \\\n\
+    VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.z ++; \\\n\
+    VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    int4 coord_out = coord; \\\n\
+    coord_out.xyw = coord.www + rgb_order.xyz; \\\n\
+    float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \\\n\
+        rMean * r_scale * output_scale - output_zp, \\\n\
+        rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\
+ \\\n\
+    half4 paramData_f16; \\\n\
+    _viv_asm(CONV, paramData_f16, paramData0); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevHi_2x8); \\\n\
+    VXC_WriteImage(output, coord_out.zx, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \\\n\
+        gMean * g_scale * output_scale - output_zp, \\\n\
+        gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\
+    _viv_asm(CONV, paramData_f16, paramData1); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevHi_2x8); \\\n\
+    VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \\\n\
+        bMean * b_scale * output_scale - output_zp, \\\n\
+        bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
+    _viv_asm(CONV, paramData_f16, paramData2); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevHi_2x8); \\\n\
+    VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\
+PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\
+"; /* end of pre_process_rgb888_planar_1_vx*/
+
+static const char pre_process_rgb888_planar_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;\n\
+_viv_uniform int4 rgb_order;\n\
+\n\
+__kernel void pre_process_rgb888_planar_4over3_U8toU8\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+          global int             *xRatio,\n\
+          global int             *yRatio,\n\
+          global int             *xOffset,\n\
+          global int             *yOffset,\n\
+                 float           rMean,\n\
+                 float           gMean,\n\
+                 float           bMean,\n\
+                 float           r_scale,\n\
+                 int             reverse,\n\
+                 int             height,\n\
+                 float           g_scale,\n\
+                 float           b_scale\n\
+    )\n\
+{\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_out;\n\
+\n\
+    vxc_uchar16 src0, src1, src2, src3;\n\
+    vxc_uchar16 dst0, dst1, dst2;\n\
+\n\
+    VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.z ++;\n\
+    coord_out.xy = (coord_in.xy >> 2) * 3;\n\
+    coord_out.zw = coord_in.yy + (int2)(1, 2);\n\
+\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+\n\
+    int4 coord_r = coord_out;\n\
+    coord_r.yzw += rgb_order.xxx;\n\
+    VXC_WriteImage(output, coord_r.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_r.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_r.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.z ++;\n\
+\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+\n\
+    int4 coord_g = coord_out;\n\
+    coord_g.yzw += rgb_order.yyy;\n\
+    VXC_WriteImage(output, coord_g.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_g.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_g.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+\n\
+    int4 coord_b = coord_out;\n\
+    coord_b.yzw += rgb_order.zzz;\n\
+    VXC_WriteImage(output, coord_b.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_b.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_b.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
-IMAGE_PRE_PROCESS(U8,  uint4, vxc_uchar16, vxc_uchar16)\n\
-IMAGE_PRE_PROCESS(I8,  int4,  vxc_char16,  vxc_char16)\n\
-IMAGE_PRE_PROCESS(I16, int4,  vxc_short8,  vxc_short8)\n\
-IMAGE_PRE_PROCESS(F16, half4, vxc_half8,   vxc_short8)\n\
-"; /* end of pre_process_rgb_vx*/
+\n\
+__kernel void pre_process_rgb888_planar_half_U8toU8\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+          global int             *xRatio,\n\
+          global int             *yRatio,\n\
+          global int             *xOffset,\n\
+          global int             *yOffset,\n\
+                 float           rMean,\n\
+                 float           gMean,\n\
+                 float           bMean,\n\
+                 float           r_scale,\n\
+                 int             reverse,\n\
+                 int             height,\n\
+                 float           g_scale,\n\
+                 float           b_scale\n\
+    )\n\
+{\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+    vxc_uchar16 src0, src1, src2;\n\
+\n\
+    VXC_ReadImage2DArray(src0, input, coord_in, 0,\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.z ++;\n\
+    VXC_ReadImage2DArray(src1, input, coord_in, 0,\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.z ++;\n\
+    VXC_ReadImage2DArray(src2, input, coord_in, 0,\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    int2 coord = coord_in.xy >> 1;\n\
+\n\
+    int4 coord_rgb = coord.xyyy;\n\
+    coord_rgb.yzw += rgb_order.xyz;\n\
+    VXC_WriteImage(output, coord_rgb.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_rgb.xz, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_rgb.xw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of pre_process_rgb888_planar_2_vx*/
 
-static const char pre_process_rgb888_planar_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char pre_process_rgb888_planar_nhwc_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniVecShift10;\n\
 _viv_uniform VXC_512Bits uniAddRShift;\n\
@@ -30510,11 +34090,15 @@ _viv_uniform VXC_512Bits uniExtractBytes;\n\
 \n\
 _viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;\n\
 \n\
 _viv_uniform float output_scale;\n\
 _viv_uniform float output_zp;\n\
 \n\
-#define RESIZE_BILINEAR_4X1(mean, output) \\\n\
+#define RESIZE_BILINEAR_4X1(scale, mean) \\\n\
     VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\
         VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\
@@ -30552,21 +34136,13 @@ _viv_uniform float output_zp;\n\
     VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
         uniConvertIntergetoF32_4x4); \\\n\
  \\\n\
-    tmp_dst = tmp_dst * output_scale - mean * output_scale + output_zp; \\\n\
-    _viv_asm(CONV, dst0, tmp_dst); \\\n\
-    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
-        uniExtract8Data_2x8); \\\n\
-    _viv_asm(COPY, dst, dst1, 8); \\\n\
-    VXC_WriteImage(output, coord_out, dst, \\\n\
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
+    tmp_dst = tmp_dst * scale * output_scale - mean * scale * output_scale + output_zp; \\\n\
+    _viv_asm(CONV, dst0, tmp_dst);\n\
 #define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\
-__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
+__kernel void pre_process_rgb888_planar_scale_U8to##dst_name##_nhwc \\\n\
     ( \\\n\
     __read_only  image2d_array_t input, \\\n\
-    __write_only image2d_array_t output0, \\\n\
-    __write_only image2d_array_t output1, \\\n\
-    __write_only image2d_array_t output2, \\\n\
+    __write_only image2d_array_t output, \\\n\
           global int             *xRatio, \\\n\
           global int             *yRatio, \\\n\
           global int             *xOffset, \\\n\
@@ -30574,7 +34150,10 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
                  float           rMean, \\\n\
                  float           gMean, \\\n\
                  float           bMean, \\\n\
-                 float           f32Var \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
     ) \\\n\
 { \\\n\
     int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
@@ -30636,7 +34215,9 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
     int4 test02, temp2; \\\n\
     int4 tt; \\\n\
     vxc_uchar4 val; \\\n\
-    int2 coord_out = (int2)(xPos.x, yPos); \\\n\
+    int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \\\n\
+    coord_out.x = coord_out.x * 3; \\\n\
+    coord_out.z = coord_out.x + 8; \\\n\
  \\\n\
     vxc_uchar8 line1, line2; \\\n\
  \\\n\
@@ -30659,29 +34240,36 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
         uniConvertIntergetoF32_4x4); \\\n\
  \\\n\
     conv_type dst0; \\\n\
-    dst_type dst1; \\\n\
-    copy_type dst; \\\n\
-    tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \\\n\
+    dst_type dst1, dst2; \\\n\
+    copy_type data0, data1, dst; \\\n\
+    tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\
     _viv_asm(CONV, dst0, tmp_dst); \\\n\
     VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
         uniExtract8Data_2x8); \\\n\
-    _viv_asm(COPY, dst, dst1, 8); \\\n\
-    VXC_WriteImage(output0, coord_out, dst, \\\n\
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    RESIZE_BILINEAR_4X1(gMean, output1) \\\n\
-    RESIZE_BILINEAR_4X1(bMean, output2) \\\n\
+    RESIZE_BILINEAR_4X1(g_scale, gMean) \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+    RESIZE_BILINEAR_4X1(b_scale, bMean) \\\n\
+    VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, data0, dst1, 16); \\\n\
+    _viv_asm(COPY, data1, dst2, 16); \\\n\
+    VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uni16BitsDataInterleave_0_2x8); \\\n\
+    VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uni16BitsDataInterleave_1_2x8); \\\n\
+    VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
 PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8,  half4, vxc_short8)\n\
 PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4,  vxc_short8)\n\
 \n\
 #define PRE_PROCESS_RGB888_PLANAR_8BITS(dst_name, write_type) \\\n\
-__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
+__kernel void pre_process_rgb888_planar_scale_U8to##dst_name##_nhwc \\\n\
     ( \\\n\
     __read_only  image2d_array_t input, \\\n\
-    __write_only image2d_array_t output0, \\\n\
-    __write_only image2d_array_t output1, \\\n\
-    __write_only image2d_array_t output2, \\\n\
+    __write_only image2d_array_t output, \\\n\
           global int             *xRatio, \\\n\
           global int             *yRatio, \\\n\
           global int             *xOffset, \\\n\
@@ -30689,7 +34277,10 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
                  float           rMean, \\\n\
                  float           gMean, \\\n\
                  float           bMean, \\\n\
-                 float           f32Var \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
     ) \\\n\
 { \\\n\
     int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
@@ -30745,6 +34336,7 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
     int4 test01, temp1; \\\n\
     int4 test02, temp2; \\\n\
     int2 coord_out = (int2)(xPos.x, yPos); \\\n\
+    coord_out.x = coord_out.x * 3; \\\n\
  \\\n\
     VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
         uniVecShift10); \\\n\
@@ -30767,13 +34359,11 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
         uniConvertIntergetoF32_4x4); \\\n\
  \\\n\
     int4 dst0; \\\n\
-    write_type dst; \\\n\
-    tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \\\n\
+    write_type dst1, dst; \\\n\
+    tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\
     dst0 = convert_int4_rte(tmp_dst); \\\n\
-    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
         uniExtract8Data_2x8); \\\n\
- \\\n\
-    VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
     coord_in.x = coord.x; \\\n\
     coord_in.z = 1; \\\n\
@@ -30813,12 +34403,10 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
         uniExtractBytes); \\\n\
     VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
         uniConvertIntergetoF32_4x4); \\\n\
-    tmp_dst = tmp_dst * output_scale - gMean * output_scale + output_zp; \\\n\
+    tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \\\n\
     dst0 = convert_int4_rte(tmp_dst); \\\n\
-    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
         uniExtract8Data_2x8); \\\n\
- \\\n\
-    VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
     coord_in.x = coord.x; \\\n\
     coord_in.z = 2; \\\n\
@@ -30858,32 +34446,591 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\
         uniExtractBytes); \\\n\
     VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
         uniConvertIntergetoF32_4x4); \\\n\
-    tmp_dst = tmp_dst * output_scale - bMean * output_scale + output_zp; \\\n\
+    tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \\\n\
+    dst0 = convert_int4_rte(tmp_dst); \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+    VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uni8BitsDataInterleave_0_2x8); \\\n\
+    VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uni16BitsDataInterleave_1_2x8); \\\n\
+    VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)\n\
+PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_nhwc_0_vx*/
+
+static const char pre_process_rgb888_planar_nhwc_1_vx[] = "\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\
+_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\
+\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;\n\
+\n\
+#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\
+__kernel void pre_process_rgb888_planar_copy_U8to##dst_name##_nhwc \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+          global int             *xRatio, \\\n\
+          global int             *yRatio, \\\n\
+          global int             *xOffset, \\\n\
+          global int             *yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    coord.xy += (int2)(*xOffset, *yOffset); \\\n\
+    vxc_uchar16 src0, src1, src2; \\\n\
+    dst_type dst0, dst1; \\\n\
+ \\\n\
+    int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\
+    VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.z ++; \\\n\
+    VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.z ++; \\\n\
+    VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    int4 coord_out = coord; \\\n\
+    coord_out.z = coord_out.z * 3; \\\n\
+    coord_out.x = coord_out.z + 8; \\\n\
+    float4 paramData0 = (float4)(rMean * output_scale * r_scale - output_zp, \\\n\
+        rMean * r_scale * output_scale - output_zp, \\\n\
+        rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\
+ \\\n\
+    half4 paramData_f16; \\\n\
+    copy_type data0, data1, data2, dst; \\\n\
+    _viv_asm(CONV, paramData_f16, paramData0); \\\n\
+    VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    float4 paramData1 = (float4)(gMean * output_scale * g_scale - output_zp,\\\n\
+        gMean * g_scale * output_scale - output_zp, \\\n\
+        gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\
+    _viv_asm(CONV, paramData_f16, paramData1); \\\n\
+    VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    _viv_asm(COPY, data0, dst0, 16); \\\n\
+ \\\n\
+    float4 paramData2 = (float4)(bMean * output_scale * b_scale - output_zp, \\\n\
+        bMean * b_scale * output_scale - output_zp, \\\n\
+        bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
+    _viv_asm(CONV, paramData_f16, paramData2); \\\n\
+    VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    _viv_asm(COPY, data1, dst1, 16); \\\n\
+    VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uni16BitsDataInterleave_0_2x8); \\\n\
+    VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uni16BitsDataInterleave_1_2x8); \\\n\
+    VXC_WriteImage(output, coord_out.xw, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8,  vxc_short8)\n\
+PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\
+\n\
+#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \\\n\
+__kernel void pre_process_rgb888_planar_copy_U8to##dst_name##_nhwc \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+          global int             *xRatio, \\\n\
+          global int             *yRatio, \\\n\
+          global int             *xOffset, \\\n\
+          global int             *yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse, \\\n\
+                 int             height, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    coord.xy += (int2) (*xOffset, *yOffset); \\\n\
+    vxc_uchar16 src0, src1, src2; \\\n\
+    write_type dst0, dst1, dst2, dst3; \\\n\
+ \\\n\
+    int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\
+    VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.z ++; \\\n\
+    VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.z ++; \\\n\
+    VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    int4 coord_out = coord; \\\n\
+    coord_out.z = coord_out.z * 3; \\\n\
+    coord_out.x = coord_out.z + 16; \\\n\
+    float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \\\n\
+        rMean * r_scale * output_scale - output_zp, \\\n\
+        rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\
+ \\\n\
+    half4 paramData_f16; \\\n\
+    _viv_asm(CONV, paramData_f16, paramData0); \\\n\
+ \\\n\
+    VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+ \\\n\
+    float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \\\n\
+        gMean * g_scale * output_scale - output_zp, \\\n\
+        gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\
+    _viv_asm(CONV, paramData_f16, paramData1); \\\n\
+ \\\n\
+    VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+ \\\n\
+    float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \\\n\
+        bMean * b_scale * output_scale - output_zp, \\\n\
+        bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
+    _viv_asm(CONV, paramData_f16, paramData2); \\\n\
+ \\\n\
+    VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniDataMeanStddevLo_2x8); \\\n\
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uni8BitsDataInterleave_0_2x8); \\\n\
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uni8BitsDataInterleave_1_2x8); \\\n\
+    VXC_DP2x8(dst3, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uni8BitsDataInterleave_2_2x8); \\\n\
+    VXC_WriteImage(output, coord_out.zw, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(output, coord_out.xw, dst3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\
+PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\
+"; /* end of pre_process_rgb888_planar_nhwc_1_vx*/
+
+static const char pre_process_rgb888_planar_nhwc_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;\n\
+\n\
+__kernel void pre_process_rgb888_planar_half_U8toU8_nhwc\n\
+    (\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+          global int             *xRatio,\n\
+          global int             *yRatio,\n\
+          global int             *xOffset,\n\
+          global int             *yOffset,\n\
+                 float           rMean,\n\
+                 float           gMean,\n\
+                 float           bMean,\n\
+                 float           r_scale,\n\
+                 int             reverse,\n\
+                 float           g_scale,\n\
+                 float           b_scale\n\
+    )\n\
+{\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+    vxc_uchar16 src0, src1, src2;\n\
+\n\
+    VXC_ReadImage2DArray(src0, input, coord_in, 0,\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.z ++;\n\
+    VXC_ReadImage2DArray(src1, input, coord_in, 0,\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.z ++;\n\
+    VXC_ReadImage2DArray(src2, input, coord_in, 0,\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    int4 coord;\n\
+    coord.xy = coord_in.xy >> 1;\n\
+\n\
+    coord.x = coord.x * 3;\n\
+    coord.z = coord.x + 16;\n\
+\n\
+    vxc_uchar16 dst0, dst1;\n\
+    src0.lo = src0.s02468ace;\n\
+    src0.hi = src1.s02468ace;\n\
+    src1.lo = src2.s02468ace;\n\
+\n\
+    VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\
+        uni8BitsDataInterleave_0_2x8);\n\
+    VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\n\
+        uni8BitsDataInterleave_1_2x8);\n\
+    VXC_DP2x8(dst1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\
+        uni8BitsDataInterleave_2_2x8);\n\
+\n\
+    VXC_WriteImage(output, coord.xy, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord.zy, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of pre_process_rgb888_planar_nhwc_2_vx*/
+
+static const char pre_process_rgb888_planar_sep_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniVecShift10;\n\
+_viv_uniform VXC_512Bits uniAddRShift;\n\
+_viv_uniform VXC_512Bits uniGetTempVal;\n\
+_viv_uniform VXC_512Bits uniExtractBytes;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform int4 rgb_order;\n\
+\n\
+#define RESIZE_BILINEAR_4X1(input, scale, mean, output, _coord) \\\n\
+    VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+    tmp_dst = tmp_dst * scale * output_scale - scale * mean * output_scale + output_zp; \\\n\
+    _viv_asm(CONV, dst0, tmp_dst); \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst1, 8); \\\n\
+    VXC_WriteImage(output, _coord, dst, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+#define RGB888_PLANAR_SEP_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\
+__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __read_only  image2d_array_t input2, \\\n\
+    __write_only image2d_array_t output, \\\n\
+          global int             *xRatio, \\\n\
+          global int             *yRatio, \\\n\
+          global int             *xOffset, \\\n\
+          global int             *yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse, \\\n\
+                 int             height, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
+ \\\n\
+    int4 xPos = get_global_id(0); \\\n\
+    int yPos = get_global_id(1); \\\n\
+ \\\n\
+    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\
+    xPos += (int4)(0, 1, 2, 3); \\\n\
+ \\\n\
+    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\
+    int4 sx = fx0 & 0xffff8000; \\\n\
+    fx0 -= sx; \\\n\
+    sx = sx >> 15; \\\n\
+ \\\n\
+    vxc_short4 fx; \\\n\
+    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniAddRShift); \\\n\
+ \\\n\
+    int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\
+    int sy = fy & 0xffff8000; \\\n\
+ \\\n\
+    fy -= sy; \\\n\
+    sy = sy >> 15; \\\n\
+ \\\n\
+    fy = (fy + (1<< 4)) >> 5; \\\n\
+ \\\n\
+    vxc_uchar16 line0Y; \\\n\
+    vxc_uchar16 line1Y; \\\n\
+    int4 coord; \\\n\
+    sx = sx + *xOffset; \\\n\
+    coord.xyz = sx.xyz; \\\n\
+    coord.w = sy + *yOffset; \\\n\
+    int2 coord1 = (int2)(sx.w, coord.w); \\\n\
+    VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    int4 test01, temp1; \\\n\
+    int4 test02, temp2; \\\n\
+    int4 tt; \\\n\
+    vxc_uchar4 val; \\\n\
+    int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \\\n\
+    coord_out.yzw += rgb_order.xyz; \\\n\
+ \\\n\
+    vxc_uchar8 line1, line2; \\\n\
+ \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+ \\\n\
+    vxc_float4 tmp_dst; \\\n\
+    vxc_uchar4 u8_dst; \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+    conv_type dst0; \\\n\
+    dst_type dst1; \\\n\
+    copy_type dst; \\\n\
+    tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\
+    _viv_asm(CONV, dst0, tmp_dst); \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst1, 8); \\\n\
+    VXC_WriteImage(output, coord_out.xy, dst, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    RESIZE_BILINEAR_4X1(input1, g_scale, gMean, output, coord_out.xz) \\\n\
+    RESIZE_BILINEAR_4X1(input2, b_scale, bMean, output, coord_out.xw) \\\n\
+}\n\
+RGB888_PLANAR_SEP_16BITS(F16, vxc_half8,  half4, vxc_short8)\n\
+RGB888_PLANAR_SEP_16BITS(I16, vxc_short8, int4,  vxc_short8)\n\
+\n\
+#define RGB888_PLANAR_SEP_8BITS(dst_name, write_type) \\\n\
+__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __read_only  image2d_array_t input2, \\\n\
+    __write_only image2d_array_t output, \\\n\
+          global int             *xRatio, \\\n\
+          global int             *yRatio, \\\n\
+          global int             *xOffset, \\\n\
+          global int             *yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse, \\\n\
+                 int             height, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
+    int4 xPos = get_global_id(0); \\\n\
+    int yPos  = get_global_id(1); \\\n\
+ \\\n\
+    int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\
+    xPos += (int4)(0, 1, 2, 3); \\\n\
+ \\\n\
+    int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\
+    int4 sx = fx0 & 0xffff8000; \\\n\
+    fx0 -= sx; \\\n\
+    sx = sx >> 15; \\\n\
+ \\\n\
+    vxc_short4 fx; \\\n\
+    VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \\\n\
+ \\\n\
+    int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\
+    int sy = fy & 0xffff8000; \\\n\
+ \\\n\
+    fy -= sy; \\\n\
+    sy = sy >> 15; \\\n\
+    fy = (fy + (1<< 4)) >> 5; \\\n\
+ \\\n\
+    vxc_uchar16 line0Y; \\\n\
+    vxc_uchar16 line1Y; \\\n\
+    int4 coord; \\\n\
+    sx = sx + *xOffset; \\\n\
+    coord.xyz = sx.xyz; \\\n\
+    coord.w   = sy + *yOffset; \\\n\
+    int2 coord1 = (int2)(sx.w, coord.w); \\\n\
+    VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    int4 test01, temp1; \\\n\
+    int4 test02, temp2; \\\n\
+    int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \\\n\
+    coord_out.yzw += rgb_order.xyz; \\\n\
+ \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+ \\\n\
+    vxc_float4 tmp_dst; \\\n\
+    vxc_uchar4 u8_dst; \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+ \\\n\
+    int4 dst0; \\\n\
+    write_type dst; \\\n\
+    tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\
+    dst0 = convert_int4_rte(tmp_dst); \\\n\
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+ \\\n\
+    VXC_WriteImage(output, coord_out.xy, dst, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(line0Y, input1, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input1, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input1, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input1, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(line1Y, input1, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input1, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input1, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input1, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+    tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \\\n\
+    dst0 = convert_int4_rte(tmp_dst); \\\n\
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+ \\\n\
+    VXC_WriteImage(output, coord_out.xz, \\\n\
+        dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(line0Y, input2, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input2, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input2, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line0Y, input2, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(line1Y, input2, coord.xw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input2, coord.yw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input2, coord.zw, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(line1Y, input2, coord1, VXC_5BITOFFSET_XY(0, 1), \\\n\
+        VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
+    temp1 = temp1 + test01; \\\n\
+ \\\n\
+    VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniVecShift10); \\\n\
+    VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+        uniGetTempVal); \\\n\
+    temp2 = temp2 + test02; \\\n\
+    temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\
+    VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniExtractBytes); \\\n\
+    VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+        uniConvertIntergetoF32_4x4); \\\n\
+    tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \\\n\
     dst0 = convert_int4_rte(tmp_dst); \\\n\
     VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
         uniExtract8Data_2x8); \\\n\
  \\\n\
-    VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(output, coord_out.xw, \\\n\
+        dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)\n\
-PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_0_vx*/
+RGB888_PLANAR_SEP_8BITS(U8, vxc_uchar16)\n\
+RGB888_PLANAR_SEP_8BITS(I8, vxc_char16)\n\
+"; /* end of pre_process_rgb888_planar_sep_0_vx*/
 
-static const char pre_process_rgb888_planar_1_vx[] = "\n\
-#include \"cl_viv_vx_ext.h\"\n\
+static const char pre_process_rgb888_planar_sep_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\
 _viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\
 \n\
 _viv_uniform float output_scale;\n\
 _viv_uniform float output_zp;\n\
+_viv_uniform int4 rgb_order;\n\
 \n\
-#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\
-__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
+#define RGB888_PLANAR_SEP_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\
+__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\
     ( \\\n\
-    __read_only  image2d_array_t input, \\\n\
-    __write_only image2d_array_t output0, \\\n\
-    __write_only image2d_array_t output1, \\\n\
-    __write_only image2d_array_t output2, \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __read_only  image2d_array_t input2, \\\n\
+    __write_only image2d_array_t output, \\\n\
           global int             *xRatio, \\\n\
           global int             *yRatio, \\\n\
           global int             *xOffset, \\\n\
@@ -30891,7 +35038,11 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
                  float           rMean, \\\n\
                  float           gMean, \\\n\
                  float           bMean, \\\n\
-                 float           f32Var \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse, \\\n\
+                 int             height, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
     ) \\\n\
 { \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
@@ -30900,16 +35051,14 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
     vxc_uchar16 src0, src1, src2; \\\n\
     dst_type dst0, dst1; \\\n\
  \\\n\
-    int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\
-    VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
-    coord_in.z ++; \\\n\
-    VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
-    coord_in.z ++; \\\n\
-    VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
     coord.x = coord.z + 8; \\\n\
-    float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \\\n\
-        rMean * output_scale - output_zp, output_scale); \\\n\
+    float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \\\n\
+        rMean * r_scale * output_scale - output_zp, \\\n\
+        rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\
  \\\n\
     half4 paramData_f16; \\\n\
     copy_type tmp_dst; \\\n\
@@ -30919,44 +35068,49 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
     VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
         uniDataMeanStddevHi_2x8); \\\n\
     _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
-    VXC_WriteImage(output0, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    int4 coord_out = coord; \\\n\
+    coord_out.yw = coord_out.ww + rgb_order.xy; \\\n\
+    VXC_WriteImage(output, coord_out.zy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
     _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
-    VXC_WriteImage(output0, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(output, coord_out.xy, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \\\n\
-        gMean * output_scale - output_zp, output_scale); \\\n\
+    float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \\\n\
+        gMean * g_scale * output_scale - output_zp, \\\n\
+        gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\
     _viv_asm(CONV, paramData_f16, paramData1); \\\n\
     VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
         uniDataMeanStddevLo_2x8); \\\n\
     VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
         uniDataMeanStddevHi_2x8); \\\n\
     _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
-    VXC_WriteImage(output1, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
     _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
-    VXC_WriteImage(output1, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \\\n\
-        bMean * output_scale - output_zp, output_scale); \\\n\
+    float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \\\n\
+        bMean * b_scale * output_scale - output_zp, \\\n\
+        bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
     _viv_asm(CONV, paramData_f16, paramData2); \\\n\
     VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
         uniDataMeanStddevLo_2x8); \\\n\
     VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
         uniDataMeanStddevHi_2x8); \\\n\
     _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
-    VXC_WriteImage(output2, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_out.w = coord.w + rgb_order.z; \\\n\
+    VXC_WriteImage(output, coord_out.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
     _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
-    VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(output, coord_out.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
-PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8,  vxc_short8)\n\
-PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\
+RGB888_PLANAR_SEP_COPY_16BITS(F16, vxc_half8,  vxc_short8)\n\
+RGB888_PLANAR_SEP_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\
 \n\
 #define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \\\n\
-__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
+__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\
     ( \\\n\
-    __read_only  image2d_array_t input, \\\n\
-    __write_only image2d_array_t output0, \\\n\
-    __write_only image2d_array_t output1, \\\n\
-    __write_only image2d_array_t output2, \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __read_only  image2d_array_t input2, \\\n\
+    __write_only image2d_array_t output, \\\n\
           global int             *xRatio, \\\n\
           global int             *yRatio, \\\n\
           global int             *xOffset, \\\n\
@@ -30964,7 +35118,11 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
                  float           rMean, \\\n\
                  float           gMean, \\\n\
                  float           bMean, \\\n\
-                 float           f32Var \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse, \\\n\
+                 int             height, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
     ) \\\n\
 { \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
@@ -30973,15 +35131,15 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
     vxc_uchar16 src0, src1, src2; \\\n\
     write_type dst; \\\n\
  \\\n\
-    int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\
-    VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
-    coord_in.z ++; \\\n\
-    VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
-    coord_in.z ++; \\\n\
-    VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \\\n\
-        rMean * output_scale - output_zp, output_scale); \\\n\
+    int4 coord_out = coord; \\\n\
+    coord_out.xyw += rgb_order.xyz; \\\n\
+    float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp, \\\n\
+        rMean * r_scale * output_scale - output_zp, \\\n\
+        rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\
  \\\n\
     half4 paramData_f16; \\\n\
     _viv_asm(CONV, paramData_f16, paramData0); \\\n\
@@ -30990,46 +35148,49 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\
         uniDataMeanStddevLo_2x8); \\\n\
     VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
         uniDataMeanStddevHi_2x8); \\\n\
-    VXC_WriteImage(output0, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(output, coord_out.zx, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \\\n\
-        gMean * output_scale - output_zp, output_scale); \\\n\
+    float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp, \\\n\
+        gMean * g_scale * output_scale - output_zp, \\\n\
+        gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\
     _viv_asm(CONV, paramData_f16, paramData1); \\\n\
  \\\n\
     VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
         uniDataMeanStddevLo_2x8); \\\n\
     VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
         uniDataMeanStddevHi_2x8); \\\n\
-    VXC_WriteImage(output1, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \\\n\
-        bMean * output_scale - output_zp, output_scale); \\\n\
+    float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp, \\\n\
+        bMean * b_scale * output_scale - output_zp, \\\n\
+        bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
     _viv_asm(CONV, paramData_f16, paramData2); \\\n\
  \\\n\
     VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
         uniDataMeanStddevLo_2x8); \\\n\
     VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
         uniDataMeanStddevHi_2x8); \\\n\
-    VXC_WriteImage(output2, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
 PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\
 PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\
-"; /* end of pre_process_rgb888_planar_1_vx*/
+"; /* end of pre_process_rgb888_planar_sep_1_vx*/
 
-static const char pre_process_rgb888_planar_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char pre_process_rgb888_planar_sep_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;\n\
 _viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;\n\
 _viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;\n\
 _viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;\n\
 _viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;\n\
+_viv_uniform int4 rgb_order;\n\
 \n\
-__kernel void pre_process_rgb888_planar_4over3_U8toU8\n\
+__kernel void pre_process_rgb888_planar_sep_4over3_U8toU8\n\
     (\n\
-    __read_only  image2d_array_t input,\n\
-    __write_only image2d_array_t output0,\n\
-    __write_only image2d_array_t output1,\n\
-    __write_only image2d_array_t output2,\n\
+    __read_only  image2d_array_t input0,\n\
+    __read_only  image2d_array_t input1,\n\
+    __read_only  image2d_array_t input2,\n\
+    __write_only image2d_array_t output,\n\
           global int             *xRatio,\n\
           global int             *yRatio,\n\
           global int             *xOffset,\n\
@@ -31037,24 +35198,24 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8\n\
                  float           rMean,\n\
                  float           gMean,\n\
                  float           bMean,\n\
-                 float           f32Var\n\
+                 float           r_scale,\n\
+                 int             reverse,\n\
+                 int             height,\n\
+                 float           g_scale,\n\
+                 float           b_scale\n\
     )\n\
 {\n\
-    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
     int4 coord_out;\n\
 \n\
     vxc_uchar16 src0, src1, src2, src3;\n\
     vxc_uchar16 dst0, dst1, dst2;\n\
 \n\
-    VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.z ++;\n\
+    VXC_ReadImage(src0, input0, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input0, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src2, input0, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src3, input0, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
     coord_out.xy = (coord_in.xy >> 2) * 3;\n\
     coord_out.zw = coord_in.yy + (int2)(1, 2);\n\
 \n\
@@ -31067,19 +35228,16 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8\n\
     VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
     VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
 \n\
-    VXC_WriteImage(output0, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    int4 coord_r = coord_out;\n\
+    coord_r.yzw += rgb_order.xxx;\n\
+    VXC_WriteImage(output, coord_r.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_r.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_r.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-    VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.z ++;\n\
+    VXC_ReadImage(src0, input1, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input1, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src2, input1, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src3, input1, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
 \n\
     VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);\n\
     VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);\n\
@@ -31090,18 +35248,16 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8\n\
     VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
     VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
 \n\
-    VXC_WriteImage(output1, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    int4 coord_g = coord_out;\n\
+    coord_g.yzw += rgb_order.yyy;\n\
+    VXC_WriteImage(output, coord_g.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_g.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_g.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-    VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src0, input2, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input2, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src2, input2, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src3, input2, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
 \n\
     VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);\n\
     VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);\n\
@@ -31112,17 +35268,19 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8\n\
     VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
     VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
 \n\
-    VXC_WriteImage(output2, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output2, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    int4 coord_b = coord_out;\n\
+    coord_b.yzw += rgb_order.zzz;\n\
+    VXC_WriteImage(output, coord_b.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_b.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_b.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
-__kernel void pre_process_rgb888_planar_half_U8toU8\n\
+__kernel void pre_process_rgb888_planar_sep_half_U8toU8\n\
     (\n\
-    __read_only  image2d_array_t input,\n\
-    __write_only image2d_array_t output0,\n\
-    __write_only image2d_array_t output1,\n\
-    __write_only image2d_array_t output2,\n\
+    __read_only  image2d_array_t input0,\n\
+    __read_only  image2d_array_t input1,\n\
+    __read_only  image2d_array_t input2,\n\
+    __write_only image2d_array_t output,\n\
           global int             *xRatio,\n\
           global int             *yRatio,\n\
           global int             *xOffset,\n\
@@ -31130,31 +35288,32 @@ __kernel void pre_process_rgb888_planar_half_U8toU8\n\
                  float           rMean,\n\
                  float           gMean,\n\
                  float           bMean,\n\
-                 float           f32Var\n\
+                 float           r_scale,\n\
+                 int             reverse,\n\
+                 int             height,\n\
+                 float           g_scale,\n\
+                 float           b_scale\n\
     )\n\
 {\n\
-    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
 \n\
     vxc_uchar16 src0, src1, src2;\n\
 \n\
-    VXC_ReadImage2DArray(src0, input, coord_in, 0,\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.z ++;\n\
-    VXC_ReadImage2DArray(src1, input, coord_in, 0,\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.z ++;\n\
-    VXC_ReadImage2DArray(src2, input, coord_in, 0,\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src0, input0, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input1, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src2, input2, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-    int2 coord = coord_in.xy >> 1;\n\
+    coord_in.zw = coord_in.xy >> 1;\n\
 \n\
-    VXC_WriteImage(output0, coord, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output1, coord, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output2, coord, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    int4 coord_rgb = coord_in.zwww;\n\
+    coord_rgb.yzw += rgb_order.xyz;\n\
+    VXC_WriteImage(output, coord_rgb.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_rgb.xz, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_rgb.xw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
-"; /* end of pre_process_rgb888_planar_2_vx*/
+"; /* end of pre_process_rgb888_planar_sep_2_vx*/
 
-static const char pre_process_rgb888_planar_sep_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char pre_process_rgb888_planar_sep_nhwc_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniVecShift10;\n\
 _viv_uniform VXC_512Bits uniAddRShift;\n\
@@ -31163,11 +35322,15 @@ _viv_uniform VXC_512Bits uniExtractBytes;\n\
 \n\
 _viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;\n\
 \n\
 _viv_uniform float output_scale;\n\
 _viv_uniform float output_zp;\n\
 \n\
-#define RESIZE_BILINEAR_4X1(input, mean, output) \\\n\
+#define RESIZE_BILINEAR_4X1(input, scale, mean) \\\n\
     VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
@@ -31197,23 +35360,16 @@ _viv_uniform float output_zp;\n\
     VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
         uniConvertIntergetoF32_4x4); \\\n\
  \\\n\
-    tmp_dst = tmp_dst * output_scale - mean * output_scale + output_zp; \\\n\
-    _viv_asm(CONV, dst0, tmp_dst); \\\n\
-    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
-        uniExtract8Data_2x8); \\\n\
-    _viv_asm(COPY, dst, dst1, 8); \\\n\
-    VXC_WriteImage(output, coord_out, dst, \\\n\
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    tmp_dst = tmp_dst * scale * output_scale - mean * scale * output_scale + output_zp; \\\n\
+    _viv_asm(CONV, dst0, tmp_dst);\n\
 \n\
 #define RGB888_PLANAR_SEP_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\
-__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
+__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name##_nhwc \\\n\
     ( \\\n\
     __read_only  image2d_array_t input0, \\\n\
     __read_only  image2d_array_t input1, \\\n\
     __read_only  image2d_array_t input2, \\\n\
-    __write_only image2d_array_t output0, \\\n\
-    __write_only image2d_array_t output1, \\\n\
-    __write_only image2d_array_t output2, \\\n\
+    __write_only image2d_array_t output, \\\n\
           global int             *xRatio, \\\n\
           global int             *yRatio, \\\n\
           global int             *xOffset, \\\n\
@@ -31221,7 +35377,10 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
                  float           rMean, \\\n\
                  float           gMean, \\\n\
                  float           bMean, \\\n\
-                 float           f32Var \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
     ) \\\n\
 { \\\n\
     int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
@@ -31274,7 +35433,9 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
     int4 test02, temp2; \\\n\
     int4 tt; \\\n\
     vxc_uchar4 val; \\\n\
-    int2 coord_out = (int2)(xPos.x, yPos); \\\n\
+    int4 coord_out = (int4)(xPos.x, yPos, yPos, yPos); \\\n\
+    coord_out.x = coord_out.x * 3; \\\n\
+    coord_out.z = coord_out.x + 8; \\\n\
  \\\n\
     vxc_uchar8 line1, line2; \\\n\
  \\\n\
@@ -31297,31 +35458,38 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
         uniConvertIntergetoF32_4x4); \\\n\
  \\\n\
     conv_type dst0; \\\n\
-    dst_type dst1; \\\n\
-    copy_type dst; \\\n\
-    tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \\\n\
+    dst_type dst1, dst2; \\\n\
+    copy_type data0, data1, dst; \\\n\
+    tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\
     _viv_asm(CONV, dst0, tmp_dst); \\\n\
     VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
         uniExtract8Data_2x8); \\\n\
-    _viv_asm(COPY, dst, dst1, 8); \\\n\
-    VXC_WriteImage(output0, coord_out, dst, \\\n\
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    RESIZE_BILINEAR_4X1(input1, g_scale, gMean) \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
  \\\n\
-    RESIZE_BILINEAR_4X1(input1, gMean, output1) \\\n\
-    RESIZE_BILINEAR_4X1(input2, bMean, output2) \\\n\
+    RESIZE_BILINEAR_4X1(input2, b_scale, bMean) \\\n\
+    VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, data0, dst1, 16); \\\n\
+    _viv_asm(COPY, data1, dst2, 16); \\\n\
+    VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uni16BitsDataInterleave_0_2x8); \\\n\
+    VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uni16BitsDataInterleave_1_2x8); \\\n\
+    VXC_WriteImage(output, coord_out.zy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
 RGB888_PLANAR_SEP_16BITS(F16, vxc_half8,  half4, vxc_short8)\n\
 RGB888_PLANAR_SEP_16BITS(I16, vxc_short8, int4,  vxc_short8)\n\
 \n\
 #define RGB888_PLANAR_SEP_8BITS(dst_name, write_type) \\\n\
-__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
+__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name##_nhwc \\\n\
     ( \\\n\
     __read_only  image2d_array_t input0, \\\n\
     __read_only  image2d_array_t input1, \\\n\
     __read_only  image2d_array_t input2, \\\n\
-    __write_only image2d_array_t output0, \\\n\
-    __write_only image2d_array_t output1, \\\n\
-    __write_only image2d_array_t output2, \\\n\
+    __write_only image2d_array_t output, \\\n\
           global int             *xRatio, \\\n\
           global int             *yRatio, \\\n\
           global int             *xOffset, \\\n\
@@ -31329,7 +35497,10 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
                  float           rMean, \\\n\
                  float           gMean, \\\n\
                  float           bMean, \\\n\
-                 float           f32Var \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
     ) \\\n\
 { \\\n\
     int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\
@@ -31378,6 +35549,7 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
     int4 test01, temp1; \\\n\
     int4 test02, temp2; \\\n\
     int2 coord_out = (int2)(xPos.x, yPos); \\\n\
+    coord_out.x = coord_out.x * 3; \\\n\
  \\\n\
     VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
         uniVecShift10); \\\n\
@@ -31400,13 +35572,11 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
         uniConvertIntergetoF32_4x4); \\\n\
  \\\n\
     int4 dst0; \\\n\
-    write_type dst; \\\n\
-    tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \\\n\
+    write_type dst1, dst; \\\n\
+    tmp_dst = tmp_dst * r_scale * output_scale - rMean * r_scale * output_scale + output_zp; \\\n\
     dst0 = convert_int4_rte(tmp_dst); \\\n\
-    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
         uniExtract8Data_2x8); \\\n\
- \\\n\
-    VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
     VXC_ReadImage(line0Y, input1, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_ReadImage(line0Y, input1, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
@@ -31438,12 +35608,10 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
         uniExtractBytes); \\\n\
     VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
         uniConvertIntergetoF32_4x4); \\\n\
-    tmp_dst = tmp_dst * output_scale - gMean * output_scale + output_zp; \\\n\
+    tmp_dst = tmp_dst * g_scale * output_scale - gMean * g_scale * output_scale + output_zp; \\\n\
     dst0 = convert_int4_rte(tmp_dst); \\\n\
-    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
         uniExtract8Data_2x8); \\\n\
- \\\n\
-    VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
     VXC_ReadImage(line0Y, input2, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_ReadImage(line0Y, input2, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
@@ -31475,33 +35643,39 @@ __kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\
         uniExtractBytes); \\\n\
     VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
         uniConvertIntergetoF32_4x4); \\\n\
-    tmp_dst = tmp_dst * output_scale - bMean * output_scale + output_zp; \\\n\
+    tmp_dst = tmp_dst * b_scale * output_scale - bMean * b_scale * output_scale + output_zp; \\\n\
     dst0 = convert_int4_rte(tmp_dst); \\\n\
-    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \\\n\
         uniExtract8Data_2x8); \\\n\
- \\\n\
-    VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uni8BitsDataInterleave_0_2x8); \\\n\
+    VXC_DP2x8(dst, dst1, dst1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uni16BitsDataInterleave_1_2x8); \\\n\
+    VXC_WriteImage(output, coord_out.xy, dst, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
 RGB888_PLANAR_SEP_8BITS(U8, vxc_uchar16)\n\
-RGB888_PLANAR_SEP_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_sep_0_vx*/
+RGB888_PLANAR_SEP_8BITS(I8, vxc_char16)\n\
+"; /* end of pre_process_rgb888_planar_sep_nhwc_0_vx*/
 
-static const char pre_process_rgb888_planar_sep_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char pre_process_rgb888_planar_sep_nhwc_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\
-_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\
 \n\
 _viv_uniform float output_scale;\n\
 _viv_uniform float output_zp;\n\
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni16BitsDataInterleave_1_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;\n\
 \n\
 #define RGB888_PLANAR_SEP_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\
-__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\
+__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name##_nhwc \\\n\
     ( \\\n\
     __read_only  image2d_array_t input0, \\\n\
     __read_only  image2d_array_t input1, \\\n\
     __read_only  image2d_array_t input2, \\\n\
-    __write_only image2d_array_t output0, \\\n\
-    __write_only image2d_array_t output1, \\\n\
-    __write_only image2d_array_t output2, \\\n\
+    __write_only image2d_array_t output, \\\n\
           global int             *xRatio, \\\n\
           global int             *yRatio, \\\n\
           global int             *xOffset, \\\n\
@@ -31509,7 +35683,10 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\
                  float           rMean, \\\n\
                  float           gMean, \\\n\
                  float           bMean, \\\n\
-                 float           f32Var \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
     ) \\\n\
 { \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
@@ -31522,58 +35699,50 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\
     VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    coord.x = coord.z + 8; \\\n\
-    float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \\\n\
-        rMean * output_scale - output_zp, output_scale); \\\n\
+    int4 coord_out = coord; \\\n\
+    coord_out.z = coord_out.z * 3; \\\n\
+    coord_out.x = coord_out.z + 8; \\\n\
+    float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp,\\\n\
+        rMean * r_scale * output_scale - output_zp, \\\n\
+        rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\
  \\\n\
     half4 paramData_f16; \\\n\
-    copy_type tmp_dst; \\\n\
+    copy_type data0, data1, data2, dst; \\\n\
     _viv_asm(CONV, paramData_f16, paramData0); \\\n\
-    VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+    VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
         uniDataMeanStddevLo_2x8); \\\n\
-    VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
-        uniDataMeanStddevHi_2x8); \\\n\
-    _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
-    VXC_WriteImage(output0, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-    _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
-    VXC_WriteImage(output0, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
-    float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \\\n\
-        gMean * output_scale - output_zp, output_scale); \\\n\
+    float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp,\\\n\
+        gMean * g_scale * output_scale - output_zp, \\\n\
+        gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\
     _viv_asm(CONV, paramData_f16, paramData1); \\\n\
-    VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+    VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), \\\n\
         uniDataMeanStddevLo_2x8); \\\n\
-    VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
-        uniDataMeanStddevHi_2x8); \\\n\
-    _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
-    VXC_WriteImage(output1, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-    _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
-    VXC_WriteImage(output1, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, data0, dst0, 16); \\\n\
  \\\n\
-    float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \\\n\
-        bMean * output_scale - output_zp, output_scale); \\\n\
+    float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp,\\\n\
+        bMean * b_scale * output_scale - output_zp, \\\n\
+        bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
     _viv_asm(CONV, paramData_f16, paramData2); \\\n\
     VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
         uniDataMeanStddevLo_2x8); \\\n\
-    VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
-        uniDataMeanStddevHi_2x8); \\\n\
-    _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\
-    VXC_WriteImage(output2, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-    _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\
-    VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, data1, dst0, 16); \\\n\
+    VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uni16BitsDataInterleave_0_2x8); \\\n\
+    VXC_WriteImage(output, coord_out.zw, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uni16BitsDataInterleave_1_2x8); \\\n\
+    VXC_WriteImage(output, coord_out.xw, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
 RGB888_PLANAR_SEP_COPY_16BITS(F16, vxc_half8,  vxc_short8)\n\
 RGB888_PLANAR_SEP_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\
 \n\
 #define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \\\n\
-__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\
+__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name##_nhwc \\\n\
     ( \\\n\
     __read_only  image2d_array_t input0, \\\n\
     __read_only  image2d_array_t input1, \\\n\
     __read_only  image2d_array_t input2, \\\n\
-    __write_only image2d_array_t output0, \\\n\
-    __write_only image2d_array_t output1, \\\n\
-    __write_only image2d_array_t output2, \\\n\
+    __write_only image2d_array_t output, \\\n\
           global int             *xRatio, \\\n\
           global int             *yRatio, \\\n\
           global int             *xOffset, \\\n\
@@ -31581,153 +35750,75 @@ __kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\
                  float           rMean, \\\n\
                  float           gMean, \\\n\
                  float           bMean, \\\n\
-                 float           f32Var \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
     ) \\\n\
 { \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
  \\\n\
     coord.xy += (int2) (*xOffset, *yOffset); \\\n\
     vxc_uchar16 src0, src1, src2; \\\n\
-    write_type dst; \\\n\
+    write_type dst0, dst1, dst2, dst3; \\\n\
  \\\n\
     VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \\\n\
-        rMean * output_scale - output_zp, output_scale); \\\n\
+    int4 coord_out = coord; \\\n\
+    coord_out.z = coord_out.z * 3; \\\n\
+    coord_out.x = coord_out.z + 16; \\\n\
+    float4 paramData0 = (float4)(rMean * r_scale * output_scale - output_zp,\\\n\
+        rMean * r_scale * output_scale - output_zp, \\\n\
+        rMean * r_scale * output_scale - output_zp, r_scale * output_scale); \\\n\
  \\\n\
     half4 paramData_f16; \\\n\
     _viv_asm(CONV, paramData_f16, paramData0); \\\n\
  \\\n\
-    VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+    VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
         uniDataMeanStddevLo_2x8); \\\n\
-    VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
-        uniDataMeanStddevHi_2x8); \\\n\
-    VXC_WriteImage(output0, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \\\n\
-        gMean * output_scale - output_zp, output_scale); \\\n\
+    float4 paramData1 = (float4)(gMean * g_scale * output_scale - output_zp,\\\n\
+        gMean * g_scale * output_scale - output_zp, \\\n\
+        gMean * g_scale * output_scale - output_zp, g_scale * output_scale); \\\n\
     _viv_asm(CONV, paramData_f16, paramData1); \\\n\
  \\\n\
-    VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+    VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
         uniDataMeanStddevLo_2x8); \\\n\
-    VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
-        uniDataMeanStddevHi_2x8); \\\n\
-    VXC_WriteImage(output1, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \\\n\
-        bMean * output_scale - output_zp, output_scale); \\\n\
+    float4 paramData2 = (float4)(bMean * b_scale * output_scale - output_zp,\\\n\
+        bMean * b_scale * output_scale - output_zp, \\\n\
+        bMean * b_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
     _viv_asm(CONV, paramData_f16, paramData2); \\\n\
  \\\n\
-    VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+    VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
         uniDataMeanStddevLo_2x8); \\\n\
-    VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
-        uniDataMeanStddevHi_2x8); \\\n\
-    VXC_WriteImage(output2, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uni8BitsDataInterleave_0_2x8); \\\n\
+    VXC_DP2x8(dst2, dst0, dst1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uni8BitsDataInterleave_1_2x8); \\\n\
+    VXC_DP2x8(dst3, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uni8BitsDataInterleave_2_2x8); \\\n\
+    VXC_WriteImage(output, coord_out.zw, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(output, coord_out.xw, dst3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
 PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\
 PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\
-"; /* end of pre_process_rgb888_planar_sep_1_vx*/
+"; /* end of pre_process_rgb888_planar_sep_nhwc_1_vx*/
 
-static const char pre_process_rgb888_planar_sep_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char pre_process_rgb888_planar_sep_nhwc_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
-_viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;\n\
-_viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;\n\
-_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_0_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_1_2x8;\n\
+_viv_uniform VXC_512Bits uni8BitsDataInterleave_2_2x8;\n\
 \n\
-__kernel void pre_process_rgb888_planar_sep_4over3_U8toU8\n\
+__kernel void pre_process_rgb888_planar_sep_half_U8toU8_nhwc\n\
     (\n\
     __read_only  image2d_array_t input0,\n\
     __read_only  image2d_array_t input1,\n\
     __read_only  image2d_array_t input2,\n\
-    __write_only image2d_array_t output0,\n\
-    __write_only image2d_array_t output1,\n\
-    __write_only image2d_array_t output2,\n\
-          global int             *xRatio,\n\
-          global int             *yRatio,\n\
-          global int             *xOffset,\n\
-          global int             *yOffset,\n\
-                 float           rMean,\n\
-                 float           gMean,\n\
-                 float           bMean,\n\
-                 float           f32Var\n\
-    )\n\
-{\n\
-    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
-    int4 coord_out;\n\
-\n\
-    vxc_uchar16 src0, src1, src2, src3;\n\
-    vxc_uchar16 dst0, dst1, dst2;\n\
-\n\
-    VXC_ReadImage(src0, input0, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input0, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src2, input0, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src3, input0, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    coord_out.xy = (coord_in.xy >> 2) * 3;\n\
-    coord_out.zw = coord_in.yy + (int2)(1, 2);\n\
-\n\
-    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);\n\
-    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);\n\
-    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
-    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
-    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
-    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
-    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
-    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
-\n\
-    VXC_WriteImage(output0, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(src0, input1, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input1, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src2, input1, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src3, input1, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);\n\
-    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);\n\
-    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
-    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
-    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
-    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
-    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
-    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
-\n\
-    VXC_WriteImage(output1, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(src0, input2, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src1, input2, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src2, input2, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(src3, input2, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);\n\
-    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);\n\
-    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
-    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
-    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
-    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
-    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
-    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
-\n\
-    VXC_WriteImage(output2, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output2, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pre_process_rgb888_planar_sep_half_U8toU8\n\
-    (\n\
-    __read_only  image2d_array_t input0,\n\
-    __read_only  image2d_array_t input1,\n\
-    __read_only  image2d_array_t input2,\n\
-    __write_only image2d_array_t output0,\n\
-    __write_only image2d_array_t output1,\n\
-    __write_only image2d_array_t output2,\n\
+    __write_only image2d_array_t output,\n\
           global int             *xRatio,\n\
           global int             *yRatio,\n\
           global int             *xOffset,\n\
@@ -31735,7 +35826,10 @@ __kernel void pre_process_rgb888_planar_sep_half_U8toU8\n\
                  float           rMean,\n\
                  float           gMean,\n\
                  float           bMean,\n\
-                 float           f32Var\n\
+                 float           r_scale,\n\
+                 int             reverse,\n\
+                 float           g_scale,\n\
+                 float           b_scale\n\
     )\n\
 {\n\
     int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
@@ -31746,13 +35840,28 @@ __kernel void pre_process_rgb888_planar_sep_half_U8toU8\n\
     VXC_ReadImage(src1, input1, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
     VXC_ReadImage(src2, input2, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-    coord_in.zw = coord_in.xy >> 1;\n\
+    int4 coord;\n\
+    coord.xy = coord_in.xy >> 1;\n\
+\n\
+    coord.x = coord.x * 3;\n\
+    coord.z = coord.x + 16;\n\
+\n\
+    vxc_uchar16 dst0, dst1;\n\
+    src0.lo = src0.s02468ace;\n\
+    src0.hi = src1.s02468ace;\n\
+    src1.lo = src2.s02468ace;\n\
 \n\
-    VXC_WriteImage(output0, coord_in.zw, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output1, coord_in.zw, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output2, coord_in.zw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\
+        uni8BitsDataInterleave_0_2x8);\n\
+    VXC_DP2x8(dst0, src0, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\n\
+        uni8BitsDataInterleave_1_2x8);\n\
+    VXC_DP2x8(dst1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\
+        uni8BitsDataInterleave_2_2x8);\n\
+\n\
+    VXC_WriteImage(output, coord.xy, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord.zy, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
-"; /* end of pre_process_rgb888_planar_sep_2_vx*/
+"; /* end of pre_process_rgb888_planar_sep_nhwc_2_vx*/
 
 static const char pre_process_rgb_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -31773,6 +35882,8 @@ _viv_uniform VXC_512Bits uniExtractBtoF32_part1_4x4;\n\
 _viv_uniform VXC_512Bits uniExtractBtoF32_part2_4x4;\n\
 _viv_uniform VXC_512Bits uniExtractBtoF32_part3_4x4;\n\
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform float4 param_data;\n\
+_viv_uniform float4 rgb_scale;\n\
 \n\
 #define IMAGE_PRE_PROCESS_COPY_16BITS(dst_name, dst_type, copy_type, convert_type) \\\n\
 __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
@@ -31786,9 +35897,11 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
                 float            rMean, \\\n\
                 float            gMean, \\\n\
                 float            bMean, \\\n\
-                float            f32Var, \\\n\
+                float            r_scale, \\\n\
                 int              reverse_channel, \\\n\
-                int              trans \\\n\
+                int              trans, \\\n\
+                float            g_scale, \\\n\
+                float            b_scale \\\n\
     ) \\\n\
 { \\\n\
     int2 coord      = (int2)(get_global_id(0) * 3, get_global_id(1)); \\\n\
@@ -31802,10 +35915,6 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
         VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
         VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
-    f32Var *= outputScale; \\\n\
-    float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \\\n\
-        bMean * f32Var - outputZP, f32Var); \\\n\
  \\\n\
     int4 coord_out = (int4)(get_global_id(0), get_global_id(1), r_order, 0); \\\n\
     float4 tmp0, tmp1; \\\n\
@@ -31813,8 +35922,8 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
  \\\n\
     VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \\\n\
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \\\n\
-    tmp0 = tmp0 * paramData.w - paramData.x; \\\n\
-    tmp1 = tmp1 * paramData.w - paramData.x; \\\n\
+    tmp0 = tmp0 * rgb_scale.x - param_data.x; \\\n\
+    tmp1 = tmp1 * rgb_scale.x - param_data.x; \\\n\
     _viv_asm(CONV_RTE, result0, tmp0); \\\n\
     _viv_asm(CONV_RTE, result1, tmp1); \\\n\
     VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -31824,8 +35933,8 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
     coord_out.z = 1; \\\n\
     VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \\\n\
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \\\n\
-    tmp0 = tmp0 * paramData.w - paramData.y; \\\n\
-    tmp1 = tmp1 * paramData.w - paramData.y; \\\n\
+    tmp0 = tmp0 * rgb_scale.y - param_data.y; \\\n\
+    tmp1 = tmp1 * rgb_scale.y - param_data.y; \\\n\
     _viv_asm(CONV_RTE, result0, tmp0); \\\n\
     _viv_asm(CONV_RTE, result1, tmp1); \\\n\
     VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -31835,8 +35944,8 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
     coord_out.z = b_order; \\\n\
     VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \\\n\
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \\\n\
-    tmp0 = tmp0 * paramData.w - paramData.z; \\\n\
-    tmp1 = tmp1 * paramData.w - paramData.z; \\\n\
+    tmp0 = tmp0 * rgb_scale.z - param_data.z; \\\n\
+    tmp1 = tmp1 * rgb_scale.z - param_data.z; \\\n\
     _viv_asm(CONV_RTE, result0, tmp0); \\\n\
     _viv_asm(CONV_RTE, result1, tmp1); \\\n\
     VXC_DP2x8(dst0, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -31858,9 +35967,11 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
                 float            rMean, \\\n\
                 float            gMean, \\\n\
                 float            bMean, \\\n\
-                float            f32Var, \\\n\
+                float            r_scale, \\\n\
                 int              reverse_channel, \\\n\
-                int              trans \\\n\
+                int              trans, \\\n\
+                float            g_scale, \\\n\
+                float            b_scale \\\n\
     ) \\\n\
 { \\\n\
     int2 coord      = (int2)(get_global_id(0) * 3, get_global_id(1)); \\\n\
@@ -31875,10 +35986,6 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
         coord.x += 16; \\\n\
     VXC_ReadImage(src2, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
         VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
-    f32Var *= outputScale; \\\n\
-    float4 paramData = (float4)(rMean * f32Var - outputZP, gMean * f32Var - outputZP, \\\n\
-        bMean * f32Var - outputZP, f32Var); \\\n\
  \\\n\
     int4 coord_out = (int4)(get_global_id(0), get_global_id(1), r_order, 0); \\\n\
     float4 tmp0, tmp1; \\\n\
@@ -31886,15 +35993,15 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
  \\\n\
     VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \\\n\
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \\\n\
-    tmp0 = tmp0 * paramData.w - paramData.x; \\\n\
-    tmp1 = tmp1 * paramData.w - paramData.x; \\\n\
+    tmp0 = tmp0 * rgb_scale.x - param_data.x; \\\n\
+    tmp1 = tmp1 * rgb_scale.x - param_data.x; \\\n\
     result0 = convert_int4_rte(tmp0); \\\n\
     result1 = convert_int4_rte(tmp1); \\\n\
     VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
     VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part2_4x4); \\\n\
     VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part3_4x4); \\\n\
-    tmp0 = tmp0 * paramData.w - paramData.x; \\\n\
-    tmp1 = tmp1 * paramData.w - paramData.x; \\\n\
+    tmp0 = tmp0 * rgb_scale.x - param_data.x; \\\n\
+    tmp1 = tmp1 * rgb_scale.x - param_data.x; \\\n\
     result0 = convert_int4_rte(tmp0); \\\n\
     result1 = convert_int4_rte(tmp1); \\\n\
     VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -31903,15 +36010,15 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
     coord_out.z = 1; \\\n\
     VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \\\n\
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \\\n\
-    tmp0 = tmp0 * paramData.w - paramData.y; \\\n\
-    tmp1 = tmp1 * paramData.w - paramData.y; \\\n\
+    tmp0 = tmp0 * rgb_scale.y - param_data.y; \\\n\
+    tmp1 = tmp1 * rgb_scale.y - param_data.y; \\\n\
     result0 = convert_int4_rte(tmp0); \\\n\
     result1 = convert_int4_rte(tmp1); \\\n\
     VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
     VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part2_4x4); \\\n\
     VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part3_4x4); \\\n\
-    tmp0 = tmp0 * paramData.w - paramData.y; \\\n\
-    tmp1 = tmp1 * paramData.w - paramData.y; \\\n\
+    tmp0 = tmp0 * rgb_scale.y - param_data.y; \\\n\
+    tmp1 = tmp1 * rgb_scale.y - param_data.y; \\\n\
     result0 = convert_int4_rte(tmp0); \\\n\
     result1 = convert_int4_rte(tmp1); \\\n\
     VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -31920,15 +36027,15 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
     coord_out.z = b_order; \\\n\
     VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \\\n\
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \\\n\
-    tmp0 = tmp0 * paramData.w - paramData.z; \\\n\
-    tmp1 = tmp1 * paramData.w - paramData.z; \\\n\
+    tmp0 = tmp0 * rgb_scale.z - param_data.z; \\\n\
+    tmp1 = tmp1 * rgb_scale.z - param_data.z; \\\n\
     result0 = convert_int4_rte(tmp0); \\\n\
     result1 = convert_int4_rte(tmp1); \\\n\
     VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
     VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part2_4x4); \\\n\
     VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part3_4x4); \\\n\
-    tmp0 = tmp0 * paramData.w - paramData.z; \\\n\
-    tmp1 = tmp1 * paramData.w - paramData.z; \\\n\
+    tmp0 = tmp0 * rgb_scale.z - param_data.z; \\\n\
+    tmp1 = tmp1 * rgb_scale.z - param_data.z; \\\n\
     result0 = convert_int4_rte(tmp0); \\\n\
     result1 = convert_int4_rte(tmp1); \\\n\
     VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -31989,9 +36096,11 @@ __kernel void pre_process_yuv420_copy_##name \\\n\
                  float           rMean, \\\n\
                  float           gMean, \\\n\
                  float           bMean, \\\n\
-                 float           var, \\\n\
+                 float           r_scale, \\\n\
                  int             reverse_channel, \\\n\
-                 int             trans \\\n\
+                 int             trans, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
     ) \\\n\
 { \\\n\
     int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \\\n\
@@ -32050,17 +36159,23 @@ __kernel void pre_process_yuv420_copy_##name \\\n\
     VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
     VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
  \\\n\
-    var *= output_scale; \\\n\
-    float4  paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \\\n\
-        rMean * var - output_zp, var); \\\n\
+    float4  paramData = (float4)(bMean * b_scale * output_scale - output_zp,\\\n\
+                                 gMean * g_scale * output_scale - output_zp, \\\n\
+                                 rMean * r_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
     half4 paramData_f16; \\\n\
     _viv_asm(CONV, paramData_f16, paramData); \\\n\
  \\\n\
     VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \\\n\
     VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \\\n\
+ \\\n\
+    paramData.w = g_scale * output_scale; \\\n\
+    _viv_asm(CONV, paramData_f16, paramData); \\\n\
  \\\n\
     VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \\\n\
     VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \\\n\
+ \\\n\
+    paramData.w = r_scale * output_scale; \\\n\
+    _viv_asm(CONV, paramData_f16, paramData); \\\n\
  \\\n\
     VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \\\n\
     VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \\\n\
@@ -32090,9 +36205,11 @@ __kernel void pre_process_yuv420_copy_##name \\\n\
                  float           rMean, \\\n\
                  float           gMean, \\\n\
                  float           bMean, \\\n\
-                 float           var, \\\n\
+                 float           r_scale, \\\n\
                  int             reverse_channel, \\\n\
-                 int             trans \\\n\
+                 int             trans, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
     ) \\\n\
 { \\\n\
     int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \\\n\
@@ -32142,18 +36259,22 @@ __kernel void pre_process_yuv420_copy_##name \\\n\
     VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
     VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
  \\\n\
-    var *= output_scale; \\\n\
-    float4  paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \\\n\
-        rMean * var - output_zp, var); \\\n\
+    float4  paramData = (float4)(bMean * b_scale * output_scale - output_zp, \\\n\
+                                 gMean * g_scale * output_scale - output_zp, \\\n\
+                                 rMean * r_scale * output_scale - output_zp, b_scale * output_scale); \\\n\
     half4 paramData_f16; \\\n\
     _viv_asm(CONV, paramData_f16, paramData); \\\n\
  \\\n\
     VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \\\n\
     VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \\\n\
  \\\n\
+    paramData.w = g_scale * output_scale; \\\n\
+    _viv_asm(CONV, paramData_f16, paramData); \\\n\
     VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \\\n\
     VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \\\n\
  \\\n\
+    paramData.w = r_scale * output_scale; \\\n\
+    _viv_asm(CONV, paramData_f16, paramData); \\\n\
     VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \\\n\
     VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \\\n\
  \\\n\
@@ -32228,9 +36349,11 @@ __kernel void pre_process_yuv420_scale_##name \\\n\
                  float           rMean, \\\n\
                  float           gMean, \\\n\
                  float           bMean, \\\n\
-                 float           var, \\\n\
+                 float           r_scale, \\\n\
                  int             reverse_channel, \\\n\
-                 int             trans \\\n\
+                 int             trans, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
     ) \\\n\
 { \\\n\
     int4 gidx = get_global_id(0); \\\n\
@@ -32379,7 +36502,7 @@ __kernel void pre_process_yuv420_scale_##name \\\n\
     float4 tmpDst; \\\n\
     int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
-    tmpDst = (tmpDst - bMean) * var; \\\n\
+    tmpDst = (tmpDst - bMean) * b_scale; \\\n\
     dstPos.z = bOrder; \\\n\
     result = convert_int4_rte(tmpDst * output_scale + output_zp); \\\n\
     VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -32393,7 +36516,7 @@ __kernel void pre_process_yuv420_scale_##name \\\n\
     temp2 = fx * tmpData0 + tmpData1; \\\n\
     result = fy * temp2 + (temp1 << 10); \\\n\
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
-    tmpDst = (tmpDst - gMean) * var; \\\n\
+    tmpDst = (tmpDst - gMean) * g_scale; \\\n\
     dstPos.z = 1; \\\n\
     result = convert_int4_rte(tmpDst * output_scale + output_zp); \\\n\
     VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -32407,7 +36530,7 @@ __kernel void pre_process_yuv420_scale_##name \\\n\
     temp2 = fx * tmpData0 + tmpData1; \\\n\
     result = fy * temp2 + (temp1 << 10); \\\n\
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
-    tmpDst = (tmpDst - rMean) * var; \\\n\
+    tmpDst = (tmpDst - rMean) * r_scale; \\\n\
     dstPos.z = rOrder; \\\n\
     result = convert_int4_rte(tmpDst * output_scale + output_zp); \\\n\
     VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
@@ -32467,9 +36590,11 @@ __kernel void pre_process_yuv420_scale_##name \\\n\
                  float           rMean, \\\n\
                  float           gMean, \\\n\
                  float           bMean, \\\n\
-                 float           var, \\\n\
+                 float           r_scale, \\\n\
                  int             reverse_channel, \\\n\
-                 int             trans \\\n\
+                 int             trans, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
     ) \\\n\
 { \\\n\
     int4 gidx = get_global_id(0); \\\n\
@@ -32620,7 +36745,7 @@ __kernel void pre_process_yuv420_scale_##name \\\n\
     float4 tmpDst; \\\n\
     int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
-    tmpDst = (tmpDst - bMean) * var; \\\n\
+    tmpDst = (tmpDst - bMean) * b_scale; \\\n\
     dstPos.z = bOrder; \\\n\
     tmpDst = tmpDst * output_scale + output_zp; \\\n\
     _viv_asm(CONV_RTE, tmpVal, tmpDst); \\\n\
@@ -32636,7 +36761,7 @@ __kernel void pre_process_yuv420_scale_##name \\\n\
     temp2 = fx * tmpData0 + tmpData1; \\\n\
     result = fy * temp2 + (temp1 << 10); \\\n\
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
-    tmpDst = (tmpDst - gMean) * var; \\\n\
+    tmpDst = (tmpDst - gMean) * g_scale; \\\n\
     dstPos.z = 1; \\\n\
     tmpDst = tmpDst * output_scale + output_zp; \\\n\
     _viv_asm(CONV_RTE, tmpVal, tmpDst); \\\n\
@@ -32652,7 +36777,7 @@ __kernel void pre_process_yuv420_scale_##name \\\n\
     temp2 = fx * tmpData0 + tmpData1; \\\n\
     result = fy * temp2 + (temp1 << 10); \\\n\
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
-    tmpDst = (tmpDst - rMean) * var; \\\n\
+    tmpDst = (tmpDst - rMean) * r_scale; \\\n\
     dstPos.z = rOrder; \\\n\
     tmpDst = tmpDst * output_scale + output_zp; \\\n\
     _viv_asm(CONV_RTE, tmpVal, tmpDst); \\\n\
@@ -32669,7 +36794,9 @@ static const char pre_process_yuv422_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n
 _viv_uniform int bOrder;\n\
 _viv_uniform int rOrder;\n\
 \n\
-_viv_uniform float outputScaleVar;\n\
+_viv_uniform float outputScaleVar_b;\n\
+_viv_uniform float outputScaleVar_g;\n\
+_viv_uniform float outputScaleVar_r;\n\
 _viv_uniform float bMeanScaleVarZp;\n\
 _viv_uniform float gMeanScaleVarZp;\n\
 _viv_uniform float rMeanScaleVarZp;\n\
@@ -32693,10 +36820,12 @@ __kernel void pre_process_yuv422_copy_##name \\\n\
                  float           rMean, \\\n\
                  float           gMean, \\\n\
                  float           bMean, \\\n\
-                 float           var, \\\n\
+                 float           r_scale, \\\n\
                  int             reverse_channel, \\\n\
                  int             trans, \\\n\
-                 int             yuv422_type \\\n\
+                 int             yuv422_type, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
     ) \\\n\
 { \\\n\
     int gidx = get_global_id(0); \\\n\
@@ -32726,21 +36855,21 @@ __kernel void pre_process_yuv422_copy_##name \\\n\
     dst_type dst0; \\\n\
     save_type dst; \\\n\
     int4 dstPos = (int4)(gidx, gidy, 0, 0); \\\n\
-    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\
+    tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\
     _viv_asm(CONV_RTE, result, tmpDstB); \\\n\
     dstPos.z = bOrder; \\\n\
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
     _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\
+    tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\
     _viv_asm(CONV_RTE, result, tmpDstG); \\\n\
     dstPos.z = 1; \\\n\
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
     _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\
+    tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\
     _viv_asm(CONV_RTE, result, tmpDstR); \\\n\
     dstPos.z = rOrder; \\\n\
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
@@ -32758,7 +36887,10 @@ static const char pre_process_yuv422_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\
 _viv_uniform int bOrder;\n\
 _viv_uniform int rOrder;\n\
 \n\
-_viv_uniform float outputScaleVar;\n\
+_viv_uniform float outputScaleVar_b;\n\
+_viv_uniform float outputScaleVar_g;\n\
+_viv_uniform float outputScaleVar_r;\n\
+\n\
 _viv_uniform float bMeanScaleVarZp;\n\
 _viv_uniform float gMeanScaleVarZp;\n\
 _viv_uniform float rMeanScaleVarZp;\n\
@@ -32788,10 +36920,12 @@ __kernel void pre_process_yuv422_scale_##name \\\n\
                  float           rMean, \\\n\
                  float           gMean, \\\n\
                  float           bMean, \\\n\
-                 float           var, \\\n\
+                 float           r_scale, \\\n\
                  int             reverse_channel, \\\n\
                  int             trans, \\\n\
-                 int             yuv422_type \\\n\
+                 int             yuv422_type, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
     ) \\\n\
 { \\\n\
     int4 gidx = get_global_id(0); \\\n\
@@ -32863,21 +36997,21 @@ __kernel void pre_process_yuv422_scale_##name \\\n\
     dst_type dst0; \\\n\
     save_type dst; \\\n\
     int4 dstPos = (int4)(gidx.x, gidy, 0, 0); \\\n\
-    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\
+    tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\
     _viv_asm(CONV_RTE, result, tmpDstB); \\\n\
     dstPos.z = bOrder; \\\n\
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
     _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\
+    tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\
     _viv_asm(CONV_RTE, result, tmpDstG); \\\n\
     dstPos.z = 1; \\\n\
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
     _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
     VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\
+    tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\
     _viv_asm(CONV_RTE, result, tmpDstR); \\\n\
     dstPos.z = rOrder; \\\n\
     VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
@@ -32939,9 +37073,11 @@ __kernel void pre_process_yuv444_copy_U8toU8(\n\
                float                 rMean,\n\
                float                 gMean,\n\
                float                 bMean,\n\
-               float                   var,\n\
+               float               r_scale,\n\
                int         reverse_channel,\n\
-               int                   trans\n\
+               int                   trans,\n\
+               float               g_scale,\n\
+               float               b_scale\n\
     )\n\
 {\n\
     int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset));\n\
@@ -33000,17 +37136,22 @@ __kernel void pre_process_yuv444_copy_U8toU8(\n\
     VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
     VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
 \n\
-    var *= outputScale;\n\
-    float4  paramData = (float4)(bMean * var - zp, gMean * var - zp,\\\n\
-        rMean * var - zp, var);\n\
+    float4  paramData = (float4)(bMean * b_scale * outputScale - zp, gMean * g_scale * outputScale - zp,\\\n\
+        rMean * r_scale * outputScale - zp, b_scale * outputScale);\n\
     half4 paramData_f16;\n\
     _viv_asm(CONV, paramData_f16, paramData);\n\
 \n\
     VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\
     VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\
+\n\
+    paramData.w = g_scale * outputScale;\n\
+    _viv_asm(CONV, paramData_f16, paramData);\n\
 \n\
     VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\
     VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\
+\n\
+    paramData.w = r_scale * outputScale;\n\
+    _viv_asm(CONV, paramData_f16, paramData);\n\
 \n\
     VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\
     VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\
@@ -33035,9 +37176,11 @@ __kernel void pre_process_yuv444_copy_U8toF16(\n\
                float                 rMean,\n\
                float                 gMean,\n\
                float                 bMean,\n\
-               float                   var,\n\
+               float               r_scale,\n\
                int         reverse_channel,\n\
-               int                   trans\n\
+               int                   trans,\n\
+               float               g_scale,\n\
+               float               b_scale\n\
     )\n\
 {\n\
     int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset));\n\
@@ -33097,16 +37240,22 @@ __kernel void pre_process_yuv444_copy_U8toF16(\n\
     VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
     VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
 \n\
-    float4  paramData = (float4)(bMean * var, gMean * var,\\\n\
-        rMean * var, var);\n\
+    float4  paramData = (float4)(bMean * b_scale * outputScale, gMean * g_scale * outputScale,\\\n\
+        rMean * r_scale * outputScale, b_scale * outputScale);\n\
     half4 paramData_f16;\n\
     _viv_asm(CONV, paramData_f16, paramData);\n\
 \n\
     VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\
     VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\
+\n\
+    paramData.w = g_scale * outputScale;\n\
+     _viv_asm(CONV, paramData_f16, paramData);\n\
 \n\
     VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\
     VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\
+\n\
+    paramData.w = r_scale * outputScale;\n\
+     _viv_asm(CONV, paramData_f16, paramData);\n\
 \n\
     VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\
     VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\
@@ -33171,7 +37320,8 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \\\n\
     __read_only image2d_t y_img, __read_only image2d_t u_img, \\\n\
     __read_only image2d_t v_img, __write_only image2d_array_t    output, \\\n\
     global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset, \\\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans) \\\n\
+    float rMean, float gMean, float bMean, float r_scale, int reverse_channel, int trans, \\\n\
+    float g_scale, float b_scale) \\\n\
 { \\\n\
     int4 gidx = get_global_id(0); \\\n\
     int gidy = get_global_id(1); \\\n\
@@ -33283,7 +37433,7 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \\\n\
     float4 tmpDst; \\\n\
     int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
-    tmpDst = (tmpDst - bMean) * var; \\\n\
+    tmpDst = (tmpDst - bMean) * b_scale; \\\n\
     dstPos.z = bOrder; \\\n\
     result = convert_int4_rte(tmpDst * outputScale + zp); \\\n\
     VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \\\n\
@@ -33297,7 +37447,7 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \\\n\
     temp2 = fx * tmpData0 + tmpData1; \\\n\
     result = fy * temp2 + (temp1 << 10); \\\n\
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
-    tmpDst = (tmpDst - gMean) * var; \\\n\
+    tmpDst = (tmpDst - gMean) * g_scale; \\\n\
     dstPos.z = 1; \\\n\
     result = convert_int4_rte(tmpDst * outputScale + zp); \\\n\
     VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \\\n\
@@ -33311,7 +37461,7 @@ __kernel void pre_process_yuv444_scale_U8to##dst_name( \\\n\
     temp2 = fx * tmpData0 + tmpData1; \\\n\
     result = fy * temp2 + (temp1 << 10); \\\n\
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
-    tmpDst = (tmpDst - rMean) * var; \\\n\
+    tmpDst = (tmpDst - rMean) * r_scale; \\\n\
     dstPos.z = rOrder; \\\n\
     result = convert_int4_rte(tmpDst * outputScale + zp); \\\n\
     VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8); \\\n\
@@ -33360,7 +37510,8 @@ __kernel void pre_process_yuv444_scale_U8toF16(\n\
     __read_only image2d_t y_img, __read_only image2d_t u_img,\n\
     __read_only image2d_t v_img, __write_only image2d_array_t    output,\n\
     global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
+    float rMean, float gMean, float bMean, float r_scale, int reverse_channel, int trans,\n\
+    float g_scale, float b_scale)\n\
 {\n\
     int4 gidx = get_global_id(0);\n\
     int gidy = get_global_id(1);\n\
@@ -33480,7 +37631,7 @@ __kernel void pre_process_yuv444_scale_U8toF16(\n\
     float4 tmpDst;\n\
     int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    tmpDst = (tmpDst - bMean) * var;\n\
+    tmpDst = (tmpDst - bMean) * b_scale;\n\
     dstPos.z = bOrder;\n\
     _viv_asm(CONV, hDst, tmpDst);\n\
     VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\
@@ -33495,7 +37646,7 @@ __kernel void pre_process_yuv444_scale_U8toF16(\n\
     temp2 = fx * tmpData0 + tmpData1;\n\
     result = fy * temp2 + (temp1 << 10);\n\
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    tmpDst = (tmpDst - gMean) * var;\n\
+    tmpDst = (tmpDst - gMean) * g_scale;\n\
     dstPos.z = 1;\n\
     _viv_asm(CONV, hDst, tmpDst);\n\
     VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\
@@ -33510,7 +37661,7 @@ __kernel void pre_process_yuv444_scale_U8toF16(\n\
     temp2 = fx * tmpData0 + tmpData1;\n\
     result = fy * temp2 + (temp1 << 10);\n\
     VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    tmpDst = (tmpDst - rMean) * var;\n\
+    tmpDst = (tmpDst - rMean) * r_scale;\n\
     dstPos.z = rOrder;\n\
     _viv_asm(CONV, hDst, tmpDst);\n\
     VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\
@@ -37154,7 +41305,6 @@ static const char resize_1d_bilinear_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\
 _viv_uniform VXC_512Bits uniFp16toFp32_4x4;\n\
-_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\
 _viv_uniform VXC_512Bits uniExtactHalf8_2x8;\n\
 _viv_uniform float scale_x;\n\
 _viv_uniform int out_height;\n\
@@ -37215,8 +41365,10 @@ __kernel void resize_1d_bilinear_F16toF16_DOWN\n\
 \n\
         _viv_asm(COPY, src_half, src, 16);\n\
 \n\
-        VXC_DP4x4(left4,  src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_left_4x4);\n\
-        VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_right_4x4);\n\
+        VXC_DP4x4(left4,  src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+            uniConvertFp2FP32_left_4x4);\n\
+        VXC_DP4x4(right4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+            uniConvertFp2FP32_right_4x4);\n\
         right4      -= left4;\n\
         float4 dst4  = right4 * x_lerp + left4;\n\
 \n\
@@ -37281,8 +41433,10 @@ __kernel void resize_1d_bilinear_F16toU8_DOWN\n\
 \n\
         _viv_asm(COPY, src_half, src, 16);\n\
 \n\
-        VXC_DP4x4(left4,  src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_left_4x4);\n\
-        VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFp2FP32_right_4x4);\n\
+        VXC_DP4x4(left4,  src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+            uniConvertFp2FP32_left_4x4);\n\
+        VXC_DP4x4(right4, src_half, src_half, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+            uniConvertFp2FP32_right_4x4);\n\
         right4      -= left4;\n\
         float4 dst4  = right4 * x_lerp + left4;\n\
 \n\
@@ -41782,6 +45936,580 @@ __kernel void scatter_nd_update_F16F16toU8_big(\n\
 }\n\
 "; /* end of scatter_nd_update_big_vx*/
 
+static const char scatter_nd_update_fp_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int update_width;\n\
+_viv_uniform int output_width;\n\
+_viv_uniform int ref_stride;\n\
+_viv_uniform int output_stride;\n\
+\n\
+_viv_uniform int4 coord_stride;\n\
+_viv_uniform int4 coord_stride1;\n\
+_viv_uniform float inout_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertFp16ToFp32_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
+\n\
+inline void AtomicAdd_float(volatile __global float *source, const float operand)\n\
+{\n\
+    union\n\
+    {\n\
+        unsigned int intVal;\n\
+        float floatVal;\n\
+    } newVal;\n\
+    union\n\
+    {\n\
+        unsigned int intVal;\n\
+        float floatVal;\n\
+    } prevVal;\n\
+    do\n\
+    {\n\
+        prevVal.floatVal = *source;\n\
+        newVal.floatVal = prevVal.floatVal + operand;\n\
+    } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\
+                             prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\
+}\n\
+\n\
+__kernel void scatter_nd_update_update_F16(\n\
+    __read_only image2d_t   index,\n\
+    __read_only image2d_t   update,\n\
+    image2d_t  temp_buf_float,\n\
+    image2d_t  link_buffer0,\n\
+    int width, int area, int vol, int val4,\n\
+    int val5, int val6, int val7, int coord_dim)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    Image img1 = create_image_from_image2d(index, 4);\n\
+    Image img2 = create_image_from_image2d(update, 2);\n\
+    Image img3 = create_image_from_image2d(temp_buf_float, 4);\n\
+    __global int* index_ptr = (__global int*)img1.ptr;\n\
+    __global short* update_ptr = (__global short*)img2.ptr;\n\
+    __global float* output_ptr = (__global float*)img3.ptr;\n\
+    half src;\n\
+\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim);\n\
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);\n\
+    short tmpData = update_ptr[gidy * update_width + gidx];\n\
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;\n\
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;\n\
+    int loc = idx * output_width + gidx;\n\
+    _viv_asm(COPY, src, tmpData, 4);\n\
+    float data;\n\
+    _viv_asm(CONV, data, src);\n\
+    AtomicAdd_float(output_ptr + loc, data);\n\
+}\n\
+\n\
+__kernel void scatter_nd_update_update_F16_4X(\n\
+    __read_only image2d_t   index,\n\
+    __read_only image2d_t   update,\n\
+    image2d_t  temp_buf_float,\n\
+    image2d_t  link_buffer0,\n\
+    int width, int area, int vol, int val4,\n\
+    int val5, int val6, int val7, int coord_dim)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    Image img1 = create_image_from_image2d(index, 4);\n\
+    Image img2 = create_image_from_image2d(update, 2);\n\
+    Image img3 = create_image_from_image2d(temp_buf_float, 4);\n\
+    __global int* index_ptr = (__global int*)img1.ptr;\n\
+    __global vxc_short4* update_ptr = (__global vxc_short4*)img2.ptr;\n\
+    __global float* output_ptr = (__global float*)img3.ptr;\n\
+    vxc_half4 src;\n\
+\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim);\n\
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);\n\
+    vxc_short4 tmpData = update_ptr[gidy * update_width + gidx];\n\
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;\n\
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;\n\
+    int4 loc = idx * output_width + gidx * 4 + (int4)(0, 1, 2, 3);\n\
+\n\
+    _viv_asm(COPY, src, tmpData, 8);\n\
+    float4 data;\n\
+    VXC_DP4x4(data, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1),\n\
+             uniConvertFp16ToFp32_4x4);\n\
+    AtomicAdd_float(output_ptr + loc.x, data.x);\n\
+    AtomicAdd_float(output_ptr + loc.y, data.y);\n\
+    AtomicAdd_float(output_ptr + loc.z, data.z);\n\
+    AtomicAdd_float(output_ptr + loc.w, data.w);\n\
+}\n\
+\n\
+__kernel void scatter_nd_update_update_BF16(\n\
+    __read_only image2d_t   index,\n\
+    __read_only image2d_t   update,\n\
+    image2d_t  temp_buf_float,\n\
+    image2d_t  link_buffer0,\n\
+    int width, int area, int vol, int val4,\n\
+    int val5, int val6, int val7, int coord_dim)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    Image img1 = create_image_from_image2d(index, 4);\n\
+    Image img2 = create_image_from_image2d(update, 2);\n\
+    Image img3 = create_image_from_image2d(temp_buf_float, 4);\n\
+    __global int* index_ptr = (__global int*)img1.ptr;\n\
+    __global short* update_ptr = (__global short*)img2.ptr;\n\
+    __global float* output_ptr = (__global float*)img3.ptr;\n\
+    float data;\n\
+\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim);\n\
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);\n\
+    short tmpData = update_ptr[gidy * update_width + gidx];\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    vxc_short8 src0, src1;\n\
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;\n\
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;\n\
+    int loc = idx * output_width + gidx;\n\
+    _viv_asm(COPY, src0, tmpData, 4);\n\
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                        uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, data, src1, 4);\n\
+    AtomicAdd_float(output_ptr + loc, data);\n\
+}\n\
+\n\
+__kernel void scatter_nd_update_update_BF16_4X(\n\
+    __read_only image2d_t   index,\n\
+    __read_only image2d_t   update,\n\
+    image2d_t  temp_buf_float,\n\
+    image2d_t  link_buffer0,\n\
+    int width, int area, int vol, int val4,\n\
+    int val5, int val6, int val7, int coord_dim)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    Image img1 = create_image_from_image2d(index, 4);\n\
+    Image img2 = create_image_from_image2d(update, 2);\n\
+    Image img3 = create_image_from_image2d(temp_buf_float, 4);\n\
+    __global int* index_ptr = (__global int*)img1.ptr;\n\
+    __global vxc_short4* update_ptr = (__global vxc_short4*)img2.ptr;\n\
+    __global float* output_ptr = (__global float*)img3.ptr;\n\
+\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim);\n\
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);\n\
+    vxc_short4 tmpData = update_ptr[gidy * update_width + gidx];\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    vxc_short8 src0, src1;\n\
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;\n\
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;\n\
+    int4 loc = idx * output_width + gidx * 4 + (int4)(0, 1, 2, 3);\n\
+\n\
+    _viv_asm(COPY, src0, tmpData, 8);\n\
+    float4 data;\n\
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                        uniConvBF16toF32_Part0_2x8);\n\
+    _viv_asm(COPY, data, src1, 16);\n\
+    AtomicAdd_float(output_ptr + loc.x, data.x);\n\
+    AtomicAdd_float(output_ptr + loc.y, data.y);\n\
+    AtomicAdd_float(output_ptr + loc.z, data.z);\n\
+    AtomicAdd_float(output_ptr + loc.w, data.w);\n\
+}\n\
+\n\
+#define SCATTER_ND_UPDATE_REF_FP16(type0, type1, ptr_type) \\\n\
+__kernel void scatter_nd_update_ref_##type0##to##type1( \\\n\
+    __read_only image2d_t   index, \\\n\
+    __read_only image2d_t   update, \\\n\
+    __read_only image2d_t   temp_buf_int, \\\n\
+    image2d_t  temp_ref, \\\n\
+    image2d_t  link_buffer0, \\\n\
+    image2d_t  link_buffer1, \\\n\
+    int width, int area, int vol, int val4, \\\n\
+    int val5, int val6, int val7, int coord_dim) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    Image img1 = create_image_from_image2d(index, 4); \\\n\
+    Image img2 = create_image_from_image2d(temp_buf_int, 4); \\\n\
+    Image img3 = create_image_from_image2d(temp_ref, 2); \\\n\
+    __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+    __global ptr_type* acc_ptr = (__global ptr_type*)img2.ptr; \\\n\
+    __global short* ref_ptr = (__global short*)img3.ptr; \\\n\
+ \\\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\
+    int loc = idx * output_stride + gidx; \\\n\
+    float4 tmpData; \\\n\
+    tmpData.x = convert_float(acc_ptr[loc]) * inout_scale + output_zp; \\\n\
+    half4 data; \\\n\
+    short tmpDst; \\\n\
+    _viv_asm(CONV, data, tmpData); \\\n\
+    _viv_asm(COPY, tmpDst, data, 4); \\\n\
+    ref_ptr[loc] = tmpDst; \\\n\
+}\n\
+SCATTER_ND_UPDATE_REF_FP16(I32, F16, int)\n\
+SCATTER_ND_UPDATE_REF_FP16(F32, F16, float)\n\
+\n\
+#define SCATTER_ND_UPDATE_REF_FP16_4X(type0, type1, ptr_type) \\\n\
+__kernel void scatter_nd_update_ref_##type0##to##type1##_4X( \\\n\
+    __read_only image2d_t   index, \\\n\
+    __read_only image2d_t   update, \\\n\
+    __read_only image2d_t   temp_buf_int, \\\n\
+    image2d_t  temp_ref, \\\n\
+    image2d_t  link_buffer0, \\\n\
+    image2d_t  link_buffer1, \\\n\
+    int width, int area, int vol, int val4, \\\n\
+    int val5, int val6, int val7, int coord_dim) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    Image img1 = create_image_from_image2d(index, 4); \\\n\
+    Image img2 = create_image_from_image2d(temp_buf_int, 4); \\\n\
+    Image img3 = create_image_from_image2d(temp_ref, 2); \\\n\
+    __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+    __global ptr_type* acc_ptr = (__global ptr_type*)img2.ptr; \\\n\
+    __global vxc_short4* ref_ptr = (__global vxc_short4*)img3.ptr; \\\n\
+ \\\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\
+    float4 tmpData = convert_float4(vload4(gidx, acc_ptr + idx * ref_stride)); \\\n\
+    int loc = idx * output_stride + gidx; \\\n\
+    float4 tmpVal = tmpData * inout_scale + output_zp; \\\n\
+    half4 data; \\\n\
+    vxc_short8 tmpDst; \\\n\
+    _viv_asm(CONV, data, tmpVal); \\\n\
+    _viv_asm(COPY, tmpDst, data, 16); \\\n\
+    ref_ptr[loc] = tmpDst.s0246; \\\n\
+}\n\
+SCATTER_ND_UPDATE_REF_FP16_4X(I32, F16, int)\n\
+SCATTER_ND_UPDATE_REF_FP16_4X(F32, F16, float)\n\
+\n\
+__kernel void scatter_nd_update_ref_F32toBF16(\n\
+    __read_only image2d_t   index,\n\
+    __read_only image2d_t   update,\n\
+    __read_only image2d_t   temp_buf_int,\n\
+    image2d_t  temp_ref,\n\
+    image2d_t  link_buffer0,\n\
+    image2d_t  link_buffer1,\n\
+    int width, int area, int vol, int val4,\n\
+    int val5, int val6, int val7, int coord_dim)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    Image img1 = create_image_from_image2d(index, 4);\n\
+    Image img2 = create_image_from_image2d(temp_buf_int, 4);\n\
+    Image img3 = create_image_from_image2d(temp_ref, 2);\n\
+    __global int* index_ptr = (__global int*)img1.ptr;\n\
+    __global float* acc_ptr = (__global float*)img2.ptr;\n\
+    __global short* ref_ptr = (__global short*)img3.ptr;\n\
+\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim);\n\
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);\n\
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;\n\
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;\n\
+    int loc = idx * output_stride + gidx;\n\
+    float tmpData;\n\
+    tmpData = acc_ptr[loc];\n\
+    vxc_ushort8 src0, src2;\n\
+    _viv_asm(COPY, src0, tmpData, 4);\n\
+    VXC_DP2x8(src2, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+\n\
+    ref_ptr[loc] = src2.x;\n\
+}\n\
+\n\
+__kernel void scatter_nd_update_ref_F32toBF16_4X(\n\
+    __read_only image2d_t   index,\n\
+    __read_only image2d_t   update,\n\
+    __read_only image2d_t   temp_buf_int,\n\
+    image2d_t  temp_ref,\n\
+    image2d_t  link_buffer0,\n\
+    image2d_t  link_buffer1,\n\
+    int width, int area, int vol, int val4,\n\
+    int val5, int val6, int val7, int coord_dim)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    Image img1 = create_image_from_image2d(index, 4);\n\
+    Image img2 = create_image_from_image2d(temp_buf_int, 4);\n\
+    Image img3 = create_image_from_image2d(temp_ref, 2);\n\
+    __global int* index_ptr = (__global int*)img1.ptr;\n\
+    __global float* acc_ptr = (__global float*)img2.ptr;\n\
+    __global vxc_short4* ref_ptr = (__global vxc_short4*)img3.ptr;\n\
+\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim);\n\
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim);\n\
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1;\n\
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w;\n\
+    float4 tmpData = vload4(gidx, acc_ptr + idx * ref_stride);\n\
+    int loc = idx * output_stride + gidx;\n\
+    vxc_short8 src0, src2;\n\
+    _viv_asm(COPY, src0, tmpData, 16);\n\
+    VXC_DP2x8(src2, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    ref_ptr[loc] = src2.s0123;\n\
+}\n\
+"; /* end of scatter_nd_update_fp_vx*/
+
+static const char scatter_nd_update_qint_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\
+_viv_uniform int update_width;\n\
+_viv_uniform int output_width;\n\
+_viv_uniform int ref_stride;\n\
+_viv_uniform int output_stride;\n\
+_viv_uniform int2 multAndoutZP0;\n\
+\n\
+_viv_uniform int4 coord_stride;\n\
+_viv_uniform int4 coord_stride1;\n\
+\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform int input_zp;\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform float inout_scale;\n\
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+#define SCATTER_RESET(name0, name1, ptr0, ptr1, type0, type1, len0, len1, size0, size1, ptr2, ptr3, len3) \\\n\
+__kernel void scatter_nd_update_reset_##name0##to##name1( \\\n\
+    __read_only image2d_t   input_ref, \\\n\
+    image2d_t  temp_ref, \\\n\
+    image2d_t  temp_buf_int, \\\n\
+    int length, int res) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    Image img1 = create_image_from_image2d(input_ref, size0); \\\n\
+    Image img2 = create_image_from_image2d(temp_ref, size1); \\\n\
+    Image img3 = create_image_from_image2d(temp_buf_int, 4); \\\n\
+    __global ptr0* input_ptr = (__global ptr0*)img1.ptr; \\\n\
+    __global ptr1* output_ptr = (__global ptr1*)img2.ptr; \\\n\
+    __global int* tmp_update_ptr = (__global int*)img3.ptr; \\\n\
+    ptr0 tmpData = input_ptr[gidx]; \\\n\
+    int4 zeros = (int4)(0); \\\n\
+    int loc2 = gidx * 8; \\\n\
+    type0 src; \\\n\
+    type1 tmpDst; \\\n\
+    ptr1 dst; \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, src, tmpData, len0); \\\n\
+    VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+    _viv_asm(COPY, dst, tmpDst, len1); \\\n\
+    output_ptr[gidx] = dst; \\\n\
+    vstore4(zeros, 0, tmp_update_ptr + loc2); \\\n\
+    vstore4(zeros, 1, tmp_update_ptr + loc2); \\\n\
+    if(gidx < res) \\\n\
+    { \\\n\
+        __global ptr2* input_ptr1 = (__global ptr2*)img1.ptr; \\\n\
+        __global ptr3* output_ptr1 = (__global ptr3*)img2.ptr; \\\n\
+        ptr2 tmpData1 = input_ptr1[length + gidx]; \\\n\
+        ptr3 dst1; \\\n\
+        dst1 ^= dst1; \\\n\
+        tmp_update_ptr[length + gidx] = 0; \\\n\
+        _viv_asm(COPY, src, tmpData1, 4); \\\n\
+        VXC_DP2x8(tmpDst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        _viv_asm(COPY, dst1, tmpDst, len3); \\\n\
+        output_ptr1[length + gidx] = dst1; \\\n\
+    } \\\n\
+}\n\
+SCATTER_RESET(U8,  U8,  vxc_uchar8, vxc_uchar8, vxc_uchar8, vxc_uchar8, 8, 8, 1, 1, uchar, uchar, 1)\n\
+SCATTER_RESET(I8,  I8,  vxc_char8,  vxc_char8,  vxc_char8,  vxc_char8,  8, 8, 1, 1, char, char, 1)\n\
+SCATTER_RESET(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, 16, 16, 2, 2, short, short, 2)\n\
+SCATTER_RESET(F16, F16, vxc_short8, vxc_short8, vxc_half8,  vxc_half8,  16, 16, 2, 2, short, short, 2)\n\
+SCATTER_RESET(U8,  F16, vxc_uchar8, vxc_short8, vxc_uchar8, vxc_half8,  8, 16, 1, 2, uchar, short, 2)\n\
+SCATTER_RESET(I8,  F16, vxc_char8,  vxc_short8, vxc_char8,  vxc_half8,  8, 16, 1, 2, char, short, 2)\n\
+SCATTER_RESET(I16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_half8,  16, 8, 2, 1, short, short, 2)\n\
+SCATTER_RESET(F16, U8,  vxc_short8, vxc_uchar8, vxc_half8,  vxc_uchar8, 16, 8, 2, 1, short, uchar, 1)\n\
+\n\
+__kernel void scatter_nd_update_reset_BF16toBF16(\n\
+    __read_only image2d_t   input_ref,\n\
+    image2d_t  temp_ref,\n\
+    image2d_t  temp_buf_int)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    Image img1 = create_image_from_image2d(input_ref, 2);\n\
+    Image img2 = create_image_from_image2d(temp_ref, 2);\n\
+    Image img3 = create_image_from_image2d(temp_buf_int, 4);\n\
+    __global vxc_short8* input_ptr = (__global vxc_short8*)img1.ptr;\n\
+    __global vxc_short8* output_ptr = (__global vxc_short8*)img2.ptr;\n\
+    __global float* tmp_update_ptr = (__global float*)img3.ptr;\n\
+    vxc_short8 src = input_ptr[gidx];\n\
+    float4 zeros = (float4)(0, 0, 0, 0);\n\
+    int loc2 = gidx * 8;\n\
+    output_ptr[gidx] = src;\n\
+    vstore4(zeros, 0, tmp_update_ptr + loc2);\n\
+    vstore4(zeros, 1, tmp_update_ptr + loc2);\n\
+}\n\
+\n\
+#define SCATTER_ND_UPDATE_QINT(src0_type, data_type, ptr_type, element_size) \\\n\
+__kernel void scatter_nd_update_update_##src0_type( \\\n\
+    __read_only image2d_t   index, \\\n\
+    __read_only image2d_t   update, \\\n\
+    image2d_t  temp_buf_int, \\\n\
+    image2d_t  link_buffer0, \\\n\
+    int width, int area, int vol, int val4, \\\n\
+    int val5, int val6, int val7, int coord_dim) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    Image img1 = create_image_from_image2d(index, 4); \\\n\
+    Image img2 = create_image_from_image2d(update, element_size); \\\n\
+    Image img3 = create_image_from_image2d(temp_buf_int, 4); \\\n\
+    __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+    __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \\\n\
+    __global int* output_ptr = (__global int*)img3.ptr; \\\n\
+    data_type src; \\\n\
+ \\\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\
+    ptr_type tmpData = update_ptr[gidy * update_width + gidx]; \\\n\
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\
+    int loc = idx * output_width + gidx; \\\n\
+    _viv_asm(COPY, src, tmpData, 4); \\\n\
+    vxc_int4 data; \\\n\
+    short zp = input_zp; \\\n\
+    VXC_DP4x4(data, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+             uniConvert1stUint8SubZpToFp32_4x4); \\\n\
+    atomic_add(output_ptr + loc, data.x); \\\n\
+}\n\
+SCATTER_ND_UPDATE_QINT(U8,  vxc_uchar8, uchar, 1)\n\
+SCATTER_ND_UPDATE_QINT(I8,  vxc_char8,  char,  1)\n\
+SCATTER_ND_UPDATE_QINT(I16, vxc_short8, short, 2)\n\
+\n\
+#define SCATTER_ND_UPDATE_QINT_4X(src0_type, data_type, ptr_type, element_size) \\\n\
+__kernel void scatter_nd_update_update_##src0_type##_4X( \\\n\
+    __read_only image2d_t   index, \\\n\
+    __read_only image2d_t   update, \\\n\
+    image2d_t  temp_buf_int, \\\n\
+    image2d_t  link_buffer0, \\\n\
+    int width, int area, int vol, int val4, \\\n\
+    int val5, int val6, int val7, int coord_dim) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    Image img1 = create_image_from_image2d(index, 4); \\\n\
+    Image img2 = create_image_from_image2d(update, element_size); \\\n\
+    Image img3 = create_image_from_image2d(temp_buf_int, 4); \\\n\
+    __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+    __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \\\n\
+    __global int* output_ptr = (__global int*)img3.ptr; \\\n\
+ \\\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\
+    ptr_type src = update_ptr[gidy * update_width + gidx]; \\\n\
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\
+    int4 loc = idx * output_width + gidx * 4 + (int4)(0, 1, 2, 3); \\\n\
+    vxc_int4 data; \\\n\
+    short zp = input_zp; \\\n\
+    VXC_DP4x4(data, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+             uniConvert1stUint8SubZpToFp32_4x4); \\\n\
+    atomic_add(output_ptr + loc.x, data.x); \\\n\
+    atomic_add(output_ptr + loc.y, data.y); \\\n\
+    atomic_add(output_ptr + loc.z, data.z); \\\n\
+    atomic_add(output_ptr + loc.w, data.w); \\\n\
+}\n\
+SCATTER_ND_UPDATE_QINT_4X(U8,  vxc_uchar8, vxc_uchar4, 1)\n\
+SCATTER_ND_UPDATE_QINT_4X(I8,  vxc_char8,  vxc_char4,  1)\n\
+SCATTER_ND_UPDATE_QINT_4X(I16, vxc_short8, vxc_short4, 2)\n\
+\n\
+#define SCATTER_ND_UPDATE_REF(src0_type, dst_type, data_type, ptr_type, element_size) \\\n\
+__kernel void scatter_nd_update_ref_##src0_type##to##dst_type( \\\n\
+    __read_only image2d_t   index, \\\n\
+    __read_only image2d_t   update, \\\n\
+    __read_only image2d_t   temp_buf_int, \\\n\
+    image2d_t  temp_ref, \\\n\
+    image2d_t  link_buffer0, \\\n\
+    image2d_t  link_buffer1, \\\n\
+    int width, int area, int vol, int val4, \\\n\
+    int val5, int val6, int val7, int coord_dim) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    Image img1 = create_image_from_image2d(index, 4); \\\n\
+    Image img2 = create_image_from_image2d(temp_buf_int, 4); \\\n\
+    Image img3 = create_image_from_image2d(temp_ref, element_size); \\\n\
+    __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+    __global int* acc_ptr = (__global int*)img2.ptr; \\\n\
+    __global ptr_type* ref_ptr = (__global ptr_type*)img3.ptr; \\\n\
+    data_type dst; \\\n\
+ \\\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\
+    int loc = idx * output_stride + gidx; \\\n\
+    int tmpData = acc_ptr[loc]; \\\n\
+    int4 data; \\\n\
+    data.x = convert_int_rte(tmpData * inout_scale + output_zp); \\\n\
+    VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                    uniConvertInt32toUint8_2x8); \\\n\
+    ref_ptr[loc] = dst.x; \\\n\
+}\n\
+SCATTER_ND_UPDATE_REF(I32, U8,  vxc_uchar8, uchar, 1)\n\
+SCATTER_ND_UPDATE_REF(I32, I8,  vxc_char8,  char,  1)\n\
+SCATTER_ND_UPDATE_REF(I32, I16, vxc_short8, short, 2)\n\
+\n\
+#define SCATTER_ND_UPDATE_REF_4X(src0_type, dst_type, data_type, ptr_type, element_size) \\\n\
+__kernel void scatter_nd_update_ref_##src0_type##to##dst_type##_4X( \\\n\
+    __read_only image2d_t   index, \\\n\
+    __read_only image2d_t   update, \\\n\
+    __read_only image2d_t   temp_buf_int, \\\n\
+    image2d_t  temp_ref, \\\n\
+    image2d_t  link_buffer0, \\\n\
+    image2d_t  link_buffer1, \\\n\
+    int width, int area, int vol, int val4, \\\n\
+    int val5, int val6, int val7, int coord_dim) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    Image img1 = create_image_from_image2d(index, 4); \\\n\
+    Image img2 = create_image_from_image2d(temp_buf_int, 4); \\\n\
+    Image img3 = create_image_from_image2d(temp_ref, element_size); \\\n\
+    __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+    __global int* acc_ptr = (__global int*)img2.ptr; \\\n\
+    __global ptr_type* ref_ptr = (__global ptr_type*)img3.ptr; \\\n\
+    data_type dst; \\\n\
+ \\\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\
+    float4 tmpData = convert_float4(vload4(gidx, acc_ptr + idx * ref_stride)); \\\n\
+    int loc = idx * output_stride + gidx; \\\n\
+    int4 data = convert_int4_rte(tmpData * inout_scale + output_zp); \\\n\
+    VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                    uniConvertInt32toUint8_2x8); \\\n\
+    ref_ptr[loc] = dst.xyzw; \\\n\
+}\n\
+SCATTER_ND_UPDATE_REF_4X(I32, U8,  vxc_uchar8, vxc_uchar4, 1)\n\
+SCATTER_ND_UPDATE_REF_4X(I32, I8,  vxc_char8,  vxc_char4,  1)\n\
+SCATTER_ND_UPDATE_REF_4X(I32, I16, vxc_short8, vxc_short4, 2)\n\
+\n\
+#define SCATTER_ND_UPDATE_COPY(src0_type, ptr_type, element_size, ptr_type1) \\\n\
+__kernel void scatter_nd_update_copy_##src0_type( \\\n\
+    __read_only image2d_t  temp_ref, \\\n\
+    __read_only image2d_t  link_buffer1, \\\n\
+    image2d_t  output, \\\n\
+    int length, int res) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    Image img1 = create_image_from_image2d(temp_ref, element_size); \\\n\
+    Image img2 = create_image_from_image2d(output, element_size); \\\n\
+    __global ptr_type* input_ptr = (__global ptr_type*)img1.ptr; \\\n\
+    __global ptr_type* output_ptr = (__global ptr_type*)img2.ptr; \\\n\
+    output_ptr[gidx] = input_ptr[gidx]; \\\n\
+    if(gidx < res) \\\n\
+    { \\\n\
+        __global ptr_type1* input_ptr1 = (__global ptr_type1*)img1.ptr; \\\n\
+        __global ptr_type1* output_ptr1 = (__global ptr_type1*)img2.ptr; \\\n\
+        output_ptr1[length + gidx] = input_ptr1[length + gidx]; \\\n\
+    } \\\n\
+}\n\
+SCATTER_ND_UPDATE_COPY(U8,  vxc_uchar8, 1, uchar)\n\
+SCATTER_ND_UPDATE_COPY(I8,  vxc_char8,  1, char)\n\
+SCATTER_ND_UPDATE_COPY(I16, vxc_short8, 2, short)\n\
+SCATTER_ND_UPDATE_COPY(F16, vxc_short8, 2, short)\n\
+SCATTER_ND_UPDATE_COPY(BF16, vxc_short8, 2, short)\n\
+"; /* end of scatter_nd_update_qint_vx*/
+
 static const char scatter_nd_update_special_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\
@@ -42120,7 +46848,7 @@ __kernel void sequence_mask_##src0_type_name##to##src1_type_name##_2D( \\\n\
     short zp = inputZP; \\\n\
     VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
                  uniConvert1stUint8SubZpToFp32_4x4); \\\n\
-    int index = convert_int_rte(tmpData.s0 * input_scale); \\\n\
+    int index = convert_int_rtz(tmpData.s0 * input_scale); \\\n\
     int4 data; \\\n\
     data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \\\n\
     write_type dst; \\\n\
@@ -42146,7 +46874,7 @@ __kernel void sequence_mask_##src0_type_name##to##src1_type_name( \\\n\
     short zp = inputZP; \\\n\
     VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
                  uniConvert1stUint8SubZpToFp32_4x4); \\\n\
-    int index = convert_int_rte(tmpData.s0 * input_scale); \\\n\
+    int index = convert_int_rtz(tmpData.s0 * input_scale); \\\n\
     int4 data; \\\n\
     data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \\\n\
     write_type dst; \\\n\
@@ -42172,7 +46900,7 @@ __kernel void sequence_mask_F16toF16_2D(\n\
     float4 tmpData;\n\
     VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
             UniFP16toFP32Lo4_dp4x4);\n\
-    int index = convert_int_rte(tmpData.x);\n\
+    int index = convert_int_rtz(tmpData.x);\n\
     float4 data;\n\
     data = outIdx < index? outputVal1 : convert_float(output_ZP);\n\
     vxc_short8 dst;\n\
@@ -42195,7 +46923,7 @@ __kernel void sequence_mask_F16toF16(\n\
     float4 tmpData;\n\
     VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
             UniFP16toFP32Lo4_dp4x4);\n\
-    int index = convert_int_rte(tmpData.x);\n\
+    int index = convert_int_rtz(tmpData.x);\n\
     float4 data;\n\
     data = outIdx < index? outputVal1 : convert_float(output_ZP);\n\
     vxc_short8 dst;\n\
@@ -42218,7 +46946,7 @@ __kernel void sequence_mask_F16toU8_2D(\n\
     float4 tmpData;\n\
     VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
             UniFP16toFP32Lo4_dp4x4);\n\
-    int index = convert_int_rte(tmpData.x);\n\
+    int index = convert_int_rtz(tmpData.x);\n\
     int4 data;\n\
     data = outIdx < index? convert_int_rte(outputVal1) : output_ZP;\n\
     vxc_uchar16 dst;\n\
@@ -42239,7 +46967,7 @@ __kernel void sequence_mask_F16toU8(\n\
     float4 tmpData;\n\
     VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
             UniFP16toFP32Lo4_dp4x4);\n\
-    int index = convert_int_rte(tmpData.x);\n\
+    int index = convert_int_rtz(tmpData.x);\n\
     int4 data;\n\
     data = outIdx < index? convert_int_rte(outputVal1) : output_ZP;\n\
     vxc_uchar16 dst;\n\
@@ -43211,6 +47939,167 @@ TILE_2D_MIX(U8, F16, 7, 6, vxc_uchar8, vxc_short8)\n\
 TILE_2D_MIX(U8, F16, 0, 7, vxc_uchar8, vxc_short8)\n\
 "; /* end of tile_mix_vx*/
 
+static const char tiny_yolov4_postprocess_box_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+#define logE    (1.44269502f)\n\
+\n\
+float4 sigmoid4(float4 x)\n\
+{\n\
+    x *= -logE;\n\
+    x = 1 + exp2(x);\n\
+    return 1 / x;\n\
+}\n\
+\n\
+float4 exp4(float4 x)\n\
+{\n\
+    x *= logE;\n\
+    return exp2(x);\n\
+}\n\
+\n\
+#define CONST0      (1.0499999523162842f)\n\
+#define CONST1      (0.0250000003725290f)\n\
+\n\
+_viv_uniform VXC_512Bits uniDatatoFloat32_0_4x4;\n\
+_viv_uniform VXC_512Bits uniDatatoFloat32_1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uniDataTranspose_0_2x8;\n\
+_viv_uniform VXC_512Bits uniDataTranspose_1_2x8;\n\
+_viv_uniform float input0_scale;\n\
+_viv_uniform float input0_tail;\n\
+_viv_uniform float input1_scale;\n\
+_viv_uniform float input1_tail;\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform float CONST2;\n\
+__kernel void tiny_yolov4_postprocess_box_U8_U8toU8\n\
+    (\n\
+    __read_only  image2d_array_t  input0,\n\
+    __read_only  image2d_array_t  input1,\n\
+    __write_only image2d_array_t  output,\n\
+                 float            bias_0,\n\
+                 float            bias_1\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(0));\n\
+\n\
+    vxc_uchar16 src0, src1, src2, src3;\n\
+    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input0, coord.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input0, coord.xy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_ReadImage(src2, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src3, input1, coord.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    coord.zw += (int2)(2, 3);\n\
+\n\
+    float4 data0, data1, data2, data3, data;\n\
+    VXC_DP4x4(data0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);\n\
+    data0 = data0 * input0_scale + input0_tail;\n\
+    data0 = sigmoid4(data0);\n\
+    data0 = data0 * CONST0 - CONST1;\n\
+\n\
+    VXC_DP4x4(data, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);\n\
+    data = data * input1_scale + input1_tail;\n\
+    data0 = data0 * CONST2 + data * CONST2;\n\
+\n\
+    VXC_DP4x4(data1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_1_4x4);\n\
+    data1 = data1 * input0_scale + input0_tail;\n\
+    data1 = sigmoid4(data1);\n\
+    data1 = data1 * CONST0 - CONST1;\n\
+\n\
+    VXC_DP4x4(data, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);\n\
+    data = data * input1_scale + input1_tail;\n\
+    data1 = data1 * CONST2 + data * CONST2;\n\
+\n\
+    VXC_DP4x4(data2, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_0_4x4);\n\
+    data2 = data2 * input0_scale + input0_tail;\n\
+    data2 = exp4(data2) * bias_0;\n\
+\n\
+    VXC_DP4x4(data3, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFloat32_1_4x4);\n\
+    data3 = data3 * input0_scale + input0_tail;\n\
+    data3 = exp4(data3) * bias_1;\n\
+\n\
+    data0 = data0 * output_scale + output_zp;\n\
+    data1 = data1 * output_scale + output_zp;\n\
+\n\
+    int4 dst0 = convert_int4_rte(data0);\n\
+    int4 dst1 = convert_int4_rte(data1);\n\
+    VXC_DP2x8(src1, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\
+    data2 = data2 * output_scale + output_zp;\n\
+    data3 = data3 * output_scale + output_zp;\n\
+    dst0 = convert_int4_rte(data2);\n\
+    dst1 = convert_int4_rte(data3);\n\
+    VXC_DP2x8(src1, dst0, dst1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8);\n\
+\n\
+    VXC_DP2x8(src0, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniDataTranspose_0_2x8);\n\
+    VXC_DP2x8(src0, src1, src1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniDataTranspose_1_2x8);\n\
+\n\
+    VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    coord.x ++;\n\
+    VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord.yz, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord.yw, src0, VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of tiny_yolov4_postprocess_box_vx*/
+
+static const char tiny_yolov4_postprocess_confidence_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniU8TimesU8_0_4x4;\n\
+_viv_uniform VXC_512Bits uniU8PlusU8_trans_0_2x8;\n\
+_viv_uniform VXC_512Bits uniU8PlusU8_trans_1_2x8;\n\
+_viv_uniform VXC_512Bits uniU16TimesMultiplier_PostShift_2x8;\n\
+_viv_uniform int output_zp;\n\
+\n\
+__kernel void tiny_yolov4_postprocess_conf_U8toU8\n\
+(\n\
+    __read_only  image2d_t input,\n\
+    __write_only image2d_t output\n\
+)\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, get_global_id(0));\n\
+\n\
+    vxc_uchar16 src0, src1, src2, src3, src4;\n\
+\n\
+    VXC_ReadImage(src0, input, coord.wz, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    vxc_ushort8 data0, data1;\n\
+\n\
+    VXC_ReadImage(src1, input, coord.wy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src2, input, coord.wy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src3, input, coord.wy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src4, input, coord.wy, VXC_5BITOFFSET_XY(0, 4), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    coord.zw = coord.xx + (int2)(2, 3);\n\
+\n\
+    VXC_DP4x4(data0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);\n\
+    VXC_DP4x4(data0, src0, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);\n\
+    VXC_DP4x4(data1, src0, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);\n\
+    VXC_DP4x4(data1, src0, src4, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniU8TimesU8_0_4x4);\n\
+\n\
+    VXC_DP2x8(src1, data0, data0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\
+        uniU16TimesMultiplier_PostShift_2x8);\n\
+    VXC_DP2x8(src1, data1, data1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\n\
+        uniU16TimesMultiplier_PostShift_2x8);\n\
+\n\
+    uchar zp;\n\
+    _viv_asm(COPY, zp, output_zp, 2);\n\
+\n\
+    VXC_DP2x8(src0, src1, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\n\
+        uniU8PlusU8_trans_0_2x8);\n\
+    VXC_DP2x8(src0, src1, zp, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),\n\
+        uniU8PlusU8_trans_1_2x8);\n\
+\n\
+    VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    coord.x ++;\n\
+    VXC_WriteImage(output, coord.yx, src0, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord.yz, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord.yw, src0, VXC_MODIFIER(12, 15, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of tiny_yolov4_postprocess_confidence_vx*/
+
 static const char upsample_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniF16MulMultipiler_PostShft_2x8;\n\
@@ -49204,6 +54093,8 @@ static const char gather_cl[] = "__kernel void gather_U8toU8(\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
     int4 indice = read_imagei(input1, coord_in.xy);\n\
+\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     uint4 data = read_imageui(input0, coord_in.zw);\n\
@@ -49229,6 +54120,8 @@ __kernel void gather_F16toF16(\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
     int4 indice = read_imagei(input1, coord_in.xy);\n\
+\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     float4 data = read_imagef(input0, coord_in.zw);\n\
@@ -49254,6 +54147,8 @@ __kernel void gather_I32toI32(\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
     int4 indice = read_imagei(input1, coord_in.xy);\n\
+\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     int4 data = read_imagei(input0, coord_in.zw);\n\
@@ -49279,6 +54174,8 @@ __kernel void gather_F32toF32(\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
     int4 indice = read_imagei(input1, coord_in.xy);\n\
+\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     float4 data = read_imagef(input0, coord_in.zw);\n\
@@ -49305,6 +54202,7 @@ static const char gather_array_cl[] = "__kernel void gather_array_U8toU8(\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
     int4 indice = read_imagei(input1, coord_in.xy);\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     Image img1 = create_image_from_image2d(input0, 1);\n\
@@ -49333,6 +54231,7 @@ __kernel void gather_array_F16toF16(\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
     int4 indice = read_imagei(input1, coord_in.xy);\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     Image img1 = create_image_from_image2d(input0, 2);\n\
@@ -49361,6 +54260,7 @@ __kernel void gather_array_I32toI32(\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
     int4 indice = read_imagei(input1, coord_in.xy);\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     Image img1 = create_image_from_image2d(input0, 4);\n\
@@ -49389,6 +54289,7 @@ __kernel void gather_array_F32toF32(\n\
 \n\
     int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
     int4 indice = read_imagei(input1, coord_in.xy);\n\
+    indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
     coord_in.w = gidz * axis_num + indice.x;\n\
 \n\
     Image img1 = create_image_from_image2d(input0, 4);\n\
@@ -49423,6 +54324,7 @@ static const char gather_batch_cl[] = "__kernel void gather_batch_U8toU8(\n\
     {\n\
         int4 indice = read_imagei(input1, coord_idx);\n\
         coord_idx.y++;\n\
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
         coord_in.y = gidz * axis_num + indice.x;\n\
 \n\
         uint4 data = read_imageui(input0, coord_in);\n\
@@ -49454,6 +54356,7 @@ __kernel void gather_batch_F16toF16(\n\
     {\n\
         int4 indice = read_imagei(input1, coord_idx);\n\
         coord_idx.y++;\n\
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
         coord_in.y = gidz * axis_num + indice.x;\n\
 \n\
         float4 data = read_imagef(input0, coord_in);\n\
@@ -49485,6 +54388,7 @@ __kernel void gather_batch_I32toI32(\n\
     {\n\
         int4 indice = read_imagei(input1, coord_idx);\n\
         coord_idx.y++;\n\
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
         coord_in.y = gidz * axis_num + indice.x;\n\
 \n\
         int4 data = read_imagei(input0, coord_in);\n\
@@ -49516,6 +54420,7 @@ __kernel void gather_batch_F32toF32(\n\
     {\n\
         int4 indice = read_imagei(input1, coord_idx);\n\
         coord_idx.y++;\n\
+        indice.x = indice.x >= 0 ? indice.x : indice.x + axis_num;\n\
         coord_in.y = gidz * axis_num + indice.x;\n\
 \n\
         float4 data = read_imagef(input0, coord_in);\n\
@@ -49526,7 +54431,15 @@ __kernel void gather_batch_F32toF32(\n\
 }\n\
 "; /* end of gather_batch_cl*/
 
-static const char gather_elements_cl[] = "\n\
+static const char gather_elements_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+\n\
+_viv_uniform uint width0;\n\
+_viv_uniform uint height0;\n\
+_viv_uniform uint width1;\n\
+_viv_uniform uint height1;\n\
+_viv_uniform uint width_out;\n\
+_viv_uniform uint height_out;\n\
+\n\
 #define GATHER_ELEMENTS_AXIS0_2D(name, data_type, read_func, write_func, conv_func) \\\n\
 __kernel void gather_elements_axis0_##name##_I32to##name##_2D \\\n\
     ( \\\n\
@@ -49661,6 +54574,162 @@ __kernel void gather_elements_axis2_##name##_I32to##name \\\n\
 GATHER_ELEMENTS_AXIS2(F32, float4, read_imagef,  write_imagef,  convert_float4)\n\
 GATHER_ELEMENTS_AXIS2(I32, int4,   read_imagei,  write_imagei,  convert_int4_rte)\n\
 GATHER_ELEMENTS_AXIS2(U32, uint4,  read_imageui, write_imageui, convert_uint4_rte)\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           input_scale, \\\n\
+                 float           input_tail, \\\n\
+                 int             axis_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+    Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \\\n\
+    int* index_ptr = (int*)index_tensor.ptr; \\\n\
+    int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \\\n\
+ \\\n\
+    Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \\\n\
+    data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \\\n\
+    data_type data = input_ptr[index + coord.y * width0 + coord.z * width0 * height0]; \\\n\
+ \\\n\
+    Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \\\n\
+    data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \\\n\
+    output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(F32, float, float*, 4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I32, int,   int*,   4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(I8,  char,  char*,  1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0(U8,  uchar, uchar*, 1)\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           input_scale, \\\n\
+                 float           input_tail, \\\n\
+                 int             axis_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+    Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \\\n\
+    int* index_ptr = (int*)index_tensor.ptr; \\\n\
+    int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \\\n\
+ \\\n\
+    Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \\\n\
+    data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \\\n\
+    data_type data = input_ptr[coord.x + index * width0 + coord.z * width0 * height0]; \\\n\
+ \\\n\
+    Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \\\n\
+    data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \\\n\
+    output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(F32, float, float*, 4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I32, int,   int*,   4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(I8,  char,  char*,  1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1(U8,  uchar, uchar*, 1)\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis2_##name##_I32to##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_array_t input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           input_scale, \\\n\
+                 float           input_tail, \\\n\
+                 int             axis_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\
+    Tensor index_tensor = create_tensor_from_image2d_array(input1, 4); \\\n\
+    int* index_ptr = (int*)index_tensor.ptr; \\\n\
+    int index = index_ptr[coord.x + coord.y * width1 + coord.z * width1 * height1]; \\\n\
+ \\\n\
+    Tensor input_tensor = create_tensor_from_image2d_array(input0, stride); \\\n\
+    data_type_ptr input_ptr = (data_type_ptr)input_tensor.ptr; \\\n\
+    data_type data = input_ptr[coord.x + coord.y * width0 + index * width0 * height0]; \\\n\
+ \\\n\
+    Tensor output_tensor = create_tensor_from_image2d_array(output, stride); \\\n\
+    data_type_ptr output_ptr = (data_type_ptr)output_tensor.ptr; \\\n\
+    output_ptr[coord.x + coord.y * width_out + coord.z * width_out * height_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(F32, float, float*, 4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I32, int,   int*,   4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(I8,  char,  char*,  1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS2(U8,  uchar, uchar*, 1)\n\
+\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis0_##name##_I32to##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_t input0, \\\n\
+    __read_only  image2d_t input1, \\\n\
+    __write_only image2d_t output, \\\n\
+                 float           input_scale, \\\n\
+                 float           input_tail, \\\n\
+                 int             axis_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    Image index_img = create_image_from_image2d(input1, 4); \\\n\
+    int* index_ptr = (int*)index_img.ptr; \\\n\
+    int index = index_ptr[coord.x + coord.y * width1]; \\\n\
+ \\\n\
+    Image input_img = create_image_from_image2d(input0, stride); \\\n\
+    data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \\\n\
+    data_type data = input_ptr[index + coord.y * width0]; \\\n\
+ \\\n\
+    Image output_img = create_image_from_image2d(output, stride); \\\n\
+    data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \\\n\
+    output_ptr[coord.x + coord.y * width_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(F32, float, float*, 4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I32, int,   int*,   4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(I8,  char,  char*,  1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS0_2D(U8,  uchar, uchar*, 1)\n\
+\n\
+#define GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(name, data_type, data_type_ptr, stride) \\\n\
+__kernel void gather_elements_beyond_maxwidth_axis1_##name##_I32to##name##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_t input0, \\\n\
+    __read_only  image2d_t input1, \\\n\
+    __write_only image2d_t output, \\\n\
+                 float           input_scale, \\\n\
+                 float           input_tail, \\\n\
+                 int             axis_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    Image index_img = create_image_from_image2d(input1, 4); \\\n\
+    int* index_ptr = (int*)index_img.ptr; \\\n\
+    int index = index_ptr[coord.x + coord.y * width1]; \\\n\
+ \\\n\
+    Image input_img = create_image_from_image2d(input0, stride); \\\n\
+    data_type_ptr input_ptr = (data_type_ptr)input_img.ptr; \\\n\
+    data_type data = input_ptr[coord.x + index  * width0]; \\\n\
+ \\\n\
+    Image output_img = create_image_from_image2d(output, stride); \\\n\
+    data_type_ptr output_ptr = (data_type_ptr)output_img.ptr; \\\n\
+    output_ptr[coord.x + coord.y * width_out] = data; \\\n\
+}\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(F32, float, float*, 4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I32, int,   int*,   4)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(F16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I16, short, short*, 2)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(I8,  char,  char*,  1)\n\
+GATHER_ELEMENTS_BEYOND_MAXWIDTH_AXIS1_2D(U8,  uchar, uchar*, 1)\n\
 "; /* end of gather_elements_cl*/
 
 static const char gather_nd_cl[] = "__kernel void gather_nd_U8toU8_1D(\n\
@@ -49919,127 +54988,136 @@ __kernel void gather_nd_F32toF32_3D(\n\
 
 static const char gather_nd_batch_cl[] = "__kernel void gather_nd_batch_U8toU8_1D(\n\
     __read_only image2d_t   input0,\n\
-    __read_only image2d_t   input1,\n\
-    __write_only image2d_t  output,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int coord_dim\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
-    int gidy = get_global_id(1);  // batch_num\n\
+    int gidy = get_global_id(1);  // index_num\n\
+    int gidz = get_global_id(2);  // batch_num\n\
 \n\
-    int4 coord = (int4)(gidx, gidy, 0, 0);\n\
-    int4 indice = read_imagei(input1, coord.wy);\n\
-    coord.z = indice.x * block_size + gidx;\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    int4 indice = read_imagei(input1, coord.wyzw);\n\
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\
 \n\
-    uint4 data = read_imageui(input0, coord.zy);\n\
-    write_imageui(output, coord.xy, data);\n\
+    uint4 data = read_imageui(input0, coord0);\n\
+    write_imageui(output, coord, data);\n\
 }\n\
 \n\
 __kernel void gather_nd_batch_F16toF16_1D(\n\
     __read_only image2d_t   input0,\n\
-    __read_only image2d_t   input1,\n\
-    __write_only image2d_t  output,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int coord_dim\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
-    int gidy = get_global_id(1);  // batch_num\n\
+    int gidy = get_global_id(1);  // index_num\n\
+    int gidz = get_global_id(2);  // batch_num\n\
 \n\
-    int4 coord = (int4)(gidx, gidy, 0, 0);\n\
-    int4 indice = read_imagei(input1, coord.wy);\n\
-    coord.z = indice.x * block_size + gidx;\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    int4 indice = read_imagei(input1, coord.wyzw);\n\
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\
 \n\
-    float4 data = read_imagef(input0, coord.zy);\n\
-    write_imagef(output, coord.xy, data);\n\
+    float4 data = read_imagef(input0, coord0);\n\
+    write_imagef(output, coord, data);\n\
 }\n\
 \n\
 __kernel void gather_nd_batch_I8toI8_1D(\n\
     __read_only image2d_t   input0,\n\
-    __read_only image2d_t   input1,\n\
-    __write_only image2d_t  output,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int coord_dim\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
-    int gidy = get_global_id(1);  // batch_num\n\
+    int gidy = get_global_id(1);  // index_num\n\
+    int gidz = get_global_id(2);  // batch_num\n\
 \n\
-    int4 coord = (int4)(gidx, gidy, 0, 0);\n\
-    int4 indice = read_imagei(input1, coord.wy);\n\
-    coord.z = indice.x * block_size + gidx;\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    int4 indice = read_imagei(input1, coord.wyzw);\n\
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\
 \n\
-    int4 data = read_imagei(input0, coord.zy);\n\
-    write_imagei(output, coord.xy, data);\n\
+    int4 data = read_imagei(input0, coord0);\n\
+    write_imagei(output, coord, data);\n\
 }\n\
 \n\
 //2D\n\
 __kernel void gather_nd_batch_U8toU8_2D(\n\
     __read_only image2d_array_t   input0,\n\
-    __read_only image2d_t   input1,\n\
-    __write_only image2d_t  output,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int coord_dim\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
-    int gidy = get_global_id(1);  // batch_num\n\
+    int gidy = get_global_id(1);  // index_num\n\
+    int gidz = get_global_id(2);  // batch_num\n\
 \n\
-    int4 coord = (int4)(0, gidy, gidx, 1);\n\
-    int4 indice = read_imagei(input1, coord.xy);\n\
-    int4 indice1 = read_imagei(input1, coord.wy);\n\
+    int4 coord = (int4)(1, gidy, gidz, 0);\n\
+    int4 indice = read_imagei(input1, coord.wyzw);\n\
+    int4 indice1 = read_imagei(input1, coord.xyzw);\n\
     indice.x = indice.x * block_size + gidx;\n\
     indice.y = indice1.x;\n\
-    indice.zw = coord.yx;\n\
+    indice.zw = coord.zw;\n\
 \n\
     uint4 data = read_imageui(input0, indice);\n\
-    write_imageui(output, coord.zy, data);\n\
+    coord.x = gidx;\n\
+    write_imageui(output, coord, data);\n\
 }\n\
 \n\
 __kernel void gather_nd_batch_F16toF16_2D(\n\
     __read_only image2d_array_t   input0,\n\
-    __read_only image2d_t   input1,\n\
-    __write_only image2d_t  output,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int coord_dim\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
-    int gidy = get_global_id(1);  // batch_num\n\
+    int gidy = get_global_id(1);  // index_num\n\
+    int gidz = get_global_id(2);  // batch_num\n\
 \n\
-    int4 coord = (int4)(0, gidy, gidx, 1);\n\
-    int4 indice = read_imagei(input1, coord.xy);\n\
-    int4 indice1 = read_imagei(input1, coord.wy);\n\
+    int4 coord = (int4)(1, gidy, gidz, 0);\n\
+    int4 indice = read_imagei(input1, coord.wyzw);\n\
+    int4 indice1 = read_imagei(input1, coord.xyzw);\n\
     indice.x = indice.x * block_size + gidx;\n\
     indice.y = indice1.x;\n\
-    indice.zw = coord.yx;\n\
+    indice.zw = coord.zw;\n\
 \n\
     float4 data = read_imagef(input0, indice);\n\
-    write_imagef(output, coord.zy, data);\n\
+    coord.x = gidx;\n\
+    write_imagef(output, coord, data);\n\
 }\n\
 \n\
 __kernel void gather_nd_batch_I8toI8_2D(\n\
     __read_only image2d_array_t   input0,\n\
-    __read_only image2d_t   input1,\n\
-    __write_only image2d_t  output,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int coord_dim\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
-    int gidy = get_global_id(1);  // batch_num\n\
+    int gidy = get_global_id(1);  // index_num\n\
+    int gidz = get_global_id(2);  // batch_num\n\
 \n\
-    int4 coord = (int4)(0, gidy, gidx, 1);\n\
-    int4 indice = read_imagei(input1, coord.xy);\n\
-    int4 indice1 = read_imagei(input1, coord.wy);\n\
+    int4 coord = (int4)(1, gidy, gidz, 0);\n\
+    int4 indice = read_imagei(input1, coord.wyzw);\n\
+    int4 indice1 = read_imagei(input1, coord.xyzw);\n\
     indice.x = indice.x * block_size + gidx;\n\
     indice.y = indice1.x;\n\
     indice.y = indice1.x;\n\
-    indice.zw = coord.yx;\n\
+    indice.zw = coord.zw;\n\
 \n\
     int4 data = read_imagei(input0, indice);\n\
-    write_imagei(output, coord.zy, data);\n\
+    coord.x = gidx;\n\
+    write_imagei(output, coord, data);\n\
 }\n\
 "; /* end of gather_nd_batch_cl*/
 
@@ -57045,6 +62123,103 @@ GEMM_TRANSB_3D(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);\n\
 \n\
 "; /* end of matrixmul_cl*/
 
+static const char matrixmul_cross_cl[] = "__kernel void gemm_F32F32toF32_merge(\n\
+    __read_only image2d_array_t   inputA,\n\
+    __read_only image2d_array_t   inputB,\n\
+    __write_only image2d_array_t  output,\n\
+    int M,\n\
+    int K,\n\
+    int N,\n\
+    int ac2zero,\n\
+    int bc2zero,\n\
+    float scale_a,\n\
+    float zp_a,\n\
+    float scale_b,\n\
+    float zp_b,\n\
+    float scale_out,\n\
+    float zp_out,\n\
+    int outer)\n\
+{\n\
+    for(int i = 0; i < outer; i++)\n\
+    {\n\
+        int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? i : get_global_id(2)), 0);\n\
+        int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0);\n\
+\n\
+        float4 sum = (float4)(0);\n\
+\n\
+        for(; coord_a.x < K;)\n\
+        {\n\
+            float4 tempA0;\n\
+            float4 tempB0;\n\
+\n\
+            tempA0 = read_imagef(inputA, coord_a);\n\
+            tempB0 = read_imagef(inputB, coord_b);\n\
+            coord_a.x++;\n\
+            coord_b.y++;\n\
+\n\
+            sum = sum + tempA0 * tempB0;\n\
+        }\n\
+\n\
+        coord_b.y = get_global_id(1);\n\
+        coord_b.z = get_global_id(2) + i * get_global_size(2);\n\
+        write_imagef(output, coord_b, sum);\n\
+    }\n\
+}\n\
+\n\
+#define GEMM_MERGE(name, dst_type, read_image_type, convert_type, write_image_type) \\\n\
+__kernel void gemm_##name##_merge( \\\n\
+    __read_only image2d_array_t   inputA, \\\n\
+    __read_only image2d_array_t   inputB, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int M, \\\n\
+    int K, \\\n\
+    int N, \\\n\
+    int ac2zero, \\\n\
+    int bc2zero, \\\n\
+    float scale_a, \\\n\
+    float zp_a, \\\n\
+    float scale_b, \\\n\
+    float zp_b, \\\n\
+    float scale_out, \\\n\
+    float zp_out, \\\n\
+    int outer) \\\n\
+{ \\\n\
+    for(int i = 0; i < outer; i++) \\\n\
+    { \\\n\
+        int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? i : get_global_id(2)), 0); \\\n\
+        int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? i : get_global_id(2)), 0); \\\n\
+        float4 sum = (float4)(0); \\\n\
+        dst_type dst; \\\n\
+     \\\n\
+        for(; coord_a.x < K;) \\\n\
+        { \\\n\
+            float4 tempA0; \\\n\
+            float4 tempB0; \\\n\
+     \\\n\
+            tempA0 = convert_float4(read_image_type(inputA, coord_a)); \\\n\
+            tempB0 = convert_float4(read_image_type(inputB, coord_b)); \\\n\
+            tempA0.x = (tempA0.x - zp_a) * scale_a; \\\n\
+            tempB0.x = (tempB0.x - zp_b) * scale_b; \\\n\
+     \\\n\
+            coord_a.x++; \\\n\
+            coord_b.y++; \\\n\
+     \\\n\
+            sum = sum + tempA0 * tempB0; \\\n\
+        } \\\n\
+        sum.x = sum.x * scale_out + zp_out; \\\n\
+        dst = convert_type(sum); \\\n\
+     \\\n\
+        coord_b.y = get_global_id(1); \\\n\
+        coord_b.z = get_global_id(2) + i * get_global_size(2); \\\n\
+        write_image_type(output, coord_b, dst); \\\n\
+    } \\\n\
+}\n\
+GEMM_MERGE(I8I8toI8,int4,read_imagei,convert_int4,write_imagei);\n\
+GEMM_MERGE(U8U8toU8,uint4,read_imageui,convert_uint4,write_imageui);\n\
+GEMM_MERGE(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);\n\
+\n\
+"; /* end of matrixmul_cross_cl*/
+
 static const char matrixmul_transA_cl[] = "__kernel void gemm_transa_F32F32toF32_2D(\n\
     __read_only image2d_t   inputA,\n\
     __read_only image2d_t   inputB,\n\
@@ -59324,6 +64499,85 @@ __kernel void moments_axis2_BF16toF32(\n\
 }\n\
 "; /* end of moments_axis2_cl*/
 
+static const char nearest_grid_sample_cl[] = "__kernel void nearest_grid_sample_F32_F32toF32(\n\
+    __read_only  image2d_array_t  input0,\n\
+    __read_only  image2d_t        input1,\n\
+    __write_only image2d_array_t  output,\n\
+                           float  half_input0_w,\n\
+                           float  half_input0_h,\n\
+                           float  add_float_value_w,\n\
+                           float  add_float_value_h,\n\
+                           int    depth\n\
+                           )\n\
+{\n\
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int2   coord_in1    =  (int2)(get_global_id(0) * 2, get_global_id(1));\n\
+\n\
+    float fx = read_imagef(input1, coord_in1).x;\n\
+    coord_in1.x = coord_in1.x + 1;\n\
+    float fy = read_imagef(input1, coord_in1).x;\n\
+\n\
+    fx = fx * half_input0_w + add_float_value_w;\n\
+    fy = fy * half_input0_h + add_float_value_h;\n\
+    int   x_index = convert_int(fx);\n\
+    int   y_index = convert_int(fy);\n\
+    int4   coord_in     = (int4)(x_index, y_index, 0, 0);\n\
+\n\
+    float4  dst;\n\
+\n\
+    while (coord_in.z < depth){\n\
+        dst    = read_imagef(input0, coord_in);\n\
+        write_imagef(output, coord_out, dst);\n\
+        coord_in.z++;\n\
+        coord_out.z++;\n\
+    }\n\
+}\n\
+\n\
+\n\
+__kernel void nearest_grid_sample_U8_U8toU8(\n\
+    __read_only  image2d_array_t  input0,\n\
+    __read_only  image2d_t        input1,\n\
+    __write_only image2d_array_t  output,\n\
+                           float  half_input0_w,\n\
+                           float  half_input0_h,\n\
+                           float  add_float_value_w,\n\
+                           float  add_float_value_h,\n\
+                           int    depth,\n\
+                           float  in0_scale,\n\
+                           float  in0_tail,\n\
+                           float  in1_scale,\n\
+                           float  in1_tail,\n\
+                           float  out_scale,\n\
+                           float  out_tail\n\
+                           )\n\
+{\n\
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int2   coord_in1    =  (int2)(get_global_id(0) * 2, get_global_id(1));\n\
+\n\
+    float fx    = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;\n\
+    coord_in1.x = coord_in1.x + 1;\n\
+    float fy    = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;\n\
+\n\
+    fx = fx * half_input0_w + add_float_value_w;\n\
+    fy = fy * half_input0_h + add_float_value_h;\n\
+    int   x_index = convert_int(fx);\n\
+    int   y_index = convert_int(fy);\n\
+    int4   coord_in     = (int4)(x_index, y_index, 0, 0);\n\
+\n\
+    float4 val;\n\
+    uint4  dst;\n\
+\n\
+    while (coord_in.z < depth){\n\
+        val    = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;\n\
+        dst      = convert_uint4_rte(val * out_scale + out_tail);\n\
+        write_imageui(output, coord_out, dst);\n\
+        coord_in.z++;\n\
+        coord_out.z++;\n\
+    }\n\
+\n\
+}\n\
+"; /* end of nearest_grid_sample_cl*/
+
 static const char one_hot_cl[] = "__kernel void one_hot_F32toF32\n\
     (\n\
         __read_only  image2d_t       input,\n\
@@ -62168,6 +67422,290 @@ __kernel void resize_1d_nearest_U8toU8(\n\
 }\n\
 "; /* end of resize_1d_nearest_cl*/
 
+static const char resize_3d_bilinear_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\
+\n\
+#define RESIZE_3D(in_name, out_name, read_image_type, dst_type, convert_type, write_image_type) \\\n\
+__kernel void resize_3d_bilinear_##in_name##to##out_name( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                           float  scale_x, \\\n\
+                           float  scale_y, \\\n\
+                           float  scale_z, \\\n\
+                           float  half_pixel_value, \\\n\
+                           uint   in_width, \\\n\
+                           uint   in_height, \\\n\
+                           uint   in_depth, \\\n\
+                           float  in_scale, \\\n\
+                           float  in_tail, \\\n\
+                           float  out_scale, \\\n\
+                           float  out_tail \\\n\
+                           ) \\\n\
+{ \\\n\
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    float  in_x         = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value; \\\n\
+    float  left_x_f     = fmax(floor(in_x), 0); \\\n\
+    float  x_lerp       = in_x - left_x_f; \\\n\
+    int    left_x_idx   = convert_int(left_x_f); \\\n\
+    float  in_y         = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value; \\\n\
+    float  top_y_f      = fmax(floor(in_y), 0); \\\n\
+    float  y_lerp       = in_y - top_y_f; \\\n\
+    int    top_y_idx    = convert_int(top_y_f); \\\n\
+    float  in_z         = (convert_float(coord_out.z) + half_pixel_value) * scale_z - half_pixel_value; \\\n\
+    float  front_z_f    = fmax(floor(in_z), 0); \\\n\
+    float  z_lerp       = in_z - front_z_f; \\\n\
+    int    front_z_idx  = convert_int(front_z_f); \\\n\
+    int4   coord_in     = (int4)(left_x_idx, top_y_idx, front_z_idx, 0); \\\n\
+    float4 data_000, data_100, data_010, data_110, data_001, data_011, data_101, data_111; \\\n\
+    dst_type  dst; \\\n\
+ \\\n\
+    int dx, dy, dz; \\\n\
+    dx = in_x < 0 ? 0 : (left_x_f < in_width - 1 ? 1 : 0); \\\n\
+    dy = in_y < 0 ? 0 : (top_y_f < in_height - 1 ? 1 : 0); \\\n\
+    dz = in_z < 0 ? 0 : (front_z_idx < in_depth - 1 ? 1 : 0); \\\n\
+ \\\n\
+    data_000 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\
+    coord_in.y = coord_in.y + dy; \\\n\
+    data_010 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\
+    coord_in.x = coord_in.x + dx; \\\n\
+    data_110 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\
+    coord_in.y = coord_in.y - dy; \\\n\
+    data_100 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\
+    coord_in.z = coord_in.z + dz; \\\n\
+    data_101 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\
+    coord_in.y = coord_in.y + dy; \\\n\
+    data_111 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\
+    coord_in.x = coord_in.x - dx; \\\n\
+    data_011 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\
+    coord_in.y = coord_in.y - dy; \\\n\
+    data_001 = convert_float4(read_image_type(input, coord_in)) * in_scale + in_tail; \\\n\
+ \\\n\
+    data_000 = data_000 + (data_100 - data_000) * x_lerp; \\\n\
+    data_010 = data_010 + (data_110 - data_010) * x_lerp; \\\n\
+    data_000 = data_000 + (data_010 - data_000) * y_lerp; \\\n\
+ \\\n\
+    data_001 = data_001 + (data_101 - data_001) * x_lerp; \\\n\
+    data_011 = data_011 + (data_111 - data_011) * x_lerp; \\\n\
+    data_001 = data_001 + (data_011 - data_001) * y_lerp; \\\n\
+    data_000 = data_000 + (data_001 - data_000) * z_lerp; \\\n\
+ \\\n\
+    dst      = convert_type(data_000 * out_scale + out_tail); \\\n\
+ \\\n\
+    write_image_type(output, coord_out, dst); \\\n\
+}\n\
+RESIZE_3D(F32, F32, read_imagef,  float4, convert_float4, write_imagef)\n\
+RESIZE_3D(F32, U8,  read_imagef,  uint4,  convert_uint4,  write_imageui)\n\
+RESIZE_3D(U8,  F32, read_imageui, float4, convert_float4, write_imagef)\n\
+RESIZE_3D(U8,  U8,  read_imageui, uint4,  convert_uint4,  write_imageui)\n\
+RESIZE_3D(I8,  I8,  read_imagei,  int4,   convert_int4,   write_imagei)\n\
+\n\
+__kernel void resize_3d_bilinear_BF16toBF16(\n\
+    __read_only  image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+                           float  scale_x,\n\
+                           float  scale_y,\n\
+                           float  scale_z,\n\
+                           float  half_pixel_value,\n\
+                           uint   in_width,\n\
+                           uint   in_height,\n\
+                           uint   in_depth,\n\
+                           float  in_scale,\n\
+                           float  in_tail,\n\
+                           float  out_scale,\n\
+                           float  out_tail\n\
+                           )\n\
+{\n\
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    float  in_x         = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value;\n\
+    float  left_x_f     = fmax(floor(in_x), 0);\n\
+    float  x_lerp       = in_x - left_x_f;\n\
+    int    left_x_idx   = convert_int(left_x_f);\n\
+    float  in_y         = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value;\n\
+    float  top_y_f      = fmax(floor(in_y), 0);\n\
+    float  y_lerp       = in_y - top_y_f;\n\
+    int    top_y_idx    = convert_int(top_y_f);\n\
+    float  in_z         = (convert_float(coord_out.z) + half_pixel_value) * scale_z - half_pixel_value;\n\
+    float  front_z_f    = fmax(floor(in_z), 0);\n\
+    float  z_lerp       = in_z - front_z_f;\n\
+    int    front_z_idx  = convert_int(front_z_f);\n\
+    int4   coord_in     = (int4)(left_x_idx, top_y_idx, front_z_idx, 0);\n\
+    uint4 data_000, data_100, data_010, data_110, data_001, data_011, data_101, data_111;\n\
+    float4 data_000_f, data_100_f, data_010_f, data_110_f, data_001_f, data_011_f, data_101_f, data_111_f;\n\
+    uint4  dst;\n\
+\n\
+    int dx, dy, dz;\n\
+    dx = in_x < 0 ? 0 : (left_x_f < in_width - 1 ? 1 : 0);\n\
+    dy = in_y < 0 ? 0 : (top_y_f < in_height - 1 ? 1 : 0);\n\
+    dz = in_z < 0 ? 0 : (front_z_idx < in_depth - 1 ? 1 : 0);\n\
+\n\
+    data_000 = read_imageui(input, coord_in);\n\
+    data_000 = data_000 << 16;\n\
+    coord_in.y = coord_in.y + dy;\n\
+    data_010 = read_imageui(input, coord_in);\n\
+    data_010 = data_010 << 16;\n\
+    coord_in.x = coord_in.x + dx;\n\
+    data_110 = read_imageui(input, coord_in);\n\
+    data_110 = data_110 << 16;\n\
+    coord_in.y = coord_in.y - dy;\n\
+    data_100 = read_imageui(input, coord_in);\n\
+    data_100 = data_100 << 16;\n\
+    coord_in.z = coord_in.z + dz;\n\
+    data_101 = read_imageui(input, coord_in);\n\
+    data_101 = data_101 << 16;\n\
+    coord_in.y = coord_in.y + dy;\n\
+    data_111 = read_imageui(input, coord_in);\n\
+    data_111 = data_111 << 16;\n\
+    coord_in.x = coord_in.x - dx;\n\
+    data_011 = read_imageui(input, coord_in);\n\
+    data_011 = data_011 << 16;\n\
+    coord_in.y = coord_in.y - dy;\n\
+    data_001 = read_imageui(input, coord_in);\n\
+    data_001 = data_001 << 16;\n\
+\n\
+    _viv_asm(COPY, data_000_f, data_000, 16);\n\
+    _viv_asm(COPY, data_010_f, data_010, 16);\n\
+    _viv_asm(COPY, data_110_f, data_110, 16);\n\
+    _viv_asm(COPY, data_100_f, data_100, 16);\n\
+    _viv_asm(COPY, data_101_f, data_101, 16);\n\
+    _viv_asm(COPY, data_111_f, data_111, 16);\n\
+    _viv_asm(COPY, data_011_f, data_011, 16);\n\
+    _viv_asm(COPY, data_001_f, data_001, 16);\n\
+\n\
+    data_000_f = data_000_f + (data_100_f - data_000_f) * x_lerp;\n\
+    data_010_f = data_010_f + (data_110_f - data_010_f) * x_lerp;\n\
+    data_000_f = data_000_f + (data_010_f - data_000_f) * y_lerp;\n\
+\n\
+    data_001_f = data_001_f + (data_101_f - data_001_f) * x_lerp;\n\
+    data_011_f = data_011_f + (data_111_f - data_011_f) * x_lerp;\n\
+    data_001_f = data_001_f + (data_011_f - data_001_f) * y_lerp;\n\
+    data_000_f = data_000_f + (data_001_f - data_000_f) * z_lerp;\n\
+\n\
+    _viv_asm(COPY, dst, data_000_f, 16);\n\
+    dst = dst >> 16;\n\
+    write_imageui(output, coord_out, dst);\n\
+}\n\
+"; /* end of resize_3d_bilinear_cl*/
+
+static const char resize_3d_nearest_cl[] = "\n\
+#define NEAREST_INDEX_PROCESS() \\\n\
+    int4   coord_out  = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    float  in_x       = (convert_float(coord_out.x) + half_pixel_value) * scale_x + round_value; \\\n\
+    int    in_x_idx   = convert_int(in_x); \\\n\
+    float  in_y       = (convert_float(coord_out.y) + half_pixel_value) * scale_y + round_value; \\\n\
+    int    in_y_idx   = convert_int(in_y); \\\n\
+    float  in_z       = (convert_float(coord_out.z) + half_pixel_value) * scale_z + round_value; \\\n\
+    int    in_z_idx   = convert_int(in_z); \\\n\
+\n\
+__kernel void resize_3d_nearest_F32toF32(\n\
+    __read_only  image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+                           float  scale_x,\n\
+                           float  scale_y,\n\
+                           float  scale_z,\n\
+                           float  half_pixel_value,\n\
+                           float  round_value,\n\
+                           float  output_scale,\n\
+                           float  output_tail)\n\
+{\n\
+    NEAREST_INDEX_PROCESS()\n\
+    int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);\n\
+    float4 dst;\n\
+    dst    = read_imagef(input, coord_in);\n\
+    write_imagef(output, coord_out, dst);\n\
+}\n\
+\n\
+\n\
+__kernel void resize_3d_nearest_U8toU8(\n\
+    __read_only  image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+                           float  scale_x,\n\
+                           float  scale_y,\n\
+                           float  scale_z,\n\
+                           float  half_pixel_value,\n\
+                           float  round_value,\n\
+                           float  output_scale,\n\
+                           float  output_tail)\n\
+{\n\
+    NEAREST_INDEX_PROCESS()\n\
+    int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);\n\
+    uint4 dst;\n\
+    dst    = convert_uint4(convert_float4(read_imageui(input, coord_in)) * output_scale + output_tail);\n\
+    write_imageui(output, coord_out, dst);\n\
+}\n\
+\n\
+__kernel void resize_3d_nearest_U8toF32(\n\
+    __read_only  image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+                           float  scale_x,\n\
+                           float  scale_y,\n\
+                           float  scale_z,\n\
+                           float  half_pixel_value,\n\
+                           float  round_value,\n\
+                           float  output_scale,\n\
+                           float  output_tail)\n\
+{\n\
+    NEAREST_INDEX_PROCESS()\n\
+    int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);\n\
+    float4 dst;\n\
+    dst    = convert_float4(read_imageui(input, coord_in)) * output_scale + output_tail;\n\
+    write_imagef(output, coord_out, dst);\n\
+}\n\
+\n\
+__kernel void resize_3d_nearest_F32toU8(\n\
+    __read_only  image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+                           float  scale_x,\n\
+                           float  scale_y,\n\
+                           float  scale_z,\n\
+                           float  half_pixel_value,\n\
+                           float  round_value,\n\
+                           float  output_scale,\n\
+                           float  output_tail)\n\
+{\n\
+    NEAREST_INDEX_PROCESS()\n\
+    int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);\n\
+    uint4 dst;\n\
+    dst    = convert_uint4(read_imagef(input, coord_in) * output_scale + output_tail);\n\
+    write_imageui(output, coord_out, dst);\n\
+}\n\
+\n\
+__kernel void resize_3d_nearest_I8toI8(\n\
+    __read_only  image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+                           float  scale_x,\n\
+                           float  scale_y,\n\
+                           float  scale_z,\n\
+                           float  half_pixel_value,\n\
+                           float  round_value,\n\
+                           float  output_scale,\n\
+                           float  output_tail)\n\
+{\n\
+    NEAREST_INDEX_PROCESS()\n\
+    int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);\n\
+    int4 dst;\n\
+    dst    = convert_int4(convert_float4(read_imagei(input, coord_in)) * output_scale);\n\
+    write_imagei(output, coord_out, dst);\n\
+}\n\
+\n\
+__kernel void resize_3d_nearest_BF16toBF16(\n\
+    __read_only  image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+                           float  scale_x,\n\
+                           float  scale_y,\n\
+                           float  scale_z,\n\
+                           float  half_pixel_value,\n\
+                           float  round_value,\n\
+                           float  output_scale,\n\
+                           float  output_tail)\n\
+{\n\
+    NEAREST_INDEX_PROCESS()\n\
+    int4 coord_in = (int4)(in_x_idx, in_y_idx, in_z_idx, 0);\n\
+    uint4 dst;\n\
+    dst = read_imageui(input, coord_in);\n\
+    write_imageui(output, coord_out, dst);\n\
+}\n\
+\n\
+"; /* end of resize_3d_nearest_cl*/
+
 static const char resize_bilinear_cl[] = "__kernel void resize_bilinear_F32toF32(\n\
     __read_only  image2d_array_t  input,\n\
     __write_only image2d_array_t  output,\n\
@@ -64556,7 +70094,7 @@ __kernel void swish_F32toU8_2D(\n\
 }"; /* end of swish_cl*/
 
 static const char tile_cl[] = "\n\
-#define TILE_3D(name0, name1, data_type, read_image_func, write_image_func) \\\n\
+#define TILE_3D(name0, name1, src_type, dst_type, conv_type, read_image_func, write_image_func) \\\n\
 __kernel void tile_##name0##to##name1 \\\n\
     ( \\\n\
     __read_only  image2d_array_t input, \\\n\
@@ -64567,7 +70105,9 @@ __kernel void tile_##name0##to##name1 \\\n\
                              int multiples_0, \\\n\
                              int multiples_1, \\\n\
                              int multiples_2, \\\n\
-                             int multiples_3 \\\n\
+                             int multiples_3, \\\n\
+                             float inoutscale, \\\n\
+                             float inouttail \\\n\
     ) \\\n\
 { \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
@@ -64575,7 +70115,9 @@ __kernel void tile_##name0##to##name1 \\\n\
     int width = get_image_width(input); \\\n\
     int height = get_image_height(input); \\\n\
  \\\n\
-    data_type src; \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+ \\\n\
     read_image_func(src, input, coord); \\\n\
  \\\n\
     int batch_id = (short)coord.z / (short)depthIn; \\\n\
@@ -64597,17 +70139,19 @@ __kernel void tile_##name0##to##name1 \\\n\
                 for (int x = 0; x < multiples_0; x++) \\\n\
                 { \\\n\
                     coord_out.x = coord.x + x * width; \\\n\
-                    write_image_func(output, coord_out.xyzw, src); \\\n\
+                    dst = conv_type(convert_float4(src) * inoutscale + inouttail); \\\n\
+                    write_image_func(output, coord_out.xyzw, dst); \\\n\
                 } \\\n\
             } \\\n\
         } \\\n\
     } \\\n\
 }\n\
-TILE_3D(I32, I32, int4,   READ_IMAGEI_2DARRAY,  write_imagei)\n\
-TILE_3D(U32, U32, uint4,  READ_IMAGEUI_2DARRAY, write_imageui)\n\
-TILE_3D(F32, F32, float4, READ_IMAGEF_2DARRAY,  write_imagef)\n\
+TILE_3D(I32, I32, int4,   int4,  convert_int4_rte,  READ_IMAGEI_2DARRAY,  write_imagei)\n\
+TILE_3D(U32, U32, uint4,  uint4, convert_uint4_rte, READ_IMAGEUI_2DARRAY, write_imageui)\n\
+TILE_3D(F32, F32, float4, float4,convert_float4_rte,READ_IMAGEF_2DARRAY,  write_imagef)\n\
+TILE_3D(F32, U32, float4, uint4, convert_uint4_rte, READ_IMAGEF_2DARRAY,  write_imageui)\n\
 \n\
-#define TILE_2D(name0, name1, data_type, read_image_func, write_image_func) \\\n\
+#define TILE_2D(name0, name1, src_type, dst_type, conv_type, read_image_func, write_image_func) \\\n\
 __kernel void tile_##name0##to##name1##_2D \\\n\
     ( \\\n\
     __read_only  image2d_t input, \\\n\
@@ -64618,7 +70162,9 @@ __kernel void tile_##name0##to##name1##_2D \\\n\
                        int multiples_0, \\\n\
                        int multiples_1, \\\n\
                        int multiples_2, \\\n\
-                       int multiples_3 \\\n\
+                       int multiples_3, \\\n\
+                       float inoutscale, \\\n\
+                       float inouttail \\\n\
     ) \\\n\
 { \\\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
@@ -64627,22 +70173,25 @@ __kernel void tile_##name0##to##name1##_2D \\\n\
     int output_width = get_image_width(output); \\\n\
     int output_height = get_image_height(output); \\\n\
  \\\n\
-    data_type src = read_image_func(input, coord); \\\n\
+    src_type src = read_image_func(input, coord); \\\n\
+    dst_type dst; \\\n\
  \\\n\
     do \\\n\
     { \\\n\
         do \\\n\
         { \\\n\
-            write_image_func(output, coord, src); \\\n\
+            dst = conv_type(convert_float4(src) * inoutscale + inouttail); \\\n\
+            write_image_func(output, coord, dst); \\\n\
             coord.x += width; \\\n\
         } while (coord.x < output_width); \\\n\
         coord.x = get_global_id(0); \\\n\
         coord.y += height; \\\n\
     } while (coord.y < output_height); \\\n\
 }\n\
-TILE_2D(I32, I32, int4,   read_imagei,  write_imagei)\n\
-TILE_2D(U32, U32, uint4,  read_imageui, write_imageui)\n\
-TILE_2D(F32, F32, float4, read_imagef,  write_imagef)\n\
+TILE_2D(I32, I32, int4,   int4,  convert_int4_rte,  read_imagei,  write_imagei)\n\
+TILE_2D(U32, U32, uint4,  uint4, convert_uint4_rte, read_imageui, write_imageui)\n\
+TILE_2D(F32, F32, float4, float4,convert_float4_rte,read_imagef,  write_imagef)\n\
+TILE_2D(F32, U32, float4, uint4, convert_uint4_rte, read_imagef,  write_imageui)\n\
 \n\
 \n\
 \n\
@@ -65903,9 +71452,13 @@ static const source_map_t evis_resource[] =
     {"cumsum_vx", cumsum_vx},
     {"cumsum_2d_vx", cumsum_2d_vx},
     {"cumsum_bf16_vx", cumsum_bf16_vx},
+    {"cumsum_ex_rev_axis0_vx", cumsum_ex_rev_axis0_vx},
+    {"cumsum_ex_rev_axis1_vx", cumsum_ex_rev_axis1_vx},
+    {"cumsum_ex_rev_axis2_vx", cumsum_ex_rev_axis2_vx},
     {"cumsum_f16_u8_vx", cumsum_f16_u8_vx},
     {"custom_softmax_vx", custom_softmax_vx},
     {"custom_warp_affine_vx", custom_warp_affine_vx},
+    {"custom_warp_affine_rgb_vx", custom_warp_affine_rgb_vx},
     {"custom_warp_perspective_vx", custom_warp_perspective_vx},
     {"depth2space_crd_vx", depth2space_crd_vx},
     {"depthwise_conv1d_src0_vx", depthwise_conv1d_src0_vx},
@@ -65988,12 +71541,15 @@ static const source_map_t evis_resource[] =
     {"lstmunit_activation_S_F16_vx", lstmunit_activation_S_F16_vx},
     {"lstmunit_activation_S_U8_vx", lstmunit_activation_S_U8_vx},
     {"matrixmul_bf16_vx", matrixmul_bf16_vx},
+    {"matrixmul_cross_vx", matrixmul_cross_vx},
+    {"matrixmul_cross_i16_vx", matrixmul_cross_i16_vx},
     {"matrixmul_f16_vx", matrixmul_f16_vx},
     {"matrixmul_f16f16_u8_vx", matrixmul_f16f16_u8_vx},
     {"matrixmul_f16i16_i16_vx", matrixmul_f16i16_i16_vx},
     {"matrixmul_f16u8_f16_vx", matrixmul_f16u8_f16_vx},
     {"matrixmul_f16u8_u8_vx", matrixmul_f16u8_u8_vx},
     {"matrixmul_i16_vx", matrixmul_i16_vx},
+    {"matrixmul_merge_vx", matrixmul_merge_vx},
     {"matrixmul_transA_vx", matrixmul_transA_vx},
     {"matrixmul_transB_f16_vx", matrixmul_transB_f16_vx},
     {"matrixmul_transB_f16_mix_vx", matrixmul_transB_f16_mix_vx},
@@ -66015,6 +71571,12 @@ static const source_map_t evis_resource[] =
     {"moments_axis2_vx", moments_axis2_vx},
     {"moments_u8_vx", moments_u8_vx},
     {"moments_u8_axis012_vx", moments_u8_axis012_vx},
+    {"nearest_grid_sample_BF16_to_BF16_vx", nearest_grid_sample_BF16_to_BF16_vx},
+    {"nearest_grid_sample_F16_to_F16_vx", nearest_grid_sample_F16_to_F16_vx},
+    {"nearest_grid_sample_F16_to_U8_vx", nearest_grid_sample_F16_to_U8_vx},
+    {"nearest_grid_sample_I16_to_I16_vx", nearest_grid_sample_I16_to_I16_vx},
+    {"nearest_grid_sample_I8_to_I8_vx", nearest_grid_sample_I8_to_I8_vx},
+    {"nearest_grid_sample_U8_to_U8_vx", nearest_grid_sample_U8_to_U8_vx},
     {"one_hot_vx", one_hot_vx},
     {"poolwithargmax_F16_vx", poolwithargmax_F16_vx},
     {"poolwithargmax_I16_vx", poolwithargmax_I16_vx},
@@ -66031,9 +71593,15 @@ static const source_map_t evis_resource[] =
     {"pre_process_rgb888_planar_0_vx", pre_process_rgb888_planar_0_vx},
     {"pre_process_rgb888_planar_1_vx", pre_process_rgb888_planar_1_vx},
     {"pre_process_rgb888_planar_2_vx", pre_process_rgb888_planar_2_vx},
+    {"pre_process_rgb888_planar_nhwc_0_vx", pre_process_rgb888_planar_nhwc_0_vx},
+    {"pre_process_rgb888_planar_nhwc_1_vx", pre_process_rgb888_planar_nhwc_1_vx},
+    {"pre_process_rgb888_planar_nhwc_2_vx", pre_process_rgb888_planar_nhwc_2_vx},
     {"pre_process_rgb888_planar_sep_0_vx", pre_process_rgb888_planar_sep_0_vx},
     {"pre_process_rgb888_planar_sep_1_vx", pre_process_rgb888_planar_sep_1_vx},
     {"pre_process_rgb888_planar_sep_2_vx", pre_process_rgb888_planar_sep_2_vx},
+    {"pre_process_rgb888_planar_sep_nhwc_0_vx", pre_process_rgb888_planar_sep_nhwc_0_vx},
+    {"pre_process_rgb888_planar_sep_nhwc_1_vx", pre_process_rgb888_planar_sep_nhwc_1_vx},
+    {"pre_process_rgb888_planar_sep_nhwc_2_vx", pre_process_rgb888_planar_sep_nhwc_2_vx},
     {"pre_process_rgb_copy_vx", pre_process_rgb_copy_vx},
     {"pre_process_yuv420_copy_vx", pre_process_yuv420_copy_vx},
     {"pre_process_yuv420_scale_0_vx", pre_process_yuv420_scale_0_vx},
@@ -66092,6 +71660,8 @@ static const source_map_t evis_resource[] =
     {"scatter_nd_update_vx", scatter_nd_update_vx},
     {"scatter_nd_update_atom_vx", scatter_nd_update_atom_vx},
     {"scatter_nd_update_big_vx", scatter_nd_update_big_vx},
+    {"scatter_nd_update_fp_vx", scatter_nd_update_fp_vx},
+    {"scatter_nd_update_qint_vx", scatter_nd_update_qint_vx},
     {"scatter_nd_update_special_vx", scatter_nd_update_special_vx},
     {"select_vx", select_vx},
     {"sequence_mask_vx", sequence_mask_vx},
@@ -66102,6 +71672,8 @@ static const source_map_t evis_resource[] =
     {"tensorstackconcat_vx", tensorstackconcat_vx},
     {"tile_vx", tile_vx},
     {"tile_mix_vx", tile_mix_vx},
+    {"tiny_yolov4_postprocess_box_vx", tiny_yolov4_postprocess_box_vx},
+    {"tiny_yolov4_postprocess_confidence_vx", tiny_yolov4_postprocess_confidence_vx},
     {"upsample_F16_vx", upsample_F16_vx},
     {"upsample_I16_vx", upsample_I16_vx},
     {"upsample_I8_vx", upsample_I8_vx},
@@ -66192,6 +71764,7 @@ static const source_map_t cl_resource[] =
     {"lstmunit_activation_S_F32_cl", lstmunit_activation_S_F32_cl},
     {"lstmunit_activation_S_U8_cl", lstmunit_activation_S_U8_cl},
     {"matrixmul_cl", matrixmul_cl},
+    {"matrixmul_cross_cl", matrixmul_cross_cl},
     {"matrixmul_transA_cl", matrixmul_transA_cl},
     {"maximum_cl", maximum_cl},
     {"maxpoolwithargmax_cl", maxpoolwithargmax_cl},
@@ -66204,6 +71777,7 @@ static const source_map_t cl_resource[] =
     {"moments_axis012_cl", moments_axis012_cl},
     {"moments_axis1_cl", moments_axis1_cl},
     {"moments_axis2_cl", moments_axis2_cl},
+    {"nearest_grid_sample_cl", nearest_grid_sample_cl},
     {"one_hot_cl", one_hot_cl},
     {"poolwithargmax_cl", poolwithargmax_cl},
     {"pow_cl", pow_cl},
@@ -66229,6 +71803,8 @@ static const source_map_t cl_resource[] =
     {"repeat_cl", repeat_cl},
     {"resize_1d_bilinear_cl", resize_1d_bilinear_cl},
     {"resize_1d_nearest_cl", resize_1d_nearest_cl},
+    {"resize_3d_bilinear_cl", resize_3d_bilinear_cl},
+    {"resize_3d_nearest_cl", resize_3d_nearest_cl},
     {"resize_bilinear_cl", resize_bilinear_cl},
     {"resize_nearest_cl", resize_nearest_cl},
     {"reversesequence_cl", reversesequence_cl},
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
index 8462aad82..2c63c1e5e 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
@@ -33,6 +33,7 @@
 #include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vsi_nn_libnnext_resource.h"
+#include "vsi_nn_error.h"
 
 static char s_vx_resource_path[VSI_NN_MAX_PATH] = "VX";
 
@@ -63,6 +64,11 @@ uint8_t * vsi_nn_LoadBinarySource
     fseek( fp, 0, SEEK_SET );
 
     buf = (uint8_t *)malloc( len + 1 );
+    if (buf == NULL)
+    {
+        fclose( fp );
+        return NULL;
+    }
     n = (int32_t)fread( buf, 1, len, fp );
     fclose( fp );
 
@@ -208,7 +214,10 @@ static vsi_status vsi_nn_RegisterVXKernel
     evis = context->config.evis.ver;
 
     program_src = (const char**)malloc(kernel_info->resource_num * sizeof(char *));
+    CHECK_PTR_FAIL_GOTO( program_src, "Create buffer fail.", final );
     program_len = (vx_size*)malloc(kernel_info->resource_num * sizeof(vx_size));
+    CHECK_PTR_FAIL_GOTO( program_len, "Create buffer fail.", final );
+
     for (i = 0; i < kernel_info->resource_num; i++)
     {
         program_src[i] = vsi_nn_resource_load_source_code(
@@ -228,7 +237,7 @@ static vsi_status vsi_nn_RegisterVXKernel
     {
         VSILOGE("[%s : %d] vxCreateProgramWithSource() Error!\n", __FILE__, __LINE__);
         status = VSI_FAILURE;
-        goto OnError;
+        goto final;
     }
 
     if(evis == VSI_NN_HW_EVIS_NONE)
@@ -267,16 +276,17 @@ static vsi_status vsi_nn_RegisterVXKernel
     {
         VSILOGE( "Add kernel %s fail.", kernel->name );
     }
-OnError:
+final:
     for (i = 0; i < kernel_info->resource_num; i++)
     {
-        if (program_src[i] && load_from_file)
+        if (load_from_file && program_src[i])
         {
             free((char *)program_src[i]);
         }
     }
     if(program_src) free((char**)program_src);
     if(program_len) free(program_len);
+
     return status;
 }
 
@@ -286,7 +296,7 @@ static vsi_status vsi_nn_RegisterBinKernel
     vsi_nn_kernel_info_t * kernel_info
     )
 {
-    vsi_status status;
+    vsi_status status = VSI_FAILURE;
     vx_kernel obj;
     vx_program program = NULL;
     vx_size program_len = 0;
@@ -308,6 +318,11 @@ static vsi_status vsi_nn_RegisterBinKernel
 
     program_ptr = vsi_nn_VxBinResourceGetResource(
             kernel_info->resource_name[kernel_info->resource_num - 1], &program_len);
+    if (program_ptr == NULL)
+    {
+        VSILOGE("[%s : %d] vsi_nn_VxBinResourceGetResource() Error!\n", __FILE__, __LINE__);
+        return status;
+    }
     program = vxCreateProgramWithBinary(ctx, (const vx_uint8 *)program_ptr, program_len);
 
     status = vxGetStatus((vx_reference)program);
@@ -396,10 +411,19 @@ vx_node vsi_nn_RegisterClientKernelAndNewNode
     )
 {
     vsi_status status;
-    vx_context ctx;
-    vx_kernel obj;
-    vx_node node;
-    vx_kernel_description_t * kernel = kernel_info->kernel[kernel_info->kernel_index];
+    vx_context ctx = NULL;
+    vx_kernel obj = NULL;
+    vx_node node = NULL;
+    vx_kernel_description_t * kernel = NULL;
+
+    if (kernel_info->kernel)
+    {
+        kernel = kernel_info->kernel[kernel_info->kernel_index];
+    }
+    else
+    {
+        goto final;
+    }
 
     ctx = vxGetContext( (vx_reference)graph->g );
 
@@ -444,6 +468,8 @@ vx_node vsi_nn_RegisterClientKernelAndNewNode
             kernel->name, status );
         return NULL;
     }
+
+final:
     return node;
 } /* vsi_nn_RegisterClientKernelAndNewNode() */
 
@@ -501,6 +527,10 @@ vsi_status VX_CALLBACK vsi_nn_KernelValidator
     vx_meta_format metas[]
 )
 {
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(parameters);
+    VSI_UNREFERENCED(num);
+    VSI_UNREFERENCED(metas);
     return VSI_SUCCESS;
 } /* vsi_nn_KernelValidator() */
 
@@ -511,6 +541,9 @@ vsi_status VX_CALLBACK vsi_nn_KernelInitializer
     uint32_t paraNum
     )
 {
+    VSI_UNREFERENCED(nodObj);
+    VSI_UNREFERENCED(paramObj);
+    VSI_UNREFERENCED(paraNum);
     return VSI_SUCCESS;
 } /* vsi_nn_KernelInitializer() */
 
@@ -521,6 +554,9 @@ vsi_status VX_CALLBACK vsi_nn_KernelDeinitializer
     uint32_t paraNum
     )
 {
+    VSI_UNREFERENCED(nodObj);
+    VSI_UNREFERENCED(paraObj);
+    VSI_UNREFERENCED(paraNum);
     return VSI_SUCCESS;
 } /* vsi_nn_KernelDeinitializer() */
 
@@ -543,6 +579,8 @@ const uint8_t * vsi_nn_VxBinResourceGetResource
     vx_size *len
     )
 {
+    VSI_UNREFERENCED(name);
+    VSI_UNREFERENCED(len);
     return NULL;
 } /* vsi_nn_VxResourceGetBinResource() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c b/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c
index 1f371d471..97da8bd51 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c
@@ -39,6 +39,7 @@
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
 
 static vsi_status op_compute
     (
@@ -78,6 +79,7 @@ static vsi_bool op_check
     attr.vtl = TRUE;
     attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
     a_times_b[0] = vsi_nn_CreateTensor(self->graph, &attr);
+    CHECK_PTR_FAIL_GOTO(a_times_b[0], "Create tensor failed", final);
     ret = vsi_nn_OpCheck(VSI_NN_OP_MULTIPLY, self, inputs, a_times_b);
     if (!ret)
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c
index 078d708a7..b248d9054 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c
@@ -34,6 +34,7 @@
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_constraint_check.h"
 #include "vsi_nn_kernel_prv.h"
+#include "vsi_nn_error.h"
 
 static int32_t _get_input_num
     (
@@ -91,6 +92,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -101,6 +104,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 } /* op_check() */
 
@@ -112,6 +118,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
@@ -122,7 +130,7 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_bool ret = TRUE;
+    vsi_bool ret = FALSE;
     uint32_t i;
     vsi_nn_tensor_attr_t attr;
     vsi_nn_internal_node_t* curr = NULL;
@@ -134,6 +142,12 @@ static vsi_bool op_setup
 
     input_num = _get_input_num(self, inputs);
 
+    if (input_num < 2)
+    {
+        VSILOGE( "Wrong input tensor number = %u.", input_num );
+        return FALSE;
+    }
+
     is_sp_supported = vsi_nn_is_sp_supported_broadcast(self->graph, inputs, input_num, outputs[0]);
 
     for(i = 0; i < input_num -1; i++)
@@ -142,6 +156,7 @@ static vsi_bool op_setup
 
         /* setup input for each add */
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         if(i == 0)
         {
             curr->inputs[0] = inputs[i];
@@ -174,6 +189,7 @@ static vsi_bool op_setup
             }
 
             temp_output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(temp_output_tensor, curr, "Create internal tensor failed", final);
 
             curr->outputs[0] = temp_output_tensor->t;
         }
@@ -182,8 +198,10 @@ static vsi_bool op_setup
             curr->outputs[0] = outputs[0];
         }
 
-        vsi_nn_internal_setup_node( self, curr );
+        ret = vsi_nn_internal_setup_node( self, curr );
     }
+
+final:
     return ret;
 } /* op_setup() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c b/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c
index 23248759e..6252e4d52 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c
@@ -70,6 +70,9 @@ static vsi_bool op_check
     )
 {
     /*TODO: Check tensor shapes. */
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 } /* op_check() */
 
@@ -80,6 +83,7 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
     if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.size[0] = inputs[1]->attr.size[0];
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c
index 56889cbed..0e6fa13e5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c
@@ -248,6 +248,7 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
     /* TODO: Add code to comput outputs' shape. */
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c
index a969fa6b5..7afa231b4 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c
@@ -78,6 +78,7 @@ static vsi_bool setup_op_shapes
         attr.is_const = TRUE;
 
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         inputs[BI_LSTM_FW_INPUT_H_STATE] = output_tensor->t;
     }
 
@@ -91,6 +92,7 @@ static vsi_bool setup_op_shapes
         attr.is_const = TRUE;
 
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         inputs[BI_LSTM_BW_INPUT_H_STATE] = output_tensor->t;
     }
 
@@ -119,6 +121,8 @@ static vsi_bool setup_op_shapes
     }
 
     return TRUE;
+final:
+    return FALSE;
 }
 
 static vsi_status op_compute
@@ -128,6 +132,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -139,6 +145,9 @@ static vsi_bool op_check
     )
 {
     /*TODO: Check tensor shapes. */
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 } /* op_check() */
 
@@ -150,6 +159,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
@@ -183,6 +194,9 @@ static vsi_bool op_setup
     vsi_size_t batch_size = 0;
     uint32_t time_step = 0;
     vsi_size_t i = 0;
+    vsi_bool ret = FALSE;
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_tensor_t** merge_tensors = NULL;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_node_wksp( self );
@@ -207,6 +221,7 @@ static vsi_bool op_setup
         /* transpose to time_major */
         output_tensor = vsi_nn_rnn_transpose_time_major(self,
             inputs[BI_LSTM_INPUT_INPUT], NULL, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         input_tensor = output_tensor->t;
     }
 
@@ -219,6 +234,7 @@ static vsi_bool op_setup
             /* transpose to time_major */
             output_tensor = vsi_nn_rnn_transpose_time_major(self,
                 inputs[BI_LSTM_AUX_INPUT], NULL, use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
             aux_input_tensor = output_tensor->t;
         }
     }
@@ -231,10 +247,12 @@ static vsi_bool op_setup
     CHECK_PTR_FAIL_GOTO( reshape_output_tensors, "Create buffer fail.", final );
     memset( reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
 
-    vsi_nn_rnn_split_input_tensor(self, input_tensor,
+    status = vsi_nn_rnn_split_input_tensor(self, input_tensor,
         split_output_tensors, time_step, use_virtual_tensor);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
-    vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor);
+    status = vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
     /* split aux input tensor */
     if(has_aux_input)
@@ -246,10 +264,12 @@ static vsi_bool op_setup
         CHECK_PTR_FAIL_GOTO( aux_reshape_output_tensors, "Create buffer fail.", final );
         memset( aux_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
 
-        vsi_nn_rnn_split_input_tensor(self, aux_input_tensor,
+        status = vsi_nn_rnn_split_input_tensor(self, aux_input_tensor,
             aux_split_output_tensors, time_step, use_virtual_tensor);
+        CHECK_STATUS_FAIL_GOTO(status, final);
 
-        vsi_nn_rnn_data_check_aligned(self, aux_split_output_tensors, time_step, use_virtual_tensor);
+        status = vsi_nn_rnn_data_check_aligned(self, aux_split_output_tensors, time_step, use_virtual_tensor);
+        CHECK_STATUS_FAIL_GOTO(status, final);
     }
 
     /* prepare output tensor */
@@ -267,6 +287,7 @@ static vsi_bool op_setup
         /* reshape for split output */
         output_tensor = vsi_nn_rnn_reshape_split_output(self,
             split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         reshape_output_tensors[i] = output_tensor->t;
 
         if (has_aux_input)
@@ -274,6 +295,7 @@ static vsi_bool op_setup
             /* reshape for aux split output */
             output_tensor = vsi_nn_rnn_reshape_split_output(self,
                 aux_split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
             aux_reshape_output_tensors[i] = output_tensor->t;
         }
     }
@@ -291,21 +313,25 @@ static vsi_bool op_setup
         vsi_nn_internal_init_tensor_attr(&attr,
             &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         lstmcell_out0 = output_tensor->t;
 
         /* lstmcell output h_state */
         vsi_nn_internal_init_tensor_attr(&attr,
             &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         lstmcell_out1 = output_tensor->t;
 
         /* lstmcell output c_state */
         vsi_nn_internal_init_tensor_attr(&attr,
             &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         lstmcell_out2 = output_tensor->t;
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_OVXLIB, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.lstmunit_ovxlib.activation = curr_param->activation;
         curr->node->nn_param.lstmunit_ovxlib.cell_clip = curr_param->cell_clip;
         curr->node->nn_param.lstmunit_ovxlib.forget_bias = curr_param->forget_bias;
@@ -373,6 +399,7 @@ static vsi_bool op_setup
         /* reshape output to 3-dims */
         output_tensor = vsi_nn_rnn_reshape_cell_output(self,
             lstmcell_out0, (uint32_t)batch_size, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         lstmcell_reshape_output_tensors_fw[i] = output_tensor->t;
     }
 
@@ -391,21 +418,25 @@ static vsi_bool op_setup
         vsi_nn_internal_init_tensor_attr(&attr,
             &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         lstmcell_out0 = output_tensor->t;
 
         /* lstmcell output h_state */
         vsi_nn_internal_init_tensor_attr(&attr,
             &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         lstmcell_out1 = output_tensor->t;
 
         /* lstmcell output c_state */
         vsi_nn_internal_init_tensor_attr(&attr,
             &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         lstmcell_out2 = output_tensor->t;
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_OVXLIB, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.lstmunit_ovxlib.activation = curr_param->activation;
         curr->node->nn_param.lstmunit_ovxlib.cell_clip = curr_param->cell_clip;
         curr->node->nn_param.lstmunit_ovxlib.forget_bias = curr_param->forget_bias;
@@ -473,12 +504,12 @@ static vsi_bool op_setup
         /* reshape output to 3-dims */
         output_tensor = vsi_nn_rnn_reshape_cell_output(self,
             lstmcell_out0, (uint32_t)batch_size, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         lstmcell_reshape_output_tensors_bw[i] = output_tensor->t;
     }
 
     if(curr_param->merge_outputs)
     {
-        vsi_nn_tensor_t** merge_tensors = NULL;
         merge_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
         CHECK_PTR_FAIL_GOTO( merge_tensors, "Create buffer fail.", final );
         memset( merge_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
@@ -489,6 +520,7 @@ static vsi_bool op_setup
             vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
             output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
 
             tensor = output_tensor->t;
         }
@@ -499,8 +531,10 @@ static vsi_bool op_setup
             vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
             output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
 
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 2, 1 );
+            CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
             curr->node->nn_param.concat.axis = 0;
             curr->inputs[0] = lstmcell_reshape_output_tensors_fw[i];
             curr->inputs[1] = lstmcell_reshape_output_tensors_bw[i];
@@ -512,6 +546,7 @@ static vsi_bool op_setup
 
         /* concat lstmcell output, the lstm's output is 3-dims */
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.concat.axis = 2;
         for( i = 0; i < time_step; i++ )
         {
@@ -526,7 +561,6 @@ static vsi_bool op_setup
             vsi_nn_rnn_transpose_time_major(self,
                 tensor, outputs[BI_LSTM_FW_OUTPUT_OUTPUT], use_virtual_tensor);
         }
-        vsi_nn_safe_free( merge_tensors );
     }
     else
     {
@@ -537,12 +571,14 @@ static vsi_bool op_setup
             vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
             output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
 
             tensor = output_tensor->t;
         }
 
         /* concat lstmcell output, the lstm's output is 3-dims */
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.concat.axis = 2;
         for( i = 0; i < time_step; i++ )
         {
@@ -565,12 +601,14 @@ static vsi_bool op_setup
             vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[BI_LSTM_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
             output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
 
             tensor = output_tensor->t;
         }
 
         /* concat lstmcell output, the lstm's output is 3-dims */
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.concat.axis = 2;
         for( i = 0; i < time_step; i++ )
         {
@@ -587,7 +625,10 @@ static vsi_bool op_setup
         }
     }
 
+    ret = TRUE;
+
 final:
+    vsi_nn_safe_free( merge_tensors );
     vsi_nn_safe_free( split_output_tensors );
     vsi_nn_safe_free( aux_split_output_tensors )
     vsi_nn_safe_free( reshape_output_tensors );
@@ -595,7 +636,7 @@ static vsi_bool op_setup
     vsi_nn_safe_free( lstmcell_reshape_output_tensors_fw );
     vsi_nn_safe_free( lstmcell_reshape_output_tensors_bw );
 
-    return TRUE;
+    return ret;
 } /* op_setup() */
 
 static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c
index c122de7f5..8b3844de0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c
@@ -79,6 +79,7 @@ static vsi_bool setup_op_shapes
         attr.is_const = TRUE;
 
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         inputs[BI_RNN_FW_INPUT_H_STATE] = output_tensor->t;
     }
 
@@ -92,6 +93,7 @@ static vsi_bool setup_op_shapes
         attr.is_const = TRUE;
 
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         inputs[BI_RNN_BW_INPUT_H_STATE] = output_tensor->t;
     }
 
@@ -103,6 +105,7 @@ static vsi_bool setup_op_shapes
         attr.vtl = use_virtual_tensor;
         attr.is_const = FALSE;
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         outputs[BI_RNN_FW_OUTPUT_H_STATE] = output_tensor->t;
     }
 
@@ -114,6 +117,7 @@ static vsi_bool setup_op_shapes
         attr.vtl = use_virtual_tensor;
         attr.is_const = FALSE;
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         outputs[BI_RNN_BW_OUTPUT_H_STATE] = output_tensor->t;
     }
 
@@ -162,6 +166,8 @@ static vsi_bool setup_op_shapes
         }
     }
     return TRUE;
+final:
+    return FALSE;
 }
 
 static vsi_status op_compute
@@ -171,6 +177,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -181,6 +189,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -193,6 +204,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
@@ -225,6 +238,9 @@ static vsi_bool op_setup
     vsi_size_t batch_size = 0;
     vsi_size_t time_step = 0;
     vsi_size_t i = 0;
+    vsi_bool ret = FALSE;
+    vsi_nn_tensor_t** merge_tensors = NULL;
+    vsi_status status = VSI_FAILURE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_node_wksp( self );
@@ -249,6 +265,7 @@ static vsi_bool op_setup
         /* transpose to time_major */
         output_tensor = vsi_nn_rnn_transpose_time_major(self,
             inputs[BI_RNN_INPUT_INPUT], NULL, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         input_tensor = output_tensor->t;
     }
 
@@ -261,6 +278,7 @@ static vsi_bool op_setup
             /* transpose to time_major */
             output_tensor = vsi_nn_rnn_transpose_time_major(self,
                 inputs[BI_RNN_AUX_INPUT], NULL, use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
             aux_input_tensor = output_tensor->t;
         }
     }
@@ -273,10 +291,12 @@ static vsi_bool op_setup
     CHECK_PTR_FAIL_GOTO( reshape_output_tensors, "Create buffer fail.", final );
     memset( reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
 
-    vsi_nn_rnn_split_input_tensor(self, input_tensor,
+    status = vsi_nn_rnn_split_input_tensor(self, input_tensor,
         split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
-    vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+    status = vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
     /* split aux input tensor */
     if(has_aux_input)
@@ -288,10 +308,13 @@ static vsi_bool op_setup
         CHECK_PTR_FAIL_GOTO( aux_reshape_output_tensors, "Create buffer fail.", final );
         memset( aux_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
 
-        vsi_nn_rnn_split_input_tensor(self, aux_input_tensor,
+        status = vsi_nn_rnn_split_input_tensor(self, aux_input_tensor,
             aux_split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+        CHECK_STATUS_FAIL_GOTO(status, final);
 
-        vsi_nn_rnn_data_check_aligned(self, aux_split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+        status = vsi_nn_rnn_data_check_aligned(self, aux_split_output_tensors,
+            (uint32_t)time_step, use_virtual_tensor);
+        CHECK_STATUS_FAIL_GOTO(status, final);
     }
 
     /* prepare output tensor */
@@ -309,6 +332,7 @@ static vsi_bool op_setup
         /* reshape for split output */
         output_tensor = vsi_nn_rnn_reshape_split_output(self,
             split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         reshape_output_tensors[i] = output_tensor->t;
 
         if (has_aux_input)
@@ -316,6 +340,7 @@ static vsi_bool op_setup
             /* reshape for aux split output */
             output_tensor = vsi_nn_rnn_reshape_split_output(self,
                 aux_split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
             aux_reshape_output_tensors[i] = output_tensor->t;
         }
     }
@@ -331,12 +356,14 @@ static vsi_bool op_setup
         vsi_nn_internal_init_tensor_attr(&attr,
             &outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         rnncell_out0 = output_tensor->t;
 
         /* rnncell output h_state */
         vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         rnncell_out1 = output_tensor->t;
 
         if (reshape_output_tensors[i]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 &&
@@ -366,6 +393,7 @@ static vsi_bool op_setup
         }
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RNNCELL_OVXLIB, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.rnncell_ovxlib.activation = curr_param->activation;
         memcpy( curr->node->nn_param.rnncell_ovxlib.internal_dtype,
             curr_param->internal_dtype,
@@ -399,6 +427,7 @@ static vsi_bool op_setup
         /* reshape output to 3-dims */
         output_tensor = vsi_nn_rnn_reshape_cell_output(self,
             rnncell_out0, (uint32_t)batch_size, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         rnncell_reshape_output_tensors_fw[i] = output_tensor->t;
     }
 
@@ -421,12 +450,14 @@ static vsi_bool op_setup
             &outputs[BI_RNN_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
         }
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         rnncell_out0 = output_tensor->t;
 
         /* rnncell output h_state */
         vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[BI_RNN_BW_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         rnncell_out1 = output_tensor->t;
 
         if (reshape_output_tensors[time_step - 1 - i]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 &&
@@ -456,6 +487,7 @@ static vsi_bool op_setup
         }
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RNNCELL_OVXLIB, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.rnncell_ovxlib.activation = curr_param->activation;
         memcpy( curr->node->nn_param.rnncell_ovxlib.internal_dtype,
                 curr_param->internal_dtype,
@@ -489,12 +521,12 @@ static vsi_bool op_setup
         /* reshape output to 3-dims */
         output_tensor = vsi_nn_rnn_reshape_cell_output(self,
             rnncell_out0, (uint32_t)batch_size, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         rnncell_reshape_output_tensors_bw[time_step - 1 - i] = output_tensor->t;
     }
 
     if(curr_param->merge_outputs)
     {
-        vsi_nn_tensor_t** merge_tensors = NULL;
         merge_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
         CHECK_PTR_FAIL_GOTO( merge_tensors, "Create buffer fail.", final );
         memset( merge_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
@@ -505,6 +537,7 @@ static vsi_bool op_setup
             vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
             output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
 
             tensor = output_tensor->t;
         }
@@ -515,8 +548,10 @@ static vsi_bool op_setup
             vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
             output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
 
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 2, 1 );
+            CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
             curr->node->nn_param.concat.axis = 0;
             curr->inputs[0] = rnncell_reshape_output_tensors_fw[i];
             curr->inputs[1] = rnncell_reshape_output_tensors_bw[i];
@@ -528,6 +563,7 @@ static vsi_bool op_setup
 
         /* concat rnncell output, the rnn's output is 3-dims */
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.concat.axis = 2;
         for( i = 0; i < time_step; i++ )
         {
@@ -542,7 +578,6 @@ static vsi_bool op_setup
             vsi_nn_rnn_transpose_time_major(self,
                 tensor, outputs[BI_RNN_FW_OUTPUT_OUTPUT], use_virtual_tensor);
         }
-        vsi_nn_safe_free( merge_tensors );
     }
     else
     {
@@ -553,6 +588,7 @@ static vsi_bool op_setup
             vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
             output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
 
             tensor = output_tensor->t;
         }
@@ -561,6 +597,7 @@ static vsi_bool op_setup
         if (outputs[BI_RNN_FW_OUTPUT_H_STATE] != NULL)
         {
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+            CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
             curr->inputs[0] = last_step_h_state_fw;
             curr->outputs[0] = outputs[BI_RNN_FW_OUTPUT_H_STATE];
             vsi_nn_internal_setup_node(self, curr);
@@ -568,6 +605,7 @@ static vsi_bool op_setup
 
         /* concat rnncell output, the rnn's output is 3-dims */
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.concat.axis = 2;
         for( i = 0; i < time_step; i++ )
         {
@@ -590,6 +628,7 @@ static vsi_bool op_setup
             vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[BI_RNN_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
             output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
 
             tensor = output_tensor->t;
         }
@@ -598,6 +637,7 @@ static vsi_bool op_setup
         if (outputs[BI_RNN_BW_OUTPUT_H_STATE] != NULL)
         {
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+            CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
             curr->inputs[0] = last_step_h_state_bw;
             curr->outputs[0] = outputs[BI_RNN_BW_OUTPUT_H_STATE];
             vsi_nn_internal_setup_node(self, curr);
@@ -605,6 +645,7 @@ static vsi_bool op_setup
 
         /* concat rnncell output, the rnn's output is 3-dims */
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.concat.axis = 2;
         for( i = 0; i < time_step; i++ )
         {
@@ -621,6 +662,7 @@ static vsi_bool op_setup
         }
     }
 
+    ret = TRUE;
 final:
     vsi_nn_safe_free( split_output_tensors );
     vsi_nn_safe_free( aux_split_output_tensors )
@@ -628,8 +670,9 @@ static vsi_bool op_setup
     vsi_nn_safe_free( aux_reshape_output_tensors );
     vsi_nn_safe_free( rnncell_reshape_output_tensors_fw );
     vsi_nn_safe_free( rnncell_reshape_output_tensors_bw );
+    vsi_nn_safe_free( merge_tensors );
 
-    return TRUE;
+    return ret;
 } /* op_setup() */
 
 static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c b/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c
index 878c60692..9f7e6ace9 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c
@@ -81,6 +81,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -92,6 +95,8 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+
     if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = 1;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c
index cac99d089..f53aeb548 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c
@@ -149,6 +149,8 @@ static vsi_bool op_setup
     vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
     vsi_bool ret = TRUE;
 
+    VSI_UNREFERENCED(self);
+
     out_rank = inputs[0]->attr.dim_num;
 
     for (i = 0; i < out_rank; i++)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c
index 1eaa7839a..e3de22fff 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c
@@ -37,6 +37,7 @@
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
 
 /*
  Declare number of input and output.
@@ -290,7 +291,7 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_bool ret = TRUE;
+    vsi_bool ret = FALSE;
 
     if ( NULL == self )
     {
@@ -298,7 +299,7 @@ static vsi_bool op_setup
     }
     ret = vsi_nn_op_common_setup(self, inputs, outputs);
 
-    if (  _is_dataconvert_op(self, inputs, outputs) )
+    if ( _is_dataconvert_op(self, inputs, outputs) )
     {
         vsi_nn_internal_node_t* curr = NULL;
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
@@ -309,7 +310,7 @@ static vsi_bool op_setup
         curr->inputs[0]     = inputs[0];
         curr->outputs[0]    = outputs[0];
 
-        vsi_nn_internal_setup_node(self, curr);
+        ret &= vsi_nn_internal_setup_node(self, curr);
     }
 
     return ret;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
index 3e1db0e6d..bade3f959 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
@@ -39,6 +39,7 @@
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_constraint_check.h"
 #include "utils/vsi_nn_dtype_util.h"
+#include "vsi_nn_error.h"
 
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
@@ -194,7 +195,7 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_bool ret = TRUE;
+    vsi_bool ret = FALSE;
     vsi_nn_internal_node_t* curr = NULL;
     float min = self->nn_param.clip.min;
     float max = self->nn_param.clip.max;
@@ -224,11 +225,12 @@ static vsi_bool op_setup
         {
             curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0);
         }
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
 
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = outputs[0];
 
-        vsi_nn_internal_setup_node(self, curr);
+        ret = vsi_nn_internal_setup_node(self, curr);
 
         self->nn_param.clip.local2->is_internal_node = TRUE;
     }
@@ -236,6 +238,8 @@ static vsi_bool op_setup
     {
         ret = vsi_nn_op_common_setup(self, inputs, outputs);
     }
+
+final:
     return ret;
 } /* op_init() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_common.c b/src/tim/vx/internal/src/ops/vsi_nn_op_common.c
index 354b6ce61..f4e70c55f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_common.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_common.c
@@ -38,6 +38,9 @@ vsi_status vsi_nn_op_common_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     //TODO: assert_always()
     return VSI_FAILURE;
 } /* op_common_init() */
@@ -64,6 +67,7 @@ vsi_bool vsi_nn_op_common_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(node);
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
@@ -81,5 +85,8 @@ vsi_status vsi_nn_op_common_optimize
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(node);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return VSI_SUCCESS;
 } /* op_common_optimize() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
index bb1be6e1a..47b5889df 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
@@ -159,6 +159,8 @@ static vsi_status copy_tensor_to_view
     vsi_status ret;
     vsi_nn_concat_lcl_data * data;
 
+    VSI_UNREFERENCED(axis);
+
     ret = VSI_SUCCESS;
     /* Malloc ptr */
     data = (vsi_nn_concat_lcl_data *)malloc( sizeof(vsi_nn_concat_lcl_data) );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c
index f07a690eb..f802f44e9 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c
@@ -32,6 +32,7 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
 
 typedef struct _conv1d_local_data_t {
     vsi_bool use_ext_pad;
@@ -324,12 +325,16 @@ static vsi_bool op_setup
                 memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
                 vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, TRUE);
                 tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+                CHECK_PTR_FAIL_GOTO( tensor, "Create tensor fail.", final );
 
                 curr = vsi_nn_internal_new_node(self, VSI_NN_OP_PAD, 0, 0);
+                CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
                 front_data = (uint32_t*)\
                     vsi_nn_internal_new_node_param(curr, sizeof(uint32_t) * inputs[0]->attr.dim_num);
+                CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(front_data, curr, "Create internal buffer failed", final);
                 back_data = (uint32_t*)\
                     vsi_nn_internal_new_node_param(curr, sizeof(uint32_t) * inputs[0]->attr.dim_num);
+                CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(back_data, curr, "Create internal buffer failed", final);
 
                 front_data[0] = p->pad[0];
                 front_data[1] = 0;
@@ -353,6 +358,8 @@ static vsi_bool op_setup
     }
 
     return TRUE;
+final:
+    return FALSE;
 } /* op_setup() */
 
 static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c
index 03118aaa2..2e1ae75f5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c
@@ -38,6 +38,7 @@
 #include "vsi_nn_error.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
+#include "vsi_nn_error.h"
 
 static vsi_nn_internal_tensor_t * reshape_cell_out
     (
@@ -54,11 +55,14 @@ static vsi_nn_internal_tensor_t * reshape_cell_out
 
     vsi_nn_internal_init_tensor_attr(&attr, &cell_out->attr.dtype, TRUE);
     output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
 
     /* reshape cell_out [w,h,c,n] to [w,h,c,1,n] */
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     reshape_cell_size = vsi_nn_internal_new_node_param(curr,
         VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( reshape_cell_size, curr, "Create internal buffer fail.", final );
     reshape_cell_size[0] = cell_out->attr.size[0];
     reshape_cell_size[1] = cell_out->attr.size[1];
     reshape_cell_size[2] = cell_out->attr.size[2];
@@ -71,6 +75,8 @@ static vsi_nn_internal_tensor_t * reshape_cell_out
     curr->outputs[0] = output_tensor->t;
 
     vsi_nn_internal_setup_node( self, curr );
+
+final:
     return output_tensor;
 } /* reshape_cell_out() */
 
@@ -88,11 +94,14 @@ static vsi_nn_internal_tensor_t * reshape_split_out
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_tensor_attr(&attr, &split_out->attr.dtype, TRUE);
     output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
 
     /* reshape [w,h,c,t,n] to [w,h,c,n] */
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     reshape_split_size = vsi_nn_internal_new_node_param(curr,
         VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( reshape_split_size, curr, "Create internal buffer fail.", final );
     reshape_split_size[0] = split_out->attr.size[0];
     reshape_split_size[1] = split_out->attr.size[1];
     reshape_split_size[2] = split_out->attr.size[2];
@@ -104,10 +113,11 @@ static vsi_nn_internal_tensor_t * reshape_split_out
     curr->outputs[0] = output_tensor->t;
     vsi_nn_internal_setup_node( self, curr );
 
+final:
     return output_tensor;
 } /* reshape_split_out() */
 
-static void split_input_tensor
+static vsi_status split_input_tensor
     (
     vsi_nn_node_t * self,
     vsi_nn_tensor_t * input,
@@ -115,6 +125,7 @@ static void split_input_tensor
     uint32_t time_step
     )
 {
+    vsi_status status = VSI_FAILURE;
     uint32_t i;
     vsi_nn_tensor_attr_t attr;
     vsi_nn_internal_node_t* curr = NULL;
@@ -124,7 +135,9 @@ static void split_input_tensor
     i = 0;
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, time_step );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     slices = (uint32_t *)vsi_nn_internal_new_node_param(curr, time_step * sizeof(uint32_t));
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(slices, curr, "Create internal buffer failed", final);
     curr->node->nn_param.split.axis = 3; /* input_shape [w,h,c,t,n] */
     curr->node->nn_param.split.slices_num = time_step;
     curr->inputs[0] = input;
@@ -135,10 +148,15 @@ static void split_input_tensor
         slices[i] = 1;
         vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, TRUE);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( output_tensor, curr, "Create internal tensor fail.", final );
         curr->outputs[i] = output_tensor->t;
         output[i] = output_tensor->t;
     }
     vsi_nn_internal_setup_node( self, curr );
+
+    status = VSI_SUCCESS;
+final:
+    return status;
 } /* split_input_tensor() */
 
 static void trans_output_tensor
@@ -182,13 +200,14 @@ static void trans_output_tensor
     }
 } /* trans_output_tensor() */
 
-static void trans_input_tensor
+static vsi_status trans_input_tensor
     (
     vsi_nn_node_t * self,
     vsi_nn_tensor_t ** inputs,
     vsi_nn_tensor_t ** trans_inputs
     )
 {
+    vsi_status status = VSI_FAILURE;
     vsi_size_t perm[VSI_NN_MAX_DIM_NUM];
     vsi_nn_internal_tensor_t * tmp_tensor = NULL;
     vsi_nn_conv2d_lstm_param * p = &self->nn_param.conv2d_lstm;
@@ -203,6 +222,7 @@ static void trans_input_tensor
         perm[3] = 3;
         perm[4] = 4;
         tmp_tensor = vsi_nn_rnn_create_permute(self, inputs[CONV2D_LSTM_IN_INPUT], NULL, perm, 5, TRUE);
+        CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
         trans_inputs[CONV2D_LSTM_IN_INPUT] = tmp_tensor->t;
 
         // [c,w,h,n] --> [w,h,c,n]
@@ -211,9 +231,11 @@ static void trans_input_tensor
         perm[2] = 0;
         perm[3] = 3;
         tmp_tensor = vsi_nn_rnn_create_permute(self, inputs[CONV2D_LSTM_IN_H_STATE], NULL, perm, 4, TRUE);
+        CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
         trans_inputs[CONV2D_LSTM_IN_H_STATE] = tmp_tensor->t;
 
         tmp_tensor = vsi_nn_rnn_create_permute(self, inputs[CONV2D_LSTM_IN_C_STATE], NULL, perm, 4, TRUE);
+        CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
         trans_inputs[CONV2D_LSTM_IN_C_STATE] = tmp_tensor->t;
     }
     else
@@ -222,9 +244,13 @@ static void trans_input_tensor
         trans_inputs[CONV2D_LSTM_IN_H_STATE] = inputs[CONV2D_LSTM_IN_H_STATE];
         trans_inputs[CONV2D_LSTM_IN_C_STATE] = inputs[CONV2D_LSTM_IN_C_STATE];
     }
+
+    status = VSI_SUCCESS;
+final:
+    return status;
 } /* trans_input_tensor() */
 
-static void create_state_tensor
+static vsi_status create_state_tensor
     (
     vsi_nn_node_t * self,
     vsi_nn_tensor_t ** inputs,
@@ -234,6 +260,7 @@ static void create_state_tensor
     vsi_size_t out_channel
     )
 {
+    vsi_status status = VSI_FAILURE;
     vsi_size_t samples, state_shape[4];
     vsi_nn_tensor_attr_t attr;
     vsi_nn_internal_tensor_t * tensor = NULL;
@@ -267,6 +294,7 @@ static void create_state_tensor
         attr.is_const = TRUE;
 
         tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
         inputs[CONV2D_LSTM_IN_H_STATE] = tensor->t;
     }
 
@@ -280,6 +308,7 @@ static void create_state_tensor
         attr.is_const = TRUE;
 
         tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
         inputs[CONV2D_LSTM_IN_C_STATE] = tensor->t;
     }
 
@@ -291,6 +320,7 @@ static void create_state_tensor
         attr.vtl = TRUE;
         attr.is_const = FALSE;
         tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
         outputs[CONV2D_LSTM_OUT_H_STATE] = tensor->t;
     }
 
@@ -303,8 +333,12 @@ static void create_state_tensor
         attr.vtl = TRUE;
         attr.is_const = FALSE;
         tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
         outputs[CONV2D_LSTM_OUT_C_STATE] = tensor->t;
     }
+    status = VSI_SUCCESS;
+final:
+    return status;
 } /* create_state_tensor() */
 
 static vsi_bool setup_op_shapes
@@ -314,6 +348,7 @@ static vsi_bool setup_op_shapes
     vsi_nn_tensor_t ** outputs
     )
 {
+    vsi_status status = VSI_FAILURE;
     vsi_nn_tensor_attr_t attr;
     vsi_size_t w_out, h_out, samples, timestep, out_channel;
     vsi_size_t conv_in_shape[4];
@@ -411,7 +446,8 @@ static vsi_bool setup_op_shapes
     }
 
     /* create hstate and cstate input/output if app doesn't provide them */
-    create_state_tensor(self, inputs, outputs, w_out, h_out, out_channel);
+    status = create_state_tensor(self, inputs, outputs, w_out, h_out, out_channel);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
     /* hidden state output */
     if(VSI_NN_DIM_AUTO == outputs[CONV2D_LSTM_OUT_H_STATE]->attr.dim_num)
@@ -452,6 +488,8 @@ static vsi_bool setup_op_shapes
     }
 
     return TRUE;
+final:
+    return FALSE;
 } /* setup_op_shapes() */
 
 static vsi_status op_compute
@@ -461,6 +499,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -471,6 +511,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -483,6 +526,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
@@ -493,6 +538,7 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    vsi_status status = VSI_FAILURE;
     vsi_size_t i, timestep, perm[VSI_NN_MAX_DIM_NUM];
     vsi_nn_tensor_t * trans_inputs[3] = { NULL };
     vsi_nn_tensor_t * conv2dlstm_outputs[3] = { NULL };
@@ -503,6 +549,7 @@ static vsi_bool op_setup
     vsi_nn_tensor_t * cell_out0 = NULL, * cell_out1 = NULL, * cell_out2 = NULL;
     vsi_nn_conv2d_lstm_param * p = &self->nn_param.conv2d_lstm;
     vsi_nn_internal_node_t* curr = NULL;
+    vsi_bool ret = FALSE;
 
     memset(&attr, 0, sizeof(attr));
     memset(perm, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM);
@@ -512,7 +559,8 @@ static vsi_bool op_setup
 
     setup_op_shapes(self, inputs, outputs);
 
-    trans_input_tensor(self, inputs, trans_inputs);
+    status = trans_input_tensor(self, inputs, trans_inputs);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
     split_outputs = (vsi_nn_tensor_t **)malloc(sizeof(vsi_nn_tensor_t *) * timestep);
     CHECK_PTR_FAIL_GOTO( split_outputs, "Create buffer fail.", final );
@@ -522,7 +570,8 @@ static vsi_bool op_setup
     memset(conv2dlstm_step_outputs, 0, sizeof(vsi_nn_tensor_t *) * timestep);
 
     /* split input tensor by time-step */
-    split_input_tensor(self, trans_inputs[CONV2D_LSTM_IN_INPUT], split_outputs, (uint32_t)timestep);
+    status = split_input_tensor(self, trans_inputs[CONV2D_LSTM_IN_INPUT], split_outputs, (uint32_t)timestep);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
     cell_out0 = cell_out1 = cell_out2 = NULL;
     step_h_state = trans_inputs[CONV2D_LSTM_IN_H_STATE];
@@ -533,6 +582,7 @@ static vsi_bool op_setup
 
         /* reshape for split output */
         tmp_tensor = reshape_split_out(self, split_outputs[i]);
+        CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
         reshape_output = tmp_tensor->t;
 
         if((i == timestep - 1) && p->return_sequences == FALSE && p->data_format == CONV2D_LSTM_CHANNELS_FIRST)
@@ -543,6 +593,7 @@ static vsi_bool op_setup
         {
             vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.dtype, TRUE);
             tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
             cell_out0 = tmp_tensor->t;
         }
 
@@ -556,16 +607,19 @@ static vsi_bool op_setup
             /* conv2d_lstm hstate output */
             vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_OUT_H_STATE]->attr.dtype, TRUE);
             tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
             cell_out1 = tmp_tensor->t;
 
             /* conv2d_lstm cstate output */
             vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_OUT_C_STATE]->attr.dtype, TRUE);
             tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
             cell_out2 = tmp_tensor->t;
         }
 
         /* create a conv2d_lstm_cell */
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONV2D_LSTM_CELL, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.conv2d_lstm_cell.filters = p->filters;
         curr->node->nn_param.conv2d_lstm_cell.activation = p->activation;
         curr->node->nn_param.conv2d_lstm_cell.recurrent_activation = p->recurrent_activation;
@@ -600,6 +654,7 @@ static vsi_bool op_setup
         {
             /* store step's outputs */
             tmp_tensor = reshape_cell_out(self, cell_out0);
+            CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
             conv2dlstm_step_outputs[i] = tmp_tensor->t;
         }
     }
@@ -610,6 +665,7 @@ static vsi_bool op_setup
         {
             vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.dtype, TRUE);
             tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
             conv2dlstm_outputs[CONV2D_LSTM_OUT_OUTPUT] = tmp_tensor->t;
         }
         else
@@ -618,6 +674,7 @@ static vsi_bool op_setup
         }
         /* concat all step's output0 data on dimension t --- cell out0 shape: [w,h,c,t,n] */
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)timestep, 1 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.concat.axis = 3;
         for(i = 0; i < timestep; i++)
         {
@@ -638,10 +695,11 @@ static vsi_bool op_setup
         trans_output_tensor(self, conv2dlstm_outputs, outputs);
     }
 
+    ret = TRUE;
 final:
     vsi_nn_safe_free(split_outputs);
     vsi_nn_safe_free(conv2dlstm_step_outputs)
-    return TRUE;
+    return ret;
 } /* op_setup() */
 
 static vsi_status op_deinit
@@ -660,6 +718,7 @@ static vsi_status op_init
     )
 {
     vsi_status status = VSI_SUCCESS;
+    VSI_UNREFERENCED(self);
     return status;
 } /* op_init() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
index 388de95c3..3a31d44db 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
 
@@ -99,8 +99,10 @@ static vsi_nn_internal_tensor_t * create_input_conv
     attr.vtl = TRUE;
     attr.is_const = FALSE;
     input_conv_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(input_conv_out, "Create internal tensor failed", final);
 
     input_conv = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(input_conv, "Create internal node failed", final);
     input_conv->node->nn_param.conv2d.group = 1;
     input_conv->node->nn_param.conv2d.ksize[0] = p->conv2d.ksize[0];
     input_conv->node->nn_param.conv2d.ksize[1] = p->conv2d.ksize[1];
@@ -129,6 +131,7 @@ static vsi_nn_internal_tensor_t * create_input_conv
     // reshape whcn --> xn
     reshape_out = reshape_tensor_to_act(self, input_conv_out->t);
 
+final:
     return reshape_out;
 } /* create_input_conv() */
 
@@ -176,8 +179,10 @@ static vsi_nn_internal_tensor_t * create_recurrent_conv
     attr.vtl = TRUE;
     attr.is_const = FALSE;
     recurrent_conv_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(recurrent_conv_out, "Create internal tensor failed", final);
 
     recurrent_conv = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(recurrent_conv, "Create internal node failed", final);
     recurrent_conv->node->nn_param.conv2d.pad_type = VSI_NN_PAD_SAME;
     recurrent_conv->node->nn_param.conv2d.group = 1;
     recurrent_conv->node->nn_param.conv2d.ksize[0] = p->conv2d.ksize[0];
@@ -203,6 +208,8 @@ static vsi_nn_internal_tensor_t * create_recurrent_conv
 
     // reshape whcn --> xn
     reshape_out = reshape_tensor_to_act(self, recurrent_conv_out->t);
+
+final:
     return reshape_out;
 } /* create_recurrent_conv() */
 
@@ -303,6 +310,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -313,6 +322,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -325,6 +337,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
@@ -344,6 +358,7 @@ static vsi_bool op_setup
     vsi_nn_internal_tensor_t * reshape_h_out = NULL;
     vsi_nn_internal_tensor_t * reshape_c_out = NULL;
     vsi_nn_conv2d_lstm_cell_param * p = &self->nn_param.conv2d_lstm_cell;
+    vsi_bool ret = FALSE;
 
     vsi_nn_internal_init_node_wksp( self );
 
@@ -359,6 +374,7 @@ static vsi_bool op_setup
             inputs[CONV2D_LSTM_CELL_IN_KERNEL_I2I + i],
             inputs[CONV2D_LSTM_CELL_IN_BIAS_I + i]
         );
+        CHECK_PTR_FAIL_GOTO(input_conv_outputs[i], "Create internal tensor failed", final);
     }
 
     /* create recurrent convolution */
@@ -369,10 +385,12 @@ static vsi_bool op_setup
             inputs[CONV2D_LSTM_CELL_IN_H_STATE],
             inputs[CONV2D_LSTM_CELL_IN_KERNEL_R2I + i]
         );
+        CHECK_PTR_FAIL_GOTO(recurrent_conv_outputs[i], "Create internal tensor failed", final);
     }
 
     /* activations */
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_ACTIVATION, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->node->nn_param.lstmunit_activation.cell_clip = 0;
     curr->node->nn_param.lstmunit_activation.proj_clip = 0;
     curr->node->nn_param.lstmunit_activation.forget_bias = 0;
@@ -384,6 +402,7 @@ static vsi_bool op_setup
     curr->node->nn_param.lstmunit_activation.recurrent_activation = p->recurrent_activation;
 
     reshape_cell_in = reshape_tensor_to_act(self, inputs[CONV2D_LSTM_CELL_IN_C_STATE]);
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_cell_in, curr, "Create internal tensor failed", final);
     curr->inputs[LSTMUNIT_ACT_CSTATE_IN] = reshape_cell_in->t;
     for(i = 0; i < CONV2D_LSTM_CELL_GATE_NUM; i++)
     {
@@ -392,15 +411,20 @@ static vsi_bool op_setup
         curr->inputs[LSTMUNIT_ACT_HSTATE_FC_I + i] = recurrent_conv_outputs[i]->t;
     }
     reshape_out = reshape_tensor_to_act(self, outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]);
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_out, curr, "Create internal tensor failed", final);
     reshape_h_out = reshape_tensor_to_act(self, outputs[CONV2D_LSTM_CELL_OUT_H_STATE]);
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_h_out, curr, "Create internal tensor failed", final);
     reshape_c_out = reshape_tensor_to_act(self, outputs[CONV2D_LSTM_CELL_OUT_C_STATE]);
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_c_out, curr, "Create internal tensor failed", final);
 
     curr->outputs[LSTMUNIT_ACT_OUTPUT] = reshape_out->t;
     curr->outputs[LSTMUNIT_ACT_CSTATE_OUT] = reshape_c_out->t;
     curr->outputs[LSTMUNIT_ACT_HSTATE_OUT] = reshape_h_out->t;
     vsi_nn_internal_setup_node(self, curr);
 
-    return TRUE;
+    ret = TRUE;
+final:
+    return ret;
 } /* op_setup() */
 
 static vsi_status op_deinit
@@ -419,7 +443,7 @@ static vsi_status op_init
     )
 {
     vsi_status status = VSI_SUCCESS;
-
+    VSI_UNREFERENCED(self);
     return status;
 } /* op_init() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c
index 1825e3b98..98217903a 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c
@@ -353,7 +353,7 @@ static vsi_status op_init
     //self->nn_param.conv3d.local = \
     //    (conv3d_local_data_t*)malloc(sizeof(conv3d_local_data_t));
     */
-
+    VSI_UNREFERENCED(self);
     return VSI_SUCCESS;
 } /* op_init() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c
index 6aaa61d5c..ed26a68f0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c
@@ -36,6 +36,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
 
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
@@ -47,6 +48,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -72,6 +75,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 }
 
@@ -86,6 +91,7 @@ static vsi_bool op_setup
     int32_t i = 0;
     uint32_t j = 0;
     vsi_nn_internal_node_t* curr = NULL;
+    vsi_bool ret = FALSE;
 
     vsi_nn_internal_init_node_wksp( self );
     p = (vsi_nn_crop_param *)&(self->nn_param.crop);
@@ -96,46 +102,43 @@ static vsi_bool op_setup
         return FALSE;
     }
 
-    if ( VSI_NN_DIM_AUTO != outputs[0]->attr.dim_num )
-    {
-        goto final;
-    }
-
-    if (p->dims + p->axis == inputs[0]->attr.dim_num)
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
-        for (i = 0; i < p->axis; i++)
-        {
-            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
-        }
-        for (i = p->axis; i < (int32_t)inputs[0]->attr.dim_num; i++)
+        if (p->dims + p->axis == inputs[0]->attr.dim_num)
         {
-            outputs[0]->attr.size[i] = inputs[1]->attr.size[i];
-        }
-        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
-    }
-    else
-    {
-        if (p->dims == 1)
-        {
-            for (i = 0; i <= p->axis; i++)
+            for (i = 0; i < p->axis; i++)
             {
-                outputs[0]->attr.size[i] = inputs[1]->attr.size[i];
-                p->offset[i] = p->offset[0];
+                outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
             }
-            for (i = p->axis + 1; i < (int32_t)inputs[0]->attr.dim_num; i++)
+            for (i = p->axis; i < (int32_t)inputs[0]->attr.dim_num; i++)
             {
-                outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+                outputs[0]->attr.size[i] = inputs[1]->attr.size[i];
             }
             outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
         }
         else
         {
-            VSILOGE("Invalid parameter: offset dims!\n");
-            return FALSE;
+            if (p->dims == 1)
+            {
+                for (i = 0; i <= p->axis; i++)
+                {
+                    outputs[0]->attr.size[i] = inputs[1]->attr.size[i];
+                    p->offset[i] = p->offset[0];
+                }
+                for (i = p->axis + 1; i < (int32_t)inputs[0]->attr.dim_num; i++)
+                {
+                    outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+                }
+                outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+            }
+            else
+            {
+                VSILOGE("Invalid parameter: offset dims!\n");
+                return FALSE;
+            }
         }
     }
 
-final:
     for (j = 0; j < self->nn_param.crop.dims; j++)
     {
         p->lcl_data->begin_dims[j] = (int32_t)self->nn_param.crop.offset[j];
@@ -151,6 +154,7 @@ static vsi_bool op_setup
     }
 
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->node->nn_param.strided_slice.begin_dims = p->lcl_data->begin_dims;
     curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num;
     curr->node->nn_param.strided_slice.end_dims = p->lcl_data->end_dims;
@@ -163,9 +167,10 @@ static vsi_bool op_setup
     curr->node->nn_param.strided_slice.new_axis_mask = 0;
     curr->inputs[0] = inputs[0];
     curr->outputs[0] = outputs[0];
-    vsi_nn_internal_setup_node( self, curr );
+    ret = vsi_nn_internal_setup_node( self, curr );
 
-    return TRUE;
+final:
+    return ret;
 } /* op_setup() */
 
 static vsi_status op_init
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c
index d976b13b8..43f8a8f43 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c
@@ -136,6 +136,8 @@ static vsi_bool op_setup
     /* TODO: Add code to comput outputs' shape. */
     uint32_t i = 0;
 
+    VSI_UNREFERENCED(self);
+
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
index d1a778528..6d109f00b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
@@ -70,6 +70,8 @@ static vsi_bool _is_same_quant
 {
     vsi_nn_dtype_t *dtype,*_dtype;
 
+    VSI_UNREFERENCED(self);
+
     dtype = &inputs[0]->attr.dtype;
     _dtype = &outputs[0]->attr.dtype;
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c
index 7048f5173..ba3a3c511 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconv3d.c
@@ -37,6 +37,7 @@
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
 
 typedef struct _deconv3d_local_data_t {
     int32_t placeholder;
@@ -135,7 +136,9 @@ void _rotate_weight_data(
     int32_t item_size = vsi_nn_TypeGetBytes(weights->attr.dtype.vx_type);
 
     weight_data = vsi_nn_ConvertTensorToData(graph, weights);
+    CHECK_PTR_FAIL_GOTO( weight_data, "Create weight_data fail.", final );
     buffer = (uint8_t*)malloc(item_size * depth_size * weight_ic * weight_oc);
+    CHECK_PTR_FAIL_GOTO( buffer, "Create buffer fail.", final );
     memset(buffer, 0x00, item_size * depth_size * weight_ic * weight_oc);
     //memcpy(buffer, weight_data, item_size * slice_size * weight_ic * weight_oc);
     for(oc = 0; oc < weight_oc; oc++)
@@ -164,6 +167,8 @@ void _rotate_weight_data(
     }
 
     vsi_nn_CopyDataToTensor( graph, weights, buffer );
+
+final:
     vsi_nn_Free( buffer );
     vsi_nn_safe_free( weight_data );
 }
@@ -263,7 +268,7 @@ static vsi_status op_init
     //self->nn_param.deconv3d.local = \
     //    (deconv3d_local_data_t*)malloc(sizeof(deconv3d_local_data_t));
     */
-
+    VSI_UNREFERENCED(self);
     return VSI_SUCCESS;
 } /* op_init() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
index 09c59d81d..be301ea20 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
@@ -36,6 +36,183 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_constraint_check.h"
 
+#define LOCAL() (local)
+
+typedef struct _vsi_nn_grouped_deconv2d_param_local_data {
+    vsi_nn_tensor_t ** input_tensor_group;
+    vsi_nn_tensor_t ** weight_tensor_group;
+    vsi_nn_tensor_t ** bias_tensor_group;
+    vsi_nn_tensor_t ** output_tensor_group;
+} vsi_nn_grouped_deconv2d_param_local_data;
+
+static vsi_status op_grouped_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t * inputs[3],
+    vsi_nn_tensor_t ** outputs,
+    vx_nn_deconvolution_params_ext2_t param
+    )
+{
+    vsi_bool res;
+    uint32_t i;
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_deconv_param *nn_param = &self->nn_param.deconv;
+    uint32_t group = nn_param->group;
+    vsi_nn_grouped_deconv2d_param_local_data *local =
+        (vsi_nn_grouped_deconv2d_param_local_data*)malloc(sizeof(vsi_nn_grouped_deconv2d_param_local_data));
+    if (NULL == local)
+    {
+        VSILOGE("Malloc fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+        goto final;
+    }
+    memset(local, 0, sizeof(vsi_nn_grouped_deconv2d_param_local_data));
+    /* TODO */
+    LOCAL()->input_tensor_group = (vsi_nn_tensor_t **)malloc(
+        group * sizeof(vsi_nn_tensor_t *));
+    if (NULL == LOCAL()->input_tensor_group)
+    {
+        VSILOGE("Malloc fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+        goto final;
+    }
+    memset(LOCAL()->input_tensor_group, 0, group * sizeof(vsi_nn_tensor_t *));
+    res = vsi_nn_CreateTensorGroup(self->graph, inputs[0], 2,
+        LOCAL()->input_tensor_group, group);
+    if (res == FALSE)
+    {
+        VSILOGE("CreateTensorGroup fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+        goto final;
+    }
+
+    LOCAL()->weight_tensor_group = (vsi_nn_tensor_t **)malloc(
+        group * sizeof(vsi_nn_tensor_t *));
+    if (NULL == LOCAL()->weight_tensor_group)
+    {
+        VSILOGE("Malloc fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+        return VSI_FAILURE;
+    }
+    memset(LOCAL()->weight_tensor_group, 0, group * sizeof(vsi_nn_tensor_t *));
+    res = vsi_nn_CreateTensorGroup(self->graph, inputs[1], 2,
+        LOCAL()->weight_tensor_group, group);
+    if (res == FALSE)
+    {
+        VSILOGE("CreateTensorGroup fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+        goto final;
+    }
+
+    LOCAL()->bias_tensor_group = (vsi_nn_tensor_t **)malloc(
+        group * sizeof(vsi_nn_tensor_t *));
+    if (NULL == LOCAL()->bias_tensor_group)
+    {
+        VSILOGE("Malloc fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+        goto final;
+    }
+    memset(LOCAL()->bias_tensor_group, 0, group * sizeof(vsi_nn_tensor_t *));
+    if (inputs[2] != NULL)
+    {
+        res = vsi_nn_CreateTensorGroup(self->graph, inputs[2], 0,
+            LOCAL()->bias_tensor_group, group);
+        if (res == FALSE)
+        {
+            VSILOGE("CreateTensorGroup fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+            goto final;
+        }
+    }
+
+    LOCAL()->output_tensor_group = (vsi_nn_tensor_t **)malloc(
+        group * sizeof(vsi_nn_tensor_t *));
+    if (NULL == LOCAL()->output_tensor_group)
+    {
+        VSILOGE("Malloc fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+        goto final;
+    }
+    memset(LOCAL()->output_tensor_group, 0, group * sizeof(vsi_nn_tensor_t *));
+    res = vsi_nn_CreateTensorGroup(self->graph, outputs[0], 2,
+        LOCAL()->output_tensor_group, group);
+    if (res == FALSE)
+    {
+        VSILOGE("CreateTensorGroup fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+        goto final;
+    }
+
+    param.ext.channel_group = 1;
+    for (i = 0; i < group; i++)
+    {
+        vx_tensor bias;
+
+        if ( inputs[2] == NULL )
+        {
+            bias = NULL;
+        }
+        else
+        {
+            bias = LOCAL()->bias_tensor_group[i]->t;
+        }
+
+        self->n = vxDeconvolutionLayer(
+            self->graph->g,
+            LOCAL()->input_tensor_group[i]->t,
+            LOCAL()->weight_tensor_group[i]->t,
+            bias,
+            (vx_nn_deconvolution_params_t *)&param,
+            sizeof( vx_nn_deconvolution_params_ext2_t ),
+            LOCAL()->output_tensor_group[i]->t
+            );
+        if ( NULL == self->n )
+        {
+            VSILOGE("Add vxConvolutionLayer fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+            status = VSI_FAILURE;
+            goto final;
+        }
+        else
+        {
+            // no need to maintain self->n
+            vxReleaseNode( &self->n );
+            status = VSI_SUCCESS;
+            self->n = NULL;
+        }
+    }
+
+final:
+    if (LOCAL())
+    {
+        if (LOCAL()->input_tensor_group)
+        {
+            for (i = 0; i < group; i++)
+            {
+                vsi_safe_release_tensor((LOCAL()->input_tensor_group[i]));
+            }
+            vsi_nn_safe_free(LOCAL()->input_tensor_group);
+        }
+        if (LOCAL()->weight_tensor_group)
+        {
+            for (i = 0; i < group; i++)
+            {
+                vsi_safe_release_tensor((LOCAL()->weight_tensor_group[i]));
+            }
+            vsi_nn_safe_free(LOCAL()->weight_tensor_group);
+        }
+        if (LOCAL()->bias_tensor_group != NULL)
+        {
+            for (i = 0; i < group; i++)
+            {
+                vsi_safe_release_tensor((LOCAL()->bias_tensor_group[i]));
+            }
+            vsi_nn_safe_free(LOCAL()->bias_tensor_group);
+        }
+        if (LOCAL()->output_tensor_group != NULL)
+        {
+            for (i = 0; i < group; i++)
+            {
+                vsi_safe_release_tensor((LOCAL()->output_tensor_group[i]));
+            }
+            vsi_nn_safe_free(LOCAL()->output_tensor_group);
+        }
+
+        vsi_nn_safe_free(LOCAL());
+    }
+    return status;
+} /* op_compute() */
+
 #define COMPUTE_DECONV_SZ( in, ksize, pad_1, pad_2, stride, output_padding )\
     (( in - 1 ) * stride + ksize - pad_1 - pad_2 + output_padding)
 static vsi_status op_compute
@@ -161,18 +338,31 @@ static vsi_status op_compute
     //param.border_mode;
     //param.border_const;
 
-    self->n = vxDeconvolutionLayer(
-        self->graph->g,
-        inputs[0]->t,
-        weight_tensor->t,
-        (NULL == inputs[2]) ? NULL : inputs[2]->t,
-        (vx_nn_deconvolution_params_t *)&param,
-        sizeof( vx_nn_deconvolution_params_ext2_t ),
-        outputs[0]->t
-        );
-    if( NULL != self->n )
+    if (self->nn_param.deconv.group > 1 &&
+        self->nn_param.deconv.group < inputs[0]->attr.size[2])
     {
-        status = VSI_SUCCESS;
+        vsi_nn_tensor_t *inputs_tensors[3] = {NULL};
+
+        inputs_tensors[0] = inputs[0];
+        inputs_tensors[1] = weight_tensor;
+        inputs_tensors[2] = inputs[2];
+        status = op_grouped_compute(self, inputs_tensors, outputs, param );
+    }
+    else
+    {
+        self->n = vxDeconvolutionLayer(
+            self->graph->g,
+            inputs[0]->t,
+            weight_tensor->t,
+            (NULL == inputs[2]) ? NULL : inputs[2]->t,
+            (vx_nn_deconvolution_params_t *)&param,
+            sizeof( vx_nn_deconvolution_params_ext2_t ),
+            outputs[0]->t
+            );
+        if ( NULL != self->n )
+        {
+            status = VSI_SUCCESS;
+        }
     }
 
 final:
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c
index 4128480bf..1180dbee9 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c
@@ -64,7 +64,9 @@ static vsi_status op_compute
     weight_attr.size[2] = weight_attr.size[1];
     weight_attr.size[1] = 1;
     weight_attr.dim_num = 4;
-    if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
+    if (inputs[1]->attr.dtype.qnt_type !=
+            VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC &&
+        inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8)
     {
         weight_tensor = vsi_nn_reshape_tensor( self->graph, inputs[1], weight_attr.size, 4 );
         CHECK_PTR_FAIL_GOTO( weight_tensor, "create tensor fail.", final );
@@ -118,6 +120,7 @@ static vsi_status op_compute
             attr.size[2] = weight_tensor->attr.size[3];
             attr.size[3] = weight_tensor->attr.size[2];
             permute_tensor = vsi_nn_CreateTensor(self->graph, &attr);
+            CHECK_PTR_FAIL_GOTO( permute_tensor, "Create tensor fail.", final );
             self->n = vxTensorPermuteNode( self->graph->g, weight_tensor->t,
                         permute_tensor->t, perm_array, 4);
             if ( NULL == self->n )
@@ -135,6 +138,7 @@ static vsi_status op_compute
         memset(&attr_reverse, 0, sizeof(vsi_nn_tensor_attr_t));
         memcpy(&attr_reverse, &tmp_in_tensor->attr, sizeof(vsi_nn_tensor_attr_t) );
         reverse_tensor = vsi_nn_CreateTensor(self->graph, &attr_reverse);
+        CHECK_PTR_FAIL_GOTO( reverse_tensor, "Create tensor fail.", final );
         para.axis = axis_reverse;
         para.numberOfAxis = 2;
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c
index 551aa59ea..cee8b8c7c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c
@@ -36,6 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_math.h"
+#include "vsi_nn_error.h"
 
 static vsi_status vsi_nn_depth2space_compute
     (
@@ -46,29 +47,38 @@ static vsi_status vsi_nn_depth2space_compute
 {
     vsi_status status;
     vsi_nn_tensor_t *block_size_tensor = NULL;
-    vx_nn_reorg_params_t param;
+#if (VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
+    vx_nn_reorg_params_ext3_t paramExt;
+    vx_nn_reorg_params_t *param = (vx_nn_reorg_params_t *)&paramExt.base.base;
+    size_t size = sizeof(vx_nn_reorg_params_ext3_t);
+    paramExt.mode = self->nn_param.depth2space.mode;
+#else
+    vx_nn_reorg_params_t base;
+    vx_nn_reorg_params_t *param = &base;
+    size_t size = sizeof(vx_nn_reorg_params_t);
+    memset(param, 0, sizeof(vx_nn_reorg_params_t));
+#endif
 
     status = VSI_FAILURE;
-    memset(&param, 0, sizeof(vx_nn_reorg_params_t));
 
     block_size_tensor = vsi_nn_VariableToTensor(self,
         (uint8_t *)&self->nn_param.depth2space.block_size,
         VSI_NN_TYPE_INT32);
-    if( NULL == block_size_tensor )
+    if ( NULL == block_size_tensor )
     {
         VSILOGE("Create block_size_tensor fail.(depth2space)");
         return VSI_FAILURE;
     }
     self->nn_param.depth2space.local.block_size_tensor = block_size_tensor;
-    param.block_size = REQUIRED_IO(block_size_tensor);
-    param.type = VX_REORG_DEPTH_TO_SPACE;
+    param->block_size = REQUIRED_IO(block_size_tensor);
+    param->type = VX_REORG_DEPTH_TO_SPACE;
 
     self->n = vxReorgLayer2( self->graph->g,
         inputs[0]->t,
-        &param,
-        sizeof(vx_nn_reorg_params_t),
+        param,
+        size,
         outputs[0]->t);
-    if( NULL != self->n )
+    if ( NULL != self->n )
     {
         status = VSI_SUCCESS;
     }
@@ -84,6 +94,13 @@ static vsi_status op_compute
 {
     vsi_status status = VSI_FAILURE;
 
+#if (VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
+    if (self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_DCR ||
+        self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_CRD)
+    {
+        status = vsi_nn_depth2space_compute(self, inputs, outputs);
+    }
+#else
     if (self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_DCR)
     {
         status = vsi_nn_depth2space_compute(self, inputs, outputs);
@@ -92,6 +109,7 @@ static vsi_status op_compute
     {
         status = vsi_nn_internal_compute_node( self );
     }
+#endif
     else
     {
         VSILOGE("Unknown depth2space mode.(depth2space)");
@@ -101,24 +119,6 @@ static vsi_status op_compute
     return status;
 } /* op_compute() */
 
-static vsi_status op_optimize
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs,
-    vsi_nn_opt_direction_e direction
-    )
-{
-    if (self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_CRD)
-    {
-        return vsi_nn_internal_optimize_node(self, direction );
-    }
-    else
-    {
-        return VSI_SUCCESS;
-    }
-} /* op_optimize() */
-
 static vsi_bool op_check
     (
     vsi_nn_node_t * self,
@@ -139,6 +139,7 @@ static vsi_bool op_check
     return ret;
 } /* op_check() */
 
+#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
 static void op_set_depth2space_param_value(vsi_nn_nn_param_t *nn_param,
                                     vsi_nn_op_t  type_name,
                                     vsi_nn_depth2space_mode_e   mode,
@@ -160,20 +161,23 @@ static vsi_bool op_set_depth2space_internal
     vsi_nn_op_t  type_name
     )
 {
-    vsi_bool retn = TRUE;
+    vsi_bool retn = FALSE;
     vsi_nn_internal_node_t* curr = NULL;
 
     vsi_nn_internal_init_node_wksp( self );
 
     curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     op_set_depth2space_param_value(&(curr->node->nn_param), type_name,
         self->nn_param.depth2space.mode, self->nn_param.depth2space.block_size);
     curr->inputs[0]  = inputs[0];
     curr->outputs[0] = outputs[0];
     retn = vsi_nn_internal_setup_node(self, curr);
 
+final:
     return retn;
 }
+#endif
 
 static vsi_status op_init
     (
@@ -199,7 +203,7 @@ static vsi_bool op_setup
 {
     vsi_bool ret = TRUE;
     uint32_t size = node->nn_param.depth2space.block_size;
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
         outputs[0]->attr.size[0] = inputs[0]->attr.size[0] * size;
@@ -208,10 +212,12 @@ static vsi_bool op_setup
         outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
     }
 
+#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
     if (node->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_CRD)
     {
         ret = op_set_depth2space_internal(node, inputs, outputs, VSI_NN_OP_DEPTH2SPACE_INTERNAL);
     }
+#endif
     return ret;
 } /* op_setup() */
 
@@ -225,11 +231,13 @@ static vsi_status op_deinit
         vsi_nn_ReleaseTensor(&(self->nn_param.depth2space.local.block_size_tensor));
     }
 
+#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
     if (self->nn_param.depth2space.mode == VSI_NN_DEPTH2SPACE_CRD)
     {
         vsi_nn_internal_deinit_node_wksp(self);
     }
     else
+#endif
     {
         vsi_nn_op_common_deinit(self);
     }
@@ -249,7 +257,7 @@ DEF_OP_REG
     /* deinit     */ op_deinit,
     /* check      */ op_check,
     /* setup      */ op_setup,
-    /* optimize   */ op_optimize,
+    /* optimize   */ NULL,
     /* input_num  */ 1,
     /* output_num */ 1
     );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c
index fa5336755..1b417b168 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_constraint_check.h"
 
@@ -48,6 +48,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -82,19 +84,21 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_bool ret = TRUE;
+    vsi_bool ret = FALSE;
     vsi_nn_internal_node_t* curr = NULL;
 
     vsi_nn_internal_init_node_wksp(self);
 
     curr = vsi_nn_internal_new_node(self, VSI_NN_OP_LINEAR, 0, 0);
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->node->nn_param.linear.a = self->nn_param.dropout.ratio;
     curr->node->nn_param.linear.b = 0;
     curr->inputs[0] = inputs[0];
     curr->outputs[0] = outputs[0];
 
-    vsi_nn_internal_setup_node(self, curr);
+    ret = vsi_nn_internal_setup_node(self, curr);
 
+final:
     return ret;
 } /* op_init() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
index 68c6993a0..280e5eee2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
@@ -120,6 +120,8 @@ static vsi_bool op_setup
     vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
     vsi_bool ret = TRUE;
 
+    VSI_UNREFERENCED(self);
+
     out_rank = inputs[0]->attr.dim_num;
 
     for (i = 0; i < out_rank; i++)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c
index bcdf270f5..c1f2fc56e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c
@@ -122,6 +122,8 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(node);
+
     if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = inputs[1]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c
index 68c9fc257..d586d3141 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c
@@ -35,6 +35,7 @@
 #include "utils/vsi_nn_dtype_util_prv.h"
 #include "utils/vsi_nn_math.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
 
 static vsi_status op_compute
     (
@@ -43,6 +44,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 }
 
@@ -85,6 +88,7 @@ static vsi_bool op_check
         IO_TYPE(D_BF16,         D_F32)
         IO_TYPE(D_I32|Q_DFP,    D_I32|Q_DFP)
         IO_TYPE(D_I32|Q_ASYM,   D_I32|Q_ASYM)
+        IO_TYPE(D_BOOL8,        D_BOOL8)
     END_IO_TYPE_DECL(EXPAND_BROADCAST)
     if (!VALIDATE_OP_IO_TYPES(EXPAND_BROADCAST, self, inputs, self->input.num, outputs, self->output.num))
     {
@@ -109,9 +113,11 @@ static vsi_bool op_setup
     vsi_nn_tensor_attr_t attr;
     vsi_nn_internal_tensor_t* input_0 = NULL;
     vsi_nn_internal_tensor_t *input_1 = NULL;
+    vsi_nn_internal_tensor_t* input_2 = NULL;
     vsi_nn_internal_node_t* mul_node = NULL;
     vsi_nn_tensor_t* mul_input = NULL;
     int32_t use_virtual_tensor = 1;
+    vsi_bool is_same_shape = TRUE;
     vsi_nn_expand_broadcast_param *p = &self->nn_param.expand_broadcast;
 
     vsi_nn_internal_init_node_wksp(self);
@@ -120,33 +126,55 @@ static vsi_bool op_setup
     attr.dim_num = p->dim_num;
     if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE &&
         (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ||
-        inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16)) {
+        inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16))
+    {
         attr.dtype.vx_type = VSI_NN_TYPE_INT32;
     }
-    else {
+    else if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BOOL8)
+    {
+        attr.dtype.vx_type = VSI_NN_TYPE_BOOL8;
+    }
+    else
+    {
         attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
     }
     attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
     attr.is_const = TRUE;
-    for(i = 0; i < p->dim_num; i++)
+    for (i = 0; i < p->dim_num; i++)
     {
+        vsi_size_t sz = i < inputs[0]->attr.dim_num ?
+            inputs[0]->attr.size[i] : 1;
+
         attr.size[i] = p->shape[i];
+        if (( p->shape[i] != sz && p->shape[i] != 1)
+            && is_same_shape)
+        {
+            is_same_shape = FALSE;
+        }
     }
     input_1 = vsi_nn_internal_new_tensor( self, &attr, 1.0f );
+    CHECK_PTR_FAIL_GOTO(input_1, "Create tensor failed", final);
 
-    if (p->dimensions_num > 0) {
+    if (p->dimensions_num > 0)
+    {
         vsi_nn_internal_node_t* reshape_node = NULL;
         vsi_size_t* reshape_input_size = NULL;
         memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
         vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor);
         input_0 = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(input_0, "Create internal tensor failed", final);
         reshape_node = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(reshape_node, "Create internal node failed", final);
         reshape_input_size = (vsi_size_t*)vsi_nn_internal_new_node_param(reshape_node,
             VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
-        for(i = 0; i < p->dim_num; i++) {
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_input_size, reshape_node,
+            "Create internal buffer failed", final);
+        for (i = 0; i < p->dim_num; i++)
+        {
             reshape_input_size[i] = 1;
         }
-        for (i = 0; i < p->dimensions_num; i++) {
+        for (i = 0; i < p->dimensions_num; i++)
+        {
             reshape_input_size[p->dimensions[i]] = p->shape[p->dimensions[i]];
         }
 
@@ -156,20 +184,74 @@ static vsi_bool op_setup
         reshape_node->outputs[0] = input_0->t;
         vsi_nn_internal_setup_node( self, reshape_node );
         mul_input = input_0->t;
-    } else {
+    }
+    else
+    {
         mul_input = inputs[0];
     }
 
-    mul_node = vsi_nn_internal_new_node(self, VSI_NN_OP_MULTIPLY, 0, 0 );
-    mul_node->inputs[0] = mul_input;
-    mul_node->inputs[1] = input_1->t;
-    mul_node->node->nn_param.multiply.scale = 1.0f;
-    mul_node->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
-    mul_node->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN;
-    mul_node->outputs[0] = outputs[0];
-    vsi_nn_internal_setup_node(self, mul_node);
+    if (is_same_shape)
+    {
+        vsi_nn_internal_node_t* curr = NULL;
+        vsi_nn_tensor_t* temp_tensor = NULL;
+
+        if (input_1->t->attr.dim_num != mul_input->attr.dim_num)
+        {
+            vsi_size_t* shape_sizes = NULL;
+            uint32_t rank0 = input_1->t->attr.dim_num;
+            uint32_t rank1 = mul_input->attr.dim_num;
+            uint32_t rank = vsi_nn_max( rank0, rank1 );
+
+            memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
+            vsi_nn_internal_init_tensor_attr(&attr, &mul_input->attr.dtype, use_virtual_tensor);
+            input_2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO(input_2, "Create internal tensor failed", final);
+
+            curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 );
+            CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+            shape_sizes = (vsi_size_t*)vsi_nn_internal_new_node_param(curr,
+                VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+            CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(shape_sizes, curr,
+                "Create internal buffer failed", final);
+            for (i = 0; i < rank; i++)
+            {
+                shape_sizes[i] = i < rank1 ? mul_input->attr.size[i] : 1;
+            }
+            curr->node->nn_param.reshape2.size = shape_sizes;
+            curr->node->nn_param.reshape2.dim_num = rank;
+            curr->inputs[0] = mul_input;
+            curr->outputs[0] = input_2->t;
+            vsi_nn_internal_setup_node( self, curr );
+
+            temp_tensor = input_2->t;
+        }
+        else
+        {
+            temp_tensor = mul_input;
+        }
+
+        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+        curr->inputs[0] = temp_tensor;
+        curr->outputs[0] = outputs[0];
+        vsi_nn_internal_setup_node(self, curr);
+    }
+    else
+    {
+        mul_node = vsi_nn_internal_new_node(self, VSI_NN_OP_MULTIPLY, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(mul_node, "Create internal node failed", final);
+        mul_node->inputs[0] = mul_input;
+        mul_node->inputs[1] = input_1->t;
+        mul_node->node->nn_param.multiply.scale = 1.0f;
+        mul_node->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
+        mul_node->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN;
+        mul_node->outputs[0] = outputs[0];
+        vsi_nn_internal_setup_node(self, mul_node);
+    }
 
     return TRUE;
+final:
+    return FALSE;
 }
 
 static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c
index 23be09a06..958b06b10 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c
@@ -123,6 +123,9 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /* TODO: Add code to comput outputs' shape. */
     if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c
index 92b13378c..4a803ad6e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c
@@ -171,6 +171,8 @@ static vsi_bool op_setup
     vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
     vsi_bool ret = TRUE;
 
+    VSI_UNREFERENCED(self);
+
     in1_rank = inputs[0]->attr.dim_num;
     in2_rank = inputs[1]->attr.dim_num;
     out_rank = vsi_nn_max( in1_rank, in2_rank );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
index 1f3f281c2..489d3cb96 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
@@ -34,6 +34,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_tensor_util_prv.h"
 
 #define _ARG_NUM            (1)
 #define _INPUT_NUM          (2)
@@ -80,7 +81,31 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_int32( param, "axis", (int32_t)axis );
     vsi_nn_kernel_param_add_int32( param, "indices_num", (int32_t)indices_num );
     vsi_nn_kernel_param_add_int32( param, "batch_dims", (int32_t)batch_dims );
-    n = vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, outputs, 1, param );
+
+    if (vsi_nn_is_same_data_type(inputs[0], outputs[0]) == FALSE ||
+        vsi_nn_is_same_quant_type(inputs[0], outputs[0]))
+    {
+        n = vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, outputs, 1, param );
+    }
+    else
+    {
+        vsi_nn_tensor_attr_t attr;
+        vsi_nn_tensor_t* temp_tensors = NULL;
+
+        VSILOGW("gather is no_range_change operation! \
+            Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!");
+
+        memcpy( &attr, &outputs[0]->attr, sizeof(attr));
+        memcpy( &attr.dtype, &inputs[0]->attr.dtype, sizeof(attr.dtype));
+        attr.is_const = FALSE;
+        attr.vtl = TRUE;
+        temp_tensors = vsi_nn_CreateTensor( self->graph, &attr );
+
+        vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, &temp_tensors, 1, param );
+        n = vxTensorCopyNode( self->graph->g, temp_tensors->t, outputs[0]->t);
+
+        vsi_safe_release_tensor(temp_tensors);
+    }
     if ( n != NULL )
     {
         self->n = (vx_node)n;
@@ -187,7 +212,7 @@ static vsi_bool op_setup
                 outputs[0]->attr.size[j] = inputs[0]->attr.size[i];
                 j++;
             }
-            for (i = 0; i < inputs[1]->attr.dim_num; i++)
+            for (i = 0; i < q_rank; i++)
             {
                 outputs[0]->attr.size[j] = inputs[1]->attr.size[i];
                 j++;
@@ -198,8 +223,8 @@ static vsi_bool op_setup
                 j++;
             }
         }
-
     }
+
     return TRUE;
 } /* op_setup() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c
index baf55b1dc..b77a39db3 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c
@@ -58,6 +58,7 @@ static vsi_status op_compute
 {
     vsi_status status = VSI_FAILURE;
     vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_nn_tensor_t* temp_tensors = NULL;
     vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
     uint32_t rank_in = 0;
     int32_t axis = 0;
@@ -66,6 +67,8 @@ static vsi_status op_compute
     vsi_bool ret = FALSE;
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_gather_elements_param * p = NULL;
+    vsi_size_t depth0 = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
+    vsi_size_t depth1 = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1;
 
     if ( NULL == self )
     {
@@ -86,7 +89,31 @@ static vsi_status op_compute
     // Add params
     param = vsi_nn_kernel_param_create();
 
-    if ( ret && new_axis0 == new_axis1 )
+    if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE)
+    {
+        vsi_nn_tensor_attr_t attr;
+
+        VSILOGW("gather_element is no_range_change operation! \
+            Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!");
+
+        memcpy( &attr, &outputs[0]->attr, sizeof(attr));
+        memcpy( &attr.dtype, &inputs[0]->attr.dtype, sizeof(attr.dtype));
+        attr.is_const = FALSE;
+        attr.vtl = TRUE;
+        temp_tensors = vsi_nn_CreateTensor( self->graph, &attr );
+    }
+    else
+    {
+        temp_tensors = outputs[0];
+    }
+
+    if ( ret && new_axis0 == new_axis1 &&
+        inputs[0]->attr.size[0] < GPU_TENSOR_MAX_WIDTH &&
+        inputs[0]->attr.size[1] < GPU_TENSOR_MAX_WIDTH &&
+        inputs[1]->attr.size[0] < GPU_TENSOR_MAX_WIDTH &&
+        inputs[1]->attr.size[1] < GPU_TENSOR_MAX_WIDTH &&
+        depth0 < GPU_TENSOR_MAX_WIDTH &&
+        depth1 < GPU_TENSOR_MAX_WIDTH)
     {
         vsi_nn_kernel_param_add_int32( param, "axis", new_axis0 );
 
@@ -95,7 +122,7 @@ static vsi_status op_compute
         reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
                 inputs[1], shapes[1], rank_in );
         reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph,
-                outputs[0], shapes[1], rank_in );
+                temp_tensors, shapes[1], rank_in );
 
         self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
                 "gather_elements",
@@ -112,7 +139,13 @@ static vsi_status op_compute
         self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
                 "gather_elements",
                 inputs, 2,
-                outputs, 1, param );
+                &temp_tensors, 1, param );
+    }
+
+    if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE)
+    {
+        self->n = vxTensorCopyNode( self->graph->g, temp_tensors->t, outputs[0]->t);
+        vsi_safe_release_tensor(temp_tensors);
     }
 
     vsi_nn_kernel_param_release( &param );
@@ -164,6 +197,8 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         uint32_t i = 0;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c
index 4246ee6aa..26d47dd7e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c
@@ -30,10 +30,11 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
-#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_tensor_util_prv.h"
 
 #define _ARG_NUM            (2)
 #define _INPUT_NUM          (2)
@@ -50,19 +51,20 @@ static vsi_status op_compute
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_kernel_node_t    n = NULL;
     vsi_size_t i = 0;
-    int32_t batch_dims = self->nn_param.gather_nd.batch_dims == 0 ? 0 : 1;
+    int32_t batch_dims = self->nn_param.gather_nd.batch_dims;
     vsi_size_t block_size = 1, coord_dim = 1;
     vsi_size_t *input_size = inputs[0]->attr.size;
     vsi_size_t dims_num = inputs[0]->attr.dim_num;
 
+    batch_dims = batch_dims < 0 ? 0 : batch_dims;
+
     if (inputs[1]->attr.dim_num > 1)
     {
         coord_dim = inputs[1]->attr.size[0];
     }
     if (coord_dim > 4 || (coord_dim > 3 && input_size[dims_num - 1] != 1)
-        || (batch_dims && coord_dim >= 3))
+        || (batch_dims && coord_dim >= 3) || (batch_dims >= (int32_t)vsi_nn_min(dims_num, inputs[1]->attr.dim_num)))
     {
-        CHECK_STATUS(status);
         return status;
     }
 
@@ -76,7 +78,32 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_int32( param, "block_size", (int32_t)block_size );
     vsi_nn_kernel_param_add_int32( param, "coord_dim", (int32_t)coord_dim );
     vsi_nn_kernel_param_add_int32( param, "batch_dims", (int32_t)batch_dims );
-    n = vsi_nn_kernel_selector( self->graph, "gather_nd", inputs, 2, outputs, 1, param );
+
+    if (vsi_nn_is_same_data_type(inputs[0], outputs[0]) == FALSE ||
+        vsi_nn_is_same_quant_type(inputs[0], outputs[0]))
+    {
+        n = vsi_nn_kernel_selector( self->graph, "gather_nd", inputs, 2, outputs, 1, param );
+    }
+    else
+    {
+        vsi_nn_tensor_attr_t attr;
+        vsi_nn_tensor_t* temp_tensors = NULL;
+
+        VSILOGW("gather_nd is no_range_change operation! \
+            Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!");
+
+        memcpy( &attr, &outputs[0]->attr, sizeof(attr));
+        memcpy( &attr.dtype, &inputs[0]->attr.dtype, sizeof(attr.dtype));
+        attr.is_const = FALSE;
+        attr.vtl = TRUE;
+        temp_tensors = vsi_nn_CreateTensor( self->graph, &attr );
+
+        vsi_nn_kernel_selector( self->graph, "gather_nd", inputs, 2, &temp_tensors, 1, param );
+        n = vxTensorCopyNode( self->graph->g, temp_tensors->t, outputs[0]->t);
+
+        vsi_safe_release_tensor(temp_tensors);
+    }
+
     if ( n != NULL )
     {
         self->n = (vx_node)n;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c b/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c
index 77feaafe3..09e96a1f0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c
@@ -78,6 +78,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 } /* op_check() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_globallppool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_globallppool.c
index de9059ecf..cc6463f63 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_globallppool.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_globallppool.c
@@ -155,6 +155,8 @@ static vsi_bool op_setup
 {
     vsi_size_t i = 0;
 
+    VSI_UNREFERENCED(self);
+
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
index d8c99aa89..86f15f81d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
@@ -51,7 +51,38 @@ static vsi_status op_compute
 {
     vsi_status status = VSI_FAILURE;
 
-    status = vsi_nn_internal_compute_node(self);
+    vsi_nn_kernel_param_t* param = NULL;
+    int32_t align_corners = self->nn_param.gridsample.align_corners;
+    vsi_nn_kernel_node_t n;
+    char kernel_name[128];
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32(param, "align_corners", align_corners);
+
+    switch (self->nn_param.gridsample.mode) {
+        case VSI_NN_INTERPOLATION_BILINEAR:
+            snprintf(kernel_name, sizeof(kernel_name), "bilinear_grid_sample");
+            break;
+        case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR:
+            snprintf(kernel_name, sizeof(kernel_name), "nearest_grid_sample");
+            break;
+        default:
+            break;
+    }
+
+    n = (vx_node)vsi_nn_kernel_selector(
+        self->graph, kernel_name, inputs, 2, outputs, 1, param);
+
+    if (n == NULL) {
+        vsi_nn_kernel_param_release(&param);
+        status = VSI_FAILURE;
+        return status;
+    }
+    self->n = (vx_node)n;
+    vsi_nn_kernel_param_release(&param);
+    if (self->n) {
+        status = VSI_SUCCESS;
+    }
 
     return status;
 } /* op_compute() */
@@ -63,8 +94,12 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    if (VSI_NN_INTERPOLATION_BILINEAR != self->nn_param.gridsample.mode) {
-        VSILOGE("Only support bilinear_grid_sample now!");
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+    if ((VSI_NN_INTERPOLATION_BILINEAR != self->nn_param.gridsample.mode) &&
+        (VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR !=
+         self->nn_param.gridsample.mode)) {
+        VSILOGE("Only support bilinear or nearest grid sample mode now!");
         return FALSE;
     }
 
@@ -85,8 +120,6 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_nn_internal_node_t* curr = NULL;
-
     if (NULL == self) {
         return FALSE;
     }
@@ -101,22 +134,6 @@ static vsi_bool op_setup
         }
     }
 
-    if (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.gridsample.mode) {
-        vsi_nn_internal_init_node_wksp(self);
-        curr = vsi_nn_internal_new_node(
-            self, VSI_NN_OP_BILINEAR_GRID_SAMPLE, 2, 1);
-        curr->node->nn_param.bilinear_grid_sample.align_corners =
-            self->nn_param.gridsample.align_corners;
-        curr->node->nn_param.bilinear_grid_sample.padding_mode =
-            self->nn_param.gridsample.padding_mode;
-        curr->node->nn_param.bilinear_grid_sample.const_val =
-            self->nn_param.gridsample.const_val;
-        curr->inputs[0]  = inputs[0];
-        curr->inputs[1]  = inputs[1];
-        curr->outputs[0] = outputs[0];
-        vsi_nn_internal_setup_node(self, curr);
-    }
-
     return TRUE;
 } /* op_setup() */
 
@@ -129,7 +146,7 @@ static vsi_status op_init
     //self->nn_param.grid_sample.local = \
     //    (grid_sample_local_data_t*)malloc(sizeof(grid_sample_local_data_t));
     */
-
+    VSI_UNREFERENCED(self);
     return VSI_SUCCESS;
 } /* op_init() */
 
@@ -140,7 +157,7 @@ static vsi_status op_deinit
 {
     vsi_status status = VSI_SUCCESS;
 
-    status = vsi_nn_internal_deinit_node_wksp(self);
+    status = vsi_nn_op_common_deinit(self);
 
     return status;
 } /* op_deinit() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c
index 5cfeddf58..a40497949 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c
@@ -77,6 +77,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -103,6 +105,7 @@ static vsi_bool op_setup
 {
     vsi_nn_internal_node_t* curr = NULL;
     vsi_nn_grouped_conv1d_param* p = &self->nn_param.grouped_conv1d;
+    vsi_bool ret = FALSE;
 
     vsi_nn_internal_init_node_wksp(self);
 
@@ -125,7 +128,9 @@ static vsi_bool op_setup
 
     p->local->input = _expand_tensor_dim( self->graph, inputs[0],
             inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 );
-    if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
+    if (inputs[1]->attr.dtype.qnt_type !=
+            VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC &&
+        inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8)
     {
         p->local->weight = _expand_tensor_dim( self->graph, inputs[1],
             inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 );
@@ -159,6 +164,7 @@ static vsi_bool op_setup
 
 
     curr = vsi_nn_internal_new_node(self, VSI_NN_OP_GROUPED_CONV2D, 0, 0);
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->inputs[0] = p->local->input;
     curr->inputs[1] = p->local->weight;
     curr->inputs[2] = inputs[2];
@@ -179,10 +185,10 @@ static vsi_bool op_setup
     curr->node->nn_param.grouped_conv2d.pad_type = p->pad_type;
     curr->node->nn_param.grouped_conv2d.pad_mode = p->pad_mode;
 
-    vsi_nn_internal_setup_node(self, curr);
+    ret = vsi_nn_internal_setup_node(self, curr);
 
 final:
-    return TRUE;
+    return ret;
 } /* op_setup() */
 
 static vsi_status op_init
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
index 00545d3c9..629486c69 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
@@ -77,6 +77,7 @@ static vsi_bool _is_3d_group_norm
     vsi_nn_tensor_t ** inputs
     )
 {
+    VSI_UNREFERENCED(self);
     if ( 3 == inputs[0]->attr.dim_num )
     {
         return TRUE;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c
index ad4c2a741..24acf6f94 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c
@@ -39,13 +39,14 @@
 #include "utils/vsi_nn_tensor_op.h"
 #include "utils/vsi_nn_util.h"
 #include "ops/vsi_nn_op_gru.h"
+#include "vsi_nn_error.h"
 
 typedef struct _vsi_nn_gru_local
 {
     void * placeholder;
 } vsi_nn_gru_local;
 
-static void create_state_tensor
+static vsi_status create_state_tensor
     (
     vsi_nn_node_t * self,
     vsi_nn_tensor_t ** inputs,
@@ -54,6 +55,7 @@ static void create_state_tensor
     vsi_size_t hidden_size
     )
 {
+    vsi_status status = VSI_FAILURE;
     vsi_nn_tensor_attr_t attr;
     vsi_nn_internal_tensor_t * tensor = NULL;
 
@@ -67,6 +69,7 @@ static void create_state_tensor
         attr.vtl = TRUE;
         attr.is_const = FALSE;
         tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(tensor, "Create internal tensor failed", final);
         outputs[GRU_OUT_H_STATE] = tensor->t;
     }
 
@@ -80,9 +83,13 @@ static void create_state_tensor
         attr.is_const = TRUE;
 
         tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(tensor, "Create internal tensor failed", final);
         inputs[GRU_IN_H_STATE] = tensor->t;
     }
 
+    status = VSI_SUCCESS;
+final:
+    return status;
 } /* create_state_tensor() */
 
 static vsi_bool setup_op_shapes
@@ -92,8 +99,10 @@ static vsi_bool setup_op_shapes
     vsi_nn_tensor_t ** outputs
     )
 {
+    vsi_status status = VSI_FAILURE;
     vsi_nn_gru_param * p = &self->nn_param.gru;
     vsi_size_t batch_size = 0, hidden_size = 0, timesetp = 0;
+    vsi_bool ret = FALSE;
 
     hidden_size = p->num_units;
     if(p->time_major)
@@ -137,7 +146,8 @@ static vsi_bool setup_op_shapes
     }
 
     /* create hstate input/output if app doesn't provide them */
-    create_state_tensor(self, inputs, outputs, batch_size, hidden_size);
+    status = create_state_tensor(self, inputs, outputs, batch_size, hidden_size);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
     /* hstate output */
     if(VSI_NN_DIM_AUTO == outputs[GRU_OUT_H_STATE]->attr.dim_num)
@@ -147,7 +157,9 @@ static vsi_bool setup_op_shapes
         outputs[GRU_OUT_H_STATE]->attr.size[1] = batch_size;
     }
 
-    return TRUE;
+    ret = TRUE;
+final:
+    return ret;
 } /* setup_op_shapes() */
 
 static vsi_status op_compute
@@ -157,6 +169,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 }
 
@@ -167,6 +181,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 }
 
@@ -187,6 +204,8 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** gru_step_outputs = NULL;
     vsi_nn_internal_tensor_t * tmp_tensor = NULL;
     vsi_nn_tensor_attr_t attr;
+    vsi_bool ret = FALSE;
+    vsi_status status = VSI_FAILURE;
 
     memset(&attr, 0, sizeof(attr));
     vsi_nn_internal_init_node_wksp( self );
@@ -211,15 +230,19 @@ static vsi_bool op_setup
         /* transpose to time_major */
         tmp_tensor = vsi_nn_rnn_transpose_time_major(self,
             inputs[GRU_INPUT_INPUT], NULL, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
         input_tensor = tmp_tensor->t;
     }
 
     split_outputs = (vsi_nn_tensor_t **)malloc(timestep * sizeof(vsi_nn_tensor_t *));
+    CHECK_PTR_FAIL_GOTO( split_outputs, "Create buffer fail.", final );
     memset(split_outputs, 0, timestep * sizeof(vsi_nn_tensor_t *));
     gru_step_outputs = (vsi_nn_tensor_t **)malloc(timestep * sizeof(vsi_nn_tensor_t *));
+    CHECK_PTR_FAIL_GOTO( gru_step_outputs, "Create buffer fail.", final );
     memset(gru_step_outputs, 0, timestep * sizeof(vsi_nn_tensor_t *));
 
-    vsi_nn_rnn_split_input_tensor(self, input_tensor, split_outputs, (uint32_t)timestep, use_virtual_tensor);
+    status = vsi_nn_rnn_split_input_tensor(self, input_tensor, split_outputs, (uint32_t)timestep, use_virtual_tensor);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
     //vsi_nn_rnn_data_check_aligned(self, split_outputs, timestep, use_virtual_tensor); ??
 
@@ -233,6 +256,7 @@ static vsi_bool op_setup
         /* reshape split_outputs to cell_input */
         tmp_tensor = vsi_nn_rnn_reshape_split_output(
             self, split_outputs[i], (uint32_t)batch_size, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
         reshape_output = tmp_tensor->t;
 
         /* grucell output */
@@ -245,6 +269,7 @@ static vsi_bool op_setup
             vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[GRU_OUT_OUTPUT]->attr.dtype, use_virtual_tensor);
             tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
             cell_out0 = tmp_tensor->t;
         }
 
@@ -254,6 +279,7 @@ static vsi_bool op_setup
             vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[GRU_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
             tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
             cell_out1 = tmp_tensor->t;
         }
         else
@@ -263,6 +289,7 @@ static vsi_bool op_setup
 
         /* create a grucell */
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.grucell.num_units = p->num_units;
         curr->node->nn_param.grucell.activation = p->activation;
         curr->node->nn_param.grucell.recurrent_activation = p->recurrent_activation;
@@ -292,6 +319,7 @@ static vsi_bool op_setup
             /* reshape every step output to 3-dims for GRU_OUTPUT */
             tmp_tensor = vsi_nn_rnn_reshape_cell_output(self,
                 cell_out0, (uint32_t)batch_size, use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
             gru_step_outputs[i] = tmp_tensor->t;
         }
     } /* for(i = 0; i < timestep; i++) end */
@@ -305,11 +333,13 @@ static vsi_bool op_setup
             vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
             tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
             output_tensor = tmp_tensor->t;
         }
 
         /* concat all grucell output0, the reshaped grucell output shape: [hidden_size, batch, 1] */
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, timestep, 1 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.concat.axis = 2; /* concat the cell_outs in timestep */
         for( i = 0; i < timestep; i++ )
         {
@@ -326,10 +356,12 @@ static vsi_bool op_setup
         }
     }
 
+    ret = TRUE;
+final:
     vsi_nn_safe_free( split_outputs );
     vsi_nn_safe_free( gru_step_outputs );
 
-    return TRUE;
+    return ret;
 }
 
 static vsi_status op_deinit
@@ -350,6 +382,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 }
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c
index 5ac947b9f..9d7e34897 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c
@@ -93,6 +93,7 @@ static vsi_bool setup_op_shapes
         attr.is_const = TRUE;
 
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         inputs[GRU_INPUT_H_STATE] = output_tensor->t;
     }
 
@@ -103,6 +104,7 @@ static vsi_bool setup_op_shapes
         memcpy( &attr.dtype, &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) );
         attr.vtl = TRUE;
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         outputs[GRU_OUTPUT_H_STATE] = output_tensor->t;
     }
 
@@ -132,6 +134,8 @@ static vsi_bool setup_op_shapes
     }
 
     return TRUE;
+final:
+    return FALSE;
 }
 
 static vsi_status op_compute
@@ -141,6 +145,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -151,6 +157,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -163,6 +172,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
@@ -187,6 +198,7 @@ static vsi_bool op_setup_default
     vsi_size_t time_step = 0;
     vsi_size_t i = 0;
     vsi_bool ret = FALSE;
+    vsi_status status = VSI_FAILURE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_node_wksp( self );
@@ -211,6 +223,7 @@ static vsi_bool op_setup_default
         /* transpose to time_major */
         output_tensor = vsi_nn_rnn_transpose_time_major(self,
             inputs[GRU_INPUT_INPUT], NULL, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         input_tensor = output_tensor->t;
     }
 
@@ -222,9 +235,12 @@ static vsi_bool op_setup_default
     CHECK_PTR_FAIL_GOTO( grucell_reshape_output_tensors, "Create buffer fail.", final );
     memset( grucell_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
 
-    vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+    status = vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors,
+        (uint32_t)time_step, use_virtual_tensor);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
-    vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+    status = vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
     last_step_h_state = inputs[GRU_INPUT_H_STATE];
     for( i = 0; i < time_step; i++ )
@@ -236,6 +252,7 @@ static vsi_bool op_setup_default
         /* reshape for split output */
         output_tensor = vsi_nn_rnn_reshape_split_output(self,
             split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         reshape_output = output_tensor->t;
 
         /* grucell output */
@@ -248,6 +265,7 @@ static vsi_bool op_setup_default
             vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
             output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
             grucell_out0 = output_tensor->t;
         }
 
@@ -257,6 +275,7 @@ static vsi_bool op_setup_default
             vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[GRU_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
             output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
             grucell_out1 = output_tensor->t;
         }
         else
@@ -265,13 +284,14 @@ static vsi_bool op_setup_default
         }
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_OVXLIB, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.grucell_ovxlib.num_units = curr_param->num_units;
         curr->node->nn_param.grucell_ovxlib.activation = curr_param->activation;
         curr->node->nn_param.grucell_ovxlib.recurrent_activation = curr_param->recurrent_activation;
         curr->node->nn_param.grucell_ovxlib.linear_before_reset = curr_param->linear_before_reset;
         if ( reshape_output->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 )
         {
-            int32_t k = 0;
+            size_t k = 0;
             for (k = 0; k < _cnt_of_array( curr_param->internal_dtype ); k++)
             {
                 if (curr_param->internal_dtype[k].vx_type == VSI_NN_TYPE_NONE)
@@ -316,6 +336,7 @@ static vsi_bool op_setup_default
             /* reshape output to 3-dims */
             output_tensor = vsi_nn_rnn_reshape_cell_output(self,
                 grucell_out0, (uint32_t)batch_size, use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
             grucell_reshape_output_tensors[i] = output_tensor->t;
         }
     }
@@ -328,12 +349,14 @@ static vsi_bool op_setup_default
             vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
             output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
 
             tensor = output_tensor->t;
         }
 
         /* concat grucell output, the gru's output is 3-dims */
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.concat.axis = 2;
         for( i = 0; i < time_step; i++ )
         {
@@ -383,6 +406,8 @@ static vsi_bool op_setup_optimized
     vsi_nn_internal_tensor_t* input_weight_for_nn = NULL;
     vsi_size_t permute_in_perm[VSI_NN_MAX_DIM_NUM] = { 0 };
     vsi_size_t reshape_size[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_bool ret = FALSE;
+    vsi_status status = VSI_FAILURE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_node_wksp( self );
@@ -409,57 +434,69 @@ static vsi_bool op_setup_optimized
         /* transpose to time_major */
         output_tensor = vsi_nn_rnn_transpose_time_major(self,
             inputs[GRU_INPUT_INPUT], NULL, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         input_tensor = output_tensor->t;
     }
 
     /* input FC */
     p->local->weights_input = vsi_nn_ConcatTensor(self->graph, 1, inputs[GRU_INPUT_WEIGHT_I2R],
             inputs[GRU_INPUT_WEIGHT_I2Z], inputs[GRU_INPUT_WEIGHT_I2C]);
+    CHECK_PTR_FAIL_GOTO(p->local->weights_input, "Create tensor failed", final);
     p->local->weights_input->attr.is_const = TRUE;
     vsi_nn_SetTensorAttr(p->local->weights_input, VSI_NN_TENSOR_ATTR_CONST);
 
     p->local->weights_recurrent = vsi_nn_ConcatTensor(self->graph, 1, inputs[GRU_INPUT_WEIGHT_H2R],
                 inputs[GRU_INPUT_WEIGHT_H2Z], inputs[GRU_INPUT_WEIGHT_H2C]);
+    CHECK_PTR_FAIL_GOTO(p->local->weights_recurrent, "Create tensor failed", final);
     p->local->weights_recurrent->attr.is_const = TRUE;
     vsi_nn_SetTensorAttr(p->local->weights_recurrent, VSI_NN_TENSOR_ATTR_CONST);
 
     p->local->bias_r = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2R]->attr,
         inputs[GRUCELL_INPUT_BIAS_I2R], inputs[GRUCELL_INPUT_BIAS_H2R]);
+    CHECK_PTR_FAIL_GOTO(p->local->bias_r, "Create tensor failed", final);
     p->local->bias_r->attr.is_const = TRUE;
     vsi_nn_SetTensorAttr(p->local->bias_r, VSI_NN_TENSOR_ATTR_CONST);
 
     p->local->bias_z = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2Z]->attr,
         inputs[GRUCELL_INPUT_BIAS_I2Z], inputs[GRUCELL_INPUT_BIAS_H2Z]);
+    CHECK_PTR_FAIL_GOTO(p->local->bias_z, "Create tensor failed", final);
     p->local->bias_z->attr.is_const = TRUE;
     vsi_nn_SetTensorAttr(p->local->bias_z, VSI_NN_TENSOR_ATTR_CONST);
 
     p->local->bias_c = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2C]->attr,
         inputs[GRUCELL_INPUT_BIAS_I2C], inputs[GRUCELL_INPUT_BIAS_H2C]);
+    CHECK_PTR_FAIL_GOTO(p->local->bias_c, "Create tensor failed", final);
     p->local->bias_c->attr.is_const = TRUE;
     vsi_nn_SetTensorAttr(p->local->bias_c, VSI_NN_TENSOR_ATTR_CONST);
 
     /* prepare weight and bias for recurrent fc */
     recurrent_weight_for_nn = vsi_nn_rnn_prepare_weight_for_nn_fc(self, p->local->weights_recurrent, 1, 1);
+    CHECK_PTR_FAIL_GOTO(recurrent_weight_for_nn, "Create internal tensor failed", final);
 
     /* transpose input from [T,B,D] to [D,T,B] */
     permute_in_perm[0] = 1;
     permute_in_perm[1] = 2;
     permute_in_perm[2] = 0;
     tmp_tensor = vsi_nn_rnn_create_permute(self, input_tensor, NULL, permute_in_perm, 3, use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create internal tensor fail.", final );
 
     reshape_size[0] = tmp_tensor->t->attr.size[0];
     reshape_size[1] = tmp_tensor->t->attr.size[1];
     reshape_size[2] = tmp_tensor->t->attr.size[2];
     reshape_size[3] = 1; /* new batch dim */
     tmp_tensor = vsi_nn_rnn_create_reshape(self, tmp_tensor->t, NULL, reshape_size, 4, use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
 
     input_weight_for_nn = vsi_nn_rnn_prepare_weight_for_nn_fc(self, p->local->weights_input, 1, 1);
+    CHECK_PTR_FAIL_GOTO(input_weight_for_nn, "Create internal tensor failed", final);
 
     vsi_nn_internal_init_tensor_attr(&attr, &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_INPUT],
         use_virtual_tensor);
     output_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
 
     curr = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->node->nn_param.conv2d.ksize[0] = 1;
     curr->node->nn_param.conv2d.ksize[1] = 1;
     curr->node->nn_param.conv2d.stride[0] = 1;
@@ -483,11 +520,13 @@ static vsi_bool op_setup_optimized
     reshape_size[1] = output_tensor->t->attr.size[1];
     reshape_size[2] = output_tensor->t->attr.size[2];
     output_tensor = vsi_nn_rnn_create_reshape(self, output_tensor->t, NULL, reshape_size, 3, use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
 
     permute_in_perm[0] = 0;
     permute_in_perm[1] = 2;
     permute_in_perm[2] = 1;
     tmp_tensor = vsi_nn_rnn_create_permute(self, output_tensor->t, NULL, permute_in_perm, 3, use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
 
     /* split input tensor */
     split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
@@ -497,19 +536,24 @@ static vsi_bool op_setup_optimized
     CHECK_PTR_FAIL_GOTO( grucell_reshape_output_tensors, "Create buffer fail.", final );
     memset( grucell_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
 
-    vsi_nn_rnn_split_input_tensor(self, tmp_tensor->t, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+    status = vsi_nn_rnn_split_input_tensor(self, tmp_tensor->t, split_output_tensors,
+        (uint32_t)time_step, use_virtual_tensor);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
-    vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+    status = vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
     memcpy(&attr, &p->local->bias_r->attr, sizeof(vsi_nn_tensor_attr_t));
     attr.size[1] = 1;
     attr.dim_num = 2;
     p->local->cond_zeros = vsi_nn_CreateTensorWithDefault(self->graph, &attr, 0.0);
+    CHECK_PTR_FAIL_GOTO(p->local->cond_zeros, "Create tensor failed", final);
 
     last_step_h_state = inputs[GRU_INPUT_H_STATE];
     permute_in_perm[0] = 1;
     permute_in_perm[1] = 0;
     tmp_tensor = vsi_nn_rnn_create_permute(self, last_step_h_state, NULL, permute_in_perm, 2, use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
     last_step_h_state = tmp_tensor->t;
 
     for( i = 0; i < time_step; i++ )
@@ -525,6 +569,7 @@ static vsi_bool op_setup_optimized
         /* reshape for split output */
         output_tensor = vsi_nn_rnn_reshape_split_output(self,
             split_output_tensors[i], (uint32_t)(unit_nums * 3), use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         input_fc_output = output_tensor->t;
 
         /* last_step_h_state is not batch first, no need to permute */
@@ -533,13 +578,16 @@ static vsi_bool op_setup_optimized
         reshape_size[1] = 1/*kernel_h*/;
         reshape_size[0] = last_step_h_state->attr.size[0];
         tmp = vsi_nn_rnn_create_reshape(self, last_step_h_state, NULL, reshape_size, 4, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
 
         vsi_nn_internal_init_tensor_attr(&attr,
             &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_HIDDEN],
             use_virtual_tensor);
         tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
 
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.conv2d.ksize[0] = 1;
         curr->node->nn_param.conv2d.ksize[1] = 1;
         curr->node->nn_param.conv2d.stride[0] = 1;
@@ -562,37 +610,35 @@ static vsi_bool op_setup_optimized
         reshape_size[1] = recurrent_weight_for_nn->t->attr.size[3];
         reshape_size[0] = batch_size;
         tmp_tensor = vsi_nn_rnn_create_reshape(self, tmp_tensor->t, NULL, reshape_size, 2, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
         recurrent_fc_output = tmp_tensor->t;
 
         /* grucell output */
         vsi_nn_internal_init_tensor_attr(&attr,
             &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         grucell_out0 = output_tensor->t;
 
         /* grucell output h_state */
         vsi_nn_internal_init_tensor_attr(&attr,
             &outputs[GRU_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         grucell_out1 = output_tensor->t;
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[GRUCELL_ACTIVATION_INPUT_H_STATE] = last_step_h_state;
-        if(0)
-        {
-            curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R] = input_fc_output;
-            curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z] = NULL;
-            curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_C] = NULL;
-            curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_R] = recurrent_fc_output;
-            curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_Z] = NULL;
-            curr->inputs[GRUCELL_ACTIVATION_INPUT_RECURRENT_FC_C] = NULL;
-        }
-        else
         {
             splited_input_fc_output_tensors = vsi_nn_create_split(self,
                 input_fc_output, 1, 3, NULL, use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(splited_input_fc_output_tensors, curr,
+                "Create internal tensor failed", final);
             splited_recurrent_fc_output_tensors = vsi_nn_create_split(self,
                 recurrent_fc_output, 1, 3, NULL, use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(splited_recurrent_fc_output_tensors, curr,
+                "Create internal tensor failed", final);
             curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R] = splited_input_fc_output_tensors[0]->t;
             curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z] = splited_input_fc_output_tensors[1]->t;
             curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_C] = splited_input_fc_output_tensors[2]->t;
@@ -623,8 +669,10 @@ static vsi_bool op_setup_optimized
     vsi_nn_internal_init_tensor_attr(&attr,
         &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
     tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
 
     curr = vsi_nn_internal_new_node(self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1);
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->node->nn_param.concat.axis = 1;
     for( i = 0; i < time_step; i++ )
     {
@@ -634,9 +682,10 @@ static vsi_bool op_setup_optimized
     vsi_nn_internal_setup_node(self, curr);
 
     reshape_size[0] = batch_size;
-    reshape_size[1] = -1;
+    reshape_size[1] = (vsi_size_t)-1;
     reshape_size[2] = time_step;
     tmp_tensor = vsi_nn_rnn_create_reshape(self, tmp_tensor->t, NULL, reshape_size, 3, use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
 
     if(p->time_major)
     {
@@ -657,11 +706,12 @@ static vsi_bool op_setup_optimized
     vsi_nn_rnn_create_permute(self, last_step_h_state, outputs[GRU_OUTPUT_H_STATE],
         permute_in_perm, 2, use_virtual_tensor);
 
+    ret = TRUE;
 final:
     vsi_nn_safe_free( split_output_tensors );
     vsi_nn_safe_free( grucell_reshape_output_tensors );
 
-    return TRUE;
+    return ret;
 } /* op_setup_optimized() */
 
 static vsi_bool op_setup
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c
index 18ae5545a..2fc49d033 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c
@@ -39,6 +39,7 @@
 #include "utils/vsi_nn_tensor_op.h"
 #include "utils/vsi_nn_util.h"
 #include "ops/vsi_nn_op_grucell.h"
+#include "vsi_nn_error.h"
 
 typedef struct _vsi_nn_grucell_local
 {
@@ -64,6 +65,7 @@ static vsi_nn_internal_tensor_t * _create_fc
     {
         /* create zero bias for NN/TP */
         tmp_tensor = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE);
+        CHECK_PTR_FAIL_GOTO( tmp_tensor, "Create tensor fail.", final );
         bias_tensor = tmp_tensor->t;
     }
     else
@@ -85,8 +87,10 @@ static vsi_nn_internal_tensor_t * _create_fc
     attr.vtl = TRUE;
     attr.is_const = FALSE;
     fc_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(fc_out, "Create internal tensor failed", final);
 
     fc_node = vsi_nn_internal_new_node(self, VSI_NN_OP_FCL, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(fc_node, "Create internal node failed", final);
     fc_node->node->nn_param.fcl.axis = 0;
     fc_node->node->nn_param.fcl.weights = (uint32_t)weight->attr.size[1];
     fc_node->inputs[0] = input;
@@ -95,6 +99,7 @@ static vsi_nn_internal_tensor_t * _create_fc
     fc_node->outputs[0] = fc_out->t;
     vsi_nn_internal_setup_node(self, fc_node);
 
+final:
     return fc_out;
 } /* () */
 
@@ -136,6 +141,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -146,6 +153,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 }
 
@@ -167,6 +177,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 }
 
@@ -200,6 +212,7 @@ static vsi_bool op_setup_default
             inputs[GRUCELL_IN_KERNEL_I2Z + i],
             inputs[GRUCELL_IN_BIAS_I2Z + i]
         );
+        CHECK_PTR_FAIL_GOTO(input_fc_outputs[i], "Create internal tensor failed", final);
     }
 
     /* create hstate fc */
@@ -211,6 +224,7 @@ static vsi_bool op_setup_default
             inputs[GRUCELL_IN_KERNEL_R2Z + i],
             inputs[GRUCELL_IN_BIAS_R2Z + i]
         );
+        CHECK_PTR_FAIL_GOTO(hstate_fc_outputs[i], "Create internal tensor failed", final);
     }
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
@@ -228,8 +242,10 @@ static vsi_bool op_setup_default
     attr.vtl = TRUE;
     attr.is_const = FALSE;
     h_times_r = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(h_times_r, "Create internal tensor failed", final);
 
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_H_TIMES_ACTIVATION_R, 3, 1 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->node->nn_param.grucell_h_times_activation_r.recurrent_activation = p->recurrent_activation;
     curr->inputs[0] = inputs[GRUCELL_IN_H_STATE];
     curr->inputs[1] = input_fc_outputs[GRUCELL_GATES_R]->t;
@@ -243,8 +259,10 @@ static vsi_bool op_setup_default
         inputs[GRUCELL_IN_KERNEL_R2H],
         inputs[GRUCELL_IN_BIAS_R2H]
     );
+    CHECK_PTR_FAIL_GOTO(hstate_fc_outputs[GRUCELL_GATES_H], "Create internal tensor failed", final);
 
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_Z_H, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->node->nn_param.grucell_activation_z_h.activation = p->activation;
     curr->node->nn_param.grucell_activation_z_h.recurrent_activation = p->recurrent_activation;
     curr->inputs[GRUCELL_ACT_Z_H_HSTATE] = inputs[GRUCELL_IN_H_STATE];
@@ -257,6 +275,8 @@ static vsi_bool op_setup_default
     vsi_nn_internal_setup_node(self, curr);
 
     return TRUE;
+final:
+    return FALSE;
 }
 #endif
 
@@ -287,6 +307,7 @@ static vsi_bool op_setup_reset_after
             inputs[GRUCELL_IN_KERNEL_I2Z + i],
             inputs[GRUCELL_IN_BIAS_I2Z + i]
         );
+        CHECK_PTR_FAIL_GOTO(input_fc_outputs[i], "Create internal tensor failed", final);
     }
 
     /* create hstate fc */
@@ -298,9 +319,11 @@ static vsi_bool op_setup_reset_after
             inputs[GRUCELL_IN_KERNEL_R2Z + i],
             inputs[GRUCELL_IN_BIAS_R2Z + i]
         );
+        CHECK_PTR_FAIL_GOTO(hstate_fc_outputs[i], "Create internal tensor failed", final);
     }
 
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->node->nn_param.grucell_activation.activation = p->activation;
     curr->node->nn_param.grucell_activation.recurrent_activation = p->recurrent_activation;
     curr->inputs[GRUCELL_ACT_H_STATE] = inputs[GRUCELL_IN_H_STATE];
@@ -315,6 +338,8 @@ static vsi_bool op_setup_reset_after
     vsi_nn_internal_setup_node(self, curr);
 
     return TRUE;
+final:
+    return FALSE;
 }
 
 static vsi_bool op_setup
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c
index 4fcd61200..1478eac41 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c
@@ -75,6 +75,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -86,6 +89,8 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+
     if (VSI_NN_DIM_AUTO == outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dim_num)
     {
         outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dim_num = \
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c
index 42fc9fbc3..a77d05dd6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c
@@ -73,6 +73,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 } /* op_check() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c
index ba9b540cf..cf35692d0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c
@@ -70,6 +70,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -81,6 +84,8 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+
     if(VSI_NN_DIM_AUTO == outputs[GRUCELL_ACTIVATION_SMA_OUTPUT_OUTPUT]->attr.dim_num)
     {
         outputs[GRUCELL_ACTIVATION_SMA_OUTPUT_OUTPUT]->attr.dim_num = \
@@ -108,6 +113,8 @@ static vsi_status op_init
 {
     vsi_status status = VSI_SUCCESS;
 
+    VSI_UNREFERENCED(self);
+
     return status;
 } /* op_init() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_z_h.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_z_h.c
index 46eff0d9d..7980d4281 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_z_h.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_z_h.c
@@ -76,6 +76,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -87,6 +90,8 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+
     if (VSI_NN_DIM_AUTO == outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]->attr.dim_num)
     {
         outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]->attr.dim_num = \
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_h_times_activation_r.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_h_times_activation_r.c
index e1e448077..58dc548e6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_h_times_activation_r.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_h_times_activation_r.c
@@ -81,6 +81,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -92,6 +95,8 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+
     if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
     {
         outputs[0]->attr.dim_num = \
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c
index 020ab32e6..432ce2032 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c
@@ -35,12 +35,12 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "ops/vsi_nn_op_grucell_ovxlib.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
 #include "utils/vsi_nn_tensor_op.h"
 #include "utils/vsi_nn_util.h"
+#include "vsi_nn_error.h"
 
 #define USE_GRUCELL_ACTIVATION
 
@@ -78,8 +78,10 @@ static vsi_nn_internal_tensor_t* create_multiply
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor);
     tensor1 = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final);
 
     tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_MULTIPLY, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
 
     tmp_inode->inputs[0] = input1;
     tmp_inode->inputs[1] = input2;
@@ -89,6 +91,7 @@ static vsi_nn_internal_tensor_t* create_multiply
     tmp_inode->outputs[0] = tensor1->t;
     vsi_nn_internal_setup_node(self, tmp_inode);
 
+final:
     return tensor1;
 }
 
@@ -125,6 +128,7 @@ static vsi_bool setup_op_shapes
         attr.is_const = FALSE;
 
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         inputs[GRUCELL_INPUT_H_STATE] = output_tensor->t;
     }
 
@@ -133,6 +137,7 @@ static vsi_bool setup_op_shapes
         vsi_nn_internal_init_tensor_attr(&attr,
             &outputs[GRUCELL_OUTPUT_OUTPUT]->attr.dtype, TRUE);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         outputs[GRUCELL_OUTPUT_H_STATE] = output_tensor->t;
     }
 
@@ -156,6 +161,8 @@ static vsi_bool setup_op_shapes
     }
 
     return TRUE;
+final:
+    return FALSE;
 }
 
 static vsi_status op_compute
@@ -165,6 +172,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -175,6 +184,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -187,6 +199,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
@@ -205,24 +219,31 @@ static vsi_bool op_setup_float
     vsi_nn_internal_tensor_t* tensor_rt = NULL;
     vsi_nn_internal_tensor_t* input_hstate = NULL;
     vsi_nn_internal_tensor_t** splited_tensors = NULL;
+    vsi_bool ret = FALSE;
 
     p->local->weights_update = vsi_nn_ConcatTensor(self->graph, 0,
         inputs[GRUCELL_INPUT_WEIGHT_I2Z], inputs[GRUCELL_INPUT_WEIGHT_H2Z]);
+    CHECK_PTR_FAIL_GOTO(p->local->weights_update, "Create tensor failed", final);
     p->local->weights_reset = vsi_nn_ConcatTensor(self->graph, 0,
         inputs[GRUCELL_INPUT_WEIGHT_I2R], inputs[GRUCELL_INPUT_WEIGHT_H2R]);
+    CHECK_PTR_FAIL_GOTO(p->local->weights_reset, "Create tensor failed", final);
     p->local->bias_z = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2Z]->attr,
         inputs[GRUCELL_INPUT_BIAS_I2Z], inputs[GRUCELL_INPUT_BIAS_H2Z]);
+    CHECK_PTR_FAIL_GOTO(p->local->bias_z, "Create tensor failed", final);
     p->local->bias_r = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2R]->attr,
         inputs[GRUCELL_INPUT_BIAS_I2R], inputs[GRUCELL_INPUT_BIAS_H2R]);
+    CHECK_PTR_FAIL_GOTO(p->local->bias_r, "Create tensor failed", final);
     p->local->bias_z_r = vsi_nn_ConcatTensor(self->graph, 0, p->local->bias_z, p->local->bias_r);
+    CHECK_PTR_FAIL_GOTO(p->local->bias_z_r, "Create tensor failed", final);
     p->local->weights_z_r = vsi_nn_ConcatTensor(self->graph, 1, p->local->weights_update, p->local->weights_reset);
+    CHECK_PTR_FAIL_GOTO(p->local->weights_z_r, "Create tensor failed", final);
     p->local->weights_c = vsi_nn_ConcatTensor(self->graph, 0,
         inputs[GRUCELL_INPUT_WEIGHT_I2C], inputs[GRUCELL_INPUT_WEIGHT_H2C]);
+    CHECK_PTR_FAIL_GOTO(p->local->weights_c, "Create tensor failed", final);
     p->local->bias_c = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2C]->attr,
         inputs[GRUCELL_INPUT_BIAS_I2C], inputs[GRUCELL_INPUT_BIAS_H2C]);
+    CHECK_PTR_FAIL_GOTO(p->local->bias_c, "Create tensor failed", final);
 
-    vsi_safe_release_tensor(p->local->bias_z);
-    vsi_safe_release_tensor(p->local->bias_r);
     p->local->bias_z_r->attr.is_const = TRUE;
     vsi_nn_SetTensorAttr(p->local->bias_z_r, VSI_NN_TENSOR_ATTR_CONST);
     p->local->weights_z_r->attr.is_const = TRUE;
@@ -234,6 +255,7 @@ static vsi_bool op_setup_float
 
     input_hstate = vsi_nn_rnn_create_concat(self, 0,
         use_virtual_tensor, inputs[GRUCELL_INPUT_INPUT], inputs[GRUCELL_INPUT_H_STATE]);
+    CHECK_PTR_FAIL_GOTO(input_hstate, "Create internal tensor failed", final);
 
     dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
     if ( input_hstate->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
@@ -247,8 +269,10 @@ static vsi_bool op_setup_float
     }
     tmp_tensor = vsi_nn_rnn_create_tp_fc(self, input_hstate->t,
         p->local->weights_z_r, p->local->bias_z_r, &dtype, use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
 
     splited_tensors = vsi_nn_create_split(self, tmp_tensor->t, 0, 2, NULL, use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO(splited_tensors, "Create internal tensor failed", final);
 
     /* reset Gate activations */
     tensor_rt = vsi_nn_rnn_create_activation(self,
@@ -256,6 +280,7 @@ static vsi_bool op_setup_float
                         p->local->gate_activation,
                         &splited_tensors[1]->t->attr.dtype,
                         use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO(tensor_rt, "Create internal tensor failed", final);
 
     /* if linear_before_reset=0:  ht=g(input*w_ic + (r.hstate)*w_hc + b_ic + b_hc)*/
     if ( p->linear_before_reset == 0 )
@@ -263,10 +288,12 @@ static vsi_bool op_setup_float
         /* r{t} * h{t-1}*/
         tensor_rt = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_MULTIPLY,
             tensor_rt->t, inputs[GRUCELL_INPUT_H_STATE], &tensor_rt->t->attr.dtype, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tensor_rt, "Create internal tensor failed", final);
 
         /* [x{t}, r{t}] */
         tmp_tensor = vsi_nn_rnn_create_concat(self, 0, use_virtual_tensor,
             inputs[GRUCELL_INPUT_INPUT], tensor_rt->t);
+        CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
 
         dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
         if ( tmp_tensor->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
@@ -281,6 +308,7 @@ static vsi_bool op_setup_float
         /* W{c} x [x{t}, r{t}] */
         tmp_tensor = vsi_nn_rnn_create_tp_fc(self, tmp_tensor->t, p->local->weights_c, p->local->bias_c,
             &dtype, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
     }
     /* if linear_before_reset!=0: ht=g(input*w_ic + (r.(hstate*w_hc + b_hc)) + b_ic)*/
     else
@@ -298,19 +326,24 @@ static vsi_bool op_setup_float
         /* r.(hstate*w_hc + b_hc) */
         tmp_tensor = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_H_STATE], inputs[GRUCELL_INPUT_WEIGHT_H2C],
             inputs[GRUCELL_INPUT_BIAS_H2C], &dtype, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
         tensor_rt = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_MULTIPLY,
             tensor_rt->t, tmp_tensor->t, &tensor_rt->t->attr.dtype, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tensor_rt, "Create internal tensor failed", final);
         /* input*w_ic + b_ic */
         tmp_tensor = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_INPUT], inputs[GRUCELL_INPUT_WEIGHT_I2C],
             inputs[GRUCELL_INPUT_BIAS_I2C], &dtype, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
 
         tmp_tensor = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_ADD,
             tensor_rt->t, tmp_tensor->t, &tensor_rt->t->attr.dtype, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
     }
 
 #define USE_GRUCELL_ACTIVATION
 #ifdef USE_GRUCELL_ACTIVATION
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->inputs[0] = splited_tensors[0]->t;
     curr->inputs[1] = tmp_tensor->t;
     curr->inputs[2] = inputs[GRUCELL_INPUT_H_STATE];
@@ -342,6 +375,7 @@ static vsi_bool op_setup_float
     tmp_tensor = vsi_nn_rnn_create_binary_operator(self, VSI_NN_OP_MULTIPLY,
         tensor_zt->t, tmp_tensor->t, &tensor_ht_->t->attr.dtype, use_virtual_tensor);
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->inputs[0] = tmp_tensor->t;
     curr->inputs[1] = tensor_ht_->t;
     curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT];
@@ -349,12 +383,18 @@ static vsi_bool op_setup_float
     }
 
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->inputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT];
     curr->outputs[0] = outputs[GRUCELL_OUTPUT_H_STATE];
     vsi_nn_internal_setup_node(self, curr);
 #endif
 
-    return TRUE;
+
+    ret = TRUE;
+final:
+    vsi_safe_release_tensor(p->local->bias_z);
+    vsi_safe_release_tensor(p->local->bias_r);
+    return ret;
 }
 
 static vsi_bool op_setup_float_cudnn
@@ -379,24 +419,29 @@ static vsi_bool op_setup_float_cudnn
 
     p->local->weights_input = vsi_nn_ConcatTensor(self->graph, 1, inputs[GRUCELL_INPUT_WEIGHT_I2R],
                 inputs[GRUCELL_INPUT_WEIGHT_I2Z], inputs[GRUCELL_INPUT_WEIGHT_I2C]);
+    CHECK_PTR_FAIL_GOTO(p->local->weights_input, "Create tensor failed", final);
     p->local->weights_input->attr.is_const = TRUE;
     vsi_nn_SetTensorAttr(p->local->weights_input, VSI_NN_TENSOR_ATTR_CONST);
 
     p->local->weights_recurrent = vsi_nn_ConcatTensor(self->graph, 1, inputs[GRUCELL_INPUT_WEIGHT_H2R],
                 inputs[GRUCELL_INPUT_WEIGHT_H2Z], inputs[GRUCELL_INPUT_WEIGHT_H2C]);
+    CHECK_PTR_FAIL_GOTO(p->local->weights_recurrent, "Create tensor failed", final);
     p->local->weights_recurrent->attr.is_const = TRUE;
     vsi_nn_SetTensorAttr(p->local->weights_recurrent, VSI_NN_TENSOR_ATTR_CONST);
 
     p->local->bias_r = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2R]->attr,
         inputs[GRUCELL_INPUT_BIAS_I2R], inputs[GRUCELL_INPUT_BIAS_H2R]);
+    CHECK_PTR_FAIL_GOTO(p->local->bias_r, "Create tensor failed", final);
     p->local->bias_r->attr.is_const = TRUE;
     vsi_nn_SetTensorAttr(p->local->bias_r, VSI_NN_TENSOR_ATTR_CONST);
     p->local->bias_z = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2Z]->attr,
         inputs[GRUCELL_INPUT_BIAS_I2Z], inputs[GRUCELL_INPUT_BIAS_H2Z]);
+    CHECK_PTR_FAIL_GOTO(p->local->bias_z, "Create tensor failed", final);
     p->local->bias_z->attr.is_const = TRUE;
     vsi_nn_SetTensorAttr(p->local->bias_z, VSI_NN_TENSOR_ATTR_CONST);
     p->local->bias_c = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2C]->attr,
         inputs[GRUCELL_INPUT_BIAS_I2C], inputs[GRUCELL_INPUT_BIAS_H2C]);
+    CHECK_PTR_FAIL_GOTO(p->local->bias_c, "Create tensor failed", final);
     p->local->bias_c->attr.is_const = TRUE;
     vsi_nn_SetTensorAttr(p->local->bias_c, VSI_NN_TENSOR_ATTR_CONST);
 
@@ -412,16 +457,19 @@ static vsi_bool op_setup_float_cudnn
         /* reshape and transpose input */
         input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_INPUT],
                                                 p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
-
+        CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final);
         tmp = vsi_nn_rnn_create_nn_fc(self, input_tensor->t, p->local->weights_input,
             NULL, kernel_h, kernel_w,
             &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_INPUT],
             use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
+
         /* transpose and reshape output */
         reshaped_size[0] = inputs[GRUCELL_INPUT_INPUT]->attr.size[1];
         reshaped_size[1] = p->local->weights_input->attr.size[1];
         input_fc_output = vsi_nn_rnn_create_reshape(self, tmp->t, NULL,
             reshaped_size, 2, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(input_fc_output, "Create internal tensor failed", final);
 
         grucell_activation_input_layout = GRUCELL_ACTIVATION_INPUT_LAYOUT_INPUT_NC_FC_CN;
     }
@@ -430,6 +478,7 @@ static vsi_bool op_setup_float_cudnn
         input_fc_output = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_INPUT],
             p->local->weights_input, NULL,
             &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_INPUT], use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(input_fc_output, "Create internal tensor failed", final);
         grucell_activation_input_layout = GRUCELL_ACTIVATION_INPUT_LAYOUT_ALL_NC;
     }
 
@@ -444,25 +493,31 @@ static vsi_bool op_setup_float_cudnn
         /* reshape and transpose input */
         input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_H_STATE],
                                                 p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final);
 
         tmp = vsi_nn_rnn_create_nn_fc(self, input_tensor->t, p->local->weights_recurrent,
                                         NULL, kernel_h, kernel_w,
                                         &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_HIDDEN], use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
+
         /* transpose and reshape output */
         reshaped_size[0] = inputs[GRUCELL_INPUT_H_STATE]->attr.size[1];
         reshaped_size[1] = p->local->weights_recurrent->attr.size[1];
         recurrent_fc_output = vsi_nn_rnn_create_reshape(self, tmp->t, NULL,
             reshaped_size, 2, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(recurrent_fc_output, "Create internal tensor failed", final);
     }
     else
     {
         recurrent_fc_output = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_H_STATE],
             p->local->weights_recurrent, NULL,
             &p->internal_dtype[GRUCELL_CUDNN_QUANTIZE_PARAM_HIDDEN], use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(recurrent_fc_output, "Create internal tensor failed", final);
     }
 
 #ifdef USE_GRUCELL_ACTIVATION
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->inputs[GRUCELL_ACTIVATION_INPUT_H_STATE] = inputs[GRUCELL_INPUT_H_STATE];
 
     if(p->local->multi_batch)
@@ -480,8 +535,12 @@ static vsi_bool op_setup_float_cudnn
         {
             splited_input_fc_output_tensors = vsi_nn_create_split(self,
                 input_fc_output->t, 1, 3, NULL, use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(splited_input_fc_output_tensors, curr,
+                "Create internal tensor failed", final);
             splited_recurrent_fc_output_tensors = vsi_nn_create_split(self,
                 recurrent_fc_output->t, 1, 3, NULL, use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(splited_recurrent_fc_output_tensors, curr,
+                "Create internal tensor failed", final);
             curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R] = splited_input_fc_output_tensors[0]->t;
             curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z] = splited_input_fc_output_tensors[1]->t;
             curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_C] = splited_input_fc_output_tensors[2]->t;
@@ -494,8 +553,12 @@ static vsi_bool op_setup_float_cudnn
     {
         splited_input_fc_output_tensors = vsi_nn_create_split(self,
             input_fc_output->t, 0, 3, NULL, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(splited_input_fc_output_tensors, curr,
+            "Create internal tensor failed", final);
         splited_recurrent_fc_output_tensors = vsi_nn_create_split(self,
             recurrent_fc_output->t, 0, 3, NULL, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(splited_recurrent_fc_output_tensors, curr,
+            "Create internal tensor failed", final);
         curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_R] = splited_input_fc_output_tensors[0]->t;
         curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_Z] = splited_input_fc_output_tensors[1]->t;
         curr->inputs[GRUCELL_ACTIVATION_INPUT_INPUT_FC_C] = splited_input_fc_output_tensors[2]->t;
@@ -593,12 +656,14 @@ static vsi_bool op_setup_float_cudnn
             tensor_u->t, tmp_tensor->t, &tmp_tensor->t->attr.dtype, use_virtual_tensor);
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = tmp_tensor->t;
         curr->inputs[1] = tensor_c->t;
         curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT];
         vsi_nn_internal_setup_node(self, curr);
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT];
         curr->outputs[0] = outputs[GRUCELL_OUTPUT_H_STATE];
         vsi_nn_internal_setup_node(self, curr);
@@ -606,6 +671,8 @@ static vsi_bool op_setup_float_cudnn
 #endif
 
     return TRUE;
+final:
+    return FALSE;
 }
 
 /*
@@ -629,34 +696,38 @@ static vsi_bool op_setup_float_cudnn_v2
     vsi_nn_internal_tensor_t* tensor_r = NULL;
     vsi_nn_internal_tensor_t* concated_input = NULL;
     vsi_nn_tensor_attr_t attr;
+    vsi_bool ret = FALSE;
 
     /* input to r,z */
     p->local->weights_update = vsi_nn_ConcatTensor(self->graph, 1/* axis */,
         inputs[GRUCELL_INPUT_WEIGHT_I2R], inputs[GRUCELL_INPUT_WEIGHT_I2Z]);
+    CHECK_PTR_FAIL_GOTO(p->local->weights_update, "Create tensor failed", final);
     /* recurrent to r,z */
     p->local->weights_reset = vsi_nn_ConcatTensor(self->graph, 1/* axis */,
         inputs[GRUCELL_INPUT_WEIGHT_H2R], inputs[GRUCELL_INPUT_WEIGHT_H2Z]);
+    CHECK_PTR_FAIL_GOTO(p->local->weights_reset, "Create tensor failed", final);
     /* [input, recurrent] to r,z */
     p->local->weights_input = vsi_nn_ConcatTensor(self->graph, 0/* axis */,
         p->local->weights_update, p->local->weights_reset);
+    CHECK_PTR_FAIL_GOTO(p->local->weights_input, "Create tensor failed", final);
     p->local->weights_input->attr.is_const = TRUE;
     vsi_nn_SetTensorAttr(p->local->weights_input, VSI_NN_TENSOR_ATTR_CONST);
-    vsi_safe_release_tensor(p->local->weights_update);
-    vsi_safe_release_tensor(p->local->weights_reset);
 
     p->local->bias_z = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2Z]->attr,
         inputs[GRUCELL_INPUT_BIAS_I2Z], inputs[GRUCELL_INPUT_BIAS_H2Z]);
+    CHECK_PTR_FAIL_GOTO(p->local->bias_z, "Create tensor failed", final);
     p->local->bias_r = vsi_nn_ConstTensorAdd(self->graph, inputs[GRUCELL_INPUT_BIAS_I2R]->attr,
         inputs[GRUCELL_INPUT_BIAS_I2R], inputs[GRUCELL_INPUT_BIAS_H2R]);
+    CHECK_PTR_FAIL_GOTO(p->local->bias_r, "Create tensor failed", final);
     p->local->bias_z_r = vsi_nn_ConcatTensor(self->graph, 0/* axis */,
         p->local->bias_r, p->local->bias_z);
+    CHECK_PTR_FAIL_GOTO(p->local->bias_z_r, "Create tensor failed", final);
     p->local->bias_z_r->attr.is_const = TRUE;
     vsi_nn_SetTensorAttr(p->local->bias_z_r, VSI_NN_TENSOR_ATTR_CONST);
-    vsi_safe_release_tensor(p->local->bias_z);
-    vsi_safe_release_tensor(p->local->bias_r);
 
     concated_input = vsi_nn_rnn_create_concat(self, 0/* axis */,
         use_virtual_tensor, inputs[GRUCELL_INPUT_INPUT], inputs[GRUCELL_INPUT_H_STATE]);
+    CHECK_PTR_FAIL_GOTO(concated_input, "Create internal tensor failed", final);
 
     dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
     if ( concated_input->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
@@ -670,6 +741,16 @@ static vsi_bool op_setup_float_cudnn_v2
     }
     tmp_tensor = vsi_nn_rnn_create_tp_fc(self, concated_input->t, p->local->weights_input,
         p->local->bias_z_r, &dtype, use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
+
+    {
+        uint32_t _slices[] = { 0, 0 };
+        _slices[0] = (uint32_t)inputs[GRUCELL_INPUT_INPUT]->attr.size[0];
+        _slices[1] = (uint32_t)inputs[GRUCELL_INPUT_H_STATE]->attr.size[0];
+        splited_input_fc_output_tensors = vsi_nn_create_split(self, concated_input->t,
+            0, 2, _slices, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO( splited_input_fc_output_tensors, "Create internal tensor fail.", final );
+    }
 
     dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
     if ( splited_input_fc_output_tensors[0]->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
@@ -681,14 +762,10 @@ static vsi_bool op_setup_float_cudnn_v2
     {
         dtype.vx_type = VSI_NN_TYPE_FLOAT16;
     }
-    {
-        uint32_t _slices[] = { (uint32_t)inputs[GRUCELL_INPUT_INPUT]->attr.size[0],
-            (uint32_t)inputs[GRUCELL_INPUT_H_STATE]->attr.size[0] };
-        splited_input_fc_output_tensors = vsi_nn_create_split(self, concated_input->t,
-            0, 2, _slices, use_virtual_tensor);
-    }
+
     input2cand_output = vsi_nn_rnn_create_tp_fc(self, splited_input_fc_output_tensors[0]->t,
         inputs[GRUCELL_INPUT_WEIGHT_I2C], inputs[GRUCELL_INPUT_BIAS_I2C], &dtype, use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO(input2cand_output, "Create internal tensor failed", final);
 
     dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
     if ( inputs[GRUCELL_INPUT_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
@@ -702,14 +779,17 @@ static vsi_bool op_setup_float_cudnn_v2
     }
     recurrent2cand_output = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_H_STATE],
         inputs[GRUCELL_INPUT_WEIGHT_H2C], inputs[GRUCELL_INPUT_BIAS_H2C], &dtype, use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO(recurrent2cand_output, "Create internal tensor failed", final);
 
     tmp_tensor = vsi_nn_rnn_create_activation(self, tmp_tensor->t, p->local->gate_activation,
         &tmp_tensor->t->attr.dtype, use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
 
     /* split for combined FC outputs, r_t, z_t */
     splited_input_fc_output_tensors = vsi_nn_create_split(self, tmp_tensor->t,
         0/* axis */,
         2/* dim num */, NULL, use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO(splited_input_fc_output_tensors, "Create internal tensor failed", final);
 
     memset( &attr, 0x00, sizeof(attr) );
     attr.dim_num = VSI_NN_DIM_AUTO;
@@ -726,8 +806,10 @@ static vsi_bool op_setup_float_cudnn_v2
         dtype.vx_type = VSI_NN_TYPE_FLOAT16;
     }
     tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
 
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_A_TIMES_B_PLUS_C, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->inputs[0] = splited_input_fc_output_tensors[0]->t;
     curr->inputs[1] = recurrent2cand_output->t;
     curr->inputs[2] = input2cand_output->t;
@@ -736,10 +818,12 @@ static vsi_bool op_setup_float_cudnn_v2
 
     tensor_r = vsi_nn_rnn_create_activation(self, tmp_tensor->t,
         p->local->candidate_activation, &tmp_tensor->t->attr.dtype, use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(tensor_r, curr, "Create internal tensor failed", final);
 
 #define USE_GRUCELL_ACTIVATION_SMA
 #ifdef USE_GRUCELL_ACTIVATION_SMA
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_INTERNAL_SMA, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->inputs[GRUCELL_ACTIVATION_SMA_INPUT_H_STATE] = inputs[GRUCELL_INPUT_H_STATE];
     curr->inputs[GRUCELL_ACTIVATION_SMA_INPUT_H_T_] = tensor_r->t;
     curr->inputs[GRUCELL_ACTIVATION_SMA_INPUT_Z_T] = splited_input_fc_output_tensors[1]->t;
@@ -758,18 +842,25 @@ static vsi_bool op_setup_float_cudnn_v2
         tmp_tensor->t, &tmp_tensor->t->attr.dtype, use_virtual_tensor);
 
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->inputs[0] = tmp_tensor->t;
     curr->inputs[1] = tensor_r->t;
     curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT];
     vsi_nn_internal_setup_node(self, curr);
 
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->inputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT];
     curr->outputs[0] = outputs[GRUCELL_OUTPUT_H_STATE];
     vsi_nn_internal_setup_node(self, curr);
 #endif
-
-    return TRUE;
+    ret = TRUE;
+final:
+    vsi_safe_release_tensor(p->local->bias_z);
+    vsi_safe_release_tensor(p->local->bias_r);
+    vsi_safe_release_tensor(p->local->weights_update);
+    vsi_safe_release_tensor(p->local->weights_reset);
+    return ret;
 }
 
 static vsi_bool op_setup_default
@@ -804,6 +895,8 @@ static vsi_bool op_setup_default
     uint32_t kernel_h = 1;
     uint32_t kernel_w = 1;
     int32_t i = 0;
+    vsi_nn_tensor_t* wei_r2c_tensor = NULL;
+    vsi_nn_tensor_t* bias_r2c_tensor = NULL;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     memset( &attr, 0x00, sizeof( attr ) );
@@ -853,6 +946,7 @@ static vsi_bool op_setup_default
                                                 inputs[GRUCELL_INPUT_BIAS_I2R + i],
                                                 &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2R + i],
                                                 use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO(input_gate_fc_outputs[i], "Create internal tensor failed", final);
         }
     }
     else
@@ -862,6 +956,7 @@ static vsi_bool op_setup_default
             (uint32_t)inputs[GRUCELL_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w);
         input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_INPUT],
                                                 p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final);
 
         for( i = 0; i < GRUCELL_RZ_GATE_COUNT; i++)
         {
@@ -872,9 +967,11 @@ static vsi_bool op_setup_default
                                                 kernel_h, kernel_w,
                                                 &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2R + i],
                                                 use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
             /* transpose and reshape output */
             input_gate_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self,
                 tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO(input_gate_fc_outputs[i], "Create internal tensor failed", final);
         }
     }
 
@@ -889,10 +986,7 @@ static vsi_bool op_setup_default
                                                 inputs[GRUCELL_INPUT_BIAS_H2R + i],
                                                 &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2R + i],
                                                 use_virtual_tensor);
-            if (hstate_gate_fc_outputs[i] == NULL)
-            {
-                goto error;
-            }
+            CHECK_PTR_FAIL_GOTO(hstate_gate_fc_outputs[i], "Create internal tensor failed", final);
         }
     }
     else
@@ -902,6 +996,7 @@ static vsi_bool op_setup_default
             (uint32_t)inputs[GRUCELL_INPUT_H_STATE]->attr.size[0], &kernel_h, &kernel_w);
         hstate_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self,
             inputs[GRUCELL_INPUT_H_STATE], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(hstate_input_tensor, "Create internal tensor failed", final);
 
         for( i = 0; i < GRUCELL_RZ_GATE_COUNT; i++)
         {
@@ -912,9 +1007,11 @@ static vsi_bool op_setup_default
                                                 kernel_h, kernel_w,
                                                 &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2R + i],
                                                 use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
             /* transpose and reshape output */
             hstate_gate_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self,
                 tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO(hstate_gate_fc_outputs[i], "Create internal tensor failed", final);
         }
     }
 
@@ -926,6 +1023,7 @@ static vsi_bool op_setup_default
                                  hstate_gate_fc_outputs[i]->t,
                                  &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2R + i],
                                  use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(gate_fc_outputs[i], "Create internal tensor failed", final);
     }
 
     /* Gate activations */
@@ -936,6 +1034,7 @@ static vsi_bool op_setup_default
                                   p->local->gate_activation,
                                   &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2R + i],
                                   use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(gate_act_outputs[i], "Create internal tensor failed", final);
     }
 
     /* Candidate FC */
@@ -948,6 +1047,7 @@ static vsi_bool op_setup_default
                              inputs[GRUCELL_INPUT_H_STATE],
                              &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2R],
                              use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(rh_mul_outputs, "Create internal tensor failed", final);
     }
     else
     {
@@ -957,6 +1057,7 @@ static vsi_bool op_setup_default
             inputs[GRUCELL_INPUT_H_STATE]->attr.size,
             inputs[GRUCELL_INPUT_H_STATE]->attr.dim_num,
             use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(rh_mul_outputs, "Create internal tensor failed", final);
     }
 
     if( inputs[GRUCELL_INPUT_INPUT]->attr.dtype.qnt_type
@@ -999,6 +1100,7 @@ static vsi_bool op_setup_default
                                    inputs[GRUCELL_INPUT_BIAS_I2C],
                                    &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C],
                                    use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(input_cand_fc_output, "Create internal tensor failed", final);
     }
     else
     {
@@ -1008,6 +1110,8 @@ static vsi_bool op_setup_default
             (uint32_t)inputs[GRUCELL_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w);
         input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_INPUT],
                                                 p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final);
+
         tmp = vsi_nn_rnn_create_nn_fc(self,
                   input_tensor->t,
                   inputs[GRUCELL_INPUT_WEIGHT_I2C],
@@ -1015,9 +1119,11 @@ static vsi_bool op_setup_default
                   kernel_h, kernel_w,
                   &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C],
                   use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
         /* transpose and reshape output */
         input_cand_fc_output = vsi_nn_rnn_process_output_for_nn_fc(self,
             tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(input_cand_fc_output, "Create internal tensor failed", final);
     }
     if ( is_hstate_cand_fc_op_tp )
     {
@@ -1025,9 +1131,6 @@ static vsi_bool op_setup_default
         if ((rh_mul_outputs->t->attr.dtype.vx_type) != (inputs[GRUCELL_INPUT_WEIGHT_H2C]->attr.dtype.vx_type)
             && (p->local->multi_batch))
         {
-            vsi_nn_tensor_t* wei_r2c_tensor = NULL;
-            vsi_nn_tensor_t* bias_r2c_tensor = NULL;
-
             memcpy(&attr, &(inputs[GRUCELL_INPUT_WEIGHT_H2C]->attr), sizeof(attr));
             attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
             if ( rh_mul_outputs->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
@@ -1041,14 +1144,18 @@ static vsi_bool op_setup_default
             }
 
             wei_r2c_tensor = vsi_nn_ConvertTensorDtype(self->graph, inputs[GRUCELL_INPUT_WEIGHT_H2C], &(attr.dtype));
+            CHECK_PTR_FAIL_GOTO(wei_r2c_tensor, "Create tensor failed", final);
             attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
             bias_r2c_tensor = vsi_nn_ConvertTensorDtype(self->graph, inputs[GRUCELL_INPUT_BIAS_H2C], &(attr.dtype));
+            CHECK_PTR_FAIL_GOTO(bias_r2c_tensor, "Create tensor failed", final);
+
             rh_cand_fc_output = vsi_nn_rnn_create_tp_fc(self,
                                     rh_mul_outputs->t,
                                     wei_r2c_tensor,
                                     bias_r2c_tensor,
                                     &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2C],
                                     use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO(rh_cand_fc_output, "Create internal tensor failed", final);
         }
         else
         {
@@ -1058,6 +1165,7 @@ static vsi_bool op_setup_default
                                     inputs[GRUCELL_INPUT_BIAS_H2C],
                                     &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2C],
                                     use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO(rh_cand_fc_output, "Create internal tensor failed", final);
         }
     }
     else
@@ -1068,6 +1176,8 @@ static vsi_bool op_setup_default
             (uint32_t)rh_mul_outputs->t->attr.size[0], &kernel_h, &kernel_w);
         hstate_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, rh_mul_outputs->t,
                                                 p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(hstate_input_tensor, "Create internal tensor failed", final);
+
         tmp = vsi_nn_rnn_create_nn_fc(self,
                   hstate_input_tensor->t,
                   inputs[GRUCELL_INPUT_WEIGHT_H2C],
@@ -1075,9 +1185,11 @@ static vsi_bool op_setup_default
                   kernel_h, kernel_w,
                   &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2C],
                   use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
         /* transpose and reshape output */
         rh_cand_fc_output = vsi_nn_rnn_process_output_for_nn_fc(self,
             tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(rh_cand_fc_output, "Create internal tensor failed", final);
     }
 
     if ( p->linear_before_reset == 0 )
@@ -1091,6 +1203,7 @@ static vsi_bool op_setup_default
                                     rh_cand_fc_output->t,
                                     &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2C],
                                     use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(r_mul_hcand_fc_output, "Create internal tensor failed", final);
     }
     /* Candidate input FC add r*h FC */
     cand_fc_output = vsi_nn_rnn_create_tensor_add(self,
@@ -1098,6 +1211,7 @@ static vsi_bool op_setup_default
                          r_mul_hcand_fc_output->t,
                          &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C],
                          use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO(cand_fc_output, "Create internal tensor failed", final);
 
     /* Candidate activation */
     cand_act_output = vsi_nn_rnn_create_activation(self,
@@ -1105,6 +1219,7 @@ static vsi_bool op_setup_default
                                   p->local->candidate_activation,
                                   &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C],
                                   use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO(cand_act_output, "Create internal tensor failed", final);
 
     /* GRU cell output */
     memcpy( &attr.dtype, &gate_act_outputs[GRUCELL_GATE_Z]->t->attr.dtype, sizeof( attr.dtype ) );
@@ -1113,6 +1228,7 @@ static vsi_bool op_setup_default
     attr.vtl = use_virtual_tensor;
     attr.is_const = TRUE;
     input_tensor = vsi_nn_internal_new_tensor(self, &attr, 1.0f);
+    CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final);
 
     memset( &attr, 0x00, sizeof(attr) );
     //memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
@@ -1131,9 +1247,11 @@ static vsi_bool op_setup_default
     }
 
     tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
 
     /* create internal tensor sub node (1-zt)*c */
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SUBTRACT, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->inputs[0] = input_tensor->t;
     curr->inputs[1] = gate_act_outputs[GRUCELL_GATE_Z]->t;
     curr->outputs[0] = tmp_tensor->t;
@@ -1146,6 +1264,7 @@ static vsi_bool op_setup_default
                         cand_act_output->t,
                         &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_I2C],
                         use_virtual_tensor);
+     CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
 
     /* create internal multiply node zt*hstate */
     tmp_tensor = create_multiply(self,
@@ -1153,9 +1272,11 @@ static vsi_bool op_setup_default
                      inputs[GRUCELL_INPUT_H_STATE],
                      &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2Z],
                      use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
 
      /* create internal tensor add node (1-zt)*c + zt*hstate */
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->inputs[0] = output_tensor->t;
     curr->inputs[1] = tmp_tensor->t;
     curr->outputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT];
@@ -1164,13 +1285,16 @@ static vsi_bool op_setup_default
 
     /* copy output to h_state  */
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->inputs[0] = outputs[GRUCELL_OUTPUT_OUTPUT];
     curr->outputs[0] = outputs[GRUCELL_OUTPUT_H_STATE];
     vsi_nn_internal_setup_node(self, curr);
 
     return TRUE;
 
-error:
+final:
+    vsi_safe_release_tensor(wei_r2c_tensor);
+    vsi_safe_release_tensor(bias_r2c_tensor);
     return FALSE;
 
 } /* op_setup() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c b/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c
index dbec83887..4a07faab6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_hashtable_lookup.c
@@ -94,6 +94,8 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(node);
+
     if( outputs[0]->attr.dim_num == VSI_NN_DIM_AUTO )
     {
         outputs[0]->attr.dim_num = inputs[2]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c b/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c
index 46ee1d284..cc4b44362 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c
@@ -68,6 +68,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -79,6 +82,8 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = 2;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c b/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c
index d9b3b320f..5386af725 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c
@@ -60,6 +60,10 @@ static vsi_status op_compute
 {
     vsi_status status = VSI_SUCCESS;
 
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+
     return status;
 } /* op_compute() */
 
@@ -124,6 +128,12 @@ vsi_status vsi_nn_op_imageprocess_single_node
     vsi_nn_tensor_t *tensor_out
     )
 {
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(attr);
+    VSI_UNREFERENCED(p);
+    VSI_UNREFERENCED(data);
+    VSI_UNREFERENCED(tensor_out);
+
     return VSI_SUCCESS;
 }
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c b/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c
index 9a2043e9e..2066865a5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c
@@ -34,6 +34,7 @@
 #include "vsi_nn_tensor.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_error.h"
 
 /*
  Declare number of input and output.
@@ -50,6 +51,9 @@ static vsi_status op_compute
 {
     vsi_status status = VSI_SUCCESS;
 
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+
     status = vsi_nn_internal_compute_node( self );
 
     return status;
@@ -64,6 +68,9 @@ static vsi_bool op_check
 {
     vsi_nn_interp_param *p = NULL;
 
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+
     p = &self->nn_param.interp;
 
     if ((p->pad_beg > 0) || (p->pad_end > 0))
@@ -166,8 +173,10 @@ static vsi_bool op_setup
         memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
         vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor);
         crop_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(crop_tensor, "Create internal tensor failed", final);
         crop_in_tensor = crop_tensor->t;
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 1, 1 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num;
         curr->node->nn_param.strided_slice.end_dims_num = inputs[0]->attr.dim_num;
         curr->node->nn_param.strided_slice.stride_dims_num = inputs[0]->attr.dim_num;
@@ -177,10 +186,13 @@ static vsi_bool op_setup
         curr->node->nn_param.strided_slice.new_axis_mask = 0;
         begin_dims = (vsi_ssize_t *)vsi_nn_internal_new_node_param(curr,
             VSI_NN_MAX_DIM_NUM * sizeof(vsi_ssize_t));
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(begin_dims, curr, "Create internal buffer failed", final);
         end_dims   = (vsi_ssize_t *)vsi_nn_internal_new_node_param(curr,
             VSI_NN_MAX_DIM_NUM * sizeof(vsi_ssize_t));
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(end_dims, curr, "Create internal buffer failed", final);
         stride_dims  = (vsi_ssize_t *)vsi_nn_internal_new_node_param(curr,
             VSI_NN_MAX_DIM_NUM * sizeof(vsi_ssize_t));
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(stride_dims, curr, "Create internal buffer failed", final);
         for (i = 0; i < inputs[0]->attr.dim_num; i++)
         {
             stride_dims[i] = 1;
@@ -215,6 +227,7 @@ static vsi_bool op_setup
             && (height_in_eff_ == (vsi_ssize_t)outputs[0]->attr.size[1]))
     {
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 1, 1 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0]  = crop_in_tensor;
         curr->outputs[0] = outputs[0];
         vsi_nn_internal_setup_node(self, curr);
@@ -222,6 +235,7 @@ static vsi_bool op_setup
     else
     {
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_INTERNAL, 1, 1 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.resize_internal.align_corners = vx_true_e;
         curr->node->nn_param.resize_internal.factor = factor;
         curr->node->nn_param.resize_internal.half_pixel_centers = vx_false_e;
@@ -231,6 +245,8 @@ static vsi_bool op_setup
     }
 
     return TRUE;
+final:
+    return FALSE;
 } /* op_setup() */
 
 static vsi_status op_optimize
@@ -243,6 +259,9 @@ static vsi_status op_optimize
 {
     vsi_status     status;
 
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+
     status = VSI_SUCCESS;
     vsi_nn_internal_optimize_node( self, direction );
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c
index cff15071e..242099b11 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c
@@ -42,7 +42,7 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status = VX_FAILURE;
+    vsi_status status = VSI_FAILURE;
     int32_t axis = self->nn_param.l2_normalize.axis;
     vsi_nn_kernel_param_t * param = NULL;
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
index e872a3dc5..d52eb7d19 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
@@ -54,7 +54,7 @@ static vsi_nn_tensor_t* _expand_scale_tensor
     vsi_size_t          scale_size_out
     )
 {
-    vsi_status status = VX_SUCCESS;
+    vsi_status status = VSI_FAILURE;
     float* f32_in_buffer   = NULL;
     float* f32_out_buffer  = NULL;
     vsi_size_t  i = 0;
@@ -144,13 +144,7 @@ static vsi_bool _check_value_is_equal_to_one
         }
     }
 
-    if ( !tensor->attr.is_created_from_handle )
-    {
-        if ( tensor_data )
-        {
-            free(tensor_data);
-        }
-    }
+    vsi_nn_safe_free(tensor_data);
 
     return ret;
 }
@@ -324,7 +318,7 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_bool ret = TRUE;
+    vsi_bool ret = FALSE;
     vsi_nn_internal_node_t* curr = NULL;
 
     if( NULL == self )
@@ -349,10 +343,11 @@ static vsi_bool op_setup
     {
         self->nn_param.l2normalizescale.local.use_internal_node = TRUE;
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_L2_NORMALIZE, 0, 0);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.l2_normalize.axis = self->nn_param.l2normalizescale.axis;
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = outputs[0];
-        vsi_nn_internal_setup_node( self, curr );
+        ret = vsi_nn_internal_setup_node( self, curr );
     }
     else if ( ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 &&
                 outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) ||
@@ -370,8 +365,10 @@ static vsi_bool op_setup
         attr.vtl = TRUE;
         attr.is_const = FALSE;
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
 
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_L2_NORMALIZE, 0, 0);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.l2_normalize.axis = self->nn_param.l2normalizescale.axis;
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = output_tensor->t;
@@ -389,22 +386,26 @@ static vsi_bool op_setup
             attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
         }
         reshape_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+        CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create internal tensor failed", final);
+
         vsi_nn_ConvertTensor(self->graph, inputs[1], reshape_tensor->t);
 
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_MULTIPLY, 0, 0);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = output_tensor->t;
         curr->inputs[1] = reshape_tensor->t;
         curr->node->nn_param.multiply.scale = 1.0f;
         curr->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
         curr->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN;
         curr->outputs[0] = outputs[0];
-        vsi_nn_internal_setup_node( self, curr );
+        ret = vsi_nn_internal_setup_node( self, curr );
     }
     else
     {
         ret = vsi_nn_op_common_setup(self, inputs, outputs);
     }
 
+final:
     return ret;
 }
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
index 75354a7c5..a90ae594b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
@@ -37,6 +37,7 @@
 #include "vsi_nn_tensor_util_prv.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
 
 #define _INPUT_NUM          (3)
 #define _OUTPUT_NUM         (1)
@@ -116,11 +117,15 @@ static vsi_bool op_setup
         attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
 
         mean_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(mean_tensor, "Create internal tensor failed", final);
         vari_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(vari_tensor, "Create internal tensor failed", final);
 
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_MOMENTS, 0, 0);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         axis_array = (int32_t*)\
             vsi_nn_internal_new_node_param(curr, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(axis_array, curr, "Create internal buffer failed", final);
         axis_array[0] = axis;
 
         curr->node->nn_param.moments.axis = axis_array;
@@ -131,6 +136,7 @@ static vsi_bool op_setup
         vsi_nn_internal_setup_node( self, curr );
 
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_BATCHNORM_SINGLE, 0, 0);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = inputs[0];
         curr->inputs[1] = mean_tensor->t;
         curr->inputs[2] = vari_tensor->t;
@@ -138,13 +144,14 @@ static vsi_bool op_setup
         curr->inputs[4] = inputs[1];
         curr->node->nn_param.batchnorm_single.eps = self->nn_param.layernorm.eps;
         curr->outputs[0] = outputs[0];
-        vsi_nn_internal_setup_node( self, curr );
+        ret = vsi_nn_internal_setup_node( self, curr );
     }
     else
     {
         ret = vsi_nn_op_common_setup(self, inputs, outputs);
     }
 
+final:
     return ret;
 }
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c
index fd12173cf..34c329c4c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c
@@ -112,6 +112,8 @@ static vsi_bool _log_softmax_op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(kernel_name);
+
     /* TODO: Add code to comput outputs' shape. */
     if( NULL == self )
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c
index e44440ead..6bddcff6e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c
@@ -100,6 +100,8 @@ static vsi_bool op_setup
     vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
     vsi_bool ret = TRUE;
 
+    VSI_UNREFERENCED(self);
+
     out_rank = inputs[0]->attr.dim_num;
 
     for(i = 0; i < out_rank; i++)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c
index 01695c42b..7cb068ed0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c
@@ -106,6 +106,8 @@ static vsi_bool op_setup
     vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
     vsi_bool ret = TRUE;
 
+    VSI_UNREFERENCED(self);
+
     in1_rank = inputs[0]->attr.dim_num;
     in2_rank = inputs[1]->attr.dim_num;
     out_rank = vsi_nn_max( in1_rank, in2_rank );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lpnorm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lpnorm.c
index 7a3eb91c0..9547d8be8 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lpnorm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lpnorm.c
@@ -164,6 +164,8 @@ static vsi_bool op_setup
 {
     vsi_size_t i = 0;
 
+    VSI_UNREFERENCED(self);
+
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c
index 63a85f7ab..8d55f065d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c
@@ -34,6 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
 
 static vsi_status op_compute
     (
@@ -42,15 +43,17 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status = VSI_SUCCESS;
+    vsi_status status = VSI_FAILURE;
     vsi_nn_tensor_t * type_tensor = NULL;
     vx_nn_lshproj_params_t p;
     vx_bool valued = TRUE;
     vsi_nn_tensor_t * weight_tensor = NULL;
+    float* const_data = NULL;
 
     type_tensor = vsi_nn_VariableToTensor(self,
         (uint8_t *)&self->nn_param.lsh_projection.type,
         VSI_NN_TYPE_INT32);
+    CHECK_PTR_FAIL_GOTO( type_tensor, "Create tensor fail.", final );
 
     memset(&p, 0, sizeof(p));
     p.hash_func = REQUIRED_IO(inputs[0]);
@@ -65,7 +68,9 @@ static vsi_status op_compute
         float const_one = 1.0;
         vsi_size_t i;
         vsi_size_t count = inputs[1]->attr.size[1];
-        float* const_data = (float*)malloc(count * sizeof(float));
+
+        const_data = (float*)malloc(count * sizeof(float));
+        CHECK_PTR_FAIL_GOTO( const_data, "Create buffer fail.", final );
 
         for (i = 0; i < count; i++)
         {
@@ -78,9 +83,8 @@ static vsi_status op_compute
         attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
         weight_tensor = vsi_nn_CreateTensorFromData(self->graph,
             (uint8_t *)const_data, &attr);
+        CHECK_PTR_FAIL_GOTO( weight_tensor, "Create tensor fail.", final );
         p.weights = weight_tensor->t;
-        free(const_data);
-        //valued = FALSE;
     }
     vxSetTensorAttribute(p.weights, VX_TENSOR_VALUE, &valued, sizeof(vx_bool));
 
@@ -90,8 +94,12 @@ static vsi_status op_compute
     {
         status = VSI_FAILURE;
     }
-    vsi_nn_ReleaseTensor( &type_tensor );
-    if (weight_tensor != NULL) vsi_nn_ReleaseTensor(&weight_tensor);
+
+final:
+    vsi_nn_safe_free(const_data);
+    vsi_safe_release_tensor( type_tensor );
+    vsi_safe_release_tensor( weight_tensor );
+
     return status;
 } /* op_compute() */
 
@@ -102,6 +110,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm.c
index 900e50b7d..d3cc0c824 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm.c
@@ -202,6 +202,8 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(outputs);
     //TODO: Check tensor shapes.
     if( inputs[0]->attr.dim_num  < 3)
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c
index 283f930b5..ebd17a3f2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c
@@ -35,9 +35,9 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
+#include "vsi_nn_error.h"
 
 static vsi_bool setup_op_shapes
     (
@@ -82,6 +82,7 @@ static vsi_bool setup_op_shapes
         attr.is_const = TRUE;
 
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         inputs[LSTM_INPUT_H_STATE] = output_tensor->t;
     }
 
@@ -96,6 +97,7 @@ static vsi_bool setup_op_shapes
         attr.is_const = TRUE;
 
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         inputs[LSTM_INPUT_C_STATE] = output_tensor->t;
     }
 
@@ -107,6 +109,7 @@ static vsi_bool setup_op_shapes
         attr.vtl = use_virtual_tensor;
         attr.is_const = FALSE;
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         outputs[LSTM_OUTPUT_H_STATE] = output_tensor->t;
     }
 
@@ -119,6 +122,7 @@ static vsi_bool setup_op_shapes
         attr.vtl = use_virtual_tensor;
         attr.is_const = FALSE;
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         outputs[LSTM_OUTPUT_C_STATE] = output_tensor->t;
     }
 
@@ -156,6 +160,8 @@ static vsi_bool setup_op_shapes
     }
 
     return TRUE;
+final:
+    return FALSE;
 }
 
 static vsi_status op_compute
@@ -165,6 +171,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -175,6 +183,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -187,6 +198,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
@@ -211,6 +224,8 @@ static vsi_bool op_setup
     uint32_t batch_size = 0;
     uint32_t time_step = 0;
     uint32_t i = 0;
+    vsi_bool ret = FALSE;
+    vsi_status status = VSI_FAILURE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_node_wksp( self );
@@ -235,21 +250,26 @@ static vsi_bool op_setup
         /* transpose to time_major */
         output_tensor = vsi_nn_rnn_transpose_time_major(self,
             inputs[LSTM_INPUT_INPUT], NULL, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         input_tensor = output_tensor->t;
     }
 
     /* split input tensor */
     split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * \
         sizeof(vsi_nn_tensor_t *));
+    CHECK_PTR_FAIL_GOTO( split_output_tensors, "Create buffer fail.", final );
     memset( split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t *));
     lstmunit_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * \
         sizeof(vsi_nn_tensor_t *));
+    CHECK_PTR_FAIL_GOTO( lstmunit_reshape_output_tensors, "Create buffer fail.", final );
     memset( lstmunit_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t *));
 
-    vsi_nn_rnn_split_input_tensor(self, input_tensor,
+    status = vsi_nn_rnn_split_input_tensor(self, input_tensor,
         split_output_tensors, time_step, use_virtual_tensor);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
-    vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor);
+    status = vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
     last_step_h_state = inputs[LSTM_INPUT_H_STATE];
     last_step_c_state = inputs[LSTM_INPUT_C_STATE];
@@ -263,6 +283,7 @@ static vsi_bool op_setup
         /* reshape for split output */
         output_tensor = vsi_nn_rnn_reshape_split_output(self,
             split_output_tensors[i], batch_size, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         reshape_output = output_tensor->t;
 
         /* lstmunit output */
@@ -275,6 +296,7 @@ static vsi_bool op_setup
             vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[LSTM_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
             output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
             lstmunit_out0 = output_tensor->t;
         }
 
@@ -284,12 +306,14 @@ static vsi_bool op_setup
             vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[LSTM_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
             output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
             lstmunit_out1 = output_tensor->t;
 
             /* lstmunit output c_state */
             vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[LSTM_OUTPUT_C_STATE]->attr.dtype, use_virtual_tensor);
             output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
             lstmunit_out2 = output_tensor->t;
         }
         else
@@ -299,6 +323,7 @@ static vsi_bool op_setup
         }
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_OVXLIB, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.lstmunit_ovxlib.activation = curr_param->activation;
         curr->node->nn_param.lstmunit_ovxlib.cell_clip = curr_param->cell_clip;
         curr->node->nn_param.lstmunit_ovxlib.forget_bias = curr_param->forget_bias;
@@ -350,6 +375,7 @@ static vsi_bool op_setup
             /* reshape output to 3-dims */
             output_tensor = vsi_nn_rnn_reshape_cell_output(self,
                 lstmunit_out0, batch_size, use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
             lstmunit_reshape_output_tensors[i] = output_tensor->t;
         }
     }
@@ -362,19 +388,21 @@ static vsi_bool op_setup
             vsi_nn_internal_init_tensor_attr(&attr,
                 &outputs[LSTM_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
             output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
 
             tensor = output_tensor->t;
         }
 
         /* concat */
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.concat.axis = 2;
         for( i = 0; i < time_step; i++ )
         {
             curr->inputs[i] = lstmunit_reshape_output_tensors[i];
         }
         curr->outputs[0] = tensor;
-        vsi_nn_internal_setup_node( self, curr );
+        ret = vsi_nn_internal_setup_node( self, curr );
 
         if( !curr_param->time_major )
         {
@@ -383,11 +411,17 @@ static vsi_bool op_setup
                 tensor, outputs[LSTM_OUTPUT_OUTPUT], use_virtual_tensor);
         }
     }
+    else
+    {
+        /* return_sequences = False, return true to setup lstm node. */
+        ret = TRUE;
+    }
 
+final:
     vsi_nn_safe_free( split_output_tensors );
     vsi_nn_safe_free( lstmunit_reshape_output_tensors );
 
-    return TRUE;
+    return ret;
 } /* op_setup() */
 
 static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c
index 7730fee89..13fe0fed8 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c
@@ -222,6 +222,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     //TODO: Check tensor shapes.
     return TRUE;
 } /* op_check() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
index 27b545719..22dfd664d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
@@ -49,7 +49,7 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status = VX_FAILURE;
+    vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_param_t * param = NULL;
     int32_t _is_ln= 0;
     int32_t _is_cifg= 0;
@@ -107,6 +107,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c
index d792d34b2..f715c99ad 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
 #include "ops/vsi_nn_op_lstmunit_ovxlib.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
@@ -64,8 +64,10 @@ static vsi_nn_internal_tensor_t* create_tp_fc
 
     vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor);
     tensor2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    CHECK_PTR_FAIL_GOTO(tensor2, "Create internal tensor failed", final);
 
     tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_FCL, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
     tmp_inode->node->nn_param.fcl.axis = 0;
     tmp_inode->node->nn_param.fcl.weights = (uint32_t)weight->attr.size[1];
 
@@ -75,6 +77,7 @@ static vsi_nn_internal_tensor_t* create_tp_fc
     tmp_inode->outputs[0] = tensor2->t;
     vsi_nn_internal_setup_node(self, tmp_inode);
 
+final:
     return tensor2;
 }
 
@@ -105,6 +108,7 @@ static vsi_nn_internal_tensor_t* create_nn_fc
 
     vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor);
     tensor2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    CHECK_PTR_FAIL_GOTO(tensor2, "Create internal tensor failed", final);
 
     reshaped_weight_shape[3] = weight->attr.size[1];
     reshaped_weight_shape[2] = weight->attr.size[0] / ( kernel_h * kernel_w );
@@ -118,10 +122,12 @@ static vsi_nn_internal_tensor_t* create_nn_fc
     memcpy( &attr.dtype, &weight->attr.dtype, sizeof(attr.dtype) );
     memcpy( &attr.size, &reshaped_weight_shape, sizeof(attr.size));
     reshaped_weight_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    CHECK_PTR_FAIL_GOTO(reshaped_weight_tensor, "Create internal tensor failed", final);
 
     vsi_nn_ReshapeTensor( self->graph, weight, reshaped_weight_tensor->t, reshaped_weight_shape, 4 );
 
     tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
     tmp_inode->node->nn_param.conv2d.ksize[0] = kernel_w;
     tmp_inode->node->nn_param.conv2d.ksize[1] = kernel_h;
     tmp_inode->node->nn_param.conv2d.stride[0] = 1;
@@ -141,10 +147,11 @@ static vsi_nn_internal_tensor_t* create_nn_fc
     tmp_inode->outputs[0] = tensor2->t;
     vsi_nn_internal_setup_node(self, tmp_inode);
 
+final:
     return tensor2;
 }
 
-static void create_peephole
+static vsi_status create_peephole
     (
     vsi_nn_node_t * self,
     vsi_nn_tensor_t * input,
@@ -153,6 +160,7 @@ static void create_peephole
     vsi_bool use_virtual_tensor
     )
 {
+    vsi_status status = VSI_FAILURE;
     vsi_nn_tensor_attr_t attr;
     vsi_nn_internal_tensor_t* input_tensor0 = NULL;
     vsi_nn_internal_tensor_t* input_tensor1 = NULL;
@@ -164,8 +172,10 @@ static void create_peephole
     attr.is_const = FALSE;
     memcpy(&(attr.dtype), &((*input_fc)->t->attr.dtype), sizeof(vsi_nn_dtype_t));
     input_tensor0 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(input_tensor0, "Create internal tensor failed", final);
     /* create internal nodes */
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_MULTIPLY, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->node->nn_param.multiply.scale = 1.0f;
     curr->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
     curr->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN;
@@ -174,13 +184,19 @@ static void create_peephole
     curr->outputs[0] = input_tensor0->t;
     vsi_nn_internal_setup_node(self, curr);
     input_tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO( input_tensor1, "Create internal tensor fail.", final );
     /* create internal nodes */
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->inputs[0] = (*input_fc)->t;
     curr->inputs[1] = input_tensor0->t;
     curr->outputs[0] = input_tensor1->t;
     vsi_nn_internal_setup_node(self, curr);
     *input_fc = input_tensor1;
+
+    status = VSI_SUCCESS;
+final:
+    return status;
 }
 
 static vsi_bool setup_op_shapes
@@ -236,6 +252,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -246,6 +264,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -258,6 +279,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
@@ -272,7 +295,6 @@ static vsi_bool op_setup
     vsi_nn_tensor_attr_t attr;
     vsi_bool is_input_fc_on_tp = FALSE;
     vsi_bool is_recurrent_fc_on_tp = FALSE;
-    vsi_nn_internal_tensor_t* add_tensor = NULL;
     vsi_nn_internal_tensor_t* input_tensor = NULL;
     vsi_nn_internal_tensor_t* output_tensor = NULL;
     vsi_nn_internal_tensor_t* recurrent_input_tensor = NULL;
@@ -364,6 +386,7 @@ static vsi_bool op_setup
                                                 bias_tensors[i],
                                                 &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I + i],
                                                 use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO( input_fc_outputs[i], "Create tensor fail.", final );
         }
         if (inputs[LSTMUNIT_INPUT_AUX_INPUT] != NULL)
         {
@@ -375,6 +398,7 @@ static vsi_bool op_setup
                                                     NULL,
                                                     &p->internal_dtype_aux[LSTMUNIT_QUANTIZE_PARAM_AUX_I2I + i],
                                                     use_virtual_tensor);
+                CHECK_PTR_FAIL_GOTO( aux_input_fc_outputs[i], "Create tensor fail.", final );
             }
         }
     }
@@ -385,6 +409,7 @@ static vsi_bool op_setup
             (uint32_t)inputs[LSTMUNIT_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w);
         input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[LSTMUNIT_INPUT_INPUT],
                                                 p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO( input_tensor, "Create tensor fail.", final );
 
         for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++)
         {
@@ -395,9 +420,11 @@ static vsi_bool op_setup
                                                 kernel_h, kernel_w,
                                                 &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I + i],
                                                 use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO( tmp, "Create tensor fail.", final );
             /* transpose and reshape output */
             input_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self,
                 tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO( input_fc_outputs[i], "Create tensor fail.", final );
         }
         if (inputs[LSTMUNIT_INPUT_AUX_INPUT] != NULL)
         {
@@ -406,6 +433,7 @@ static vsi_bool op_setup
                 (uint32_t)inputs[LSTMUNIT_INPUT_AUX_INPUT]->attr.size[0], &kernel_h, &kernel_w);
             input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[LSTMUNIT_INPUT_AUX_INPUT],
                                                     p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO( input_tensor, "Create tensor fail.", final );
 
             for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++)
             {
@@ -416,9 +444,11 @@ static vsi_bool op_setup
                                                     kernel_h, kernel_w,
                                                     &p->internal_dtype_aux[LSTMUNIT_QUANTIZE_PARAM_AUX_I2I + i],
                                                     use_virtual_tensor);
+                CHECK_PTR_FAIL_GOTO( tmp, "Create tensor fail.", final );
                 /* transpose and reshape output */
                 aux_input_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self,
                     tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+                CHECK_PTR_FAIL_GOTO( aux_input_fc_outputs[i], "Create tensor fail.", final );
             }
         }
     }
@@ -432,6 +462,7 @@ static vsi_bool op_setup
                                                     aux_input_fc_outputs[i]->t,
                                                     &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_I2I],
                                                     use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO( input_add_aux_input_fc_outputs[i], "Create tensor fail.", final );
             input_fc_outputs[i] = input_add_aux_input_fc_outputs[i];
         }
     }
@@ -447,6 +478,7 @@ static vsi_bool op_setup
                                                 NULL,
                                                 &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_R2I + i],
                                                 use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO( recurrent_fc_outputs[i], "Create tensor fail.", final );
         }
     }
     else
@@ -456,6 +488,7 @@ static vsi_bool op_setup
             (uint32_t)inputs[LSTMUNIT_INPUT_H_STATE]->attr.size[0], &kernel_h, &kernel_w);
         recurrent_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self,
             inputs[LSTMUNIT_INPUT_H_STATE], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO( recurrent_input_tensor, "Create tensor fail.", final );
 
         for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++)
         {
@@ -466,31 +499,37 @@ static vsi_bool op_setup
                                                 kernel_h, kernel_w,
                                                 &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_R2I + i],
                                                 use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO( tmp, "Create tensor fail.", final );
             /* transpose and reshape output */
             recurrent_fc_outputs[i] = vsi_nn_rnn_process_output_for_nn_fc(self,
                 tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO( recurrent_fc_outputs[i], "Create tensor fail.", final );
         }
     }
 
     if (p->local->use_peephole)
     {
+        vsi_status status = VSI_FAILURE;
         /* update input gate */
         if (!p->local->use_cifg)
         {
-            create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE],
+            status = create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE],
                 inputs[LSTMUNIT_INPUT_WEIGHT_C2I], &(input_fc_outputs[0]),
                 use_virtual_tensor);
+            CHECK_STATUS_FAIL_GOTO( status, final );
         }
 
         /* update forget gate */
-        create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE],
+        status = create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE],
                 inputs[LSTMUNIT_INPUT_WEIGHT_C2F], &(input_fc_outputs[1]),
                 use_virtual_tensor);
+        CHECK_STATUS_FAIL_GOTO( status, final );
 
         /* update output gate */
-        create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE],
+        status = create_peephole(self, inputs[LSTMUNIT_INPUT_C_STATE],
                 inputs[LSTMUNIT_INPUT_WEIGHT_C2O], &(input_fc_outputs[3]),
                 use_virtual_tensor);
+        CHECK_STATUS_FAIL_GOTO( status, final );
     }
 
     /* layernorm */
@@ -498,59 +537,31 @@ static vsi_bool op_setup
     {
         for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++ )
         {
-            if (self->graph->ctx->config.support_stream_processor)
-            {
-                memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
-                attr.dim_num = VSI_NN_DIM_AUTO;
-                attr.vtl = use_virtual_tensor;
-                attr.is_const = FALSE;
-                attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-                attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
-                add_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
-                /* create internal nodes */
-                curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 );
-                curr->inputs[0] = input_fc_outputs[i]->t;
-                curr->inputs[1] = recurrent_fc_outputs[i]->t;
-                curr->outputs[0] = add_tensor->t;
-                vsi_nn_internal_setup_node(self, curr);
-
-                /* create internal nodes */
-                input_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
-                curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LAYER_NORM, 0, 0 );
-                curr->node->nn_param.layernorm.eps = (float)1e-8;
-                curr->inputs[0] = add_tensor->t;
-                curr->inputs[1] = inputs[LSTMUNIT_INPUT_BIAS_I + i];
-                curr->inputs[2] = inputs[LSTMUNIT_INPUT_LAYERNORM_I + i];
-                curr->outputs[0] = input_tensor->t;
-                vsi_nn_internal_setup_node(self, curr);
-
-                layernorm_outputs[i] = input_tensor;
-            }
-            else
-            {
-                memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
-                attr.dim_num = VSI_NN_DIM_AUTO;
-                attr.vtl = use_virtual_tensor;
-                attr.is_const = FALSE;
-                attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-                attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
-                input_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
-
-                /* create internal nodes */
-                curr = vsi_nn_internal_new_node( self, VSI_NN_OP_TENSOR_ADD_MEAN_STDDEV_NORM, 0, 0 );
-                curr->node->nn_param.tensor_add_mean_stddev_norm.eps = (float)1e-8;
-                curr->inputs[0] = input_fc_outputs[i]->t;
-                curr->inputs[1] = recurrent_fc_outputs[i]->t;
-                curr->outputs[0] = input_tensor->t;
-                vsi_nn_internal_setup_node(self, curr);
-
-                layernorm_outputs[i] = input_tensor;
-            }
+            memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+            attr.dim_num = VSI_NN_DIM_AUTO;
+            attr.vtl = use_virtual_tensor;
+            attr.is_const = FALSE;
+            attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+            attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+            input_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+            CHECK_PTR_FAIL_GOTO( input_tensor, "Create tensor fail.", final );
+
+            /* create internal nodes */
+            curr = vsi_nn_internal_new_node( self, VSI_NN_OP_TENSOR_ADD_MEAN_STDDEV_NORM, 0, 0 );
+            CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+            curr->node->nn_param.tensor_add_mean_stddev_norm.eps = (float)1e-8;
+            curr->inputs[0] = input_fc_outputs[i]->t;
+            curr->inputs[1] = recurrent_fc_outputs[i]->t;
+            curr->outputs[0] = input_tensor->t;
+            vsi_nn_internal_setup_node(self, curr);
+
+            layernorm_outputs[i] = input_tensor;
         }
     }
 
     /* activations */
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_ACTIVATION, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->node->nn_param.lstmunit_activation.cell_clip = p->cell_clip;
     curr->node->nn_param.lstmunit_activation.proj_clip = p->proj_clip;
     curr->node->nn_param.lstmunit_activation.forget_bias = p->forget_bias;
@@ -562,10 +573,9 @@ static vsi_bool op_setup
     curr->node->nn_param.lstmunit_activation.recurrent_activation = p->recurrent_activation;
 
     curr->inputs[LSTMUNIT_ACT_CSTATE_IN] = inputs[LSTMUNIT_INPUT_C_STATE];
-    for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++ )
+    for ( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++ )
     {
-        if( (p->local->use_layer_norm && !self->graph->ctx->config.support_stream_processor) ||
-            p->local->use_hybrid )
+        if( p->local->use_layer_norm || p->local->use_hybrid )
         {
             curr->inputs[LSTMUNIT_ACT_DATA_BI + i] = inputs[LSTMUNIT_INPUT_BIAS_I + i];
         }
@@ -573,14 +583,7 @@ static vsi_bool op_setup
         if( p->local->use_layer_norm )
         {
             /* Pass layernorm weights to VSI_NN_OP_LSTMUNIT_ACTIVATION */
-            if (self->graph->ctx->config.support_stream_processor)
-            {
-                curr->inputs[LSTMUNIT_ACT_LN_WI + i] = NULL;
-            }
-            else
-            {
-                curr->inputs[LSTMUNIT_ACT_LN_WI + i] = inputs[LSTMUNIT_INPUT_LAYERNORM_I + i];
-            }
+            curr->inputs[LSTMUNIT_ACT_LN_WI + i] = inputs[LSTMUNIT_INPUT_LAYERNORM_I + i];
             curr->inputs[LSTMUNIT_ACT_INPUT_FC_I + i] = layernorm_outputs[i]->t;
             curr->inputs[LSTMUNIT_ACT_HSTATE_FC_I + i] = NULL;
         }
@@ -616,6 +619,7 @@ static vsi_bool op_setup
             attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
         }
         output_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( output_tensor, curr, "Create tensor fail.", final );
 
         curr->outputs[LSTMUNIT_ACT_OUTPUT] = output_tensor->t;
         curr->outputs[LSTMUNIT_ACT_CSTATE_OUT] = outputs[LSTMUNIT_OUTPUT_C_STATE];
@@ -637,11 +641,14 @@ static vsi_bool op_setup
             use_virtual_tensor = inputs[LSTMUNIT_INPUT_BIAS_PROJ]->attr.vtl;
             input_tensor = vsi_nn_internal_create_zero_bias_tensor(self, &output_tensor->t->attr,
                 &inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr, VSI_NN_OP_FCL, FALSE);
+            CHECK_PTR_FAIL_GOTO( input_tensor, "Create tensor fail.", final );
+
             zero_bias_tensor = input_tensor->t;
 
             if (use_virtual_tensor)
             {
                 curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+                CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
                 curr->inputs[0] = inputs[LSTMUNIT_INPUT_BIAS_PROJ];
                 curr->outputs[0] = zero_bias_tensor;
 
@@ -656,6 +663,8 @@ static vsi_bool op_setup
         {
             input_tensor = vsi_nn_internal_create_zero_bias_tensor(self, &output_tensor->t->attr,
                 &inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr, VSI_NN_OP_FCL, FALSE);
+            CHECK_PTR_FAIL_GOTO( input_tensor, "Create tensor fail.", final );
+
             zero_bias_tensor = input_tensor->t;
         }
         else
@@ -664,6 +673,7 @@ static vsi_bool op_setup
         }
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_FCL, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.fcl.axis = 0;
         curr->node->nn_param.fcl.weights = (uint32_t)inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr.size[1];
 
@@ -678,12 +688,15 @@ static vsi_bool op_setup
 
         /* copy h_state to output */
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = outputs[LSTMUNIT_OUTPUT_H_STATE];
         curr->outputs[0] = outputs[LSTMUNIT_OUTPUT_OUTPUT];
         vsi_nn_internal_setup_node(self, curr);
     }
 
     return TRUE;
+final:
+    return FALSE;
 } /* op_setup() */
 
 static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
index 846339029..f4005a841 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
@@ -35,6 +35,8 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
 
 #define _ARG_NUM            (7)
 #define _INPUT_NUM          (2)
@@ -49,22 +51,24 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_param_t * param = NULL;
-    vsi_nn_kernel_node_t    n = NULL;
-    vsi_nn_tensor_t * tmp_inputs[2]  = {NULL};
-    vsi_nn_tensor_t * tmp_outputs[1] = {NULL};
-    vsi_nn_tensor_t * rs_input = NULL;
-    vsi_nn_tensor_t * rs_output = NULL;
-    vsi_size_t shape_in[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
-    vsi_size_t shape_out[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
-    uint32_t i = 0;
+    vsi_status             status            = VSI_FAILURE;
+    vsi_nn_kernel_param_t *param             = NULL;
+    vsi_nn_kernel_node_t   n                 = NULL;
+    vsi_nn_tensor_t *      tmp_inputs[2]     = {NULL};
+    vsi_nn_tensor_t *      tmp_outputs[1]    = {NULL};
+    uint32_t               new_rank[3]       = {0};
+    vsi_bool               ret               = FALSE;
+    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
 
     int32_t transposeA  = self->nn_param.matrixmul.transpose[0];
     int32_t transposeB  = self->nn_param.matrixmul.transpose[1];
     int32_t adjointA    = self->nn_param.matrixmul.adjoint[0];
     int32_t adjointB    = self->nn_param.matrixmul.adjoint[1];
 
+    uint32_t cross_flg = 0;
+    uint32_t size_axis_inner_outer[3] = {0};
+    uint32_t stride_axis_inner_outer[9] = {0};
+
     param = vsi_nn_kernel_param_create();
 
     vsi_nn_kernel_param_add_int32( param, "transposeA", transposeA );
@@ -72,46 +76,35 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_int32( param, "adjointA", adjointA );
     vsi_nn_kernel_param_add_int32( param, "adjointB", adjointB );
 
-    if (inputs[0]->attr.dim_num == 1 && inputs[1]->attr.dim_num > 1)
-    {
-        shape_in[0]    = inputs[0]->attr.size[0];
-        shape_in[1]    = 1;
-        shape_out[0]   = outputs[0]->attr.size[0];
-        shape_out[1]   = 1;
-        for(i = 2; i <= outputs[0]->attr.dim_num; i++)
-        {
-            shape_out[i]   = outputs[0]->attr.size[i - 1];
-        }
-        rs_input       = vsi_nn_reshape_tensor(self->graph, inputs[0], shape_in, 2);
-        rs_output      = vsi_nn_reshape_tensor(self->graph, outputs[0], shape_out, outputs[0]->attr.dim_num + 1);
-        tmp_inputs[0]  = rs_input;
-        tmp_inputs[1]  = inputs[1];
-        tmp_outputs[0] = rs_output;
-    }
-    else if (inputs[1]->attr.dim_num == 1 && inputs[0]->attr.dim_num > 1)
-    {
-        shape_in[0]    = 1;
-        shape_in[1]    = inputs[1]->attr.size[0];
 
-        shape_out[0]   = 1;
-        for(i = 1; i <= outputs[0]->attr.dim_num; i++)
-        {
-            shape_out[i]   = outputs[0]->attr.size[i - 1];
-        }
-        rs_input       = vsi_nn_reshape_tensor(self->graph, inputs[1], shape_in, 2);
-        rs_output      = vsi_nn_reshape_tensor(self->graph, outputs[0], shape_out, outputs[0]->attr.dim_num + 1);
+    ret = vsi_nn_kernel_optimize_matrixmul_broadcast_shape(
+                                       inputs[0]->attr.size,
+                                       inputs[1]->attr.size,
+                                       outputs[0]->attr.size,
+                                       inputs[0]->attr.dim_num,
+                                       inputs[1]->attr.dim_num,
+                                       outputs[0]->attr.dim_num,
+                                       shapes[0], shapes[1], shapes[2], new_rank,
+                                       &cross_flg, size_axis_inner_outer, stride_axis_inner_outer);
+
+    if (ret)
+    {
+        vsi_nn_kernel_param_add_int32( param, "cross_flg", cross_flg );
+        vsi_nn_kernel_param_add_buffer( param, "size_axis_inner_outer", size_axis_inner_outer, 3);
+        vsi_nn_kernel_param_add_buffer( param, "stride_axis_inner_outer", stride_axis_inner_outer, 9);
 
-        tmp_inputs[0]  = inputs[0];
-        tmp_inputs[1]  = rs_input;
-        tmp_outputs[0] = rs_output;
+        tmp_inputs[0] = vsi_nn_reshape_tensor(self->graph, inputs[0], shapes[0], new_rank[0]);
+        tmp_inputs[1] = vsi_nn_reshape_tensor(self->graph, inputs[1], shapes[1], new_rank[1]);
+        tmp_outputs[0] = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes[2], new_rank[2]);
     }
     else
     {
-        tmp_inputs[0]   = inputs[0];
-        tmp_inputs[1]   = inputs[1];
-        tmp_outputs[0]  = outputs[0];
+        VSILOGE("illegal inputs shape");
+        status = VSI_FAILURE;
+        goto final;
     }
 
+
     n = vsi_nn_kernel_selector( self->graph, "matrixmul", tmp_inputs, 2, tmp_outputs, 1, param );
     if ( n != NULL )
     {
@@ -119,19 +112,15 @@ static vsi_status op_compute
         status = VSI_SUCCESS;
     }
 
+final:
     if (param != NULL)
     {
         vsi_nn_kernel_param_release( &param );
     }
 
-    if (rs_input != NULL)
-    {
-        vsi_nn_ReleaseTensor( &rs_input );
-    }
-    if (rs_output != NULL)
-    {
-        vsi_nn_ReleaseTensor( &rs_output );
-    }
+    vsi_safe_release_tensor( tmp_inputs[0] );
+    vsi_safe_release_tensor( tmp_inputs[1] );
+    vsi_safe_release_tensor( tmp_outputs[0] );
 
     return status;
 } /* op_compute() */
@@ -282,32 +271,17 @@ static vsi_bool op_setup
                 outputs[0]->attr.size[i] = inputs[0]->attr.size[i + 1];
             }
         }
-        else if (inputs[0]->attr.dim_num > inputs[1]->attr.dim_num)
-        {
-            for (i = 2; i < inputs[0]->attr.dim_num; i++)
-            {
-                outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
-            }
-        }
-        else if (inputs[1]->attr.dim_num > inputs[0]->attr.dim_num)
-        {
-            for (i = 2; i < inputs[1]->attr.dim_num; i++)
-            {
-                outputs[0]->attr.size[i] = inputs[1]->attr.size[i];
-            }
-        }
-        else if (inputs[0]->attr.size[2] >= inputs[1]->attr.size[2])
-        {
-            for (i = 2; i < inputs[0]->attr.dim_num; i++)
-            {
-                outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
-            }
-        }
         else
         {
-            for (i = 2; i < inputs[1]->attr.dim_num; i++)
+            uint32_t rank0 = inputs[0]->attr.dim_num;
+            uint32_t rank1 = inputs[1]->attr.dim_num;
+            for (i = 2; i < outputs[0]->attr.dim_num; i++)
             {
-                outputs[0]->attr.size[i] = inputs[1]->attr.size[i];
+                vsi_size_t sz0 = i < rank0 ? inputs[0]->attr.size[i] : 1;
+                vsi_size_t sz1 = i < rank1 ? inputs[1]->attr.size[i] : 1;
+                vsi_size_t sz2 = vsi_nn_max(sz0, sz1);
+
+                outputs[0]->attr.size[i] = sz2;
             }
         }
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c
index 57f8cad39..a94df5511 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c
@@ -36,6 +36,7 @@
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_link_list.h"
 #include "vsi_nn_internal_node.h"
+#include "vsi_nn_error.h"
 
 typedef struct _max_pool3d_local_data_t {
     int32_t placeholder;
@@ -54,6 +55,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -79,6 +82,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 }
 
@@ -89,7 +94,7 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_bool ret = TRUE;
+    vsi_bool ret = FALSE;
     vsi_nn_max_pool3d_param *p = &(self->nn_param.max_pool3d);
     vsi_size_t ksize[_cnt_of_array(p->ksize)] = {0}, i = 0;
     vsi_size_t pad[_cnt_of_array(p->pad)] = {0};
@@ -173,10 +178,14 @@ static vsi_bool op_setup
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, TRUE);
     input_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final);
     pool2d_0_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    CHECK_PTR_FAIL_GOTO(pool2d_0_tensor, "Create internal tensor failed", final);
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     reshape_input_size = vsi_nn_internal_new_node_param(curr,
         VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_input_size, curr, "Create internal buffer failed", final);
     reshape_input_size[0] = inputs[0]->attr.size[0];
     reshape_input_size[1] = inputs[0]->attr.size[1];
     reshape_input_size[2] = 1;
@@ -189,9 +198,10 @@ static vsi_bool op_setup
     curr->node->nn_param.reshape2.dim_num = 4;
     curr->inputs[0] = inputs[0];
     curr->outputs[0] = input_tensor->t;
-    vsi_nn_internal_setup_node( self, curr );
+    ret = vsi_nn_internal_setup_node( self, curr );
 
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_POOL, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->node->nn_param.pool.ksize[0] = p->ksize[0];
     curr->node->nn_param.pool.ksize[1] = p->ksize[1];
     curr->node->nn_param.pool.stride[0] = p->stride[0];
@@ -205,28 +215,33 @@ static vsi_bool op_setup
     curr->node->nn_param.pool.pad_type = p->pad_type;
     curr->inputs[0] = input_tensor->t;
     curr->outputs[0] = pool2d_0_tensor->t;
-    vsi_nn_internal_setup_node( self, curr );
+    ret &= vsi_nn_internal_setup_node( self, curr );
 
     if (p->ksize[2] == 1 && p->stride[2] == 1 && p->pad[4] == 0 && p->pad[5] == 0)
     {
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.reshape2.size = outputs[0]->attr.size;
         curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num;
         curr->inputs[0] = pool2d_0_tensor->t;
         curr->outputs[0] = outputs[0];
-        vsi_nn_internal_setup_node( self, curr );
+        ret &= vsi_nn_internal_setup_node( self, curr );
     }
     else
     {
         memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
         vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, TRUE);
         reshape_0_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(reshape_0_tensor, "Create internal tensor failed", final);
         pool2d_1_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(pool2d_1_tensor, "Create internal tensor failed", final);
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         reshape_pool_size = vsi_nn_internal_new_node_param(curr,
             VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
-        reshape_pool_size[0] = -1;
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_pool_size, curr, "Create internal buffer failed", final);
+        reshape_pool_size[0] = (vsi_size_t)-1;
         reshape_pool_size[1] = inputs[0]->attr.size[2];
         reshape_pool_size[2] = 1;
         for (i = 3; i < inputs[0]->attr.dim_num; i++)
@@ -238,9 +253,10 @@ static vsi_bool op_setup
         curr->node->nn_param.reshape2.dim_num = 4;
         curr->inputs[0] = pool2d_0_tensor->t;
         curr->outputs[0] = reshape_0_tensor->t;
-        vsi_nn_internal_setup_node( self, curr );
+        ret &= vsi_nn_internal_setup_node( self, curr );
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_POOL, 1, 1 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.pool.ksize[0] = 1;
         curr->node->nn_param.pool.ksize[1] = p->ksize[2];
         curr->node->nn_param.pool.stride[0] = 1;
@@ -254,16 +270,18 @@ static vsi_bool op_setup
         curr->node->nn_param.pool.pad_type = p->pad_type;
         curr->inputs[0] = reshape_0_tensor->t;
         curr->outputs[0] = pool2d_1_tensor->t;
-        vsi_nn_internal_setup_node( self, curr );
+        ret &= vsi_nn_internal_setup_node( self, curr );
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.reshape2.size = outputs[0]->attr.size;
         curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num;
         curr->inputs[0] = pool2d_1_tensor->t;
         curr->outputs[0] = outputs[0];
-        vsi_nn_internal_setup_node( self, curr );
+        ret &= vsi_nn_internal_setup_node( self, curr );
     }
 
+final:
     return ret;
 } /* op_setup() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c
index 9df9c1b27..2deed48b7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c
@@ -56,20 +56,29 @@ static vsi_status op_compute
     vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
     uint32_t new_rank = 0;
     vsi_nn_kernel_param_t * param = NULL;
-    int32_t ksize_x    = (int32_t)self->nn_param.pool.ksize[0];
-    int32_t ksize_y    = (int32_t)self->nn_param.pool.ksize[1];
-    int32_t stride_x   = (int32_t)self->nn_param.pool.stride[0];
-    int32_t stride_y   = (int32_t)self->nn_param.pool.stride[1];
-    int32_t pad_left   = (int32_t)self->nn_param.pool.pad[0];
-    int32_t pad_right  = (int32_t)self->nn_param.pool.pad[1];
-    int32_t pad_top    = (int32_t)self->nn_param.pool.pad[2];
-    int32_t pad_bottom = (int32_t)self->nn_param.pool.pad[3];
+    int32_t ksize_x    = 0;
+    int32_t ksize_y    = 0;
+    int32_t stride_x   = 0;
+    int32_t stride_y   = 0;
+    int32_t pad_left   = 0;
+    int32_t pad_right  = 0;
+    int32_t pad_top    = 0;
+    int32_t pad_bottom = 0;
 
     if ( NULL == self )
     {
         return VSI_FAILURE;
     }
 
+    ksize_x    = (int32_t)self->nn_param.pool.ksize[0];
+    ksize_y    = (int32_t)self->nn_param.pool.ksize[1];
+    stride_x   = (int32_t)self->nn_param.pool.stride[0];
+    stride_y   = (int32_t)self->nn_param.pool.stride[1];
+    pad_left   = (int32_t)self->nn_param.pool.pad[0];
+    pad_right  = (int32_t)self->nn_param.pool.pad[1];
+    pad_top    = (int32_t)self->nn_param.pool.pad[2];
+    pad_bottom = (int32_t)self->nn_param.pool.pad[3];
+
     param = vsi_nn_kernel_param_create();
 
     vsi_nn_kernel_optimize_nchw2xhw_shape(inputs[0]->attr.size, inputs[0]->attr.dim_num,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_mod.c b/src/tim/vx/internal/src/ops/vsi_nn_op_mod.c
index 29310ad96..7be779db1 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_mod.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_mod.c
@@ -59,13 +59,15 @@ static vsi_status op_compute
     vsi_size_t new_rank = 0;
     vsi_bool ret;
     vsi_nn_kernel_param_t * param = NULL;
-    int32_t isfmod = (int32_t)self->nn_param.mod.fmod;
+    int32_t isfmod = 0;
 
     if (NULL == self)
     {
         return VSI_FAILURE;
     }
 
+    isfmod = (int32_t)self->nn_param.mod.fmod;
+
     param = vsi_nn_kernel_param_create();
 
     ret = vsi_nn_kernel_optimize_eltwise_shape(
@@ -183,6 +185,8 @@ static vsi_bool op_setup
     vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
     vsi_bool ret = TRUE;
 
+    VSI_UNREFERENCED(self);
+
     in1_rank = inputs[0]->attr.dim_num;
     in2_rank = inputs[1]->attr.dim_num;
     out_rank = vsi_nn_max( in1_rank, in2_rank );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c b/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c
index 8276c0f7c..39dda244d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c
@@ -71,13 +71,14 @@ static void _set_io_index
         vxSetParameterByIndex(self->n, idx++, (vx_reference)inputs[i]->t);
         scalar_index = idx;
         param = vxGetParameterByIndex(self->n, scalar_index);
-        vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum));
-        if (param != NULL)
+
+        if (param)
         {
+            vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum));
             vxReleaseParameter(&param);
             param = NULL;
-
         }
+
         if (type != VX_TYPE_SCALAR)
         {
             continue;
@@ -92,17 +93,18 @@ static void _set_io_index
                 vx_reference ref = 0;
                 vsi_status status;
                 param = vxGetParameterByIndex(self->n, j);
-                vxQueryParameter(param, VX_PARAMETER_REF, &ref, sizeof(vx_reference));
-                status = vxQueryScalar((vx_scalar)ref, VX_SCALAR_TYPE, &data_type, sizeof(vx_enum));
-                if (status == VX_ERROR_INVALID_REFERENCE)
-                {
-                    vx_scalar scalar = vxCreateScalar(self->graph->ctx->c, VX_TYPE_INT32, 0);
-                    ref = (vx_reference)scalar;
-                    vxSetParameterByIndex(self->n, idx++, ref);
-                    vxReleaseReference(&ref);
-                }
-                if (param != NULL)
+
+                if (param)
                 {
+                    vxQueryParameter(param, VX_PARAMETER_REF, &ref, sizeof(vx_reference));
+                    status = vxQueryScalar((vx_scalar)ref, VX_SCALAR_TYPE, &data_type, sizeof(vx_enum));
+                    if (status == VX_ERROR_INVALID_REFERENCE)
+                    {
+                        vx_scalar scalar = vxCreateScalar(self->graph->ctx->c, VX_TYPE_INT32, 0);
+                        ref = (vx_reference)scalar;
+                        vxSetParameterByIndex(self->n, idx++, ref);
+                        vxReleaseReference(&ref);
+                    }
                     vxReleaseParameter(&param);
                     param = NULL;
                 }
@@ -165,6 +167,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 } /* op_check() */
 
@@ -178,6 +183,9 @@ static vsi_bool op_setup
     /*
      * Network Binary Graph node do not need to calculate output shape
      */
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 } /* op_setup() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c b/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c
index 3c8a57d0a..acd1c9eae 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c
@@ -85,6 +85,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -96,6 +99,8 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = 1;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c b/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c
index 71a5e0786..766392ac4 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c
@@ -43,6 +43,9 @@ static vsi_status op_compute
     )
 {
     int i;
+
+    VSI_UNREFERENCED(self);
+
     for( i = 0; i < 10; i ++ )
     {
         if( NULL == outputs[i] )
@@ -65,6 +68,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 } /* op_check() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c
index 2c7dba946..111fc3d3c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c
@@ -156,6 +156,7 @@ static vsi_status op_compute
         attr.is_const = FALSE;
 
         convert_tensor = vsi_nn_CreateTensor(self->graph, &attr);
+        CHECK_PTR_FAIL_GOTO( convert_tensor, "Create tensor fail.", final );
 
         self->n = vxTensorCopyNode(
             self->graph->g,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c
index 399d0c6be..146ee332f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c
@@ -51,6 +51,8 @@ static vsi_bool _is_same_memory_shape
     uint32_t dim_num0 = inputs[0]->attr.dim_num;
     uint32_t dim_num1 = self->nn_param.permute.dim_num;
 
+    VSI_UNREFERENCED(outputs);
+
     if (dim_num0 != dim_num1)
         return FALSE;
 
@@ -102,6 +104,8 @@ static vsi_bool _is_same_quant
 {
     vsi_nn_dtype_t *dtype,*_dtype;
 
+    VSI_UNREFERENCED(self);
+
     dtype = &inputs[0]->attr.dtype;
     _dtype = &outputs[0]->attr.dtype;
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c
index cfdf7c2f1..24b0d6260 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c
@@ -34,7 +34,6 @@
 #include "utils/vsi_nn_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
 #include "utils/vsi_nn_constraint_check.h"
 
@@ -136,21 +135,28 @@ static vsi_status op_compute
     vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
     vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
     uint32_t new_rank = 0;
-    vsi_bool ret;
+    vsi_bool ret = FALSE;
     vsi_nn_kernel_param_t * param = NULL;
-    int32_t ksize_x  = (int32_t)self->nn_param.pool.ksize[0];
-    int32_t ksize_y  = (int32_t)self->nn_param.pool.ksize[1];
-    int32_t stride_x = (int32_t)self->nn_param.pool.stride[0];
-    int32_t stride_y = (int32_t)self->nn_param.pool.stride[1];
-    int32_t pad_x    = (int32_t)self->nn_param.pool.pad[0];
-    int32_t pad_y    = (int32_t)self->nn_param.pool.pad[2];
+    int32_t ksize_x  = 0;
+    int32_t ksize_y  = 0;
+    int32_t stride_x = 0;
+    int32_t stride_y = 0;
+    int32_t pad_x    = 0;
+    int32_t pad_y    = 0;
 
-    if( NULL == self )
+    if ( NULL == self )
     {
         return VSI_FAILURE;
     }
 
-    param =vsi_nn_kernel_param_create();
+    ksize_x  = (int32_t)self->nn_param.pool.ksize[0];
+    ksize_y  = (int32_t)self->nn_param.pool.ksize[1];
+    stride_x = (int32_t)self->nn_param.pool.stride[0];
+    stride_y = (int32_t)self->nn_param.pool.stride[1];
+    pad_x    = (int32_t)self->nn_param.pool.pad[0];
+    pad_y    = (int32_t)self->nn_param.pool.pad[2];
+
+    param = vsi_nn_kernel_param_create();
 
     ret = vsi_nn_poolwithargmax_optimize_shape(self,
             (vsi_ssize_t*)inputs[0]->attr.size,  (vsi_ssize_t*)outputs[0]->attr.size,
@@ -164,7 +170,7 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_int32( param, "pad_x",    pad_x );
     vsi_nn_kernel_param_add_int32( param, "pad_y",    pad_y );
 
-    if( ret )
+    if ( ret )
     {
         reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
                 inputs[0], shapes[0], new_rank );
@@ -180,7 +186,7 @@ static vsi_status op_compute
         vsi_nn_ReleaseTensor( &reshape_tensors[2] );
     }
 
-    if( self->n )
+    if ( self->n )
     {
         status = VSI_SUCCESS;
     }
@@ -270,10 +276,12 @@ static vsi_bool op_setup
         self->nn_param.pool.pad[i] = (uint32_t)pad[i];
     }
 
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         ret = vsi_nn_OpSetup( VSI_NN_OP_POOL, self, inputs, outputs );
-
+    }
+    if ( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num )
+    {
         outputs[1]->attr.dim_num = outputs[0]->attr.dim_num;
         memcpy( outputs[1]->attr.size, outputs[0]->attr.size,
             VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c
index 18942faf4..9b060f141 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "vsi_nn_internal_node.h"
 
@@ -48,6 +48,8 @@ static vsi_bool _is_same_type
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+
     if(vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
     {
         return FALSE;
@@ -63,6 +65,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -73,6 +77,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -85,6 +92,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
@@ -95,7 +104,7 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_bool ret;
+    vsi_bool ret = FALSE;
     uint32_t i;
     uint32_t axis;
     vsi_nn_tensor_attr_t attr;
@@ -112,7 +121,6 @@ static vsi_bool op_setup
         return FALSE;
     }
 
-    ret = TRUE;
     /* output */
     if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
@@ -152,6 +160,7 @@ static vsi_bool op_setup
         self->nn_param.post_process.local.enable_perm == FALSE)
     {
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.reshape2.size = outputs[0]->attr.size;
         curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num;
         curr->inputs[0] = inputs[POST_PROCESS_INPUT];
@@ -163,6 +172,7 @@ static vsi_bool op_setup
         self->nn_param.post_process.local.enable_perm == FALSE)
     {
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = inputs[POST_PROCESS_INPUT];
         curr->outputs[0] = outputs[POST_PROCESS_OUTPUT];
 
@@ -172,6 +182,7 @@ static vsi_bool op_setup
         self->nn_param.post_process.local.enable_perm == TRUE)
     {
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.permute.perm = self->nn_param.post_process.perm;
         curr->node->nn_param.permute.dim_num = self->nn_param.post_process.dim_num;
         curr->inputs[0] = inputs[POST_PROCESS_INPUT];
@@ -187,8 +198,10 @@ static vsi_bool op_setup
         attr.vtl = use_virtual_tensor;
         attr.is_const = FALSE;
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.permute.perm = self->nn_param.post_process.perm;
         curr->node->nn_param.permute.dim_num = self->nn_param.post_process.dim_num;
         curr->inputs[0] = inputs[POST_PROCESS_INPUT];
@@ -197,12 +210,15 @@ static vsi_bool op_setup
         vsi_nn_internal_setup_node( self, curr );
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = output_tensor->t;
         curr->outputs[0] = outputs[POST_PROCESS_OUTPUT];
 
         vsi_nn_internal_setup_node(self, curr);
     }
 
+    ret = TRUE;
+final:
     return ret;
 } /* op_setup() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
index aa5b46c1b..f977e32d0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
@@ -36,6 +36,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_util.h"
+#include "vsi_nn_error.h"
 
 static vsi_status op_compute
     (
@@ -44,8 +45,27 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status = vsi_nn_internal_compute_node( self );
-    self->n = vsi_nn_internal_get_node_by_uid(self, 1)->node->n;
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_internal_node_t* interal_node = NULL;
+
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+
+    status = vsi_nn_internal_compute_node( self );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    interal_node = vsi_nn_internal_get_node_by_uid(self, 1);
+
+    if (interal_node)
+    {
+        self->n = interal_node->node->n;
+    }
+    else
+    {
+        status = VSI_FAILURE;
+    }
+
+final:
     return status;
 } /* op_compute() */
 
@@ -56,6 +76,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -68,6 +91,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
@@ -81,9 +106,10 @@ static vsi_bool op_setup
     /* TODO: Add code to comput outputs' shape. */
     vsi_nn_internal_node_t* curr = NULL;
     vsi_nn_pre_process_param * p = NULL;
-    vsi_bool ret = TRUE;
+    vsi_bool ret = FALSE;
     vsi_nn_internal_tensor_t* preprocess_tensor = NULL;
     vsi_nn_preprocess_dest_layout_e layout = VSI_NN_DEST_LAYOUT_NCHW;
+    vsi_bool enable_rgb88_planar_nhwc = FALSE;
 
     p = (vsi_nn_pre_process_param *)&(self->nn_param.pre_process);
 
@@ -122,11 +148,18 @@ static vsi_bool op_setup
         if (i != self->nn_param.pre_process_rgb.dim_num)
         {
             layout = VSI_NN_DEST_LAYOUT_NHWC;
+
+            if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ||
+                p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP)
+            {
+                enable_rgb88_planar_nhwc = self->graph->ctx->options.enable_rgb88_planar_nhwc;
+            }
         }
 
-        if (layout == VSI_NN_DEST_LAYOUT_NHWC)
+        if (layout == VSI_NN_DEST_LAYOUT_NHWC && !enable_rgb88_planar_nhwc)
         {
             memcpy( &attr, &outputs[PRE_PROCESS_OUTPUT]->attr, sizeof( attr ) );
+
             attr.size[0] = p->output_attr.size[1];
             attr.size[1] = p->output_attr.size[2];
             attr.size[2] = p->output_attr.size[0];
@@ -136,7 +169,8 @@ static vsi_bool op_setup
             attr.vtl = use_virtual_tensor;
             attr.is_const = FALSE;
 
-            preprocess_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            preprocess_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+            CHECK_PTR_FAIL_GOTO(preprocess_tensor, "Create internal tensor failed", final);
         }
     }
 
@@ -145,6 +179,7 @@ static vsi_bool op_setup
     case VSI_NN_SOURCE_FORMAT_TENSOR:
         {
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_TENSOR, 0, 0 );
+            CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
 
             curr->node->nn_param.pre_process_tensor.perm = p->perm;
             curr->node->nn_param.pre_process_tensor.dim_num = p->dim_num;
@@ -152,12 +187,13 @@ static vsi_bool op_setup
             curr->inputs[0] = inputs[PRE_PROCESS_INPUT0];
             curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
 
-            vsi_nn_internal_setup_node(self, curr);
+            ret = vsi_nn_internal_setup_node(self, curr);
         }
         break;
     case VSI_NN_SOURCE_FORMAT_IMAGE_GRAY:
         {
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_GRAY, 0, 0 );
+            CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
 
             curr->node->nn_param.pre_process_gray.mean = p->norm.mean[0];
             curr->node->nn_param.pre_process_gray.scale = p->norm.scale;
@@ -178,27 +214,33 @@ static vsi_bool op_setup
                 curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
             }
 
-            vsi_nn_internal_setup_node(self, curr);
+            ret = vsi_nn_internal_setup_node(self, curr);
         }
         break;
     case VSI_NN_SOURCE_FORMAT_IMAGE_RGB:
         {
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_RGB, 0, 0 );
+            CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
 
             if (p->reverse_channel)
             {
                 curr->node->nn_param.pre_process_rgb.r_mean = p->norm.mean[2];
                 curr->node->nn_param.pre_process_rgb.g_mean = p->norm.mean[1];
                 curr->node->nn_param.pre_process_rgb.b_mean = p->norm.mean[0];
+                curr->node->nn_param.pre_process_rgb.r_scale = p->norm2.scale[2];
+                curr->node->nn_param.pre_process_rgb.g_scale = p->norm2.scale[1];
+                curr->node->nn_param.pre_process_rgb.b_scale = p->norm2.scale[0];
             }
             else
             {
                 curr->node->nn_param.pre_process_rgb.r_mean = p->norm.mean[0];
                 curr->node->nn_param.pre_process_rgb.g_mean = p->norm.mean[1];
                 curr->node->nn_param.pre_process_rgb.b_mean = p->norm.mean[2];
+                curr->node->nn_param.pre_process_rgb.r_scale = p->norm2.scale[0];
+                curr->node->nn_param.pre_process_rgb.g_scale = p->norm2.scale[1];
+                curr->node->nn_param.pre_process_rgb.b_scale = p->norm2.scale[2];
             }
 
-            curr->node->nn_param.pre_process_rgb.rgb_scale = p->norm.scale;
             curr->node->nn_param.pre_process_rgb.reverse_channel = p->reverse_channel;
             curr->node->nn_param.pre_process_rgb.rect.left = p->rect.left;
             curr->node->nn_param.pre_process_rgb.rect.top = p->rect.top;
@@ -219,27 +261,51 @@ static vsi_bool op_setup
                 curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
             }
 
-            vsi_nn_internal_setup_node(self, curr);
+            ret = vsi_nn_internal_setup_node(self, curr);
         }
         break;
     case VSI_NN_SOURCE_FORMAT_IMAGE_YUV420:
         {
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV420, 0, 0 );
+            CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
 
             if (p->reverse_channel)
             {
                 curr->node->nn_param.pre_process_yuv420.r_mean = p->norm.mean[2];
                 curr->node->nn_param.pre_process_yuv420.g_mean = p->norm.mean[1];
                 curr->node->nn_param.pre_process_yuv420.b_mean = p->norm.mean[0];
+                if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+                {
+                    curr->node->nn_param.pre_process_yuv420.r_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_yuv420.g_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_yuv420.b_scale = p->norm.scale;
+                }
+                else
+                {
+                    curr->node->nn_param.pre_process_yuv420.r_scale = p->norm2.scale[2];
+                    curr->node->nn_param.pre_process_yuv420.g_scale = p->norm2.scale[1];
+                    curr->node->nn_param.pre_process_yuv420.b_scale = p->norm2.scale[0];
+                }
             }
             else
             {
                 curr->node->nn_param.pre_process_yuv420.r_mean = p->norm.mean[0];
                 curr->node->nn_param.pre_process_yuv420.g_mean = p->norm.mean[1];
                 curr->node->nn_param.pre_process_yuv420.b_mean = p->norm.mean[2];
+                if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+                {
+                    curr->node->nn_param.pre_process_yuv420.r_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_yuv420.g_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_yuv420.b_scale = p->norm.scale;
+                }
+                else
+                {
+                    curr->node->nn_param.pre_process_yuv420.r_scale = p->norm2.scale[0];
+                    curr->node->nn_param.pre_process_yuv420.g_scale = p->norm2.scale[1];
+                    curr->node->nn_param.pre_process_yuv420.b_scale = p->norm2.scale[2];
+                }
             }
 
-            curr->node->nn_param.pre_process_yuv420.rgb_scale = p->norm.scale;
             curr->node->nn_param.pre_process_yuv420.reverse_channel = p->reverse_channel;
             curr->node->nn_param.pre_process_yuv420.rect.left = p->rect.left;
             curr->node->nn_param.pre_process_yuv420.rect.top = p->rect.top;
@@ -262,27 +328,51 @@ static vsi_bool op_setup
                 curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
             }
 
-            vsi_nn_internal_setup_node(self, curr);
+            ret = vsi_nn_internal_setup_node(self, curr);
         }
         break;
     case VSI_NN_SOURCE_FORMAT_IMAGE_BGRA:
         {
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_BGRA, 0, 0 );
+            CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
 
             if (p->reverse_channel)
             {
                 curr->node->nn_param.pre_process_bgra.r_mean = p->norm.mean[2];
                 curr->node->nn_param.pre_process_bgra.g_mean = p->norm.mean[1];
                 curr->node->nn_param.pre_process_bgra.b_mean = p->norm.mean[0];
+                if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+                {
+                    curr->node->nn_param.pre_process_bgra.r_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_bgra.g_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_bgra.b_scale = p->norm.scale;
+                }
+                else
+                {
+                    curr->node->nn_param.pre_process_bgra.r_scale = p->norm2.scale[2];
+                    curr->node->nn_param.pre_process_bgra.g_scale = p->norm2.scale[1];
+                    curr->node->nn_param.pre_process_bgra.b_scale = p->norm2.scale[0];
+                }
             }
             else
             {
                 curr->node->nn_param.pre_process_bgra.r_mean = p->norm.mean[0];
                 curr->node->nn_param.pre_process_bgra.g_mean = p->norm.mean[1];
                 curr->node->nn_param.pre_process_bgra.b_mean = p->norm.mean[2];
+                if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+                {
+                    curr->node->nn_param.pre_process_bgra.r_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_bgra.g_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_bgra.b_scale = p->norm.scale;
+                }
+                else
+                {
+                    curr->node->nn_param.pre_process_bgra.r_scale = p->norm2.scale[0];
+                    curr->node->nn_param.pre_process_bgra.g_scale = p->norm2.scale[1];
+                    curr->node->nn_param.pre_process_bgra.b_scale = p->norm2.scale[2];
+                }
             }
 
-            curr->node->nn_param.pre_process_bgra.rgb_scale = p->norm.scale;
             curr->node->nn_param.pre_process_bgra.reverse_channel = p->reverse_channel;
             curr->node->nn_param.pre_process_bgra.rect.left = p->rect.left;
             curr->node->nn_param.pre_process_bgra.rect.top = p->rect.top;
@@ -303,59 +393,30 @@ static vsi_bool op_setup
                 curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
             }
 
-            vsi_nn_internal_setup_node(self, curr);
+            ret = vsi_nn_internal_setup_node(self, curr);
         }
         break;
     case VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR:
     case VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP:
         {
-            uint32_t i = 0;
-            uint32_t axis = 2;
             vsi_bool is_input_sep = p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ? FALSE : TRUE;
-            vsi_nn_internal_tensor_t * output_tensor_group[3] = {NULL};
-            vsi_nn_internal_tensor_t* tmp_outputs[3] = { NULL };
-            vsi_nn_tensor_attr_t attr;
             float mean[3] = {0};
-            vsi_size_t size_32bit[VSI_NN_MAX_DIM_NUM] = {0};
-
-            memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-            memcpy(&attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t));
-            for(i = 0; i < p->output_attr.dim_num; i++)
-            {
-                attr.size[i] = -1 == p->output_attr.size[i] ? -1 : (vsi_size_t)p->output_attr.size[i];
-            }
-            attr.size[axis] = 1;
-            attr.vtl = TRUE;
-            attr.is_const = FALSE;
-            output_tensor_group[0] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
-            output_tensor_group[1] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
-            output_tensor_group[2] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
-            for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
-            {
-                size_32bit[i] = attr.size[i];
-            }
 
             if (p->reverse_channel)
             {
-                int32_t order[3] = {2, 1, 0};
-
                 mean[0] = p->norm.mean[2];
                 mean[1] = p->norm.mean[1];
                 mean[2] = p->norm.mean[0];
-
-                vsi_nn_reorder_tensor( (vsi_nn_tensor_t **)output_tensor_group, order,
-                        3, (vsi_nn_tensor_t **)tmp_outputs );
             }
             else
             {
                 mean[0] = p->norm.mean[0];
                 mean[1] = p->norm.mean[1];
                 mean[2] = p->norm.mean[2];
-
-                memmove( tmp_outputs, output_tensor_group, sizeof(vsi_nn_tensor_t*) * 3 );
             }
 
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_RGB888_PLANAR, 0, 0 );
+            CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
             if (is_input_sep)
             {
                 curr->inputs[0] = inputs[0];
@@ -368,28 +429,7 @@ static vsi_bool op_setup
                 curr->inputs[1] = NULL;
                 curr->inputs[2] = NULL;
             }
-            curr->outputs[0] = output_tensor_group[0]->t;
-            curr->outputs[1] = output_tensor_group[1]->t;
-            curr->outputs[2] = output_tensor_group[2]->t;
-            curr->node->nn_param.pre_process_rgb888_planar.r_mean = mean[0];
-            curr->node->nn_param.pre_process_rgb888_planar.g_mean = mean[1];
-            curr->node->nn_param.pre_process_rgb888_planar.b_mean = mean[2];
-            curr->node->nn_param.pre_process_rgb888_planar.scale = p->norm.scale;
-            curr->node->nn_param.pre_process_rgb888_planar.rect.left = p->rect.left;
-            curr->node->nn_param.pre_process_rgb888_planar.rect.top = p->rect.top;
-            curr->node->nn_param.pre_process_rgb888_planar.rect.width = p->rect.width;
-            curr->node->nn_param.pre_process_rgb888_planar.rect.height = p->rect.height;
-            curr->node->nn_param.pre_process_rgb888_planar.output_attr.size = size_32bit;
-            curr->node->nn_param.pre_process_rgb888_planar.output_attr.dim_num = p->output_attr.dim_num;
-            vsi_nn_internal_setup_node(self, curr);
-
-            curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 3, 1 );
-
-            curr->node->nn_param.concat.axis = axis;
-            curr->inputs[0] = tmp_outputs[0]->t;
-            curr->inputs[1] = tmp_outputs[1]->t;
-            curr->inputs[2] = tmp_outputs[2]->t;
-            if (layout == VSI_NN_DEST_LAYOUT_NHWC)
+            if (layout == VSI_NN_DEST_LAYOUT_NHWC && !enable_rgb88_planar_nhwc)
             {
                 curr->outputs[0] = preprocess_tensor->t;
             }
@@ -398,27 +438,93 @@ static vsi_bool op_setup
                 curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
             }
 
-            vsi_nn_internal_setup_node(self, curr);
+            if (p->reverse_channel)
+            {
+                if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+                {
+                    curr->node->nn_param.pre_process_rgb888_planar.r_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_rgb888_planar.g_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_rgb888_planar.b_scale = p->norm.scale;
+                }
+                else
+                {
+                    curr->node->nn_param.pre_process_rgb888_planar.r_scale = p->norm2.scale[2];
+                    curr->node->nn_param.pre_process_rgb888_planar.g_scale = p->norm2.scale[1];
+                    curr->node->nn_param.pre_process_rgb888_planar.b_scale = p->norm2.scale[0];
+                }
+            }
+            else
+            {
+                if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+                {
+                    curr->node->nn_param.pre_process_rgb888_planar.r_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_rgb888_planar.g_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_rgb888_planar.b_scale = p->norm.scale;
+                }
+                else
+                {
+                    curr->node->nn_param.pre_process_rgb888_planar.r_scale = p->norm2.scale[0];
+                    curr->node->nn_param.pre_process_rgb888_planar.g_scale = p->norm2.scale[1];
+                    curr->node->nn_param.pre_process_rgb888_planar.b_scale = p->norm2.scale[2];
+                }
+            }
+
+            curr->node->nn_param.pre_process_rgb888_planar.r_mean = mean[0];
+            curr->node->nn_param.pre_process_rgb888_planar.g_mean = mean[1];
+            curr->node->nn_param.pre_process_rgb888_planar.b_mean = mean[2];
+            curr->node->nn_param.pre_process_rgb888_planar.rect.left = p->rect.left;
+            curr->node->nn_param.pre_process_rgb888_planar.rect.top = p->rect.top;
+            curr->node->nn_param.pre_process_rgb888_planar.rect.width = p->rect.width;
+            curr->node->nn_param.pre_process_rgb888_planar.rect.height = p->rect.height;
+            curr->node->nn_param.pre_process_rgb888_planar.output_attr.size = p->output_attr.size;
+            curr->node->nn_param.pre_process_rgb888_planar.output_attr.dim_num = p->output_attr.dim_num;
+            curr->node->nn_param.pre_process_rgb888_planar.reverse_channel = p->reverse_channel;
+            curr->node->nn_param.pre_process_rgb888_planar.enable_rgb88_planar_nhwc = enable_rgb88_planar_nhwc;
+            ret = vsi_nn_internal_setup_node(self, curr);
         }
         break;
     case VSI_NN_SOURCE_FORMAT_IMAGE_YUV444:
         {
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV444, 0, 0 );
+            CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
 
             if (p->reverse_channel)
             {
                 curr->node->nn_param.pre_process_yuv444.r_mean = p->norm.mean[2];
                 curr->node->nn_param.pre_process_yuv444.g_mean = p->norm.mean[1];
                 curr->node->nn_param.pre_process_yuv444.b_mean = p->norm.mean[0];
+                if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+                {
+                    curr->node->nn_param.pre_process_yuv444.r_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_yuv444.g_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_yuv444.b_scale = p->norm.scale;
+                }
+                else
+                {
+                    curr->node->nn_param.pre_process_yuv444.r_scale = p->norm2.scale[2];
+                    curr->node->nn_param.pre_process_yuv444.g_scale = p->norm2.scale[1];
+                    curr->node->nn_param.pre_process_yuv444.b_scale = p->norm2.scale[0];
+                }
             }
             else
             {
                 curr->node->nn_param.pre_process_yuv444.r_mean = p->norm.mean[0];
                 curr->node->nn_param.pre_process_yuv444.g_mean = p->norm.mean[1];
                 curr->node->nn_param.pre_process_yuv444.b_mean = p->norm.mean[2];
+                if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+                {
+                    curr->node->nn_param.pre_process_yuv444.r_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_yuv444.g_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_yuv444.b_scale = p->norm.scale;
+                }
+                else
+                {
+                    curr->node->nn_param.pre_process_yuv444.r_scale = p->norm2.scale[0];
+                    curr->node->nn_param.pre_process_yuv444.g_scale = p->norm2.scale[1];
+                    curr->node->nn_param.pre_process_yuv444.b_scale = p->norm2.scale[2];
+                }
             }
 
-            curr->node->nn_param.pre_process_yuv444.rgb_scale = p->norm.scale;
             curr->node->nn_param.pre_process_yuv444.reverse_channel = p->reverse_channel;
             curr->node->nn_param.pre_process_yuv444.rect.left = p->rect.left;
             curr->node->nn_param.pre_process_yuv444.rect.top = p->rect.top;
@@ -441,25 +547,50 @@ static vsi_bool op_setup
                 curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
             }
 
-            vsi_nn_internal_setup_node(self, curr);
+            ret = vsi_nn_internal_setup_node(self, curr);
         }
         break;
     case VSI_NN_SOURCE_FORMAT_IMAGE_NV21:
     case VSI_NN_SOURCE_FORMAT_IMAGE_NV12:
         {
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_NV12, 0, 0 );
+            CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
 
             if (p->reverse_channel)
             {
                 curr->node->nn_param.pre_process_nv12.r_mean = p->norm.mean[2];
                 curr->node->nn_param.pre_process_nv12.g_mean = p->norm.mean[1];
                 curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[0];
+                if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+                {
+                    curr->node->nn_param.pre_process_nv12.r_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_nv12.g_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_nv12.b_scale = p->norm.scale;
+                }
+                else
+                {
+                    curr->node->nn_param.pre_process_nv12.r_scale = p->norm2.scale[2];
+                    curr->node->nn_param.pre_process_nv12.g_scale = p->norm2.scale[1];
+                    curr->node->nn_param.pre_process_nv12.b_scale = p->norm2.scale[0];
+                }
             }
             else
             {
                 curr->node->nn_param.pre_process_nv12.r_mean = p->norm.mean[0];
                 curr->node->nn_param.pre_process_nv12.g_mean = p->norm.mean[1];
                 curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[2];
+                if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+                {
+                    curr->node->nn_param.pre_process_nv12.r_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_nv12.g_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_nv12.b_scale = p->norm.scale;
+                }
+                else
+                {
+                    curr->node->nn_param.pre_process_nv12.r_scale = p->norm2.scale[0];
+                    curr->node->nn_param.pre_process_nv12.g_scale = p->norm2.scale[1];
+                    curr->node->nn_param.pre_process_nv12.b_scale = p->norm2.scale[2];
+                }
             }
 
             if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12)
@@ -471,7 +602,6 @@ static vsi_bool op_setup
                 curr->node->nn_param.pre_process_nv12.nv_type = VSI_NN_YUV_TYPE_NV21;
             }
 
-            curr->node->nn_param.pre_process_nv12.rgb_scale = p->norm.scale;
             curr->node->nn_param.pre_process_nv12.reverse_channel = p->reverse_channel;
             curr->node->nn_param.pre_process_nv12.rect.left = p->rect.left;
             curr->node->nn_param.pre_process_nv12.rect.top = p->rect.top;
@@ -493,25 +623,50 @@ static vsi_bool op_setup
                 curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
             }
 
-            vsi_nn_internal_setup_node(self, curr);
+            ret = vsi_nn_internal_setup_node(self, curr);
         }
         break;
     case VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422:
     case VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422:
         {
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV422, 0, 0 );
+            CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
 
             if (p->reverse_channel)
             {
                 curr->node->nn_param.pre_process_yuv422.r_mean = p->norm.mean[2];
                 curr->node->nn_param.pre_process_yuv422.g_mean = p->norm.mean[1];
                 curr->node->nn_param.pre_process_yuv422.b_mean = p->norm.mean[0];
+                if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+                {
+                    curr->node->nn_param.pre_process_yuv422.r_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_yuv422.g_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_yuv422.b_scale = p->norm.scale;
+                }
+                else
+                {
+                    curr->node->nn_param.pre_process_yuv422.r_scale = p->norm2.scale[2];
+                    curr->node->nn_param.pre_process_yuv422.g_scale = p->norm2.scale[1];
+                    curr->node->nn_param.pre_process_yuv422.b_scale = p->norm2.scale[0];
+                }
             }
             else
             {
                 curr->node->nn_param.pre_process_yuv422.r_mean = p->norm.mean[0];
                 curr->node->nn_param.pre_process_yuv422.g_mean = p->norm.mean[1];
                 curr->node->nn_param.pre_process_yuv422.b_mean = p->norm.mean[2];
+                if (vsi_nn_compareVersion(self->graph, 1, 1, 83) == -1)
+                {
+                    curr->node->nn_param.pre_process_yuv422.r_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_yuv422.g_scale = p->norm.scale;
+                    curr->node->nn_param.pre_process_yuv422.b_scale = p->norm.scale;
+                }
+                else
+                {
+                    curr->node->nn_param.pre_process_yuv422.r_scale = p->norm2.scale[0];
+                    curr->node->nn_param.pre_process_yuv422.g_scale = p->norm2.scale[1];
+                    curr->node->nn_param.pre_process_yuv422.b_scale = p->norm2.scale[2];
+                }
             }
 
             if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422)
@@ -523,7 +678,6 @@ static vsi_bool op_setup
                 curr->node->nn_param.pre_process_yuv422.yuv422_type = 1;
             }
 
-            curr->node->nn_param.pre_process_yuv422.rgb_scale = p->norm.scale;
             curr->node->nn_param.pre_process_yuv422.reverse_channel = p->reverse_channel;
             curr->node->nn_param.pre_process_yuv422.rect.left = p->rect.left;
             curr->node->nn_param.pre_process_yuv422.rect.top = p->rect.top;
@@ -544,13 +698,13 @@ static vsi_bool op_setup
                 curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
             }
 
-            vsi_nn_internal_setup_node(self, curr);
+            ret = vsi_nn_internal_setup_node(self, curr);
         }
         break;
     default:
         {
             VSILOGE( "Not support this type!(PRE_PROCESS)\n");
-            ret = FALSE;
+            goto final;
         }
         break;
     }
@@ -564,22 +718,24 @@ static vsi_bool op_setup
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB           ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA          ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_GRAY          ||
-         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ||
-         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP
+         (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR && !enable_rgb88_planar_nhwc) ||
+         (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP && !enable_rgb88_planar_nhwc)
         )
     {
         if (layout == VSI_NN_DEST_LAYOUT_NHWC)
         {
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 );
+            CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
             curr->node->nn_param.permute.perm = p->perm;
             curr->node->nn_param.permute.dim_num = p->dim_num;
             curr->inputs[0] = preprocess_tensor->t;
             curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
 
-            vsi_nn_internal_setup_node( self, curr );
+            ret = vsi_nn_internal_setup_node( self, curr );
         }
     }
 
+final:
     return ret;
 } /* op_setup() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c
index c1be23962..2c5e5b77d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c
@@ -60,7 +60,9 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_bgra.r_mean );
     vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_bgra.g_mean );
     vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_bgra.b_mean );
-    vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_bgra.rgb_scale );
+    vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_bgra.r_scale );
+    vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_bgra.g_scale );
+    vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_bgra.b_scale );
     vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_bgra.reverse_channel );
     vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_bgra.local.enable_perm );
     vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_bgra.local.enable_copy );
@@ -111,6 +113,9 @@ static vsi_bool op_setup
     /* TODO: Add code to comput outputs' shape. */
     vsi_nn_pre_process_bgra_param * p = NULL;
     uint32_t i = 0;
+
+    VSI_UNREFERENCED(inputs);
+
     p = (vsi_nn_pre_process_bgra_param *)&(self->nn_param.pre_process_bgra);
 
     if (p->rect.width == 0 || p->rect.height == 0)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c
index d264ee7fa..6bc1f796b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c
@@ -115,6 +115,9 @@ static vsi_bool op_setup
 {
     vsi_nn_pre_process_gray_param * p = NULL;
     uint32_t i = 0;
+
+    VSI_UNREFERENCED(inputs);
+
     p = (vsi_nn_pre_process_gray_param *)&(self->nn_param.pre_process_gray);
 
     if (p->rect.width == 0 || p->rect.height == 0)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
index 09eb682ff..7fa635a5b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
@@ -56,7 +56,9 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_nv12.r_mean );
     vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_nv12.g_mean );
     vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_nv12.b_mean );
-    vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_nv12.rgb_scale );
+    vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_nv12.r_scale );
+    vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_nv12.g_scale );
+    vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_nv12.b_scale );
     vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_nv12.reverse_channel );
     vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_nv12.local->enable_perm );
     vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_nv12.local->enable_copy );
@@ -113,6 +115,9 @@ static vsi_bool op_setup
     /* TODO: Add code to comput outputs' shape. */
     vsi_nn_pre_process_nv12_param * p = NULL;
     uint32_t i = 0;
+
+    VSI_UNREFERENCED(inputs);
+
     p = (vsi_nn_pre_process_nv12_param *)&(self->nn_param.pre_process_nv12);
 
     if (p->rect.width == 0 || p->rect.height == 0)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
index 6d19e4a47..80acd7974 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
@@ -59,7 +59,9 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_rgb.r_mean );
     vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_rgb.g_mean );
     vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_rgb.b_mean );
-    vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_rgb.rgb_scale );
+    vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_rgb.r_scale );
+    vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_rgb.g_scale );
+    vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_rgb.b_scale );
     vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_rgb.reverse_channel );
     vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_rgb.local.enable_perm );
     vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_rgb.local.enable_copy );
@@ -116,6 +118,9 @@ static vsi_bool op_setup
     /* TODO: Add code to comput outputs' shape. */
     vsi_nn_pre_process_rgb_param * p = NULL;
     uint32_t i = 0;
+
+    VSI_UNREFERENCED(inputs);
+
     p = (vsi_nn_pre_process_rgb_param *)&(self->nn_param.pre_process_rgb);
 
     if (p->rect.width == 0 || p->rect.height == 0)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c
index 13a636d78..3c27ecc19 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c
@@ -47,7 +47,7 @@ typedef struct _pre_process_rgb888_planar_local_data_t {
  Declare number of input and output.
  */
 #define _INPUT_NUM          (3)
-#define _OUTPUT_NUM         (3)
+#define _OUTPUT_NUM         (1)
 
 static vsi_status op_compute
     (
@@ -59,21 +59,35 @@ static vsi_status op_compute
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_kernel_node_t    n = NULL;
+    vsi_nn_pre_process_rgb888_planar_param * p = NULL;
+
+    p = (vsi_nn_pre_process_rgb888_planar_param *)&(self->nn_param.pre_process_rgb888_planar);
     param = vsi_nn_kernel_param_create();
 
-    vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_rgb888_planar.local->scale_x );
-    vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_rgb888_planar.local->scale_y );
-    vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_rgb888_planar.rect.left );
-    vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_rgb888_planar.rect.top );
-    vsi_nn_kernel_param_add_int32( param, "width", self->nn_param.pre_process_rgb888_planar.rect.width );
-    vsi_nn_kernel_param_add_int32( param, "height", self->nn_param.pre_process_rgb888_planar.rect.height );
-    vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_rgb888_planar.r_mean );
-    vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_rgb888_planar.g_mean );
-    vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_rgb888_planar.b_mean );
-    vsi_nn_kernel_param_add_float32( param, "scale", self->nn_param.pre_process_rgb888_planar.scale );
-    vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_rgb888_planar.local->enable_copy );
-
-    n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb888_planar", inputs, 3, outputs, 3, param );
+    vsi_nn_kernel_param_add_int32( param, "scale_x", p->local->scale_x );
+    vsi_nn_kernel_param_add_int32( param, "scale_y", p->local->scale_y );
+    vsi_nn_kernel_param_add_int32( param, "left", p->rect.left );
+    vsi_nn_kernel_param_add_int32( param, "top", p->rect.top );
+    vsi_nn_kernel_param_add_int32( param, "width", p->rect.width );
+    vsi_nn_kernel_param_add_int32( param, "height", p->rect.height );
+    vsi_nn_kernel_param_add_float32( param, "r_mean", p->r_mean );
+    vsi_nn_kernel_param_add_float32( param, "g_mean", p->g_mean );
+    vsi_nn_kernel_param_add_float32( param, "b_mean", p->b_mean );
+    vsi_nn_kernel_param_add_float32( param, "r_scale", p->r_scale );
+    vsi_nn_kernel_param_add_float32( param, "g_scale", p->g_scale );
+    vsi_nn_kernel_param_add_float32( param, "b_scale", p->b_scale );
+    vsi_nn_kernel_param_add_int32( param, "enable_copy", p->local->enable_copy );
+    vsi_nn_kernel_param_add_int32( param, "reverse", p->reverse_channel );
+
+    if (p->enable_rgb88_planar_nhwc)
+    {
+        n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb888_planar_nhwc", inputs, 3, outputs, 1, param );
+    }
+    else
+    {
+        n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb888_planar", inputs, 3, outputs, 1, param );
+    }
+
     if ( n != NULL )
     {
         self->n = (vx_node)n;
@@ -97,11 +111,11 @@ static vsi_bool op_check
 {
     if (inputs[1] == NULL)
     {
-        BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 1, 3)
-            IO_TYPE(D_U8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
-            IO_TYPE(D_U8, D_I8|Q_DFP,  D_I8|Q_DFP,  D_I8|Q_DFP)
-            IO_TYPE(D_U8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
-            IO_TYPE(D_U8, D_F16,       D_F16,       D_F16)
+        BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 1, 1)
+            IO_TYPE(D_U8, D_U8|Q_ASYM)
+            IO_TYPE(D_U8, D_I8|Q_DFP)
+            IO_TYPE(D_U8, D_I16|Q_DFP)
+            IO_TYPE(D_U8, D_F16)
         END_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR)
 
         if (!VALIDATE_OP_IO_TYPES(PRE_PROCESS_RGB888_PLANAR, self, inputs, 1,
@@ -115,11 +129,11 @@ static vsi_bool op_check
     }
     else
     {
-        BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 3, 3)
-            IO_TYPE(D_U8, D_U8, D_U8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
-            IO_TYPE(D_U8, D_U8, D_U8, D_I8|Q_DFP,  D_I8|Q_DFP,  D_I8|Q_DFP)
-            IO_TYPE(D_U8, D_U8, D_U8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
-            IO_TYPE(D_U8, D_U8, D_U8, D_F16,       D_F16,       D_F16)
+        BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 3, 1)
+            IO_TYPE(D_U8, D_U8, D_U8, D_U8|Q_ASYM)
+            IO_TYPE(D_U8, D_U8, D_U8, D_I8|Q_DFP)
+            IO_TYPE(D_U8, D_U8, D_U8, D_I16|Q_DFP)
+            IO_TYPE(D_U8, D_U8, D_U8, D_F16)
         END_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR)
 
         if (!VALIDATE_OP_IO_TYPES(PRE_PROCESS_RGB888_PLANAR, self, inputs, self->input.num,
@@ -144,6 +158,9 @@ static vsi_bool op_setup
 {
     vsi_nn_pre_process_rgb888_planar_param * p = NULL;
     uint32_t i = 0, j = 0;
+
+    VSI_UNREFERENCED(inputs);
+
     p = (vsi_nn_pre_process_rgb888_planar_param *)&(self->nn_param.pre_process_rgb888_planar);
 
     if (p->rect.width == 0 || p->rect.height == 0)
@@ -163,29 +180,34 @@ static vsi_bool op_setup
         }
     }
 
-    for (j = 0; j < 3; j++)
+
+    if ( VSI_NN_DIM_AUTO == outputs[j]->attr.dim_num )
     {
-        if ( VSI_NN_DIM_AUTO == outputs[j]->attr.dim_num )
+        if (p->output_attr.dim_num > 0)
         {
-            if (p->output_attr.dim_num > 0)
-            {
-                outputs[j]->attr.dim_num = p->output_attr.dim_num;
-                for (i = 0; i < p->output_attr.dim_num; i++)
-                {
-                    outputs[j]->attr.dim_num = p->output_attr.dim_num;
-                    outputs[j]->attr.size[i] = p->output_attr.size[i];
-                }
-            }
-            else
+            outputs[j]->attr.dim_num = p->output_attr.dim_num;
+            for (i = 0; i < p->output_attr.dim_num; i++)
             {
-                VSILOGE("output dim num cannot be zero!(PRE_PROCESS_RGB888_PLANAR)\n");
-                return FALSE;
+                outputs[j]->attr.size[i] = p->output_attr.size[i];
             }
         }
+        else
+        {
+            VSILOGE("output dim num cannot be zero!(PRE_PROCESS_RGB888_PLANAR)\n");
+            return FALSE;
+        }
     }
 
-    p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[0]->attr.size[0]);
-    p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[0]->attr.size[1]);
+    if (p->enable_rgb88_planar_nhwc)
+    {
+        p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[0]->attr.size[1]);
+        p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[0]->attr.size[2]);
+    }
+    else
+    {
+        p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[0]->attr.size[0]);
+        p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[0]->attr.size[1]);
+    }
 
     p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15)));
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c
index b4220a716..9886be018 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "vsi_nn_internal_node.h"
 
@@ -48,6 +48,8 @@ static vsi_bool _is_same_type
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+
     if(vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
     {
         return FALSE;
@@ -63,6 +65,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -73,6 +77,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -85,6 +92,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
@@ -95,7 +104,7 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_bool ret;
+    vsi_bool ret = FALSE;
     uint32_t i;
     uint32_t axis;
     vsi_nn_tensor_attr_t attr;
@@ -112,7 +121,6 @@ static vsi_bool op_setup
         return FALSE;
     }
 
-    ret = TRUE;
     /* output */
     if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
@@ -125,7 +133,7 @@ static vsi_bool op_setup
                 VSILOGE( "Error permute axis '%u', the dim is '%u' ",
                     axis, inputs[0]->attr.dim_num );
                 ret = FALSE;
-                break;
+                goto final;
             }
             outputs[0]->attr.size[i] = inputs[0]->attr.size[axis];
         }
@@ -152,32 +160,35 @@ static vsi_bool op_setup
         self->nn_param.pre_process_tensor.local.enable_perm == FALSE)
     {
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.reshape2.size = outputs[0]->attr.size;
         curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num;
         curr->inputs[0] = inputs[PRE_PROCESS_TENSOR_INPUT];
         curr->outputs[0] = outputs[PRE_PROCESS_TENSOR_OUTPUT];
 
-        vsi_nn_internal_setup_node(self, curr);
+        ret = vsi_nn_internal_setup_node(self, curr);
     }
     else if (self->nn_param.pre_process_tensor.local.enable_data_conv == TRUE &&
         self->nn_param.pre_process_tensor.local.enable_perm == FALSE)
     {
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = inputs[PRE_PROCESS_TENSOR_INPUT];
         curr->outputs[0] = outputs[PRE_PROCESS_TENSOR_OUTPUT];
 
-        vsi_nn_internal_setup_node(self, curr);
+        ret = vsi_nn_internal_setup_node(self, curr);
     }
     else if (self->nn_param.pre_process_tensor.local.enable_data_conv == FALSE &&
         self->nn_param.pre_process_tensor.local.enable_perm == TRUE)
     {
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.permute.perm = self->nn_param.pre_process_tensor.perm;
         curr->node->nn_param.permute.dim_num = self->nn_param.pre_process_tensor.dim_num;
         curr->inputs[0] = inputs[PRE_PROCESS_TENSOR_INPUT];
         curr->outputs[0] = outputs[PRE_PROCESS_TENSOR_OUTPUT];
 
-        vsi_nn_internal_setup_node(self, curr);
+        ret = vsi_nn_internal_setup_node(self, curr);
     }
     else
     {
@@ -187,22 +198,26 @@ static vsi_bool op_setup
         attr.vtl = use_virtual_tensor;
         attr.is_const = FALSE;
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = inputs[PRE_PROCESS_TENSOR_INPUT];
         curr->outputs[0] = output_tensor->t;
 
-        vsi_nn_internal_setup_node( self, curr );
+        ret = vsi_nn_internal_setup_node( self, curr );
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.permute.perm = self->nn_param.pre_process_tensor.perm;
         curr->node->nn_param.permute.dim_num = self->nn_param.pre_process_tensor.dim_num;
         curr->inputs[0] = output_tensor->t;
         curr->outputs[0] = outputs[PRE_PROCESS_TENSOR_OUTPUT];
 
-        vsi_nn_internal_setup_node(self, curr);
+        ret &= vsi_nn_internal_setup_node(self, curr);
     }
 
+final:
     return ret;
 } /* op_setup() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c
index bcac93c3c..37696ff6c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c
@@ -56,7 +56,9 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_yuv420.r_mean );
     vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_yuv420.g_mean );
     vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_yuv420.b_mean );
-    vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_yuv420.rgb_scale );
+    vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_yuv420.r_scale );
+    vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_yuv420.g_scale );
+    vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_yuv420.b_scale );
     vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_yuv420.reverse_channel );
     vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_yuv420.local.enable_perm );
     vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_yuv420.local.enable_copy );
@@ -113,6 +115,9 @@ static vsi_bool op_setup
     /* TODO: Add code to comput outputs' shape. */
     vsi_nn_pre_process_yuv420_param * p = NULL;
     uint32_t i = 0;
+
+    VSI_UNREFERENCED(inputs);
+
     p = (vsi_nn_pre_process_yuv420_param *)&(self->nn_param.pre_process_yuv420);
 
     if (p->rect.width == 0 || p->rect.height == 0)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c
index b9c4daf33..3922de4c2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c
@@ -65,7 +65,9 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_yuv422.r_mean );
     vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_yuv422.g_mean );
     vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_yuv422.b_mean );
-    vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_yuv422.rgb_scale );
+    vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_yuv422.r_scale );
+    vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_yuv422.g_scale );
+    vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_yuv422.b_scale );
     vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_yuv422.reverse_channel );
     vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_yuv422.local->enable_perm );
     vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_yuv422.local->enable_copy );
@@ -123,6 +125,9 @@ static vsi_bool op_setup
     /* TODO: Add code to comput outputs' shape. */
     vsi_nn_pre_process_yuv422_param * p = NULL;
     uint32_t i = 0;
+
+    VSI_UNREFERENCED(inputs);
+
     p = (vsi_nn_pre_process_yuv422_param *)&(self->nn_param.pre_process_yuv422);
 
     if (p->rect.width == 0 || p->rect.height == 0)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c
index 6a350d16e..baa5cc440 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c
@@ -56,7 +56,9 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_yuv444.r_mean );
     vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_yuv444.g_mean );
     vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_yuv444.b_mean );
-    vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_yuv444.rgb_scale );
+    vsi_nn_kernel_param_add_float32( param, "r_scale", self->nn_param.pre_process_yuv444.r_scale );
+    vsi_nn_kernel_param_add_float32( param, "g_scale", self->nn_param.pre_process_yuv444.g_scale );
+    vsi_nn_kernel_param_add_float32( param, "b_scale", self->nn_param.pre_process_yuv444.b_scale );
     vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_yuv444.reverse_channel );
     vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_yuv444.local->enable_perm );
     vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_yuv444.local->enable_copy );
@@ -113,6 +115,9 @@ static vsi_bool op_setup
     /* TODO: Add code to comput outputs' shape. */
     vsi_nn_pre_process_yuv444_param * p = NULL;
     uint32_t i = 0;
+
+    VSI_UNREFERENCED(inputs);
+
     p = (vsi_nn_pre_process_yuv444_param *)&(self->nn_param.pre_process_yuv444);
 
     if (p->rect.width == 0 || p->rect.height == 0)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c
index b66a5cf01..2bdc1362f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c
@@ -213,6 +213,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 } /* op_check() */
 
@@ -277,6 +280,8 @@ static vsi_status op_optimize
     uint32_t dim;
     vx_tensor rois_tmp, score_tmp;
 
+    VSI_UNREFERENCED(inputs);
+
     rois_tmp = NULL, score_tmp = NULL;
     if( direction == VSI_NN_OPTIMIZE_BACKWARD )
     {
@@ -326,16 +331,20 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
-    vx_tensor rois = self->nn_param.proposal.local.rois;
-    vx_tensor score = self->nn_param.proposal.local.score;
-    if( NULL != self && NULL != self->n )
+    vx_tensor rois = NULL;
+    vx_tensor score = NULL;
+
+    if ( NULL != self && NULL != self->n )
     {
-        if(rois)
+        rois = self->nn_param.proposal.local.rois;
+        score = self->nn_param.proposal.local.score;
+
+        if (rois)
         {
             vxReleaseTensor(&rois);
             rois = NULL;
         }
-        if(score)
+        if (score)
         {
             vxReleaseTensor(&score);
             score = NULL;
@@ -343,6 +352,11 @@ static vsi_status op_deinit
         vxReleaseNode( &self->n );
         self->n = NULL;
     }
+    else
+    {
+        return VSI_FAILURE;
+    }
+
     return VSI_SUCCESS;
 }
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c
index 4ea879fbf..c203fdd6a 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c
@@ -55,6 +55,9 @@ static vsi_status op_compute
         VX_CONVERT_POLICY_SATURATE, outputs[0]->t );
     */
 
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+
     if( NULL != self->n )
     {
         status = VSI_SUCCESS;
@@ -69,6 +72,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -80,6 +86,10 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+
     /* TODO: Add code to comput outputs' shape. */
     return TRUE;
 } /* op_setup() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
index a7a549448..dcbb75b04 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
@@ -36,6 +36,7 @@
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "vsi_nn_error.h"
 
 #define _ARG_NUM            (6)
 #define _INPUT_NUM          (1)
@@ -209,6 +210,8 @@ static vsi_status op_compute
 {
     vsi_status status = VSI_FAILURE;
 
+    VSI_UNREFERENCED(outputs);
+
     if ( self->nn_param.reduce.local2->use_internal_node )
     {
         status = vsi_nn_internal_compute_node( self );
@@ -219,7 +222,7 @@ static vsi_status op_compute
         vsi_nn_tensor_t *axis_tensor = NULL;
         vsi_nn_tensor_t *axis_tensor2 = NULL;
         vsi_nn_tensor_attr_t attr, attr2;
-        vx_int32 resolved_dim[4]    = {-1, -1, -1, -1};
+        vx_int32 resolved_dim[VSI_NN_MAX_DIM_NUM] = {-1};
         vx_int32 resolved_dim_count = 0;
         uint32_t i = 0;
         vsi_size_t re_sizes[VSI_NN_MAX_DIM_NUM] = {1};
@@ -230,6 +233,9 @@ static vsi_status op_compute
         vsi_nn_tensor_t *reshaped_output1 = self->nn_param.reduce.local2->reshaped_output1;
         char tensor_name[128];
 
+        CHECK_PTR_FAIL_GOTO( reshaped_input1, "check tensor pointer.", final );
+        CHECK_PTR_FAIL_GOTO( reshaped_output1, "check tensor pointer.", final );
+
         memset(tensor_name, 0, sizeof(tensor_name));
         snprintf(tensor_name,
                  sizeof(tensor_name),
@@ -240,11 +246,20 @@ static vsi_status op_compute
         {
             VSILOGW("Set uid %u reduce reshaped output name fail",
                     self->uid);
-            return VSI_FAILURE;
+
+            status = VSI_FAILURE;
+            goto final;
         }
 
 
         resolved_dim_count = self->nn_param.reduce.local2->axes_num;
+        if (resolved_dim_count > VSI_NN_MAX_DIM_NUM)
+        {
+            VSILOGE("resolved_dim_count greater than VSI_NN_MAX_DIM_NUM");
+
+            status = VSI_FAILURE;
+            goto final;
+        }
 
         for (i = 0; i < (uint32_t)resolved_dim_count; i++)
         {
@@ -313,7 +328,7 @@ static vsi_status op_compute
                                            input_t,
                                            output_t);
         }
-        else if (3 == resolved_dim[resolved_dim_count - 1] && resolved_dim_count < 3)
+        else if (resolved_dim_count > 0 && 3 == resolved_dim[resolved_dim_count - 1] && resolved_dim_count < 3)
         {
             if (1 == resolved_dim_count)
             {
@@ -349,6 +364,7 @@ static vsi_status op_compute
                 attr2.size[resolved_dim[0]] = 1;
                 attr2.vtl = FALSE;
                 mean_tmp_tensor = vsi_nn_CreateTensor(self->graph, &attr2);
+                CHECK_PTR_FAIL_GOTO( mean_tmp_tensor, "Create tensor fail.", final );
                 self->nn_param.reduce.local2->reshaped_tmp = mean_tmp_tensor;
                 re_sizes[resolved_dim[0]] = 1;
                 memset(&attr, 0, sizeof(attr));
@@ -433,6 +449,8 @@ static vsi_status op_compute
             attr2.size[resolved_dim[1]] = 1;
             attr2.vtl = FALSE;
             mean_tmp_tensor = vsi_nn_CreateTensor(self->graph, &attr2);
+            CHECK_PTR_FAIL_GOTO( mean_tmp_tensor, "Create tensor fail.", final );
+
             self->nn_param.reduce.local2->reshaped_tmp = mean_tmp_tensor;
             re_sizes[resolved_dim[0]] = 1;
             re_sizes[resolved_dim[1]] = 1;
@@ -446,11 +464,8 @@ static vsi_status op_compute
                 self->graph,
                 (uint8_t *)&resolved_dim[0],
                 &attr);
-            if( NULL == axis_tensor )
-            {
-                VSILOGE("Create axis_tensor fail.(reduce)");
-                return VSI_FAILURE;
-            }
+            CHECK_PTR_FAIL_GOTO( axis_tensor, "Create tensor fail.", final );
+
             self->nn_param.reduce.local.axis_tensor = axis_tensor;
             status = op_comput_reduce_mean(self,
                                             axis_tensor,
@@ -512,6 +527,7 @@ static vsi_status op_compute
         }
     }
 
+final:
     return status;
 } /* op_compute() */
 
@@ -523,6 +539,9 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+
     if ( self->nn_param.reduce.local2->use_internal_node )
     {
         return vsi_nn_internal_optimize_node(self, direction );
@@ -540,6 +559,10 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -711,7 +734,7 @@ static vsi_bool op_set_reduce_axis(
     for (i = 0; i < self->nn_param.reduce.axis_num; i++)
     {
         vx_int32 current_axis = self->nn_param.reduce.axis[i] < 0 ? \
-        inputs[0]->attr.dim_num + self->nn_param.reduce.axis[i] : self->nn_param.reduce.axis[i];
+        (int32_t)inputs[0]->attr.dim_num + self->nn_param.reduce.axis[i] : self->nn_param.reduce.axis[i];
 
         if (current_axis < 0 || current_axis >= (vx_int32)inputs[0]->attr.dim_num)
         {
@@ -822,16 +845,20 @@ static vsi_bool op_set_sp_reduce_internal
     int32_t axes_num = self->nn_param.reduce.local2->axes_num;
     int32_t i = 0, j = 0, index = 0;
     vsi_size_t reduce_size = 1;
+    vsi_bool ret = FALSE;
 
     vsi_nn_internal_init_node_wksp( self );
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor);
     tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final);
 
     tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
     permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode,
         inputs[0]->attr.dim_num * sizeof(uint32_t));
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(permute_in_perm, tmp_inode, "Create buffer failed", final);
 
     for ( i = 0;  i < axes_num; i++)
     {
@@ -862,11 +889,14 @@ static vsi_bool op_set_sp_reduce_internal
     vsi_nn_internal_setup_node(self, tmp_inode);
 
     new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes, outputs[0]->attr.dim_num);
+    CHECK_PTR_FAIL_GOTO(new_output, "Create tensor failed", final);
+    self->nn_param.reduce.local2->reshaped_output = new_output;
 
     tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_REDUCE_MEAN_INTERNAL, 0, 0 );
-
+    CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
     new_axis = (int32_t *)vsi_nn_internal_new_node_param(tmp_inode,
         axes_num * sizeof(int32_t));
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(new_axis, tmp_inode, "Create buffer failed", final);
     for (i = 0; i < axes_num; i++)
     {
         new_axis[i] = i;
@@ -885,11 +915,10 @@ static vsi_bool op_set_sp_reduce_internal
         tmp_inode->node->nn_param.reduce_mean_internal.scale =
             1.0f / (float)reduce_size;
     }
-    vsi_nn_internal_setup_node(self, tmp_inode);
+    ret = vsi_nn_internal_setup_node(self, tmp_inode);
 
-    self->nn_param.reduce.local2->reshaped_output = new_output;
-
-    return TRUE;
+final:
+    return ret;
 }
 
 static vsi_bool op_set_reduce_internal
@@ -912,6 +941,8 @@ static vsi_bool op_set_reduce_internal
     vx_int32 resolved_dim_count = 0;
     int32_t * axes = self->nn_param.reduce.local2->axes;
     vx_bool  is_use_float = vx_false_e;
+    vsi_bool ret = FALSE;
+
     resolved_dim_count = self->nn_param.reduce.local2->axes_num;
 
     if ((VSI_NN_OP_REDUCESUM_INTERNAL == type_name) || (VSI_NN_OP_REDUCEPROD_INTERNAL == type_name))
@@ -975,6 +1006,7 @@ static vsi_bool op_set_reduce_internal
         }
 
         curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         op_set_reduce_param_value(&(curr->node->nn_param), type_name,
         axes, 1, self->nn_param.reduce.keep_dim);
         if (self->nn_param.reduce.local2->reshaped_input)
@@ -1001,9 +1033,11 @@ static vsi_bool op_set_reduce_internal
         attr.vtl = use_virtual_tensor;
         attr.is_const = FALSE;
         tmp_output_tensor[0] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(tmp_output_tensor[0], "Create internal tensor failed", final);
         re_sizes[axes[0]] = 1;
 
         curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         op_set_reduce_param_value(&(curr->node->nn_param), type_name,
         &(axes[0]), 1, vx_true_e);
         curr->inputs[0]  = inputs[POST_PROCESS_INPUT];
@@ -1034,8 +1068,11 @@ static vsi_bool op_set_reduce_internal
             re_sizes[axes[1]] = 1;
             new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], re_sizes, dim_num);
         }
+        CHECK_PTR_FAIL_GOTO(new_output, "Reshape tensor failed", final);
+        self->nn_param.reduce.local2->reshaped_output = new_output;
 
         curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         op_set_reduce_param_value(&(curr->node->nn_param), type_name,
         &(axes[1]), 1, vx_true_e);
         if (self->nn_param.reduce.local2->reshaped_input)
@@ -1047,7 +1084,6 @@ static vsi_bool op_set_reduce_internal
             curr->inputs[0]  = tmp_output_tensor[0]->t;
         }
         curr->outputs[0] = new_output;
-        self->nn_param.reduce.local2->reshaped_output = new_output;
         vsi_nn_internal_setup_node(self, curr);
     }
     else if (3 == resolved_dim_count)
@@ -1056,12 +1092,15 @@ static vsi_bool op_set_reduce_internal
         attr.vtl = use_virtual_tensor;
         attr.is_const = FALSE;
         tmp_output_tensor[0] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(tmp_output_tensor[0], "Create internal tensor failed", final);
         attr.size[axes[1]] = 1;
         tmp_output_tensor[1] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(tmp_output_tensor[1], "Create internal tensor failed", final);
         re_sizes[axes[0]] = 1;
         re_sizes[axes[1]] = 1;
 
         curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         op_set_reduce_param_value(&(curr->node->nn_param), type_name,
         &(axes[0]), 1, vx_true_e);
         curr->inputs[0]  = inputs[POST_PROCESS_INPUT];
@@ -1069,6 +1108,7 @@ static vsi_bool op_set_reduce_internal
         vsi_nn_internal_setup_node( self, curr );
 
         curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         op_set_reduce_param_value(&(curr->node->nn_param), type_name,
         &(axes[1]), 1, vx_true_e);
         curr->inputs[0]  = tmp_output_tensor[0]->t;
@@ -1100,6 +1140,7 @@ static vsi_bool op_set_reduce_internal
         }
 
         curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         op_set_reduce_param_value(&(curr->node->nn_param), type_name,
         &(axes[2]), 1, vx_true_e);
         if (self->nn_param.reduce.local2->reshaped_input)
@@ -1119,7 +1160,10 @@ static vsi_bool op_set_reduce_internal
         VSILOGE("error: resolved_dim_count is %d\n", resolved_dim_count);
         return FALSE;
     }
-    return TRUE;
+
+    ret = TRUE;
+final:
+    return ret;
 }
 
 static vsi_bool op_setup
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c
index 4f5022836..74132f149 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c
@@ -57,11 +57,13 @@ static vsi_status op_compute
     int32_t axis_num = self->nn_param.reduce_mean_internal.axis_num;
     float scale = self->nn_param.reduce_mean_internal.scale;
     vsi_enum type = self->nn_param.reduce_mean_internal.type;
+    int32_t *axis = self->nn_param.reduce_mean_internal.axis;
     vsi_nn_kernel_param_t * param = NULL;
 
     param = vsi_nn_kernel_param_create();
     vsi_nn_kernel_param_add_int32( param, "axis_num", axis_num );
     vsi_nn_kernel_param_add_float32( param, "scale", scale );
+    vsi_nn_kernel_param_add_str( param, "axis", (const char*)axis );
 
     if (type == VSI_NN_REDUCE_MAX)
     {
@@ -95,6 +97,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c
index dd41b6a0e..08e5b9401 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c
@@ -91,6 +91,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -102,6 +105,9 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /* TODO: Add code to comput outputs' shape. */
     return TRUE;
 } /* op_setup() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c
index 062922637..9efd8fca5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c
@@ -159,6 +159,8 @@ static vsi_bool op_setup
     vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
     vsi_bool ret = TRUE;
 
+    VSI_UNREFERENCED(self);
+
     in1_rank = inputs[0]->attr.dim_num;
     in2_rank = inputs[1]->attr.dim_num;
     out_rank = vsi_nn_max( in1_rank, in2_rank );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c
index 8c40d429a..6ec9d19af 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_util.h"
 
@@ -46,6 +46,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -56,6 +58,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -84,7 +89,9 @@ static vsi_bool op_setup
     float max_value = 0;
     float threshold = 0;
     uint32_t max_raw = 0;
-    if( NULL == self )
+    vsi_bool ret = FALSE;
+
+    if ( NULL == self )
     {
         return FALSE;
     }
@@ -101,30 +108,35 @@ static vsi_bool op_setup
     if (alpha == 0 && max_raw == VSI_NN_FLOAT32_INF && threshold == 0)
     {
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU, 0, 0);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = outputs[0];
     }
     else if (alpha == 1.0f && max_value == 1.0f && threshold == -1.0f)
     {
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU1, 0, 0);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = outputs[0];
     }
     else if (alpha == 0 && max_value == 6.0f && threshold == 0)
     {
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU6, 0, 0);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = outputs[0];
     }
     else if (alpha == 0.1 && max_value == VSI_NN_FLOAT32_INF && threshold == 0)
     {
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_LEAKY_RELU, 0, 0);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = outputs[0];
     }
     else
     {
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU_KERAS_INTERNAL, 0, 0);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = outputs[0];
         curr->node->nn_param.relu_keras_internal.max_value = max_value;
@@ -132,9 +144,10 @@ static vsi_bool op_setup
         curr->node->nn_param.relu_keras_internal.threshold = threshold;
     }
 
-    vsi_nn_internal_setup_node(self, curr);
+    ret = vsi_nn_internal_setup_node(self, curr);
 
-    return TRUE;
+final:
+    return ret;
 }
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c
index 2a77c5c99..96d760e39 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c
@@ -158,8 +158,32 @@ static vsi_status op_compute
 
     param = vsi_nn_kernel_param_create();
     vsi_nn_kernel_param_add_int32( param, "axis", axis );
-    n = vsi_nn_kernel_selector( self->graph, "repeat",
-                    tmp_inputs, _INPUT_NUM, tmp_output, _OUTPUT_NUM, param );
+
+    if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE)
+    {
+        vsi_nn_tensor_t* temp_tensors = NULL;
+        vsi_nn_tensor_attr_t attr;
+        VSILOGW("repeat is no_range_change operation! \
+            Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!");
+
+        memcpy( &attr, &tmp_output[0]->attr, sizeof(attr));
+        memcpy( &attr.dtype, &tmp_inputs[0]->attr.dtype, sizeof(attr.dtype));
+        attr.is_const = FALSE;
+        attr.vtl = TRUE;
+        temp_tensors = vsi_nn_CreateTensor( self->graph, &attr );
+
+        vsi_nn_kernel_selector( self->graph, "repeat",
+                        tmp_inputs, _INPUT_NUM, &temp_tensors, _OUTPUT_NUM, param );
+
+        n = vxTensorCopyNode( self->graph->g, temp_tensors->t, tmp_output[0]->t);
+        vsi_safe_release_tensor(temp_tensors);
+    }
+    else
+    {
+        n = vsi_nn_kernel_selector( self->graph, "repeat",
+                        tmp_inputs, _INPUT_NUM, tmp_output, _OUTPUT_NUM, param );
+    }
+
     if ( n != NULL )
     {
         self->n = (vx_node)n;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
index e1cfdaa69..523eeb46a 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
@@ -124,7 +124,8 @@ static vsi_bool op_setup
         uint32_t i = 0;
         for (i = 0; i < self->nn_param.reshape.dim_num; i++)
         {
-            shape[i] = -1 == self->nn_param.reshape.size[i] ? -1 : (vsi_size_t)self->nn_param.reshape.size[i];
+            shape[i] = (uint32_t)-1 == self->nn_param.reshape.size[i] ? \
+                (vsi_size_t)-1 : (vsi_size_t)self->nn_param.reshape.size[i];
         }
         ret = vsi_nn_CalcReshapeTensor(inputs[0],
             outputs[0],
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
index 002b39be5..1a719af73 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
@@ -43,6 +43,7 @@
 #include "vsi_nn_log.h"
 #include "vsi_nn_internal_node.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_error.h"
 
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
@@ -83,7 +84,7 @@ static vsi_status op_compute
     }
     else
     {
-        char kernel_name[128];
+        char kernel_name[128] = {0};
         vsi_nn_kernel_param_t * param = NULL;
         int32_t align_corners = self->nn_param.resize.align_corners;
         int32_t half_pixel_centers = self->nn_param.resize.half_pixel_centers;
@@ -156,6 +157,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -171,6 +175,7 @@ static vsi_bool op_setup
     float factor = self->nn_param.resize.factor;
     vsi_enum layout = self->nn_param.resize.layout;
     vsi_nn_internal_node_t* curr = NULL;
+    vsi_bool ret = FALSE;
 
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
@@ -220,13 +225,14 @@ static vsi_bool op_setup
 
         vsi_nn_internal_init_node_wksp( self );
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_INTERNAL, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.resize_internal.align_corners = self->nn_param.resize.align_corners;
         curr->node->nn_param.resize_internal.factor = self->nn_param.resize.factor;
         curr->node->nn_param.resize_internal.half_pixel_centers = self->nn_param.resize.half_pixel_centers;
         curr->node->nn_param.resize_internal.layout = self->nn_param.resize.layout;
         curr->inputs[0]  = inputs[0];
         curr->outputs[0] = outputs[0];
-        vsi_nn_internal_setup_node(self, curr);
+        ret = vsi_nn_internal_setup_node(self, curr);
     }
     else if (_is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num))
     {
@@ -234,12 +240,18 @@ static vsi_bool op_setup
 
         vsi_nn_internal_init_node_wksp( self );
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0]  = inputs[0];
         curr->outputs[0] = outputs[0];
-        vsi_nn_internal_setup_node(self, curr);
+        ret = vsi_nn_internal_setup_node(self, curr);
+    }
+    else
+    {
+        ret = TRUE;
     }
 
-    return TRUE;
+final:
+    return ret;
 } /* op_setup() */
 
 static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c
index c05ec675a..d1b499ec7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c
@@ -34,6 +34,7 @@
 #include "vsi_nn_tensor.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_error.h"
 
 /*
  Declare number of input and output.
@@ -71,6 +72,9 @@ static vsi_status op_compute
 {
     vsi_status status = VSI_FAILURE;
 
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+
     status = vsi_nn_internal_compute_node( self );
 
     return status;
@@ -102,6 +106,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return TRUE;
 } /* op_check() */
 
@@ -114,6 +121,7 @@ static vsi_bool op_setup
 {
     float factor = self->nn_param.resize_1d.factor;
     vsi_nn_internal_node_t* curr = NULL;
+    vsi_bool ret = FALSE;
 
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
@@ -135,36 +143,40 @@ static vsi_bool op_setup
     {
         vsi_nn_internal_init_node_wksp( self );
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0]  = inputs[0];
         curr->outputs[0] = outputs[0];
-        vsi_nn_internal_setup_node(self, curr);
+        ret = vsi_nn_internal_setup_node(self, curr);
     }
     else if (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize_1d.type)
     {
         vsi_nn_internal_init_node_wksp( self );
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_1D_BILINEAR_INTERNAL, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.resize_1d_bilinear_internal.align_corners = self->nn_param.resize_1d.align_corners;
         curr->node->nn_param.resize_1d_bilinear_internal.factor = self->nn_param.resize_1d.factor;
         curr->node->nn_param.resize_1d_bilinear_internal.half_pixel_centers = \
                                               self->nn_param.resize_1d.half_pixel_centers;
         curr->inputs[0]  = inputs[0];
         curr->outputs[0] = outputs[0];
-        vsi_nn_internal_setup_node(self, curr);
+        ret = vsi_nn_internal_setup_node(self, curr);
     }
     else if (VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize_1d.type)
     {
         vsi_nn_internal_init_node_wksp( self );
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_1D_NEAREST_INTERNAL, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.resize_1d_nearest_internal.align_corners = self->nn_param.resize_1d.align_corners;
         curr->node->nn_param.resize_1d_nearest_internal.factor = self->nn_param.resize_1d.factor;
         curr->node->nn_param.resize_1d_nearest_internal.half_pixel_centers = \
                                               self->nn_param.resize_1d.half_pixel_centers;
         curr->inputs[0]  = inputs[0];
         curr->outputs[0] = outputs[0];
-        vsi_nn_internal_setup_node(self, curr);
+        ret = vsi_nn_internal_setup_node(self, curr);
     }
 
-    return TRUE;
+final:
+    return ret;
 } /* op_setup() */
 
 static vsi_status op_init
@@ -172,6 +184,8 @@ static vsi_status op_init
     vsi_nn_node_t* self
     )
 {
+    VSI_UNREFERENCED(self);
+
     return VSI_SUCCESS;
 } /* op_init() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_bilinear_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_bilinear_internal.c
index 66ea066ed..5b37e89a8 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_bilinear_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_bilinear_internal.c
@@ -135,7 +135,7 @@ static vsi_status op_init
     vsi_nn_node_t* self
     )
 {
-
+    VSI_UNREFERENCED(self);
     return VSI_SUCCESS;
 } /* op_init() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_nearest_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_nearest_internal.c
index edddc1a27..b202f8ca3 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_nearest_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d_nearest_internal.c
@@ -134,6 +134,7 @@ static vsi_status op_init
     vsi_nn_node_t* self
     )
 {
+    VSI_UNREFERENCED(self);
 
     return VSI_SUCCESS;
 } /* op_init() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_3d.c
new file mode 100644
index 000000000..989bb1b70
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_3d.c
@@ -0,0 +1,334 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_tensor_util.h"
+
+typedef struct _resize_3d_local_data_t {
+    int32_t placeholder;
+} resize_3d_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_tensor_t *      reshape_inputs[1]     = {NULL};
+    vsi_nn_tensor_t *      reshape_outputs[1]    = {NULL};
+
+    if ( self->nn_param.resize_3d.lcl_data->use_internal_node )
+    {
+        status = vsi_nn_internal_compute_node( self );
+    }
+    else
+    {
+        char kernel_name[128];
+        vsi_nn_kernel_param_t * param = NULL;
+        int32_t align_corners = self->nn_param.resize_3d.align_corners;
+        int32_t half_pixel_centers = self->nn_param.resize_3d.half_pixel_centers;
+        vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM]     = {{0}};
+        uint32_t               new_rank              = 4;
+        uint32_t i = 0;
+
+        if (inputs[0]->attr.dim_num > 3)
+        {
+            shapes[0][0] = inputs[0]->attr.size[0];
+            shapes[0][1] = inputs[0]->attr.size[1];
+            shapes[0][2] = inputs[0]->attr.size[2];
+            shapes[1][0] = outputs[0]->attr.size[0];
+            shapes[1][1] = outputs[0]->attr.size[1];
+            shapes[1][2] = outputs[0]->attr.size[2];
+            shapes[0][3] = 1;
+            shapes[1][3] = 1;
+
+            for (i = 3; i < inputs[0]->attr.dim_num; i++)
+            {
+                shapes[0][3] = shapes[0][3] * inputs[0]->attr.size[i];
+            }
+            shapes[1][3] = shapes[0][3];
+
+            reshape_inputs[0] = vsi_nn_reshape_tensor(self->graph, inputs[0], shapes[0], new_rank);
+            reshape_outputs[0] = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes[1], new_rank);
+
+            if (reshape_inputs[0] == NULL || reshape_outputs[0] == NULL)
+            {
+                VSILOGE("reshape tensor failed");
+                status = VSI_FAILURE;
+                goto final;
+            }
+        }
+        else
+        {
+            reshape_inputs[0] = inputs[0];
+            reshape_outputs[0] = outputs[0];
+        }
+
+
+        param = vsi_nn_kernel_param_create();
+
+        vsi_nn_kernel_param_add_int32( param, "align_corners",  align_corners );
+        vsi_nn_kernel_param_add_int32( param, "half_pixel_centers",  half_pixel_centers );
+        vsi_nn_kernel_param_add_int32( param, "type",  self->nn_param.resize_3d.type );
+
+        switch (self->nn_param.resize_3d.type)
+        {
+            case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR:
+                 snprintf(kernel_name, sizeof(kernel_name),
+                 "resize_3d_nearest");
+                 break;
+            case VSI_NN_INTERPOLATION_BILINEAR:
+                 snprintf(kernel_name, sizeof(kernel_name),
+                 "resize_3d_bilinear");
+                 break;
+            default:
+                break;
+        }
+
+        self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+            kernel_name, &reshape_inputs[0], 1, &reshape_outputs[0], 1, param );
+
+        if (self->n) {
+            status = VSI_SUCCESS;
+        }
+
+        vsi_nn_kernel_param_release(&param);
+    }
+
+final:
+    vsi_safe_release_tensor( reshape_inputs[0] );
+    vsi_safe_release_tensor( reshape_outputs[0] );
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool _is_same_shape
+    (
+    vsi_nn_tensor_t * inputs,
+    vsi_size_t *sizes,
+    uint32_t dims
+    )
+{
+    uint32_t i = 0;
+
+    if (inputs->attr.dim_num != dims)
+        return FALSE;
+
+    for (i = 0; i < dims; i++)
+    {
+        if (sizes[i] != inputs->attr.size[i])
+            return FALSE;
+    }
+
+    return TRUE;
+}
+
+static vsi_status op_optimize
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_nn_opt_direction_e direction
+    )
+{
+    if ( self->nn_param.resize_3d.lcl_data->use_internal_node )
+    {
+        return vsi_nn_internal_optimize_node(self, direction );
+    }
+    else
+    {
+        int32_t half_pixel_centers = self->nn_param.resize_3d.half_pixel_centers;
+        vsi_size_t * input_size = inputs[0]->attr.size;
+        vsi_size_t * output_size = outputs[0]->attr.size;
+
+        if ( (output_size[0] % input_size[0] == 0) && (output_size[1] % input_size[1] == 0) &&
+            half_pixel_centers == TRUE && self->nn_param.resize_3d.type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR )
+        {
+            self->nn_param.resize_3d.half_pixel_centers = FALSE;
+        }
+
+        return VSI_SUCCESS;
+    }
+} /* op_optimize() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(RESIZE_3D, 1, 1)
+        IO_TYPE(D_F16,  D_U8|Q_ASYM)
+        IO_TYPE(D_F32,  D_U8|Q_ASYM)
+        IO_TYPE(D_F16,  D_F16)
+        IO_TYPE(D_F32,  D_F32)
+        IO_TYPE(D_BF16, D_BF16)
+        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_F32)
+        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_I16|Q_SYM)
+    END_IO_TYPE_DECL(RESIZE_3D)
+    if (!VALIDATE_OP_IO_TYPES(RESIZE_3D, self, inputs, self->input.num, outputs, self->output.num)) {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    float factor = self->nn_param.resize_3d.factor;
+    vsi_nn_internal_node_t* curr = NULL;
+    uint32_t i = 0;
+    vsi_bool ret = TRUE;
+
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        if (factor != 0)
+        {
+            outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor);
+            outputs[0]->attr.size[1] = (uint32_t)(inputs[0]->attr.size[1] * factor);
+            outputs[0]->attr.size[2] = (uint32_t)(inputs[0]->attr.size[2] * factor);
+        }
+        else
+        {
+            outputs[0]->attr.size[0] = self->nn_param.resize_3d.size[0];
+            outputs[0]->attr.size[1] = self->nn_param.resize_3d.size[1];
+            outputs[0]->attr.size[2] = self->nn_param.resize_3d.size[2];
+        }
+        for (i = 3; i < inputs[0]->attr.dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+        }
+    }
+
+    if (_is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num))
+    {
+        self->nn_param.resize.lcl_data->use_internal_node = TRUE;
+        vsi_nn_internal_init_node_wksp( self );
+        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+        curr->inputs[0]  = inputs[0];
+        curr->outputs[0] = outputs[0];
+        ret = vsi_nn_internal_setup_node(self, curr);
+    }
+
+final:
+    return ret;
+} /* op_setup() */
+
+static vsi_status op_init(vsi_nn_node_t* self) {
+    vsi_status status = VSI_SUCCESS;
+
+    self->nn_param.resize_3d.lcl_data =
+        (vsi_nn_resize_3d_local_data*)malloc(sizeof(vsi_nn_resize_3d_local_data));
+    if (NULL == self->nn_param.resize_3d.lcl_data) {
+        VSILOGE("Create resize_3d local data fail.");
+        status = VSI_FAILURE;
+        goto final;
+    }
+    memset(self->nn_param.resize_3d.lcl_data, 0, sizeof(vsi_nn_resize_3d_local_data));
+
+    self->nn_param.resize_3d.align_corners = FALSE;
+    self->nn_param.resize_3d.half_pixel_centers = FALSE;
+
+
+final:
+    return status;
+} /* op_init() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    if (self->nn_param.resize_3d.lcl_data->use_internal_node)
+    {
+        vsi_nn_safe_free(self->nn_param.resize_3d.lcl_data);
+        vsi_nn_internal_deinit_node_wksp(self);
+    }
+    else
+    {
+        vsi_nn_safe_free(self->nn_param.resize_3d.lcl_data);
+        vsi_nn_op_common_deinit(self);
+    }
+
+    return VSI_SUCCESS;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ RESIZE_3D,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ op_optimize,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c
index 50924672f..1a9ad7d77 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c
@@ -36,6 +36,7 @@
 #include "vsi_nn_log.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
 
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
@@ -183,7 +184,7 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_bool ret = TRUE;
+    vsi_bool ret = FALSE;
     vsi_nn_internal_node_t* curr = NULL;
 
     vsi_nn_internal_init_node_wksp(self);
@@ -201,21 +202,26 @@ static vsi_bool op_setup
         attr.vtl = TRUE;
         attr.is_const = FALSE;
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
 
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_REVERSE, 0, 0);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = output_tensor->t;
         curr->node->nn_param.reverse.axis = self->nn_param.reverse.axis;
         curr->node->nn_param.reverse.axis_num = self->nn_param.reverse.axis_num;
-        vsi_nn_internal_setup_node(self, curr);
+        ret &= vsi_nn_internal_setup_node(self, curr);
 
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = output_tensor->t;
         curr->outputs[0] = outputs[0];
-        vsi_nn_internal_setup_node(self, curr);
+        ret &= vsi_nn_internal_setup_node(self, curr);
     }
 
     return ret;
+final:
+    return FALSE;
 } /* op_setup() */
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rnn.c
index 38df1523b..2632ed652 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_rnn.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rnn.c
@@ -89,41 +89,43 @@ static vsi_bool op_check
     )
 {
     uint32_t input_idx = 0;
-    do {
+    {
         vsi_bool break_early = FALSE;
 
         // input_idx = 0 : inputs[0].shape = shape(batch_size, input_size)
-        if (input_idx >= self->input.num) break;
+        if (input_idx >= self->input.num) goto continue_point;
         break_early = (inputs[input_idx]->attr.dim_num != 2);
-        if (break_early) break;
+        if (break_early) goto continue_point;
         input_idx ++;
 
         // input_idx = 1 : inputs[1].shape = shape(num_units, input_size)
-        if (input_idx >= self->input.num) break;
+        if (input_idx >= self->input.num) goto continue_point;
         break_early = (inputs[input_idx]->attr.dim_num != 2);
-        if (break_early) break;
+        if (break_early) goto continue_point;
         input_idx ++;
 
         // input_idx = 2 : inputs[2].shape = shape(num_units, num_units)
-        if (input_idx >= self->input.num) break;
+        if (input_idx >= self->input.num) goto continue_point;
         break_early = (inputs[input_idx]->attr.dim_num != 2);
-        if (break_early) break;
+        if (break_early) goto continue_point;
         input_idx ++;
 
         // input_idx = 3 : inputs[3].shape = shape(num_units)
-        if (input_idx >= self->input.num) break;
+        if (input_idx >= self->input.num) goto continue_point;
         break_early = (inputs[input_idx]->attr.dim_num != 1);
-        if (break_early) break;
+        if (break_early) goto continue_point;
         input_idx ++;
 
         // input_idx = 4 : inputs[4].shape = shape(batch_size, num_units)
-        if (input_idx >= self->input.num) break;
+        if (input_idx >= self->input.num) goto continue_point;
         break_early = (inputs[input_idx]->attr.dim_num != 2);
-        if (break_early) break;
+        if (break_early) goto continue_point;
         input_idx ++;
 
         return TRUE;
-    } while(0);
+    }
+
+continue_point:
 
     {
         BEGIN_IO_TYPE_DECL(RNN, 5, 1)
@@ -155,6 +157,8 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+
     if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) {
         outputs[0]->attr.size[0] = inputs[4]->attr.size[0];
         outputs[0]->attr.size[1] = inputs[4]->attr.size[1];
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c
index a5f82613a..b2c254fd9 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c
@@ -46,6 +46,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -56,6 +58,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -68,6 +73,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
@@ -98,6 +105,7 @@ static vsi_bool setup_op_shapes
         attr.is_const = TRUE;
 
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         inputs[RNNCELL_INPUT_H_STATE] = output_tensor->t;
     }
 
@@ -108,6 +116,7 @@ static vsi_bool setup_op_shapes
         memcpy( &attr.dtype, &outputs[RNNCELL_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) );
         attr.vtl = TRUE;
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         outputs[RNNCELL_OUTPUT_H_STATE] = output_tensor->t;
     }
 
@@ -131,7 +140,10 @@ static vsi_bool setup_op_shapes
             outputs[RNNCELL_OUTPUT_OUTPUT]->attr.size,
             VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) );
     }
+
     return TRUE;
+final:
+    return FALSE;
 }
 
 static vsi_bool op_setup
@@ -207,6 +219,7 @@ static vsi_bool op_setup
                                     inputs[RNNCELL_INPUT_BIAS_I],
                                     &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I],
                                     use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(input_gate_fc_outputs, "Create internal tensor failed", final);
         if (inputs[RNNCELL_INPUT_AUX_INPUT] != NULL)
         {
             aux_input_gate_fc_outputs = vsi_nn_rnn_create_tp_fc(self,
@@ -215,6 +228,7 @@ static vsi_bool op_setup
                                             NULL,
                                             &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX],
                                             use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO(aux_input_gate_fc_outputs, "Create internal tensor failed", final);
         }
     }
     else
@@ -225,6 +239,7 @@ static vsi_bool op_setup
             &kernel_h, &kernel_w);
         input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[RNNCELL_INPUT_INPUT],
             p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final);
 
         tmp = vsi_nn_rnn_create_nn_fc(self,
                 input_tensor->t,
@@ -233,9 +248,11 @@ static vsi_bool op_setup
                 kernel_h, kernel_w,
                 &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I],
                 use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
         /* transpose and reshape output */
         input_gate_fc_outputs = vsi_nn_rnn_process_output_for_nn_fc(self, tmp->t, p->local->multi_batch, kernel_h,
             kernel_w, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(input_gate_fc_outputs, "Create internal tensor failed", final);
         if (inputs[RNNCELL_INPUT_AUX_INPUT] != NULL)
         {
             /* reshape and transpose input */
@@ -245,6 +262,8 @@ static vsi_bool op_setup
             input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self,
                             inputs[RNNCELL_INPUT_AUX_INPUT],
                             p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final);
+
             tmp = vsi_nn_rnn_create_nn_fc(self,
                     input_tensor->t,
                     inputs[RNNCELL_INPUT_AUX_INPUT],
@@ -252,10 +271,13 @@ static vsi_bool op_setup
                     kernel_h, kernel_w,
                     &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX],
                     use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
+
             /* transpose and reshape output */
             aux_input_gate_fc_outputs = vsi_nn_rnn_process_output_for_nn_fc(self,
                                             tmp->t, p->local->multi_batch, kernel_h,
                                             kernel_w, use_virtual_tensor);
+            CHECK_PTR_FAIL_GOTO(aux_input_gate_fc_outputs, "Create internal tensor failed", final);
         }
     }
 
@@ -268,6 +290,7 @@ static vsi_bool op_setup
                                     inputs[RNNCELL_INPUT_BIAS_H],
                                     &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_H],
                                     use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(hstate_gate_fc_outputs, "Create internal tensor failed", final);
     }
     else
     {
@@ -277,6 +300,7 @@ static vsi_bool op_setup
         hstate_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self,
                                 inputs[RNNCELL_INPUT_H_STATE],
                                 p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(hstate_input_tensor, "Create internal tensor failed", final);
 
         tmp = vsi_nn_rnn_create_nn_fc(self,
                 hstate_input_tensor->t,
@@ -285,9 +309,12 @@ static vsi_bool op_setup
                 kernel_h, kernel_w,
                 &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_H],
                 use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tmp, "Create internal tensor failed", final);
+
         /* transpose and reshape output */
         hstate_gate_fc_outputs = vsi_nn_rnn_process_output_for_nn_fc(self,
             tmp->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(hstate_gate_fc_outputs, "Create internal tensor failed", final);
     }
 
     input_add_hstate_outputs = vsi_nn_rnn_create_tensor_add(self,
@@ -295,14 +322,22 @@ static vsi_bool op_setup
                                     hstate_gate_fc_outputs->t,
                                     &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I],
                                     use_virtual_tensor);
+    CHECK_PTR_FAIL_GOTO(input_add_hstate_outputs, "Create internal tensor failed", final);
 
     if (inputs[RNNCELL_INPUT_AUX_INPUT] != NULL)
     {
+        if (aux_input_gate_fc_outputs == NULL ||
+            input_add_hstate_outputs == NULL)
+        {
+            return FALSE;
+        }
+
         gate_fc_outputs = vsi_nn_rnn_create_tensor_add(self,
                             input_add_hstate_outputs->t,
                             aux_input_gate_fc_outputs->t,
                             &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I],
                             use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(gate_fc_outputs, "Create internal tensor failed", final);
     }
     else
     {
@@ -311,6 +346,7 @@ static vsi_bool op_setup
 
     /* activation */
     curr = vsi_nn_internal_new_node( self, vsi_nn_rnn_get_act_op_type(p->activation), 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->node->nn_param.tanh.scale_a = 1.0;
     curr->node->nn_param.tanh.scale_b = 1.0;
     curr->inputs[0] = gate_fc_outputs->t;
@@ -320,12 +356,15 @@ static vsi_bool op_setup
     if (outputs[RNNCELL_OUTPUT_H_STATE] != NULL)
     {
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = outputs[RNNCELL_OUTPUT_OUTPUT];
         curr->outputs[0] = outputs[RNNCELL_OUTPUT_H_STATE];
         vsi_nn_internal_setup_node(self, curr);
     }
 
     return TRUE;
+final:
+    return FALSE;
 } /* op_setup() */
 
 static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c
index 12668f0b5..f97dd1c07 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c
@@ -157,6 +157,8 @@ static vsi_status op_optimize
     uint32_t dim;
     vx_tensor rois_tmp;
 
+    VSI_UNREFERENCED(outputs);
+
     rois_tmp = NULL;
     if( direction == VSI_NN_OPTIMIZE_FORWARD && inputs[1]->attr.dim_num == 2 )
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c
index 87a714451..6d607b488 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c
@@ -37,7 +37,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_link_list.h"
 #include "utils/vsi_nn_dtype_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
 
 #define _INPUT_NUM          (3)
 #define _OUTPUT_NUM         (1)
@@ -49,6 +49,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -59,6 +61,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -71,18 +76,20 @@ static vsi_bool op_setup
     )
 {
     vsi_nn_internal_node_t* curr = NULL;
-    vsi_bool ret = TRUE;
+    vsi_bool ret = FALSE;
 
     vsi_nn_internal_init_node_wksp( node );
 
     curr = vsi_nn_internal_new_node( node, VSI_NN_OP_A_TIMES_B_PLUS_C, node->input.num, node->output.num );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->inputs[0] = inputs[0];
     curr->inputs[1] = inputs[1];
     curr->inputs[2] = inputs[2];
     curr->outputs[0] = outputs[0];
 
-    vsi_nn_internal_setup_node(node, curr);
+    ret = vsi_nn_internal_setup_node(node, curr);
 
+final:
     return ret;
 } /* op_setup() */
 
@@ -94,6 +101,9 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c
index 99f8e4056..a6e6c8ead 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c
@@ -129,6 +129,8 @@ static vsi_bool op_setup
     uint32_t i = 0;
     uint32_t indices_dims = inputs[1]->attr.dim_num;
 
+    VSI_UNREFERENCED(self);
+
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c
index d8c9842e1..462a2cad9 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c
@@ -30,10 +30,11 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
-#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_tensor_util_prv.h"
 
 #define _INPUT_NUM          (2)
 #define _OUTPUT_NUM         (1)
@@ -75,7 +76,32 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_int32( param, "block_size", block_size );
     vsi_nn_kernel_param_add_int32( param, "coord_dim", coord_dim );
     vsi_nn_kernel_param_add_int32( param, "idx_num", idx_num );
-    n = vsi_nn_kernel_selector( self->graph, "scatter_nd", inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+
+    if (vsi_nn_is_same_data_type(inputs[1], outputs[0]) == FALSE ||
+        vsi_nn_is_same_quant_type(inputs[1], outputs[0]))
+    {
+        n = vsi_nn_kernel_selector( self->graph, "scatter_nd", inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+    }
+    else
+    {
+        vsi_nn_tensor_attr_t attr;
+        vsi_nn_tensor_t* temp_tensors = NULL;
+
+        VSILOGW("scatter_nd is no_range_change operation! \
+            Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!");
+
+        memcpy( &attr, &outputs[0]->attr, sizeof(attr));
+        memcpy( &attr.dtype, &inputs[1]->attr.dtype, sizeof(attr.dtype));
+        attr.is_const = FALSE;
+        attr.vtl = TRUE;
+        temp_tensors = vsi_nn_CreateTensor( self->graph, &attr );
+
+        vsi_nn_kernel_selector( self->graph, "scatter_nd", inputs, _INPUT_NUM, &temp_tensors, _OUTPUT_NUM, param );
+        n = vxTensorCopyNode( self->graph->g, temp_tensors->t, outputs[0]->t);
+
+        vsi_safe_release_tensor(temp_tensors);
+    }
+
     if ( n != NULL )
     {
         self->n = (vx_node)n;
@@ -134,6 +160,8 @@ static vsi_bool op_setup
     uint32_t i = 0;
     vsi_nn_scatter_nd_param * p = &(self->nn_param.scatter_nd);
 
+    VSI_UNREFERENCED(inputs);
+
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         if (p->shape == NULL)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c
index 63900eb98..e3e19ade7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c
@@ -141,6 +141,8 @@ static vsi_bool op_setup
     /* TODO: Add code to comput outputs' shape. */
     uint32_t i = 0;
 
+    VSI_UNREFERENCED(self);
+
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c
index 485dcd5ef..7efc8c767 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c
@@ -48,68 +48,15 @@ static vsi_status op_compute
     )
 {
     vsi_status status = VSI_FAILURE;
-    vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL };
-    vsi_size_t  shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
-    vsi_size_t* shapes_ptr[_IO_NUM];
-    vsi_size_t* shapes_in[_INPUT_NUM];
-    vsi_size_t rank_in[_INPUT_NUM];
-    uint32_t new_rank = 0;
-    int32_t  i        = 0;
-    vsi_bool ret = FALSE;
-    vsi_nn_context_t ctx = NULL;
 
     if ( NULL == self )
     {
         return VSI_FAILURE;
     }
 
-    ctx = self->graph->ctx;
-
-    for (i = 0; i < _IO_NUM; i++)
-    {
-        shapes_ptr[i] = shapes[i];
-    }
-
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        shapes_in[i] = inputs[i]->attr.size;
-        rank_in[i]   = (vsi_size_t)inputs[i]->attr.dim_num;
-    }
-
-    ret = vsi_nn_kernel_optimize_broadcast_shape(
-            (const vsi_size_t**)shapes_in, rank_in, _INPUT_NUM,
-            outputs[0]->attr.size, outputs[0]->attr.dim_num,
-            shapes_ptr, shapes[_INPUT_NUM], &new_rank);
-
-    if ( ret && !ctx->config.support_stream_processor )
-    {
-        for (i = 0; i < _INPUT_NUM; i++)
-        {
-            reshape_tensors[i] = vsi_nn_reshape_tensor( self->graph,
-                    inputs[i], shapes[i], new_rank );
-        }
-
-        for (i = 0; i < _OUTPUT_NUM; i++)
-        {
-            reshape_tensors[i + _INPUT_NUM] = vsi_nn_reshape_tensor( self->graph,
-                    outputs[i], shapes[i + _INPUT_NUM], new_rank );
-        }
-
-        self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "select",
-                                        &reshape_tensors[0], _INPUT_NUM,
-                                        &reshape_tensors[_INPUT_NUM], _OUTPUT_NUM, NULL );
-
-        for (i = 0; i < _IO_NUM; i++)
-        {
-            vsi_safe_release_tensor( reshape_tensors[i] );
-        }
-    }
-    else
-    {
-        self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "select",
-                                        inputs, _INPUT_NUM,
-                                        outputs, _OUTPUT_NUM, NULL );
-    }
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "select",
+                                    inputs, _INPUT_NUM,
+                                    outputs, _OUTPUT_NUM, NULL );
 
     if ( self->n )
     {
@@ -247,6 +194,8 @@ static vsi_bool op_setup
     vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
     vsi_bool ret = TRUE;
 
+    VSI_UNREFERENCED(self);
+
     in0_rank = inputs[0]->attr.dim_num;
     in1_rank = inputs[1]->attr.dim_num;
     in2_rank = inputs[2]->attr.dim_num;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c b/src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c
index 500e6761e..dc54ba7ad 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c
@@ -100,6 +100,14 @@ static vsi_bool op_check
         IO_TYPE(D_I32,  D_I32, D_F32)
 
         IO_TYPE(D_U8|Q_ASYM,  D_NONE, D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_DFP,  D_NONE, D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,  D_NONE, D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,  D_NONE, D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM,  D_NONE, D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_NONE, D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,  D_NONE, D_U8|Q_ASYM)
+        IO_TYPE(D_F16,  D_NONE, D_F16)
+        IO_TYPE(D_F16,  D_NONE, D_U8|Q_ASYM)
         IO_TYPE(D_I32,  D_NONE, D_U8|Q_ASYM)
         IO_TYPE(D_I32,  D_NONE, D_BOOL8)
     END_IO_TYPE_DECL(SEQUENCE_MASK)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c
index bb41e98ad..f922b8d16 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c
@@ -37,6 +37,7 @@
 #include "kernel/vsi_nn_kernel.h"
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
 
 #define _ARG_NUM            (3)
 #define _INPUT_NUM          (1)
@@ -136,6 +137,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     if (self->input.num > 1)
     {
         return VSI_SUCCESS;
@@ -153,9 +156,10 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_nn_slice_param * p;
+    vsi_nn_slice_param * p = NULL;
     vsi_nn_internal_node_t* curr = NULL;
-    uint32_t i;
+    uint32_t i = 0;
+    vsi_bool ret = FALSE;
 
     if (self->nn_param.slice.dims == 0)
     {
@@ -187,6 +191,7 @@ static vsi_bool op_setup
     }
 
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->node->nn_param.strided_slice.begin_dims = p->lcl_data->begin_dims;
     curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num;
     curr->node->nn_param.strided_slice.end_dims = p->lcl_data->end_dims;
@@ -199,9 +204,10 @@ static vsi_bool op_setup
     curr->node->nn_param.strided_slice.new_axis_mask = 0;
     curr->inputs[0] = inputs[0];
     curr->outputs[0] = outputs[0];
-    vsi_nn_internal_setup_node( self, curr );
+    ret = vsi_nn_internal_setup_node( self, curr );
 
-    return TRUE;
+final:
+    return ret;
 } /* op_setup() */
 
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c
index c81639929..27431a73f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c
@@ -37,6 +37,7 @@
 #include "utils/vsi_nn_math.h"
 #include "utils/vsi_nn_constraint_check.h"
 #include "vsi_nn_tensor_util_prv.h"
+#include "vsi_nn_error.h"
 
 static vsi_status op_compute
     (
@@ -45,6 +46,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -123,6 +126,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     if (VSI_NN_OPTIMIZE_BACKWARD == direction)
     {
         return VSI_SUCCESS;
@@ -174,7 +179,9 @@ static vsi_bool op_setup
     )
 {
     vsi_nn_internal_node_t* curr = NULL;
-    if( NULL == self )
+    vsi_bool ret = FALSE;
+
+    if ( NULL == self )
     {
         return FALSE;
     }
@@ -202,13 +209,15 @@ static vsi_bool op_setup
 
     vsi_nn_internal_init_node_wksp(self);
     curr = vsi_nn_internal_new_node(self, VSI_NN_OP_SOFTMAX_INTERNAL, 0, 0);
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->inputs[0] = inputs[0];
     curr->outputs[0] = outputs[0];
     curr->node->nn_param.softmax_internal.beta = self->nn_param.softmax.beta;
     curr->node->nn_param.softmax_internal.axis = self->nn_param.softmax.axis;
-    vsi_nn_internal_setup_node(self, curr);
+    ret = vsi_nn_internal_setup_node(self, curr);
 
-    return TRUE;
+final:
+    return ret;
 }
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
index 0dbe88c87..0d85eb13e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
@@ -45,6 +45,8 @@ static vsi_bool _need_split_softmax
     )
 {
     vsi_bool ret = FALSE;
+    VSI_UNREFERENCED(self);
+
     if(inputs[0]->attr.dim_num == 2 && inputs[0]->attr.size[1] > MAX_SOFTMAX_BATCH)
     {
         ret = TRUE;
@@ -250,6 +252,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     //TODO: Check tensor shapes.
     return TRUE;
 } /* op_check() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c
index d6e201e5b..71615e740 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c
@@ -35,8 +35,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_math.h"
-#include "libnnext/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
+#include "vsi_nn_error.h"
 #include "vsi_nn_test.h"
 #include "utils/vsi_nn_constraint_check.h"
 
@@ -103,6 +102,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     if (self->nn_param.space2depth.block_size[0] != self->nn_param.space2depth.block_size[1])
     {
         return vsi_nn_internal_optimize_node(self, direction );
@@ -142,12 +143,13 @@ static vsi_bool op_set_space2depth_internal
     vsi_nn_op_t  type_name
     )
 {
-    vsi_bool retn = TRUE;
+    vsi_bool retn = FALSE;
     vsi_nn_internal_node_t* curr = NULL;
 
     vsi_nn_internal_init_node_wksp( self );
 
     curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->node->nn_param.space2depth_internal.block_size_x =
                         self->nn_param.space2depth.block_size[0];
     curr->node->nn_param.space2depth_internal.block_size_y =
@@ -156,6 +158,7 @@ static vsi_bool op_set_space2depth_internal
     curr->outputs[0] = outputs[0];
     retn = vsi_nn_internal_setup_node(self, curr);
 
+final:
     return retn;
 }
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_split.c b/src/tim/vx/internal/src/ops/vsi_nn_op_split.c
index 9810b2c09..65dc6de93 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_split.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_split.c
@@ -37,6 +37,7 @@
 #include "utils/vsi_nn_link_list.h"
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
 
 static vsi_status op_compute
     (
@@ -45,6 +46,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -178,9 +181,9 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_bool ret;
-    uint32_t i, num;
-    vsi_size_t average;
+    vsi_bool ret = FALSE;
+    uint32_t i = 0, num = 0;
+    vsi_size_t average = 1;
     vsi_size_t start[VSI_NN_MAX_DIM_NUM] = { 0 };
     vsi_size_t end[VSI_NN_MAX_DIM_NUM] = { 0 };
     uint32_t axis = self->nn_param.split.axis;
@@ -189,8 +192,6 @@ static vsi_bool op_setup
     vsi_nn_split_param * p = NULL;
     vsi_nn_internal_node_t* curr = NULL;
 
-    ret = TRUE;
-    average = 1;
     /* compute the output tensor number */
     num = (uint32_t)(self->output.num - 1);
     while ( NULL == outputs[num] )
@@ -237,6 +238,7 @@ static vsi_bool op_setup
             p->lcl_data->end_dims[j] = (int32_t)end[j];
         }
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.strided_slice.begin_dims = p->lcl_data->begin_dims;
         curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num;
         curr->node->nn_param.strided_slice.end_dims = p->lcl_data->end_dims;
@@ -249,10 +251,12 @@ static vsi_bool op_setup
         curr->node->nn_param.strided_slice.new_axis_mask = 0;
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = outputs[i];
-        vsi_nn_internal_setup_node( self, curr );
+        ret = vsi_nn_internal_setup_node( self, curr );
     }
 
     return ret;
+final:
+    return FALSE;
 } /* op_setup() */
 
 static vsi_status op_init
@@ -309,28 +313,12 @@ static vsi_status op_deinit
 
     p = &(self->nn_param.split);
 
-    if (p->lcl_data->begin_dims)
-    {
-        free(p->lcl_data->begin_dims);
-        p->lcl_data->begin_dims = NULL;
-    }
-
-    if (p->lcl_data->end_dims)
-    {
-        free(p->lcl_data->end_dims);
-        p->lcl_data->end_dims = NULL;
-    }
-
-    if (p->lcl_data->stride_dims)
-    {
-        free(p->lcl_data->stride_dims);
-        p->lcl_data->stride_dims = NULL;
-    }
-
-    if (p->lcl_data)
+    if (p && p->lcl_data)
     {
-        free(p->lcl_data);
-        p->lcl_data = NULL;
+        vsi_nn_safe_free(p->lcl_data->begin_dims);
+        vsi_nn_safe_free(p->lcl_data->end_dims);
+        vsi_nn_safe_free(p->lcl_data->stride_dims);
+        vsi_nn_safe_free(p->lcl_data);
     }
 
     vsi_nn_internal_deinit_node_wksp( self );
@@ -346,6 +334,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 }
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c
index 3609aad4f..4e0a5e566 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c
@@ -35,6 +35,7 @@
 #include "kernel/vsi_nn_kernel.h"
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_internal_node.h"
+#include "vsi_nn_error.h"
 
 /*
  Declare number of input and output.
@@ -49,6 +50,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -106,6 +109,7 @@ static vsi_bool op_setup
     vsi_bool shouldSqueeze[VSI_NN_MAX_DIM_NUM] = {FALSE};
     uint32_t numDimsSqueezed = 0;
     vsi_nn_internal_node_t* curr = NULL;
+    vsi_bool ret = FALSE;
 
     if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
@@ -122,7 +126,7 @@ static vsi_bool op_setup
             {
                 int32_t rank = self->nn_param.squeeze.axis[i];
 
-                rank = rank < 0 ? rank + inputs[0]->attr.dim_num : rank;
+                rank = rank < 0 ? rank + (int32_t)inputs[0]->attr.dim_num : rank;
 
                 if ( !shouldSqueeze[rank] )
                 {
@@ -145,13 +149,15 @@ static vsi_bool op_setup
 
     vsi_nn_internal_init_node_wksp( self );
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->node->nn_param.reshape2.size = outputs[0]->attr.size;
     curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num;
     curr->inputs[0] = inputs[0];
     curr->outputs[0] = outputs[0];
-    vsi_nn_internal_setup_node( self, curr );
+    ret = vsi_nn_internal_setup_node( self, curr );
 
-    return TRUE;
+final:
+    return ret;
 } /* op_setup() */
 
 static vsi_status op_deinit
@@ -172,6 +178,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 }
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c
index 9b59d9920..d59c6f5d1 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c
@@ -37,7 +37,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_link_list.h"
 #include "utils/vsi_nn_dtype_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
 
 #define _ARG_NUM            (1)
 #define _INPUT_NUM          VSI_NN_STACK_MAX_INPUTS
@@ -53,6 +53,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -63,6 +65,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -84,7 +89,7 @@ static vsi_bool op_setup
     vsi_nn_internal_node_t* curr = NULL;
     vsi_nn_tensor_t *output_rs = NULL;
     vsi_nn_stack_lcl_data * data = NULL;
-    vsi_bool ret = TRUE;
+    vsi_bool ret = FALSE;
     vx_int8 is_scalar = vsi_nn_GetTensorIsScalar(inputs[0]);
 
     vsi_nn_internal_init_node_wksp( node );
@@ -122,10 +127,12 @@ static vsi_bool op_setup
     if (1 == node->input.num)
     {
         curr = vsi_nn_internal_new_node( node, VSI_NN_OP_RESHAPE2, 1, 1);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = outputs[0];
         curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num;
         curr->node->nn_param.reshape2.size = outputs[0]->attr.size;
+        ret = vsi_nn_internal_setup_node(node, curr);
         goto final;
     }
 
@@ -133,17 +140,13 @@ static vsi_bool op_setup
     input_shape[1] = block_num;
 
     curr = vsi_nn_internal_new_node( node, VSI_NN_OP_CONCAT, node->input.num, node->output.num );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     for (i = 0; i < node->input.num; i++)
     {
         vsi_nn_tensor_t *input_rs = NULL;
         /* Malloc ptr */
         data = (vsi_nn_stack_lcl_data *)malloc( sizeof(vsi_nn_stack_lcl_data) );
-        if( NULL == data )
-        {
-            VSILOGE( "Create stack local data fail." );
-            ret = FALSE;
-            goto final;
-        }
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(data, curr, "Create buffer failed", final);
         memset( data, 0, sizeof(vsi_nn_stack_lcl_data) );
 
         input_rs = vsi_nn_reshape_tensor(node->graph, inputs[i], input_shape, 2);
@@ -171,16 +174,18 @@ static vsi_bool op_setup
 
     /* Malloc ptr */
     data = (vsi_nn_stack_lcl_data *)malloc( sizeof(vsi_nn_stack_lcl_data) );
-    if( NULL == data )
-    {
-        VSILOGE( "Create stack local data fail." );
-        ret = FALSE;
-        goto final;
-    }
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(data, curr, "Create buffer failed", final);
     memset( data, 0, sizeof(vsi_nn_stack_lcl_data) );
 
     output_rs = vsi_nn_reshape_tensor(node->graph, outputs[0], output_shape, 2);
-    data->src_in     = output_rs;
+    if (output_rs == NULL)
+    {
+        vsi_nn_internal_release_node(&curr);
+        VSILOGD("Create reshape tensor failed\n");
+        vsi_nn_safe_free(data);
+        goto final;
+    }
+    data->src_in = output_rs;
     /* Store node, ptr */
     vsi_nn_LinkListPushStart(
         (vsi_nn_link_list_t **)&node->nn_param.stack.lcl_data,
@@ -188,10 +193,9 @@ static vsi_bool op_setup
 
     curr->outputs[0] = output_rs;
     curr->node->nn_param.concat.axis = axis;
+    ret = vsi_nn_internal_setup_node(node, curr);
 
 final:
-    vsi_nn_internal_setup_node(node, curr);
-
     return ret;
 } /* op_setup() */
 
@@ -203,6 +207,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
index 1cf2891ad..ae43c05c8 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
@@ -749,6 +749,8 @@ static vsi_status op_optimize
     vsi_size_t     shape[VSI_NN_MAX_DIM_NUM] = { 0 };
     vsi_bool       is_same_quant_type = FALSE;
     vsi_bool       is_same_shape = TRUE;
+    vsi_size_t     input_elements = 0;
+    vsi_size_t     output_elements = 0;
 
     /* Only forward run stride_slice's optimize */
     if ( direction == VSI_NN_OPTIMIZE_BACKWARD )
@@ -775,38 +777,49 @@ static vsi_status op_optimize
 
     VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
 
-    if ( NULL == inputs[0]->t )
-    {
-        vsi_nn_TensorReinit( self->graph, inputs[0] );
-    }
-
-    /* Create tensor from view */
-    memcpy( start, (vsi_size_t*)start_dims, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM );
-    memcpy( end, (vsi_size_t*)stop_dims, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM );
-    in_view_tensor = vsi_nn_CreateViewTensor(self->graph, start, end, inputs[0]);
-    if ( NULL == in_view_tensor )
-    {
-        VSILOGE( "Create tensor %d from view fail.", i );
-        status = VSI_FAILURE;
-        goto OnError;
-    }
-
     self->nn_param.strided_slice.lcl2_data->is_optimized = TRUE;
 
     is_same_quant_type = _is_same_quant(inputs, outputs);
-    if ( NULL != outputs[0]->t || is_same_quant_type == FALSE)
+    input_elements = vsi_nn_GetElementNum( inputs[0] );
+    output_elements = vsi_nn_GetElementNum( outputs[0] );
+    if (NULL != outputs[0]->t && NULL == inputs[0]->t &&
+        is_same_quant_type && input_elements == output_elements)
     {
-        VSILOGI( "stride slice copy tensor.");
-        // Copy old tensor values to the new address.
-        status = copy_tensor_to_view( self, in_view_tensor, outputs[0], shape, is_same_shape);
-        if ( VSI_FAILURE == status )
-        {
-            goto OnError;
-        }
+        inputs[0]->t = vsi_nn_safe_reshape_tensor( outputs[0]->t,
+            (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num,
+            sizeof(inputs[0]->attr.size[0]) );
     }
     else
     {
-        outputs[0]->t = in_view_tensor;
+        if ( NULL == inputs[0]->t )
+        {
+            vsi_nn_TensorReinit( self->graph, inputs[0] );
+        }
+        /* Create tensor from view */
+        memcpy( start, (vsi_size_t*)start_dims, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM );
+        memcpy( end, (vsi_size_t*)stop_dims, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM );
+        in_view_tensor = vsi_nn_CreateViewTensor(self->graph, start, end, inputs[0]);
+        if ( NULL == in_view_tensor )
+        {
+            VSILOGE( "Create tensor %d from view fail.", i );
+            status = VSI_FAILURE;
+            goto OnError;
+        }
+
+        if ( NULL != outputs[0]->t || is_same_quant_type == FALSE)
+        {
+            VSILOGI( "stride slice copy tensor.");
+            // Copy old tensor values to the new address.
+            status = copy_tensor_to_view( self, in_view_tensor, outputs[0], shape, is_same_shape);
+            if ( VSI_FAILURE == status )
+            {
+                goto OnError;
+            }
+        }
+        else
+        {
+            outputs[0]->t = in_view_tensor;
+        }
     }
 
 OnError:
@@ -841,32 +854,32 @@ static vsi_status op_deinit
     vsi_nn_safe_free( params->end_dims );
     vsi_nn_safe_free( params->stride_dims );
 
-    if (lcl2_data->cp_node)
+    if (lcl2_data && lcl2_data->cp_node)
     {
         vxReleaseNode( &lcl2_data->cp_node );
     }
 
-    if (lcl2_data->src_tensor)
+    if (lcl2_data && lcl2_data->src_tensor)
     {
         vxReleaseTensor( &lcl2_data->src_tensor );
     }
 
-    if (lcl2_data->dst_tensor && !lcl2_data->is_same_shape)
+    if (lcl2_data && lcl2_data->dst_tensor && !lcl2_data->is_same_shape)
     {
         vxReleaseTensor( &lcl2_data->dst_tensor );
     }
 
-    if (lcl2_data->begin_dims)
+    if (lcl2_data && lcl2_data->begin_dims)
     {
         free(lcl2_data->begin_dims);
     }
 
-    if (lcl2_data->end_dims)
+    if (lcl2_data && lcl2_data->end_dims)
     {
         free(lcl2_data->end_dims);
     }
 
-    if (lcl2_data->stride_dims)
+    if (lcl2_data && lcl2_data->stride_dims)
     {
         free(lcl2_data->stride_dims);
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c b/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c
index b8b4c1e53..080183652 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c
@@ -31,6 +31,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
 
 static vsi_status _create_local_tensor
     (
@@ -129,6 +130,7 @@ static vsi_status op_compute
         attr.is_const = TRUE;
         attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
         bias_tensor = vsi_nn_CreateTensor(self->graph, &attr);
+        CHECK_PTR_FAIL_GOTO( bias_tensor, "Create tensor fail.", final );
         param.bias = bias_tensor->t;
     }
 
@@ -145,6 +147,7 @@ static vsi_status op_compute
         status = VSI_SUCCESS;
     }
 
+final:
     if (bias_tensor != NULL) vsi_nn_ReleaseTensor(&bias_tensor);
     return status;
 } /* op_compute() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c b/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c
index 812cea379..61a541c79 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c
@@ -63,6 +63,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c
index 78f350858..ff15f81de 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c
@@ -49,7 +49,7 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status = VX_FAILURE;
+    vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_tensor_add_mean_stddev_norm_param * p = NULL;
     float eps;
@@ -113,6 +113,8 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(node);
+
     /* TODO: Add code to comput outputs' shape. */
     if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c
index 3098b6cf8..82f104a58 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c
@@ -141,6 +141,8 @@ static vsi_bool op_setup
     vsi_nn_tensorstackconcat_param *p = NULL;
     int32_t axis = 0;
 
+    VSI_UNREFERENCED(outputs);
+
     if ( NULL == self )
     {
         return ret;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c
index 647396fdb..b6fb26ec7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c
@@ -41,6 +41,30 @@
  Declare number of input and output.
  */
 
+static vsi_bool _is_supported_axis(vsi_size_t* multiples, vsi_size_t multiples_num)
+{
+    vsi_size_t i = 0;
+
+    if ( multiples_num < 4)
+    {
+        return TRUE;
+    }
+    else if ( multiples_num > 4)
+    {
+        return FALSE;
+    }
+
+    for ( i = 3;  i < multiples_num;  i++)
+    {
+        if (multiples[i] > 1)
+        {
+            return FALSE;
+        }
+    }
+
+    return TRUE;
+}
+
 static vsi_status _tile_op_compute
     (
     const char * kernel_name,
@@ -49,18 +73,100 @@ static vsi_status _tile_op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status = VSI_FAILURE;
+    vsi_status status                        = VSI_FAILURE;
+    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+    vsi_size_t new_rank                      = 0;
+    vsi_bool   ret                          = FALSE;
+    vsi_size_t* multiples                   = (vsi_size_t*)self->nn_param.tile.multiples;
+    vsi_nn_tensor_t* temp_tensors[2]        = { NULL };
+    vsi_nn_tensor_t* reshape_tensors[2]     = { NULL };
+    vsi_nn_tensor_attr_t attr;
+
+    if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE)
+    {
+        VSILOGW("tile is no_range_change operation! \
+            Insert DataConvert Operation when the quantization parameters\
+            of input and output are inconsistent!");
 
-    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
-        kernel_name,
-        &inputs[0], 1,
-        &outputs[0], 1, NULL );
+        memcpy( &attr, &outputs[0]->attr, sizeof(attr));
+        memcpy( &attr.dtype, &inputs[0]->attr.dtype, sizeof(attr.dtype));
+        attr.is_const = FALSE;
+        attr.vtl = TRUE;
+        temp_tensors[1] = vsi_nn_CreateTensor( self->graph, &attr );
+    }
+    else
+    {
+        temp_tensors[1] = outputs[0];
+    }
 
-    if( self->n )
+    ret = vsi_nn_kernel_optimize_tile_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            multiples, inputs[0]->attr.dim_num,
+            temp_tensors[1]->attr.size, temp_tensors[1]->attr.dim_num,
+            shapes[0], shapes[1], shapes[2], &new_rank );
+
+    if (ret)
+    {
+        if (_is_supported_axis(shapes[1], new_rank) == FALSE)
+        {
+            reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0],\
+                shapes[0], (vsi_size_t)new_rank );
+            reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, temp_tensors[1],\
+                shapes[2], (vsi_size_t)new_rank );
+            if (reshape_tensors[0] == NULL || reshape_tensors[1] == NULL)
+            {
+                VSILOGE("reshape tensor failed!");
+                status = VSI_FAILURE;
+                goto final;
+            }
+
+            memcpy( &attr, &reshape_tensors[0]->attr, sizeof(attr));
+            attr.is_const = FALSE;
+            attr.vtl = TRUE;
+            attr.size[0] = reshape_tensors[1]->attr.size[0];
+            attr.size[1] = reshape_tensors[1]->attr.size[1];
+
+            temp_tensors[0] = vsi_nn_CreateTensor( self->graph, &attr );
+
+            self->n = (vx_node)vsi_nn_kernel_selector(
+                self->graph, kernel_name, &reshape_tensors[0], 1, &temp_tensors[0], 1, NULL);
+            self->n = (vx_node)vsi_nn_kernel_selector(
+                self->graph, kernel_name, &temp_tensors[0], 1, &reshape_tensors[1], 1, NULL);
+
+        }
+        else
+        {
+            reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0],\
+                shapes[0], (vsi_size_t)new_rank );
+            reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, temp_tensors[1],\
+                shapes[2], (vsi_size_t)new_rank );
+            if (reshape_tensors[0] == NULL || reshape_tensors[1] == NULL)
+            {
+                VSILOGE("reshape tensor failed!");
+                status = VSI_FAILURE;
+                goto final;
+            }
+
+            self->n = (vx_node)vsi_nn_kernel_selector( self->graph, kernel_name,\
+                &reshape_tensors[0], 1, &reshape_tensors[1], 1, NULL );
+        }
+    }
+
+    if ( self->n )
     {
         status = VSI_SUCCESS;
     }
 
+final:
+    vsi_safe_release_tensor(reshape_tensors[0]);
+    vsi_safe_release_tensor(reshape_tensors[1]);
+    vsi_safe_release_tensor(temp_tensors[0]);
+    if (vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE)
+    {
+        self->n = vxTensorCopyNode( self->graph->g, temp_tensors[1]->t, outputs[0]->t);
+        vsi_safe_release_tensor(temp_tensors[1]);
+    }
+
     return status;
 } /* _tile_op_compute() */
 
@@ -71,7 +177,7 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    /*TODO: Check tensor shapes. */
+    /*TODO: Check tensor shapes.  */
     vsi_nn_tile_param * p;
 
     BEGIN_IO_TYPE_DECL(TILE, 1, 1)
@@ -88,6 +194,8 @@ static vsi_bool op_check
         IO_TYPE(D_I32,          D_I32)
         IO_TYPE(D_U32,          D_U32)
         IO_TYPE(D_F32,          D_F32)
+        IO_TYPE(D_F32,          D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_U8|Q_ASYM)
     END_IO_TYPE_DECL(TILE)
     if (!VALIDATE_OP_IO_TYPES(TILE, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
index d797af2cd..ff8c0e0fd 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
@@ -36,10 +36,59 @@
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "vsi_nn_error.h"
 
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (2)
 
+vsi_nn_tensor_t* _create_permute_node
+    (
+    vsi_nn_node_t* self,
+    vsi_nn_tensor_t* input_tensor,
+    vsi_nn_tensor_t* output_tensor,
+    uint32_t* perm,
+    uint32_t dim_num,
+    vsi_bool use_virtual_tensor
+    )
+{
+    vsi_nn_tensor_t* tensor0 = NULL;
+    vsi_nn_tensor_t *output = NULL;
+
+    if (output_tensor)
+    {
+        output = output_tensor;
+    }
+    else
+    {
+        uint32_t i = 0;
+        vsi_nn_tensor_attr_t attr;
+        memcpy(&attr, &input_tensor->attr, sizeof(attr));
+        attr.vtl = use_virtual_tensor;
+        for ( i = 0; i < dim_num; i++ )
+        {
+            attr.size[i] = input_tensor->attr.size[perm[i]];
+        }
+        tensor0 = vsi_nn_CreateTensor( self->graph, &attr );
+        CHECK_PTR_FAIL_GOTO( tensor0, "Create tensor fail.", final );
+        output = tensor0;
+    }
+    self->n = vxTensorPermuteNode(
+        self->graph->g,
+        input_tensor->t,
+        output->t,
+        perm,
+        dim_num
+        );
+    if (self->n == NULL)
+    {
+        vsi_safe_release_tensor(tensor0);
+    }
+
+final:
+    return tensor0;
+}
+
 static vsi_status op_compute
     (
     vsi_nn_node_t * self,
@@ -49,18 +98,122 @@ static vsi_status op_compute
 {
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    uint32_t rank_in = 0;
+    uint32_t rank_out = 0;
+    int32_t new_axis0 = 0;
+    int32_t new_axis1 = 0;
+    int32_t axis = self->nn_param.topk.axis;
+    int32_t top_k = self->nn_param.topk.k;
+    vsi_nn_tensor_t * in_tensor = NULL;
+    vsi_nn_tensor_t * out0_tensor = NULL;
+    vsi_nn_tensor_t * out1_tensor = NULL;
+    vsi_bool ret = FALSE;
+
+    ret = vsi_nn_kernel_optimize_softmax_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+            shapes[0], &rank_in, &new_axis0);
+
+    ret = vsi_nn_kernel_optimize_softmax_shape(
+            outputs[0]->attr.size, outputs[0]->attr.dim_num, axis,
+            shapes[1], &rank_out, &new_axis1);
 
     param = vsi_nn_kernel_param_create();
-    vsi_nn_kernel_param_add_int32( param, "top_k", self->nn_param.topk.k );
+    vsi_nn_kernel_param_add_int32( param, "top_k", top_k );
+
+    if (ret)
+    {
+        uint32_t perm_in[VSI_NN_MAX_DIM_NUM] = {0};
+        uint32_t perm_out[VSI_NN_MAX_DIM_NUM] = {0};
+        vsi_nn_tensor_t* input_tensor = NULL;
+        vsi_nn_tensor_t* outputs_tensor[2] = {NULL};
+
+        reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
+                inputs[0], shapes[0], rank_in );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+                outputs[0], shapes[1], rank_in );
+        reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph,
+                outputs[1], shapes[1], rank_in );
+
+        axis = new_axis0;
+
+        if (axis != 0)
+        {
+            uint32_t i = 0;
+            uint32_t index = 0;
+
+            vsi_nn_tensor_attr_t attr0, attr1;
+            memcpy(&attr0, &reshape_tensors[1]->attr, sizeof(attr0));
+            memcpy(&attr1, &reshape_tensors[2]->attr, sizeof(attr1));
+
+            attr0.vtl = TRUE;
+            attr1.vtl = TRUE;
+            attr0.size[index] = (vsi_size_t)top_k;
+            attr1.size[index] = (vsi_size_t)top_k;
+            perm_in[index ++] = (uint32_t)axis;
+            for ( i = 0; i < rank_in; i++ )
+            {
+                if ((int32_t)i == axis)
+                    continue;
+                attr0.size[index] = shapes[1][i];
+                attr1.size[index] = shapes[1][i];
+                perm_in[index ++] = i;
+            }
+
+            perm_out[axis] = 0;
+            for ( i = 1, index = 0; i < rank_in; i++ )
+            {
+                if ((int32_t)index == axis)
+                {
+                    index ++;
+                }
+                perm_out[index ++] = i;
+            }
+
+            out0_tensor = vsi_nn_CreateTensor( self->graph, &attr0 );
+            CHECK_PTR_FAIL_GOTO( out0_tensor, "Create tensor fail.", final );
+            out1_tensor = vsi_nn_CreateTensor( self->graph, &attr1 );
+            CHECK_PTR_FAIL_GOTO( out1_tensor, "Create tensor fail.", final );
+
+            in_tensor = _create_permute_node(self, reshape_tensors[0], NULL, perm_in, rank_in, TRUE);
+            CHECK_PTR_FAIL_GOTO( in_tensor, "Create internal tensor fail.", final );
+
+            input_tensor = in_tensor;
+            outputs_tensor[0] = out0_tensor;
+            outputs_tensor[1] = out1_tensor;
+        }
+        else
+        {
+            input_tensor = reshape_tensors[0];
+            outputs_tensor[0] = reshape_tensors[1];
+            outputs_tensor[1] = reshape_tensors[2];
+        }
+
+        self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "topk",
+                &input_tensor, _INPUT_NUM,
+                outputs_tensor, _OUTPUT_NUM, param );
 
-    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "topk",
-        inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+        if (axis != 0)
+        {
+            _create_permute_node(self, outputs_tensor[0], reshape_tensors[1], perm_out, rank_in, TRUE);
+            _create_permute_node(self, outputs_tensor[1], reshape_tensors[2], perm_out, rank_in, TRUE);
+        }
+    }
 
-    if( self->n )
+    if ( self->n )
     {
         status = VSI_SUCCESS;
     }
 
+final:
+    vsi_safe_release_tensor( reshape_tensors[0] );
+    vsi_safe_release_tensor( reshape_tensors[1] );
+    vsi_safe_release_tensor( reshape_tensors[2] );
+    vsi_safe_release_tensor( in_tensor );
+    vsi_safe_release_tensor( out0_tensor );
+    vsi_safe_release_tensor( out1_tensor );
+
     return status;
 } /* op_compute() */
 
@@ -107,29 +260,38 @@ static vsi_bool op_setup
     /* TODO: Add code to comput outputs' shape. */
     uint32_t i;
 
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         vsi_nn_topk_param * p;
 
         p = &(self->nn_param.topk);
+
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
-        outputs[0]->attr.size[0] = p->k;
-        for (i = 1; i < inputs[0]->attr.dim_num; i++)
+        outputs[0]->attr.size[p->axis] = p->k;
+        for (i = 0; i < inputs[0]->attr.dim_num; i++)
         {
+            if ((int32_t)i == p->axis)
+            {
+                continue;
+            }
             outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
         }
     }
 
-    if( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num )
     {
         vsi_nn_topk_param * p;
 
         p = &(self->nn_param.topk);
 
         outputs[1]->attr.dim_num = inputs[0]->attr.dim_num;
-        outputs[1]->attr.size[0] = p->k;
-        for (i = 1; i < inputs[0]->attr.dim_num; i++)
+        outputs[1]->attr.size[p->axis] = p->k;
+        for (i = 0; i < inputs[0]->attr.dim_num; i++)
         {
+            if ((int32_t)i == p->axis)
+            {
+                continue;
+            }
             outputs[1]->attr.size[i] = inputs[0]->attr.size[i];
         }
     }
@@ -137,6 +299,17 @@ static vsi_bool op_setup
     return TRUE;
 } /* op_setup() */
 
+static vsi_status op_init
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    self->nn_param.topk.axis = 0;
+
+    return status;
+} /* op_init() */
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -144,7 +317,7 @@ extern "C" {
 DEF_OP_REG
     (
     /* op_name    */ TOPK,
-    /* init       */ NULL,
+    /* init       */ op_init,
     /* compute    */ op_compute,
     /* deinit     */ vsi_nn_op_common_deinit,
     /* check      */ op_check,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c
index a6d526633..ece932e6e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c
@@ -35,9 +35,9 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
+#include "vsi_nn_error.h"
 
 static vsi_bool setup_op_shapes
     (
@@ -80,6 +80,7 @@ static vsi_bool setup_op_shapes
         attr.is_const = TRUE;
 
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         inputs[RNN_INPUT_H_STATE] = output_tensor->t;
     }
 
@@ -91,6 +92,7 @@ static vsi_bool setup_op_shapes
         attr.vtl = use_virtual_tensor;
         attr.is_const = FALSE;
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         outputs[RNN_OUTPUT_H_STATE] = output_tensor->t;
     }
 
@@ -112,6 +114,8 @@ static vsi_bool setup_op_shapes
     }
 
     return TRUE;
+final:
+    return FALSE;
 }
 
 static vsi_status op_compute
@@ -121,6 +125,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -131,6 +137,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -143,6 +152,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
@@ -168,6 +179,8 @@ static vsi_bool op_setup
     vsi_size_t batch_size = 0;
     vsi_size_t time_step = 0;
     uint32_t i = 0;
+    vsi_bool ret = FALSE;
+    vsi_status status = VSI_FAILURE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
 
@@ -193,21 +206,28 @@ static vsi_bool op_setup
         /* transpose to time_major */
         output_tensor = vsi_nn_rnn_transpose_time_major(self,
             inputs[RNN_INPUT_INPUT], NULL, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
         input_tensor = output_tensor->t;
     }
 
     /* split input tensor */
     split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **));
+    CHECK_PTR_FAIL_GOTO( split_output_tensors, "Create buffer fail.", final );
     memset( split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
     rnncell_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step *
         sizeof(vsi_nn_tensor_t **));
+    CHECK_PTR_FAIL_GOTO( rnncell_reshape_output_tensors, "Create buffer fail.", final );
     memset( rnncell_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **));
 
-    vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+    status = vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors,
+        (uint32_t)time_step, use_virtual_tensor);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
-    vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+    status = vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
     last_step_h_state = inputs[RNN_INPUT_H_STATE];
+
     for( i = 0; i < time_step; i++ )
     {
         vsi_nn_tensor_t* reshape_output = NULL;
@@ -217,26 +237,30 @@ static vsi_bool op_setup
         /* reshape for split output */
         output_tensor = vsi_nn_rnn_reshape_split_output(self,
             split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         reshape_output = output_tensor->t;
 
         /* rnncell output */
         vsi_nn_internal_init_tensor_attr(&attr,
             &outputs[RNN_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         rnncell_out0 = output_tensor->t;
 
         /* rnncell output h_state */
         vsi_nn_internal_init_tensor_attr(&attr,
             &outputs[RNN_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
         rnncell_out1 = output_tensor->t;
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RNNCELL_OVXLIB, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.rnncell_ovxlib.activation = curr_param->activation;
         if ( reshape_output->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
              reshape_output->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 )
         {
-            int32_t k = 0;
+            size_t k = 0;
             for (k = 0; k < _cnt_of_array( curr_param->internal_dtype ); k++)
             {
                 if (curr_param->internal_dtype[k].vx_type == VSI_NN_TYPE_NONE)
@@ -274,6 +298,7 @@ static vsi_bool op_setup
         vsi_nn_internal_init_tensor_attr(&attr,
             &outputs[RNN_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Create internal tensor fail.", final );
 
         tensor = output_tensor->t;
     }
@@ -281,6 +306,7 @@ static vsi_bool op_setup
     if (outputs[RNN_OUTPUT_H_STATE] != NULL)
     {
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = last_step_h_state;
         curr->outputs[0] = outputs[RNN_OUTPUT_H_STATE];
         vsi_nn_internal_setup_node(self, curr);
@@ -288,13 +314,14 @@ static vsi_bool op_setup
 
     /* concat rnncell output, the rnn's output is 3-dims */
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->node->nn_param.concat.axis = 2;
     for( i = 0; i < time_step; i++ )
     {
         curr->inputs[i] = rnncell_reshape_output_tensors[i];
     }
     curr->outputs[0] = tensor;
-    vsi_nn_internal_setup_node( self, curr );
+    ret = vsi_nn_internal_setup_node( self, curr );
 
     if( !curr_param->time_major )
     {
@@ -303,10 +330,11 @@ static vsi_bool op_setup
             tensor, outputs[RNN_OUTPUT_OUTPUT], use_virtual_tensor);
     }
 
+final:
     vsi_nn_safe_free( split_output_tensors );
     vsi_nn_safe_free( rnncell_reshape_output_tensors );
 
-    return TRUE;
+    return ret;
 } /* op_setup() */
 
 static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c
index 7e57e3223..35d84a5f8 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c
@@ -34,8 +34,8 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
+#include "vsi_nn_error.h"
 
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (VSI_NN_UNSTACK_MAX_OUTPUTS)
@@ -47,6 +47,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -58,6 +60,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
@@ -68,6 +72,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     /*TODO: Check tensor shapes. */
     return TRUE;
 } /* op_check() */
@@ -94,6 +101,7 @@ static vsi_bool op_setup
     uint32_t i = 0, j = 0;
     uint32_t rank = inputs[0]->attr.dim_num;
     int8_t is_scalar = (rank - 1) == 0 ? TRUE : FALSE;
+    vsi_bool ret = FALSE;
 
     vsi_nn_internal_init_node_wksp( self );
 
@@ -172,10 +180,13 @@ static vsi_bool op_setup
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor);
     input_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    CHECK_PTR_FAIL_GOTO(input_tensor, "Create internal tensor failed", final);
 
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     reshape_input_size = (vsi_size_t*)vsi_nn_internal_new_node_param(curr,
         VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_input_size, curr, "Create internal buffer failed", final);
     reshape_input_size[0] = block_size;
     reshape_input_size[1] = tensor_num;
     reshape_input_size[2] = block_num;
@@ -186,23 +197,28 @@ static vsi_bool op_setup
     curr->outputs[0] = input_tensor->t;
     vsi_nn_internal_setup_node( self, curr );
 
+    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, tensor_num );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     slices = (uint32_t *)vsi_nn_internal_new_node_param(curr,
         tensor_num * sizeof(uint32_t));
-    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, tensor_num );
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(slices, curr, "Create internal buffer failed", final);
     curr->node->nn_param.split.axis = 1;
     curr->node->nn_param.split.slices = slices;
     curr->node->nn_param.split.slices_num = tensor_num;
     curr->inputs[0] = input_tensor->t;
     output_tensors = (vsi_nn_internal_tensor_t**)malloc(tensor_num * sizeof(vsi_nn_internal_tensor_t*));
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( output_tensors, curr, "Create tensor fail.", final );
+
     for (i = 0; i < tensor_num; i++)
     {
         slices[i] = 1;
         memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
         vsi_nn_internal_init_tensor_attr(&attr, &outputs[i]->attr.dtype, use_virtual_tensor);
         output_tensors[i] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensors[i], "Create internal tensor failed", final);
         curr->outputs[i] = output_tensors[i]->t;
     }
-    vsi_nn_internal_setup_node( self, curr );
+    ret = vsi_nn_internal_setup_node( self, curr );
 
     for (i = 0; i < tensor_num; i++)
     {
@@ -210,10 +226,12 @@ static vsi_bool op_setup
 
         output_size = (vsi_size_t *)vsi_nn_internal_new_node_param(curr,
             VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(output_size, curr, "Create internal buffer failed", final);
 
         memcpy(output_size, outputs[i]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
 
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.reshape2.size = output_size;
         curr->node->nn_param.reshape2.dim_num = outputs[i]->attr.dim_num;
         curr->inputs[0] = output_tensors[i]->t;
@@ -221,9 +239,10 @@ static vsi_bool op_setup
         vsi_nn_internal_setup_node( self, curr );
     }
 
+final:
     vsi_nn_safe_free(output_tensors);
 
-    return TRUE;
+    return ret;
 } /* op_setup() */
 
 static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c b/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c
index 1923b26a6..36bbdbc34 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c
@@ -35,7 +35,6 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
 #include "ops/vsi_nn_op_upsample.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
 #include "utils/vsi_nn_constraint_check.h"
 
@@ -144,17 +143,20 @@ static vsi_status op_compute
     vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
     vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
     uint32_t new_rank = 0;
-    vsi_bool ret;
+    vsi_bool ret = FALSE;
     vsi_nn_kernel_param_t * param = NULL;
-    int32_t scale_x  = (int32_t)self->nn_param.upsample.scale[0];
-    int32_t scale_y  = (int32_t)self->nn_param.upsample.scale[1];
+    int32_t scale_x  = 0;
+    int32_t scale_y  = 0;
 
     if( NULL == self )
     {
         return VSI_FAILURE;
     }
 
-    param =vsi_nn_kernel_param_create();
+    scale_x  = (int32_t)self->nn_param.upsample.scale[0];
+    scale_y  = (int32_t)self->nn_param.upsample.scale[1];
+
+    param = vsi_nn_kernel_param_create();
 
     ret = vsi_nn_upsample_optimize_shape(self,
             (vsi_ssize_t*)inputs[0]->attr.size,  (vsi_ssize_t*)inputs[1]->attr.size,
@@ -164,7 +166,7 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_int32( param, "scale_x",  scale_x );
     vsi_nn_kernel_param_add_int32( param, "scale_y",  scale_y );
 
-    if( ret )
+    if ( ret )
     {
         reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
                 inputs[0],  shapes[0], new_rank );
@@ -180,7 +182,7 @@ static vsi_status op_compute
         vsi_nn_ReleaseTensor( &reshape_tensors[2] );
     }
 
-    if( self->n )
+    if ( self->n )
     {
         status = VSI_SUCCESS;
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c
index 6bb917586..4b7dd3f61 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c
@@ -35,6 +35,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
 
 typedef struct _upsamplescale_local_data_t {
     int32_t placeholder;
@@ -56,8 +57,8 @@ static vsi_status op_compute
     )
 {
     vsi_status status = VSI_FAILURE;
-    int32_t stride = self->nn_param.upsamplescale.stride;
-    float   scale  = self->nn_param.upsamplescale.scale;
+    int32_t stride = 0;
+    float   scale  = 0;
     vsi_nn_kernel_param_t * param = NULL;
 
     if( NULL == self )
@@ -65,12 +66,15 @@ static vsi_status op_compute
         return VSI_FAILURE;
     }
 
+    stride = self->nn_param.upsamplescale.stride;
+    scale  = self->nn_param.upsamplescale.scale;
+
     if (stride == 1 || vsi_nn_abs(scale - 1.0f) == _EPSILON)
     {
         return vsi_nn_internal_compute_node( self );
     }
 
-    param =vsi_nn_kernel_param_create();
+    param = vsi_nn_kernel_param_create();
 
     vsi_nn_kernel_param_add_int32( param, "stride", stride );
     vsi_nn_kernel_param_add_float32( param, "scale", scale );
@@ -82,7 +86,7 @@ static vsi_status op_compute
 
     vsi_nn_kernel_param_release( &param );
 
-    if( self->n )
+    if ( self->n )
     {
         status = VSI_SUCCESS;
     }
@@ -141,6 +145,9 @@ static vsi_status op_optimize
     int32_t stride = self->nn_param.upsamplescale.stride;
     float scale = self->nn_param.upsamplescale.scale;
 
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+
     if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON)
     {
         return vsi_nn_internal_optimize_node( self, direction );
@@ -163,30 +170,34 @@ static vsi_bool op_setup
     float scale = self->nn_param.upsamplescale.scale;
     int32_t i = 0;
     vsi_nn_internal_node_t* curr = NULL;
+    vsi_bool ret = FALSE;
 
     vsi_nn_internal_init_node_wksp(self);
 
     if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON)
     {
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = outputs[0];
 
-        vsi_nn_internal_setup_node(self, curr);
+        ret = vsi_nn_internal_setup_node(self, curr);
     }
     else if (stride == 1)
     {
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_LINEAR, 0, 0);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.linear.a = scale;
         curr->node->nn_param.linear.b = 0;
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = outputs[0];
 
-        vsi_nn_internal_setup_node(self, curr);
+        ret = vsi_nn_internal_setup_node(self, curr);
     }
     else if (vsi_nn_abs(scale - 1.0f) == _EPSILON)
     {
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESIZE, 0, 0);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
         curr->node->nn_param.resize.type = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR;
         curr->node->nn_param.resize.align_corners = FALSE;
         curr->node->nn_param.resize.half_pixel_centers = FALSE;
@@ -195,7 +206,7 @@ static vsi_bool op_setup
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = outputs[0];
 
-        vsi_nn_internal_setup_node(self, curr);
+        ret = vsi_nn_internal_setup_node(self, curr);
     }
     else
     {
@@ -206,9 +217,12 @@ static vsi_bool op_setup
             outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
         }
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+
+        ret = TRUE;
     }
 
-    return TRUE;
+final:
+    return ret;
 } /* op_setup() */
 
 static vsi_status op_init
@@ -216,6 +230,8 @@ static vsi_status op_init
     vsi_nn_node_t* self
     )
 {
+    VSI_UNREFERENCED(self);
+
     return VSI_SUCCESS;
 } /* op_init() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c b/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c
index a8a2a7e0b..f4dcb531e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c
@@ -44,6 +44,8 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 
@@ -69,6 +71,8 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
diff --git a/src/tim/vx/internal/src/post/vsi_nn_post_cmupose.c b/src/tim/vx/internal/src/post/vsi_nn_post_cmupose.c
index 7d1b9cf09..6e0ec8d03 100644
--- a/src/tim/vx/internal/src/post/vsi_nn_post_cmupose.c
+++ b/src/tim/vx/internal/src/post/vsi_nn_post_cmupose.c
@@ -348,7 +348,7 @@ static vx_status resize_binlinear
         }
     }
 
-    return VX_SUCCESS;
+    return VSI_SUCCESS;
 }
 #endif
 
@@ -455,14 +455,15 @@ static void _convolve_same
     float *input,
     uint32_t input_size,
     double *kernel,
-    uint32_t kernel_size,
+    int32_t kernel_size,
     float *output
     )
 {
-    uint32_t pad,pad_input_size;
-    uint32_t i,k,offset;
+    uint32_t pad = 0, pad_input_size = 0;
+    uint32_t i = 0, offset = 0;
+    int32_t k = 0;
     float *pad_input = NULL;
-    double sum;
+    double sum = 0;
 
     uint32_t pad_input_sizef,input_sizef;
     if(NULL == input || NULL == kernel || NULL == output)
@@ -536,6 +537,9 @@ static void set_cols
     )
 {
     uint32_t w;
+
+    VSI_UNREFERENCED(height);
+
     if(NULL == data || cols == NULL)
     {
         return ;
@@ -947,6 +951,7 @@ static vsi_nn_con_candidate_t *_get_connection_candidate
             {
                 con_candidate = (vsi_nn_con_candidate_t *)
                     vsi_nn_LinkListNewNode(sizeof(vsi_nn_con_candidate_t), _init_candidate);
+                CHECK_PTR_FAIL_GOTO( con_candidate, "null point.", final );
 
                 sum++;
                 con_candidate->data.i = i;
@@ -963,6 +968,8 @@ static vsi_nn_con_candidate_t *_get_connection_candidate
     }
 
     *candidate_sum = sum;
+
+final:
     return con_candidate_list;
 }
 
@@ -1276,6 +1283,8 @@ static vsi_nn_subset_t *_compute_subset
     vsi_nn_subset_t *subset_list = NULL, *subset = NULL;
     uint32_t *deleteIdx = NULL;
 
+    VSI_UNREFERENCED(all_connection_num);
+
     if(NULL == all_connection ||
        NULL == candidate ||
        NULL == special_k ||
@@ -1319,6 +1328,8 @@ static vsi_nn_subset_t *_compute_subset
                 {
                     sig_subset= (vsi_nn_subset_t *)
                         vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)subset_list, j);
+                    CHECK_PTR_FAIL_GOTO( sig_subset, "null point.", final );
+
                     if(sig_subset->data.idx[indexA] == partAs[i] ||
                        sig_subset->data.idx[indexB] == partBs[i])
                     {
@@ -1338,6 +1349,8 @@ static vsi_nn_subset_t *_compute_subset
                         int32_t ii = partBs[i];
                         sig_connect = (vsi_nn_connection_t *)
                             vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)connection_k, i);
+                        CHECK_PTR_FAIL_GOTO( sig_connect, "get point fail.", final );
+
                         sig_subset->data.idx[indexB] = (float)ii;
                         sig_subset->data.idx[20 - 1] += 1;
                         sig_subset->data.idx[20 - 2] +=
@@ -1362,6 +1375,8 @@ static vsi_nn_subset_t *_compute_subset
                         vsi_nn_subset_t *j2_iter = j2_subset;
                         sig_connect = (vsi_nn_connection_t *)
                             vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)connection_k, i);
+                        CHECK_PTR_FAIL_GOTO( sig_connect, "get point fail.", final );
+
                         for(ii=0; ii<(20-2); ii++)
                         {
                             j1_iter->data.idx[ii] += j2_iter->data.idx[ii] + 1;
@@ -1380,6 +1395,8 @@ static vsi_nn_subset_t *_compute_subset
                         int32_t ii = partBs[i];
                         sig_connect = (vsi_nn_connection_t *)
                             vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)connection_k, i);
+                        CHECK_PTR_FAIL_GOTO( sig_connect, "get point fail.", final );
+
                         sum = candidate[ii].score + sig_connect->data.score;
                         j1_subset->data.idx[indexB] = (float)ii;
                         j1_subset->data.idx[20 - 1] += 1;
@@ -1413,7 +1430,7 @@ static vsi_nn_subset_t *_compute_subset
 
                     subset = (vsi_nn_subset_t *)
                         vsi_nn_LinkListNewNode(sizeof(vsi_nn_subset_t), _init_subset);
-
+                    CHECK_PTR_FAIL_GOTO( subset, "null point.", final );
                     memcpy(&subset->data, row, sizeof(float) * 20);
 
                     vsi_nn_LinkListPushEnd(
@@ -1433,6 +1450,7 @@ static vsi_nn_subset_t *_compute_subset
     memset(deleteIdx, -1, sizeof(uint32_t) * num);
 
     subset = subset_list;
+    CHECK_PTR_FAIL_GOTO( subset, "null point.", final );
     for(i=0,j=0; i<num; i++)
     {
         float tmp1 = subset->data.idx[20 - 1];
@@ -1445,28 +1463,13 @@ static vsi_nn_subset_t *_compute_subset
     }
     for(i=0; i<num; i++)
     {
-        if(deleteIdx[i] != -1)
+        if(deleteIdx[i] != (uint32_t)-1)
         {
             vsi_nn_LinkListDelIndexNode((vsi_nn_link_list_t **)&subset_list, deleteIdx[i]);
             num--;
         }
     }
 
-    #if 0
-    n = 0;
-    subset = subset_list;
-    while (subset)
-    {
-        printf("================= n=%u\n", n);
-        for(i=0; i<20; i++)
-        {
-            printf("[%d] = %f\n", i, subset->data.idx[i]);
-        }
-        subset = (vsi_nn_subset_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)subset);
-        n++;
-    }
-    #endif
-
 final:
     if(deleteIdx)free(deleteIdx);
     return subset_list;
@@ -1499,6 +1502,7 @@ static vsi_nn_connection_t **_compute_all_connetion
     score_mid = (float *)malloc(sizeof(float) * height * width * score_mid_depth);
     CHECK_PTR_FAIL_GOTO( score_mid, "Create buffer fail.", final );
     connection_all = (vsi_nn_connection_t **)malloc(sizeof(vsi_nn_connection_t *) * mapIdx_len);
+    CHECK_PTR_FAIL_GOTO( connection_all, "Create buffer fail.", final );
     special_k = (int32_t *)malloc(sizeof(int32_t) * mapIdx_len);
     CHECK_PTR_FAIL_GOTO( special_k, "Create buffer fail.", final );
 
@@ -1836,6 +1840,7 @@ vsi_status vsi_nn_CMUPose_Post_Process
     _fill_paf_avg(net_out, config, paf_avg);
 
     all_peaks = _compute_all_peaks(heatmap_avg, config, &peak_counter, &peak_list_num);
+    CHECK_PTR_FAIL_GOTO( all_peaks, "Create buffer fail.", final );
 #if 0
     for(n=0; n<peak_list_num; n++)
     {
@@ -1851,6 +1856,7 @@ vsi_status vsi_nn_CMUPose_Post_Process
 #endif
 
     all_connection = _compute_all_connetion(paf_avg, all_peaks, config, &connection_list_num, &special_k);
+    CHECK_PTR_FAIL_GOTO( all_connection, "Create buffer fail.", final );
 #if 0
     for(n=0; n<connection_list_num; n++)
     {
@@ -1888,9 +1894,14 @@ vsi_status vsi_nn_CMUPose_Post_Process
     *peak_candidate_num_out = peak_counter;
     status = VSI_SUCCESS;
 
+final:
     _cmupose_deinit(&multiplier, heatmap_avg, paf_avg);
-    _release_all_connection(all_connection, connection_list_num);
-    if(special_k)free(special_k);
+    if (all_connection)
+    {
+        _release_all_connection(all_connection, connection_list_num);
+    }
+    vsi_nn_safe_free(special_k);
+
     return status;
 }
 
@@ -1946,6 +1957,10 @@ static vsi_status _auto_fill_cmupose
     vsi_nn_tensor_t *net_in = NULL;
     static float default_scale_search[1] = {1};
 
+    VSI_UNREFERENCED(image);
+    VSI_UNREFERENCED(param);
+    VSI_UNREFERENCED(model);
+
     status = VSI_FAILURE;
     if(NULL == graph)
     {
diff --git a/src/tim/vx/internal/src/post/vsi_nn_post_fasterrcnn.c b/src/tim/vx/internal/src/post/vsi_nn_post_fasterrcnn.c
index 2a9ac0d36..f27017c45 100644
--- a/src/tim/vx/internal/src/post/vsi_nn_post_fasterrcnn.c
+++ b/src/tim/vx/internal/src/post/vsi_nn_post_fasterrcnn.c
@@ -123,28 +123,29 @@ static vsi_status _fill_fasterrcnn_param
     vsi_nn_fasterrcnn_param_t *param
     )
 {
-    vsi_status status;
+    vsi_status status = VSI_FAILURE;
     uint32_t i;
     vsi_nn_node_t   *node;
     vsi_nn_tensor_t *tensor;
 
     if(NULL == graph || NULL == param)
     {
-        return VSI_FAILURE;
+        return status;
     }
 
-    status = VSI_FAILURE;
     tensor = NULL;
 
     for(i=0; i<graph->node_num; i++)
     {
         node = vsi_nn_GetNode( graph, (vsi_nn_node_id_t)i );
         //printf("i[%u] op[%s]\n", i, vsi_nn_OpGetName(node->op));
-        if(node->op == VSI_NN_OP_PROPOSAL)
+        if (node && node->op == VSI_NN_OP_PROPOSAL)
         {
             memcpy(&param->iminfo, &node->nn_param.proposal.im_info,
                     sizeof(vsi_nn_proposal_im_info));
             tensor = vsi_nn_GetTensor(graph,node->output.tensors[0]);
+            CHECK_PTR_FAIL_GOTO( tensor, "Get tensor fail.", final );
+
             param->rois_num = (uint32_t)tensor->attr.size[1];
         }
     }
@@ -164,6 +165,7 @@ static vsi_status _fill_fasterrcnn_param
     param->classes_num = VSI_NN_FASTERRCNN_CLASSES_NUM;
     param->classes = FASTER_RCNN_CLASSES;
 
+final:
     return status;
 } /* _fill_fasterrcnn_param() */
 
@@ -572,6 +574,7 @@ static vsi_status _fasterrcnn_post_process
                 {
                     box = (vsi_nn_fasterrcnn_box_t *)
                         vsi_nn_LinkListNewNode(sizeof(vsi_nn_fasterrcnn_box_t), _init_box);
+                    CHECK_PTR_FAIL_GOTO( box, "Create box fail.", final );
                     box->score = dets[keep[k]*5+4];
                     box->class_id = i;
                     box->x1 = dets[keep[k]*5+0];
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c b/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c
index 85d862d23..27a3c45c7 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_binary_tree.c
@@ -34,8 +34,11 @@ static vsi_nn_binary_tree_t * _new_node
 
     node = (vsi_nn_binary_tree_t *)malloc(
         sizeof( vsi_nn_binary_tree_t ) );
+    if (node)
+    {
+        memset( node, 0, sizeof( vsi_nn_binary_tree_t ) );
+    }
 
-    memset( node, 0, sizeof( vsi_nn_binary_tree_t ) );
     return node;
 } /* _new_node() */
 
@@ -181,7 +184,7 @@ void vsi_nn_BinaryTreeRemoveNode
     vsi_nn_binary_tree_key_t key
     )
 {
-    if( NULL == root && NULL != *root )
+    if ( NULL != root )
     {
         return;
     }
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
index 4ce42c95e..d696e8cd5 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@@ -465,6 +465,7 @@ static _op_param_gen_t s_op_gen[] =
     /* INVERSE_SIGMOID */       NULL,
     /* GRID_SAMPLE */           NULL,
     /* LPNORM */                NULL,
+    /* RESIZE_3D */             NULL,
 };
 _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
 
@@ -548,6 +549,10 @@ void vsi_nn_GenGraphCCode
             node_id = i;
         }
         node = vsi_nn_GetNode( graph, node_id );
+        if (node == NULL)
+        {
+            continue;
+        }
         _write_code( "node[%u] = vsi_nn_AppendNode( graph, %#x, NULL );",
             i, node->op );
         for( j = 0; j < node->input.num; j ++ )
@@ -567,7 +572,7 @@ void vsi_nn_GenGraphCCode
             }
         }
         // write node params
-        if( node->op < _cnt_of_array( s_op_gen ) )
+        if( node->op < (vsi_nn_op_t)_cnt_of_array( s_op_gen ) )
         {
             if( NULL != s_op_gen[node->op] )
             {
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c
index 95f5cc7fb..22ab7bb47 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c
@@ -77,6 +77,8 @@ static const char* _get_qtype_name(vsi_nn_qnt_type_e type)
         case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: return "ASYM";
         case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: return "SYM";
         case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: return "SYMM PC";
+        case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8: return "FP8";
+        case VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8: return "FP8 PC";
         default:
             VSILOGE("Unknown quant type: %d\n", type);
             break;
@@ -162,7 +164,9 @@ vsi_bool validate_op_io_types
 {
     vsi_bool matched = FALSE;
 
-    if(self && self->attr.enable_op_constraint_check) {
+    VSI_UNREFERENCED(name);
+
+    if(self && self->attr.enable_op_constraint_check && op_constraint_reg) {
         uint32_t i = 0;
         int32_t j = 0;
         int32_t reg_tensor_num = op_constraint_reg->reg_input_num + op_constraint_reg->reg_output_num;
@@ -218,14 +222,20 @@ char* generate_op_io_types_desc
     char* desc = NULL;
 
     for(i = 0; i < inputs_num; i++) {
-        if(inputs[i]) {
+        if (inputs[i] &&
+            _get_qtype_name(inputs[i]->attr.dtype.qnt_type) &&
+            _get_dtype_name(inputs[i]->attr.dtype.vx_type))
+        {
             total_sz += snprintf(NULL, 0, "%s %s, ",
                     _get_qtype_name(inputs[i]->attr.dtype.qnt_type),
                     _get_dtype_name(inputs[i]->attr.dtype.vx_type));
         }
     }
     for(i = 0; i < outputs_num; i++) {
-        if(outputs[i]) {
+        if (outputs[i] &&
+            _get_qtype_name(outputs[i]->attr.dtype.qnt_type) &&
+            _get_dtype_name(outputs[i]->attr.dtype.vx_type))
+        {
             total_sz += snprintf(NULL, 0, "%s %s, ",
                     _get_qtype_name(outputs[i]->attr.dtype.qnt_type),
                     _get_dtype_name(outputs[i]->attr.dtype.vx_type));
@@ -234,17 +244,24 @@ char* generate_op_io_types_desc
 
     total_sz += 1; /* terminator */
     desc = (char*)malloc(sizeof(char) * total_sz);
+    CHECK_PTR_FAIL_GOTO( desc, "Create buffer fail.", final );
     memset(desc, 0x00, sizeof(char) * total_sz);
 
     for(i = 0; i < inputs_num; i++) {
-        if(inputs[i] && total_sz >= used_sz) {
+        if (inputs[i] && total_sz >= used_sz &&
+            _get_qtype_name(inputs[i]->attr.dtype.qnt_type) &&
+            _get_dtype_name(inputs[i]->attr.dtype.vx_type))
+        {
             used_sz += snprintf(desc + used_sz, total_sz - used_sz, "%s %s, ",
                     _get_qtype_name(inputs[i]->attr.dtype.qnt_type),
                     _get_dtype_name(inputs[i]->attr.dtype.vx_type));
         }
     }
     for(i = 0; i < outputs_num; i++) {
-        if(outputs[i] && total_sz >= used_sz) {
+        if (outputs[i] && total_sz >= used_sz &&
+            _get_qtype_name(outputs[i]->attr.dtype.qnt_type) &&
+            _get_dtype_name(outputs[i]->attr.dtype.vx_type))
+        {
             used_sz += snprintf(desc + used_sz, total_sz - used_sz, "%s %s, ",
                     _get_qtype_name(outputs[i]->attr.dtype.qnt_type),
                     _get_dtype_name(outputs[i]->attr.dtype.vx_type));
@@ -255,6 +272,7 @@ char* generate_op_io_types_desc
         desc[used_sz - 2] = '\0';
     }
 
+final:
     return desc;
 }
 
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dlfcn.c b/src/tim/vx/internal/src/utils/vsi_nn_dlfcn.c
index f64464962..dfabeed95 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_dlfcn.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_dlfcn.c
@@ -4,17 +4,22 @@
 #if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
 void * vsi_nn_dlopen( const char *file, int mode )
 {
+    VSI_UNREFERENCED(file);
+    VSI_UNREFERENCED(mode);
     return NULL;
 }
 
 int vsi_nn_dlclose( void *handle )
 {
+    VSI_UNREFERENCED(handle);
     return -1;
 }
 
 __declspec(noinline)
 void* vsi_nn_dlsym( void *handle, const char *name )
 {
+    VSI_UNREFERENCED(handle);
+    VSI_UNREFERENCED(name);
     return NULL;
 }
 
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
index 18575b716..ac4aa2ab1 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
@@ -116,6 +116,92 @@ static VSI_INLINE_API void _convert_float_to_bfloat16
     }
 } /* _convert_float_to_bfloat16 */
 
+static VSI_INLINE_API vsi_bool _convert_quant_float8_e4m3_to_float
+    (
+    const uint8_t * buffer,
+    size_t size,
+    const float scale,
+    float * out_buffer
+    )
+{
+    uint32_t i = 0;
+    if( !buffer || !out_buffer )
+    {
+        return FALSE;
+    }
+    for( i = 0; i < size; i ++ )
+    {
+        out_buffer[i] = fp8_e4m3_to_fp32( (uint8_t)buffer[i], scale );
+    }
+
+    return TRUE;
+} /* _convert_quant_float8_e4m3_to_float */
+
+static VSI_INLINE_API vsi_bool _convert_float_to_quant_float8_e4m3
+    (
+    const float * buffer,
+    size_t size,
+    const float scale,
+    uint8_t * out_buffer
+    )
+{
+    uint32_t i = 0;
+    if( !buffer || !out_buffer )
+    {
+        return FALSE;
+    }
+    for( i = 0; i < size; i ++ )
+    {
+        out_buffer[i] = fp32_to_fp8_e4m3( buffer[i], scale );
+    }
+
+    return TRUE;
+} /* _convert_float_to_quant_float8_e4m3 */
+
+static VSI_INLINE_API vsi_bool _convert_quant_float8_e5m2_to_float
+    (
+    const uint8_t * buffer,
+    size_t size,
+    const float scale,
+    float * out_buffer
+    )
+{
+    uint32_t i = 0;
+
+    if( !buffer || !out_buffer )
+    {
+        return FALSE;
+    }
+
+    for( i = 0; i < size; i ++ )
+    {
+        out_buffer[i] = fp8_e5m2_to_fp32( (uint8_t)buffer[i], scale );
+    }
+
+    return TRUE;
+} /* _convert_quant_float8_e5m2_to_float */
+
+static VSI_INLINE_API vsi_bool _convert_float_to_quant_float8_e5m2
+    (
+    const float * buffer,
+    size_t size,
+    const float scale,
+    uint8_t * out_buffer
+    )
+{
+    uint32_t i = 0;
+    if( !buffer || !out_buffer )
+    {
+        return FALSE;
+    }
+    for( i = 0; i < size; i ++ )
+    {
+        out_buffer[i] = fp32_to_fp8_e5m2( buffer[i], scale );
+    }
+
+    return TRUE;
+} /* _convert_float_to_quant_float8_e5m2 */
+
 #define DEF_DTYPE_CONVERT_QUANTIZE( SRC_NAME, SRC_DTYPE, ROUND, MIN, MAX ) \
     vsi_bool vsi_nn_dtype_convert_quantize_##SRC_NAME##_to_float \
         ( \
@@ -177,6 +263,15 @@ vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm8_perchannel
     int8_t * out_buffer
     )
 {
+    VSI_UNREFERENCED(size);
+    VSI_UNREFERENCED(shape);
+    VSI_UNREFERENCED(rank);
+    VSI_UNREFERENCED(scale);
+    VSI_UNREFERENCED(scale_size);
+    VSI_UNREFERENCED(zero_point);
+    VSI_UNREFERENCED(zero_point_size);
+    VSI_UNREFERENCED(channel_dim);
+
     if( !buffer || !out_buffer )
     {
         return FALSE;
@@ -195,6 +290,15 @@ vsi_bool vsi_nn_dtype_convert_quantize_symm8_perchannel_to_float
     float * out_buffer
     )
 {
+    VSI_UNREFERENCED(size);
+    VSI_UNREFERENCED(shape);
+    VSI_UNREFERENCED(rank);
+    VSI_UNREFERENCED(scale);
+    VSI_UNREFERENCED(scale_size);
+    VSI_UNREFERENCED(zero_point);
+    VSI_UNREFERENCED(zero_point_size);
+    VSI_UNREFERENCED(channel_dim);
+
     if( !buffer || !out_buffer )
     {
         return FALSE;
@@ -270,6 +374,12 @@ vsi_bool vsi_nn_dtype_convert_float_to_quantize_asymm
         case I8:
             return vsi_nn_dtype_convert_float_to_quantize_symm8(
                     buffer, size, scale, zero_point, (int8_t*)out_buffer );
+        case FP8_E4M3:
+            return _convert_float_to_quant_float8_e4m3(
+                    buffer, size, scale, (uint8_t*)out_buffer );
+        case FP8_E5M2:
+            return _convert_float_to_quant_float8_e5m2(
+                    buffer, size, scale, (uint8_t*)out_buffer );
         case I16:
             return vsi_nn_dtype_convert_float_to_quantize_symm16(
                     buffer, size, scale, zero_point, (int16_t*)out_buffer );
@@ -423,6 +533,12 @@ vsi_bool vsi_nn_dtype_convert_quantize_asymm_to_float
         case U8:
             return vsi_nn_dtype_convert_quantize_asymm8_to_float(
                     (const uint8_t *)buffer, size, scale, zero_point, out_buffer );
+        case FP8_E4M3:
+            return _convert_quant_float8_e4m3_to_float(
+                (const uint8_t *)buffer, size, scale, out_buffer );
+        case FP8_E5M2:
+            return _convert_quant_float8_e5m2_to_float(
+                (const uint8_t *)buffer, size, scale, out_buffer );
         case U16:
             return vsi_nn_dtype_convert_quantize_asymm16_to_float(
                 (const uint16_t*)buffer, size, scale, zero_point, out_buffer);
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c
index 6547f463a..07249e7c4 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c
@@ -408,12 +408,15 @@ vsi_bool vsi_nn_QuantCheck
             VSILOGE("input_fl[%d] + weight_fl[%d] != bias_fl[%d]",
                 input->attr.dtype.fl,
                 weight->attr.dtype.fl,
-                bias->attr.dtype.fl);
+                bias ? bias->attr.dtype.fl : 0);
         }
         break;
     case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
     case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
-    if (weight->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
+    case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8:
+        if (weight->attr.dtype.qnt_type ==
+                VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC ||
+            weight->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8)
     {
       ret = vsi_nn_QuantAffinePerchannelCheck(input, weight, bias);
       if(ret == FALSE)
@@ -429,7 +432,7 @@ vsi_bool vsi_nn_QuantCheck
         VSILOGE("input_scale[%.12lf] * weight_scale[%.12lf] != bias_scale[%.12lf]",
           input->attr.dtype.scale,
           weight->attr.dtype.scale,
-          bias->attr.dtype.scale);
+          bias ? bias->attr.dtype.scale : 0);
       }
     }
         break;
@@ -468,6 +471,7 @@ vsi_bool vsi_nn_DtypeCompare
             break;
         case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
         case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
+        case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8:
         {
             const float diff = (float)1e-5;
             if (dtype0->zero_point != dtype1->zero_point)
@@ -484,6 +488,7 @@ vsi_bool vsi_nn_DtypeCompare
         }
         case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC:
         case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC:
+        case VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8:
         {
             const float diff = (float)1e-5;
             int32_t i = 0;
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_hashmap.c b/src/tim/vx/internal/src/utils/vsi_nn_hashmap.c
index b576fc1e6..8a8288d86 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_hashmap.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_hashmap.c
@@ -47,7 +47,11 @@ static _binary_tree_t * _new_node
     node = (_binary_tree_t *)malloc(
         sizeof( _binary_tree_t ) );
 
-    memset( node, 0, sizeof( _binary_tree_t ) );
+    if (node)
+    {
+        memset( node, 0, sizeof( _binary_tree_t ) );
+    }
+
     return node;
 } /* _new_node() */
 
@@ -395,6 +399,7 @@ void vsi_nn_hashmap_add
     {
         iter = (vsi_nn_hashmap_item_t *)vsi_nn_LinkListNewNode(
                 sizeof( vsi_nn_hashmap_item_t ), NULL );
+        VSI_ASSERT( iter );
         key_size = strlen( hash_key ) + 1;
         iter->hash_key = (char*)malloc( sizeof(char) * key_size );
         VSI_ASSERT( iter->hash_key );
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_link_list.c b/src/tim/vx/internal/src/utils/vsi_nn_link_list.c
index 053e6e9b5..a2401aaf3 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_link_list.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_link_list.c
@@ -27,6 +27,7 @@
 #include "vsi_nn_prv.h"
 #include "utils/vsi_nn_link_list.h"
 #include "vsi_nn_types.h"
+#include "vsi_nn_error.h"
 
 static vsi_nn_link_list_t * _walk_to_start
     (
@@ -239,6 +240,7 @@ vsi_nn_link_list_t * vsi_nn_LinkListNewNode
     )
 {
     vsi_nn_link_list_t *node = (vsi_nn_link_list_t *)malloc(sz);
+    CHECK_PTR_FAIL_GOTO( node, "Create node fail.", final );
     memset(node, 0, sz);
 
     if(init)
@@ -246,6 +248,7 @@ vsi_nn_link_list_t * vsi_nn_LinkListNewNode
         init(node);
     }
 
+final:
     return node;
 } /* vsi_nn_LinkListNewNode() */
 
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_math.c b/src/tim/vx/internal/src/utils/vsi_nn_math.c
index b2aae0586..260646da9 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_math.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_math.c
@@ -360,8 +360,11 @@ struct r123array4x32 _philox4x32round(struct r123array4x32 ctr, struct r123array
     uint32_t hi1;
     uint32_t lo0 = mulhilo32(PHILOX_M4x32_0, ctr.v[0], &hi0);
     uint32_t lo1 = mulhilo32(PHILOX_M4x32_1, ctr.v[2], &hi1);
-    struct r123array4x32 out = {{hi1^ctr.v[1]^key.v[0], lo1,
-                              hi0^ctr.v[3]^key.v[1], lo0}};
+    struct r123array4x32 out = { { 0, 0, 0, 0 } };
+    out.v[0] = hi1^ctr.v[1]^key.v[0];
+    out.v[1] = lo1;
+    out.v[2] = hi0^ctr.v[3]^key.v[1];
+    out.v[3] = lo0;
     return out;
 }
 
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c
index e6a766feb..82d1aaaf1 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c
@@ -306,7 +306,7 @@ vsi_size_t vsi_nn_GetStrideSizeBySize
     type_bits = vsi_nn_TypeGetBits( type);
     stride[0] = type_bits / BITS_PER_BYTE;
     total_bytes = stride[0];
-    if( type_bits < BITS_PER_BYTE )
+    if( type_bits < BITS_PER_BYTE && type_bits != 0 )
     {
         total_bytes = 1;
         if( size[0] % (BITS_PER_BYTE / type_bits) == 0 )
@@ -375,6 +375,8 @@ float vsi_nn_DataAsFloat32
         val = (float)((int8_t*)data)[0];
         break;
     case VSI_NN_TYPE_UINT8:
+    case VSI_NN_TYPE_FLOAT8_E4M3:
+    case VSI_NN_TYPE_FLOAT8_E5M2:
         val = (float)data[0];
         break;
     case VSI_NN_TYPE_INT16:
@@ -600,6 +602,8 @@ void vsi_nn_ComputePadWithPadType
     vsi_size_t   * out_pad
     )
 {
+    VSI_UNREFERENCED(in_dim_num);
+    VSI_UNREFERENCED(rounding);
     vsi_nn_compute_padding(in_shape, ksize, stride, NULL, pad_type, out_pad);
 } /* vsi_nn_ComputePadWithPadType() */
 
@@ -651,6 +655,8 @@ void vsi_nn_ComputePadWithPadTypeForConv1D
     vsi_size_t   * out_pad
     )
 {
+    VSI_UNREFERENCED(in_dim_num);
+    VSI_UNREFERENCED(rounding);
     vsi_nn_compute_padding_conv1d(in_shape, ksize, stride, NULL, pad_type, out_pad);
 } /* vsi_nn_ComputePadWithPadTypeForConv1D() */
 
@@ -708,9 +714,10 @@ vsi_bool vsi_nn_CreateTensorGroup
     vsi_size_t end[VSI_NN_MAX_DIM_NUM];
     vsi_nn_tensor_attr_t attr;
 
-    if( NULL == graph || NULL == in_tensor
+    if ( NULL == graph || NULL == in_tensor
         || NULL == out_tensors || 0 == group_number
-        || 0 == in_tensor->attr.size[axis] )
+        || axis >= VSI_NN_MAX_DIM_NUM ||
+        0 == in_tensor->attr.size[axis] )
     {
         VSILOGW( "Create tensor group fail." );
         return FALSE;
@@ -733,13 +740,14 @@ vsi_bool vsi_nn_CreateTensorGroup
     end[2] = in_tensor->attr.size[2];
     end[3] = in_tensor->attr.size[3];
     end[axis] = 0;
-
     for( i = 0; i <  group_number; i ++ )
     {
         start[axis] = end[axis];
         end[axis] += sz;
 #ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT
-        if ( attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC )
+        if (attr.dtype.qnt_type ==
+                VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC ||
+            attr.dtype.qnt_type == VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8)
         {
             attr.dtype.scales = in_tensor->attr.dtype.scales + sz * i;
             attr.dtype.scale_dim = (int32_t)sz;
@@ -835,6 +843,7 @@ int32_t vsi_nn_Mkdir
     int32_t mode
     )
 {
+    VSI_UNREFERENCED(mode);
     if(NULL == path)
     {
         return -1;
@@ -906,6 +915,10 @@ uint8_t * vsi_nn_MallocAlignedBuffer
     sz = sizeof(aligned_header) + mem_size +
         align_start_size + align_block_size + END_GUARD_SIZE;
     raw_addr = (uint8_t *)malloc( sz * sizeof( uint8_t ) );
+    if (raw_addr == NULL)
+    {
+        return NULL;
+    }
     memset(raw_addr, 0, sizeof( uint8_t ) * sz);
     p = raw_addr + sizeof(aligned_header);
 
@@ -1175,6 +1188,7 @@ vsi_bool vsi_nn_is_same_quant_type(
             break;
         case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
         case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
+        case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8:
         {
             const float diff = (float)1e-5;
             if (src_dtype->zero_point != dst_dtype->zero_point)
@@ -1190,6 +1204,7 @@ vsi_bool vsi_nn_is_same_quant_type(
         }
         case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC:
         case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC:
+        case VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8:
         {
             const float diff = (float)1e-5;
             int32_t i = 0;
@@ -1340,6 +1355,7 @@ float vsi_nn_get_tensor_scale
             break;
         case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
         case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
+        case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8:
             scale = tensor->attr.dtype.scale;
             break;
     default:
@@ -1359,6 +1375,7 @@ int32_t vsi_nn_get_tensor_zero_point
     switch (tensor->attr.dtype.qnt_type)
     {
         case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
+        case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8:
             zero_point = 0;
             break;
         case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
@@ -1408,6 +1425,14 @@ void vsi_nn_get_tensor_clamp_min_max
         *clampMin = - zero_point;
         *clampMax = 65535 - zero_point;
     }
+    else if (vx_type == VSI_NN_TYPE_FLOAT8_E4M3) {
+        *clampMin = -448;
+        *clampMax = 448;
+    }
+    else if (vx_type == VSI_NN_TYPE_FLOAT8_E5M2) {
+        *clampMin = -57344;
+        *clampMax = 57344;
+    }
     else
     {
         uint32_t f32_min = 0xff800000;
diff --git a/src/tim/vx/internal/src/vip/virtual_device.cpp b/src/tim/vx/internal/src/vip/virtual_device.cpp
index 88a146a83..2efa849cc 100644
--- a/src/tim/vx/internal/src/vip/virtual_device.cpp
+++ b/src/tim/vx/internal/src/vip/virtual_device.cpp
@@ -30,7 +30,7 @@ namespace vip {
 Device::Device(uint32_t id) {
     id_ = id;
     graphqueue_ = std::make_unique<GraphQueue> ();
-    worker_ = std::make_unique<Worker> ();;
+    worker_ = std::make_unique<Worker> ();
     ThreadInit();
 }
 
@@ -63,6 +63,9 @@ bool Device::ThreadExit() {
 
 bool Device::GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data) {
     bool status = false;
+    idle_mtx_.lock();
+    submit_num_++;
+    idle_mtx_.unlock();
     status = graphqueue_->Submit(graph, func, data);
     return status;
 }
@@ -72,8 +75,10 @@ bool Device::GraphRemove(const vsi_nn_graph_t* graph) {
 }
 
 void Device::WaitThreadIdle() {
-    ThreadExit();
-    ThreadInit();
+    std::unique_lock<std::mutex> lock(idle_mtx_);
+    while (submit_num_ > 0) {
+        cv_.wait(lock);
+    }
 }
 
 Worker::Worker() {
@@ -108,6 +113,11 @@ void Device::HandleQueue() {
             break;
         }
         worker_->Handle(item);  // run graph
+
+        idle_mtx_.lock();
+        submit_num_--;
+        idle_mtx_.unlock();
+        cv_.notify_one();
     }
 }
 
diff --git a/src/tim/vx/internal/src/vip/virtual_device_private.h b/src/tim/vx/internal/src/vip/virtual_device_private.h
index ed4c6bb68..b0e39a0cc 100644
--- a/src/tim/vx/internal/src/vip/virtual_device_private.h
+++ b/src/tim/vx/internal/src/vip/virtual_device_private.h
@@ -28,8 +28,8 @@
 #include <queue>
 #include <vector>
 #include <map>
-#include <thread>
 #include <array>
+#include <thread>
 #include <iostream>
 #include <mutex>
 #include <unistd.h>
@@ -99,6 +99,9 @@ class Device {
         std::array<std::thread, 2> threads_;
         std::unique_ptr<GraphQueue> graphqueue_;
         std::unique_ptr<Worker> worker_;
+        std::condition_variable cv_;
+        std::mutex idle_mtx_;
+        int submit_num_ = 0;
 };
 
 }  // namespace vip
diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c
index 7d7636fd1..99a5e7938 100644
--- a/src/tim/vx/internal/src/vsi_nn_context.c
+++ b/src/tim/vx/internal/src/vsi_nn_context.c
@@ -151,6 +151,13 @@ static vsi_status vsi_nn_initOptions
         options->enable_stream_processor = atoi(env_s);
     }
 
+    env_s = NULL;
+    options->enable_rgb88_planar_nhwc = 0;
+    if (vsi_nn_getEnv("VSI_NN_FORCE_RGB888_OUT_NHWC", &env_s) && env_s)
+    {
+        options->enable_rgb88_planar_nhwc = atoi(env_s);
+    }
+
     return VSI_SUCCESS;
 }
 
diff --git a/src/tim/vx/internal/src/vsi_nn_daemon.c b/src/tim/vx/internal/src/vsi_nn_daemon.c
index a5b279712..4887368ef 100644
--- a/src/tim/vx/internal/src/vsi_nn_daemon.c
+++ b/src/tim/vx/internal/src/vsi_nn_daemon.c
@@ -28,11 +28,13 @@
 
 _INITIALIZER( daemon_start )
 {
+    //VSILOGD("OVXLIB init ... ");
     vsi_nn_kernel_backend_init();
 } /* _daemon_start() */
 
 _DEINITIALIZER( daemon_shutdown )
 {
+    //VSILOGD("OVXLIB shutdown ... ");
     vsi_nn_kernel_backend_deinit();
 } /* vsi_nn_daemen_shutdown() */
 
diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c
index bbfdabcba..c9eed9cd8 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph.c
@@ -194,10 +194,10 @@ static vsi_status update_max_node_io
     vsi_nn_node_id_t *node_list
     )
 {
-    uint32_t i,max_io;
-    vsi_status status;
+    uint32_t i = 0,max_io = 0;
+    vsi_status status = VSI_FAILURE;
     vsi_nn_node_id_t node_id;
-    vsi_nn_node_t   *node;
+    vsi_nn_node_t   *node = NULL;
 
     status = VSI_SUCCESS;
     max_io = VSI_NN_MAX_IO_NUM; /* default max node io */
@@ -205,11 +205,12 @@ static vsi_status update_max_node_io
     {
         node_id = node_list[i];
         node = vsi_nn_GetNode( graph, node_id );
-        if(node->input.num > max_io)
+
+        if (node && node->input.num > max_io)
         {
             max_io = node->input.num;
         }
-        if(node->output.num > max_io)
+        if (node && node->output.num > max_io)
         {
             max_io = node->output.num;
         }
@@ -250,6 +251,8 @@ static vsi_status optimize_node_backward
 
         /* Get inputs, outputs. */
         node = vsi_nn_GetNode( graph, node_id );
+        CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final );
+
         vsi_nn_GetTensors( graph, node->input.tensors,
             node->input.num, inputs );
         vsi_nn_GetTensors( graph, node->output.tensors,
@@ -301,6 +304,8 @@ static vsi_status optimize_node_forward
 
         /* Get inputs, outputs. */
         node = vsi_nn_GetNode( graph, node_id );
+        CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final );
+
         vsi_nn_GetTensors( graph, node->input.tensors,
             node->input.num, inputs );
         vsi_nn_GetTensors( graph, node->output.tensors,
@@ -353,6 +358,8 @@ static vsi_status compute_node
 
         /* Get inputs, outputs. */
         node = vsi_nn_GetNode( graph, node_id );
+        CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final );
+
         vsi_nn_GetTensors( graph, node->input.tensors,
             node->input.num, inputs );
         vsi_nn_GetTensors( graph, node->output.tensors,
@@ -458,6 +465,8 @@ static vsi_status setup_node
 
         /* Get inputs, outputs. */
         node = vsi_nn_GetNode( graph, node_id );
+        CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final );
+
         vsi_nn_GetTensors( graph, node->input.tensors,
             node->input.num, inputs );
         vsi_nn_GetTensors( graph, node->output.tensors,
@@ -525,6 +534,8 @@ static vsi_status set_graph_precision
         memset( outputs, 0, graph->max_node_io * sizeof( vsi_nn_tensor_t * ) );
         /* Get inputs, outputs. */
         node = vsi_nn_GetNode( graph, node_id );
+        CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final );
+
         vsi_nn_GetTensors( graph, node->input.tensors,
             node->input.num, inputs );
         vsi_nn_GetTensors( graph, node->output.tensors,
@@ -560,6 +571,9 @@ vsi_nn_graph_t * vsi_nn_CreateGraph
     vsi_nn_graph_t * graph;
     graph = NULL;
 
+    VSI_UNREFERENCED(max_tensor_num);
+    VSI_UNREFERENCED(max_node_num);
+
     VSILOGI( "%s", vsi_nn_GetVersion() );
 
     if( NULL == ctx )
@@ -1002,6 +1016,70 @@ vsi_nn_tensor_id_t vsi_nn_AddTensorFromHandle
     return _add_tensor(graph, id, attr, data);
 }
 
+vsi_nn_tensor_id_t vsi_nn_AddTensorFromView
+(
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_id_t parent_id,
+    vsi_size_t* start,
+    vsi_size_t* end
+)
+{
+    uint32_t i = 0;
+    vx_tensor view_vxt = NULL;
+    vsi_nn_tensor_t* parent_tensor = NULL;
+    vsi_nn_tensor_t* new_tensor =NULL;
+    vsi_nn_tensor_id_t id = VSI_NN_TENSOR_ID_NA;
+    vsi_nn_tensor_attr_t attr;
+
+    memset(&attr, 0x0, sizeof(vsi_nn_tensor_attr_t));
+    parent_tensor = vsi_nn_GetTensor(graph, parent_id);
+    if (NULL == parent_tensor)
+    {
+        VSILOGE("Create view tensor failed, parent tensor is invalid.");
+        id = VSI_NN_TENSOR_ID_NA;
+        goto final;
+    }
+
+    /* new tensor's all attribuites are inherited from parent tensor except 'size' */
+    attr = parent_tensor->attr;
+    for (i = 0; i < attr.dim_num; i++)
+    {
+        attr.size[i] = end[i] - start[i];
+    }
+    id = _add_tensor(graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL);
+    if (VSI_NN_TENSOR_ID_NA == id)
+    {
+        VSILOGE("Create view tensor failed, new tensor could not be created.");
+        goto final;
+    }
+
+    new_tensor = vsi_nn_GetTensor(graph, id);
+    if (new_tensor && new_tensor->t)
+    {
+        vxReleaseTensor(&(new_tensor->t));
+    }
+    else
+    {
+        VSILOGE("Create view tensor failed, new tensor or vxTensor is NULL.");
+        id = VSI_NN_TENSOR_ID_NA;
+        goto final;
+    }
+
+    view_vxt = vsi_nn_CreateViewTensor(graph, start, end, parent_tensor);
+    if ( NULL != view_vxt)
+    {
+        new_tensor->t = view_vxt;
+    }
+    else
+    {
+        VSILOGE("Create view tensor failed, view vxTensor could not be created.");
+        id = VSI_NN_TENSOR_ID_NA;
+        goto final;
+    }
+final:
+    return id;
+}
+
 vsi_nn_tensor_id_t vsi_nn_AttachTensorToGraph
     (
     vsi_nn_graph_t       * graph,
@@ -1184,6 +1262,8 @@ vsi_nn_node_t * vsi_nn_AddExternalNode
     vsi_nn_node_id_t id;
     vsi_nn_op_proc_t * node_proc;
 
+    VSI_UNREFERENCED(node_id);
+
     node_proc = (vsi_nn_op_proc_t*)proc;
 
     if( NULL == graph )
@@ -1210,12 +1290,25 @@ vsi_nn_node_t * vsi_nn_AddExternalNode
         node->output.num = node_proc->output_num;
         node->output.tensors = (vsi_nn_tensor_id_t *) malloc(
             node_proc->output_num * sizeof( vsi_nn_tensor_id_t ) );
+        if ( NULL == node->output.tensors )
+        {
+            VSILOGE("Create output tensor id %s. fail", vsi_nn_OpGetName(op));
+            vsi_nn_safe_free(node);
+            return NULL;
+        }
         vsi_nn_InitTensorsId( node->output.tensors, node_proc->output_num );
 
         /* init input struct */
         node->input.num = node_proc->input_num;
         node->input.tensors = (vsi_nn_tensor_id_t *) malloc(
             node_proc->input_num * sizeof( vsi_nn_tensor_id_t ) );
+        if ( NULL == node->input.tensors )
+        {
+            VSILOGE("Create input tensor id %s. fail", vsi_nn_OpGetName(op));
+            vsi_nn_safe_free(node->output.tensors);
+            vsi_nn_safe_free(node);
+            return NULL;
+        }
         vsi_nn_InitTensorsId( node->input.tensors, node_proc->input_num );
         node->attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE;
         node->attr.enable_op_constraint_check = TRUE;
@@ -1259,11 +1352,16 @@ vsi_bool vsi_nn_SetGraphInputs
     vsi_bool ret;
     ret = FALSE;
 
-    if( NULL == graph || tensor_num == 0 )
+    if( NULL == graph )
     {
         return ret;
     }
 
+    if ( tensor_num == 0 )
+    {
+        return TRUE;
+    }
+
     graph->input.tensors = (vsi_nn_tensor_id_t *)malloc(
         tensor_num * sizeof( vsi_nn_tensor_id_t ) );
 
@@ -1317,10 +1415,10 @@ vsi_nn_node_id_t * vsi_nn_SortGraphNode
     vsi_nn_graph_t * graph
     )
 {
-    uint32_t i,j;
-    uint32_t             count;
-    vsi_bool             dirty;
-    vsi_bool             all_tensor_processed;
+    uint32_t i = 0,j = 0;
+    uint32_t             count = 1;
+    vsi_bool             dirty = TRUE;
+    vsi_bool             all_tensor_processed = FALSE;
     vsi_bool           * tensors = NULL;
     vsi_nn_node_id_t   * nodes = NULL;
     vsi_nn_node_id_t   * sorted_nodes = NULL;
@@ -1344,21 +1442,18 @@ vsi_nn_node_id_t * vsi_nn_SortGraphNode
     /* Init variables. */
     tensors = (vsi_bool *)malloc(
         graph->tensor_num * sizeof( vsi_bool ) );
-
-    if( NULL == tensors )
-    {
-        goto _SortGraphNodeFinally;
-    }
+    CHECK_PTR_FAIL_GOTO( tensors, "Create buffer fail.", final );
+    memset(tensors, 0, graph->tensor_num * sizeof( vsi_bool ));
 
     sorted_nodes = (vsi_nn_node_id_t *)malloc(
         graph->node_num * sizeof( vsi_nn_node_id_t ) );
+    CHECK_PTR_FAIL_GOTO( sorted_nodes, "Create buffer fail.", final );
+    memset(sorted_nodes, 0, graph->node_num * sizeof( vsi_nn_node_id_t ));
+
     nodes = (vsi_nn_node_id_t *)malloc(
         graph->node_num * sizeof( vsi_nn_node_id_t ) );
-
-    if( NULL == sorted_nodes || NULL == nodes)
-    {
-        goto _SortGraphNodeFinally;
-    }
+    CHECK_PTR_FAIL_GOTO( nodes, "Create buffer fail.", final );
+    memset(sorted_nodes, 0, graph->node_num * sizeof( vsi_nn_node_id_t ));
 
     for( i = 0; i < graph->tensor_num; i++ )
     {
@@ -1396,6 +1491,8 @@ vsi_nn_node_id_t * vsi_nn_SortGraphNode
         {
             node_id = nodes[i];
             node = vsi_nn_GetNode( graph, node_id );
+            CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final );
+
             all_tensor_processed = TRUE;
             for( j = 0; j < node->input.num; j ++ )
             {
@@ -1439,17 +1536,17 @@ vsi_nn_node_id_t * vsi_nn_SortGraphNode
         }
     } while( count > 0 );
 
-    if( count != 0 )
+final:
+
+    /* Release memory. */
+    vsi_nn_safe_free( tensors );
+    vsi_nn_safe_free( nodes );
+
+    if ( count != 0 )
     {
-        free( sorted_nodes );
-        sorted_nodes = NULL;
+        vsi_nn_safe_free( sorted_nodes );
     }
 
-_SortGraphNodeFinally:
-
-    /* Release memory. */
-    free( tensors );
-    free( nodes );
     return sorted_nodes;
 } /* vsi_nn_SortGraphNode() */
 
@@ -1479,7 +1576,8 @@ uint32_t vsi_nn_GetNodesByUids
             for( j = 0; j < graph->node_num; j++ )
             {
                 node = vsi_nn_GetNode( graph, (vsi_nn_node_id_t)j );
-                if( node_uids[i] == node->uid )
+
+                if ( node && node_uids[i] == node->uid )
                 {
                     nodes[sz] = (vsi_nn_node_id_t)j;
                     sz ++;
@@ -1496,6 +1594,7 @@ uint32_t vsi_nn_GetNodesByUids
         }
         sz = graph->node_num;
     }
+
     return sz;
 } /* vsi_nn_GetNodesByUids() */
 
@@ -1536,6 +1635,8 @@ void vsi_nn_DumpGraphNodeOutputsEx
     vsi_nn_node_t    * node;
     vsi_nn_tensor_t  * tensor;
 
+    VSI_UNREFERENCED(data_fmt);
+
     if(vsi_nn_CheckFilePath(path) == FALSE)
     {
         return ;
@@ -1576,6 +1677,7 @@ void vsi_nn_DumpGraphNodeOutputsEx
     for( i = 0; i < node_num; i++ )
     {
         node = vsi_nn_GetNode( graph, (vsi_nn_node_id_t)i );
+        CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final );
 
         if( node->internal_node_wksp ) /* dump internal nodes if any */
         {
@@ -1611,7 +1713,9 @@ void vsi_nn_DumpGraphNodeOutputsEx
             }
         }
     }
-    free( nodes );
+
+final:
+    vsi_nn_safe_free( nodes );
 } /* vsi_nn_DumpGraphNodeOutputsEx */
 
 void vsi_nn_PrintGraph
@@ -1728,6 +1832,7 @@ void vsi_nn_DumpGraphToJson
 
                         /* tensor only 1 input node */
                         in_node = vsi_nn_GetNode(graph, table[0].node);
+                        CHECK_PTR_FAIL_GOTO( in_node, "Get node fail.", final );
                         if(j == node->input.num - 1)
                         {
                             fprintf(fp, "\"@uid_%u:out%u\" ", in_node->uid, table[0].index);
@@ -1847,6 +1952,7 @@ void vsi_nn_DumpGraphToJson
 
     fprintf(fp, "\t}\n}\n");
 
+final:
     vsi_nn_ReleaseTensorRelevance(graph, tensor_ref);
     fclose(fp);
 } /* vsi_nn_DumpGraphToJson() */
@@ -1959,7 +2065,8 @@ vsi_status vsi_nn_setup_binary_graph_inputs_outputs
     {
         vsi_nn_node_t* node = vsi_nn_GetNode(graph, i);
         uint32_t numParams = 0;
-        if (node->op == VSI_NN_OP_NBG)
+
+        if (node && node->op == VSI_NN_OP_NBG)
         {
             status = vxQueryNode(
                 node->n, VX_NODE_PARAMETERS, &numParams, sizeof(numParams));
@@ -1968,13 +2075,14 @@ vsi_status vsi_nn_setup_binary_graph_inputs_outputs
                 vx_parameter param = 0;
                 vx_enum type = 0;
                 param = vxGetParameterByIndex(node->n, j);
-                status = vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum));
-                if (type == VX_TYPE_SCALAR)
-                {
-                    num_of_graph_real_inputs++;
-                }
                 if (param != NULL)
                 {
+                    status = vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum));
+                    if (type == VX_TYPE_SCALAR)
+                    {
+                        num_of_graph_real_inputs++;
+                    }
+
                     vxReleaseParameter(&param);
                     param = NULL;
                 }
@@ -1997,44 +2105,50 @@ vsi_status vsi_nn_setup_binary_graph_inputs_outputs
             for (k = 0; k < graph->node_num; k++)
             {
                 vsi_nn_node_t* node = vsi_nn_GetNode(graph, k);
-                if (node->op == VSI_NN_OP_NBG)
+
+                if (node && node->op == VSI_NN_OP_NBG)
                 {
                     vx_parameter param = 0;
                     vx_reference ref = 0;
                     vx_enum type = 0;
                     uint32_t scalar_index = j;
                     param = vxGetParameterByIndex(node->n, scalar_index);
-                    status = vxQueryParameter(param,
-                                                VX_PARAMETER_TYPE,
-                                                &type,
-                                                sizeof(vx_enum));
+
                     if (param != NULL)
                     {
-                        vxReleaseParameter(&param);
-                        param = NULL;
-                    }
-                    if (type != VX_TYPE_SCALAR)
-                    {
-                        break;
-                    }
-                    for (p = scalar_index; p < scalar_index+4; p++)
-                    {
-                        param = vxGetParameterByIndex(node->n, p);
                         status = vxQueryParameter(param,
                                                     VX_PARAMETER_TYPE,
                                                     &type,
                                                     sizeof(vx_enum));
-                        if (type == VX_TYPE_SCALAR)
+                        vxReleaseParameter(&param);
+                        param = NULL;
+
+                        if (type != VX_TYPE_SCALAR)
                         {
-                            vxQueryParameter(param,
-                                                VX_PARAMETER_REF,
-                                                &ref,
-                                                sizeof(vx_reference));
-                            graph_inputs[j++] = ref;
-                            vxReleaseReference(&ref);
+                            break;
                         }
+                    }
+
+                    for (p = scalar_index; p < scalar_index+4; p++)
+                    {
+                        param = vxGetParameterByIndex(node->n, p);
+
                         if (param != NULL)
                         {
+                            status = vxQueryParameter(param,
+                                                        VX_PARAMETER_TYPE,
+                                                        &type,
+                                                        sizeof(vx_enum));
+                            if (type == VX_TYPE_SCALAR)
+                            {
+                                vxQueryParameter(param,
+                                                    VX_PARAMETER_REF,
+                                                    &ref,
+                                                    sizeof(vx_reference));
+                                graph_inputs[j++] = ref;
+                                vxReleaseReference(&ref);
+                            }
+
                             vxReleaseParameter(&param);
                         }
                     }
@@ -2146,6 +2260,8 @@ void  vsi_nn_get_tensor_consumers
     for(i = 0; i < graph->node_num; i++)
     {
         node = vsi_nn_GetNode(graph, i);
+        CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final );
+
         for(j = 0; j < node->input.num; j++)
         {
             if(node->input.tensors[j] == tensor_id)
@@ -2159,6 +2275,8 @@ void  vsi_nn_get_tensor_consumers
             }
         }
     }
+
+final:
     if(count != NULL)
     {
         *count = nodes_count;
@@ -2177,6 +2295,8 @@ void vsi_nn_get_tensor_provider
     for(i = 0; i < graph->node_num; i++)
     {
         cur_node = vsi_nn_GetNode(graph, i);
+        CHECK_PTR_FAIL_GOTO( cur_node, "Get node fail.", final );
+
         for(j = 0; j < cur_node->output.num; j++)
         {
             if(cur_node->output.tensors[j] == tensor_id)
@@ -2186,6 +2306,9 @@ void vsi_nn_get_tensor_provider
             }
         }
     }
+
+final:
+    return;
 } /* vsi_nn_get_tensor_provider() */
 
 vsi_status vsi_nn_SetGraphPreloadSize
@@ -2198,6 +2321,10 @@ vsi_status vsi_nn_SetGraphPreloadSize
     vsi_status status;
     status = VSI_FAILURE;
 
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(attr);
+    VSI_UNREFERENCED(size);
+
 #if(defined(VX_PRELOAD_CONST_TENSOR_SUPPORT) && VX_PRELOAD_CONST_TENSOR_SUPPORT)
     if(graph && graph->g)
     {
@@ -2259,6 +2386,8 @@ vsi_status vsi_nn_SetGraphPriority
     )
 {
     vsi_status status = VSI_FAILURE;
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(priority);
 #ifdef VX_GRAPH_PREEMPTION_SUPPORT
     if(graph && graph->g)
     {
diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
index 05b2d2fc1..aafc89038 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
@@ -27,7 +27,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
-#include "vsi_nn_test.h"
+#include "vsi_nn_error.h"
 
 
 static vsi_bool _is_asymm_int8_norm_tensor
@@ -88,6 +88,8 @@ static vsi_status _add_forward_node
     uint32_t i = 0;
     uint32_t j = 0;
 
+    VSI_UNREFERENCED(graph);
+
     /* Reconnect node tensors */
     for(i = 0; i < nodes_count; i++)
     {
@@ -117,8 +119,10 @@ static vsi_status _add_backward_node
 {
     uint32_t i = 0;
 
+    VSI_UNREFERENCED(graph);
+
     /* Reconnect node output tensors */
-    for(i = 0; i < (int32_t)last_node->output.num; i++)
+    for(i = 0; i < last_node->output.num; i++)
     {
         if(last_node->output.tensors[i] == output)
         {
@@ -188,10 +192,13 @@ static void _get_graph_input_asymm_int8_norm_tensor
     for(i = 0; i < graph->node_num; i++)
     {
         node = vsi_nn_GetNode(graph, i);
+        CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final );
+
         for(j = 0; j < node->input.num; j++)
         {
             vsi_nn_tensor_id_t id = node->input.tensors[j];
             vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
+
             if (_is_asymm_int8_norm_tensor(tensor))
             {
                 if(tensor_ids != NULL)
@@ -211,6 +218,7 @@ static void _get_graph_input_asymm_int8_norm_tensor
         }
     }
 
+final:
     if(count != NULL)
     {
         *count = tensor_count;
@@ -236,10 +244,13 @@ static void _get_graph_output_asymm_int8_norm_tensor
     for(i = 0; i < graph->node_num; i++)
     {
         node = vsi_nn_GetNode(graph, i);
+        CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final );
+
         for(j = 0; j < node->output.num; j++)
         {
             vsi_nn_tensor_id_t id = node->output.tensors[j];
             vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
+
             if (_is_asymm_int8_norm_tensor(tensor))
             {
                 if(tensor_ids != NULL)
@@ -251,6 +262,7 @@ static void _get_graph_output_asymm_int8_norm_tensor
         }
     }
 
+final:
     if(count != NULL)
     {
         *count = tensor_count;
@@ -280,11 +292,16 @@ static vsi_status _add_graph_dataconvert_for_int8
     if(input_count != 0)
     {
         input_ids = (vsi_nn_tensor_id_t *)malloc(sizeof(vsi_nn_tensor_id_t) * input_count);
+        CHECK_PTR_FAIL_GOTO( input_ids, "Create tensor id fail.", final );
+        memset(input_ids, 0, sizeof(vsi_nn_tensor_id_t) * input_count);
+
         _get_graph_input_asymm_int8_norm_tensor(graph, NULL, input_ids, &input_valid_count);
 
         if ( input_valid_count > 0 )
         {
             input_nodes = (vsi_nn_node_t***)malloc(sizeof(vsi_nn_node_t**) * input_valid_count);
+            CHECK_PTR_FAIL_GOTO( input_nodes, "Create node fail.", final );
+            memset(input_nodes, 0, sizeof(vsi_nn_node_t**) * input_valid_count);
         }
 
         for ( i = 0; i < input_valid_count; i++)
@@ -295,6 +312,9 @@ static vsi_status _add_graph_dataconvert_for_int8
             if(nodes_count > 0)
             {
                 input_nodes[i] = (vsi_nn_node_t**)malloc(sizeof(vsi_nn_node_t*)*nodes_count);
+                CHECK_PTR_FAIL_GOTO( input_nodes[i], "Create node fail.", final );
+                memset(input_nodes[i], 0, sizeof(vsi_nn_node_t*) * nodes_count);
+
                 vsi_nn_get_tensor_consumers(graph, input_ids[i], input_nodes[i], NULL);
 
                 *dirty = TRUE;
@@ -307,9 +327,14 @@ static vsi_status _add_graph_dataconvert_for_int8
     if(output_count > 0)
     {
         output_ids = (vsi_nn_tensor_id_t*)malloc(sizeof(vsi_nn_tensor_id_t) * output_count);
+        CHECK_PTR_FAIL_GOTO( output_ids, "Create tensor id fail.", final );
+        memset(output_ids, 0, sizeof(vsi_nn_tensor_id_t) * output_count);
+
         _get_graph_output_asymm_int8_norm_tensor(graph, NULL, output_ids);
 
         output_nodes = (vsi_nn_node_t**)malloc(sizeof(vsi_nn_node_t*) * output_count);
+        CHECK_PTR_FAIL_GOTO( output_nodes, "Create node fail.", final );
+        memset(output_nodes, 0, sizeof(vsi_nn_node_t*) * output_count);
 
         for ( i = 0; i < output_count; i++)
         {
@@ -325,33 +350,25 @@ static vsi_status _add_graph_dataconvert_for_int8
             uint32_t nodes_count = 0;
             vsi_nn_get_tensor_consumers(graph, input_ids[i], NULL, &nodes_count);
 
-            if(nodes_count != 0)
+            if (nodes_count > 0)
             {
                 vsi_nn_tensor_id_t id = input_ids[i];
                 vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
-                vsi_nn_tensor_id_t output;
+                vsi_nn_tensor_id_t output = 0;
 
-               memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t));
-               attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
-               attr.dtype.zero_point += 128;
-               attr.vtl = TRUE;
-               output = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL );
-
-               _add_dataconvert_node(graph, dataconvert_idx ++, VSI_NN_OPTIMIZE_FORWARD,
-                   input_nodes[i], nodes_count, id, output);
-            }
-            if (input_nodes[i] != NULL)
-            {
-                free(input_nodes[i]);
-                input_nodes[i] = NULL;
+                if (tensor)
+                {
+                   memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t));
+                   attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
+                   attr.dtype.zero_point += 128;
+                   attr.vtl = TRUE;
+                   output = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL );
+
+                   _add_dataconvert_node(graph, dataconvert_idx ++, VSI_NN_OPTIMIZE_FORWARD,
+                       input_nodes[i], nodes_count, id, output);
+                }
             }
         }
-
-        if(input_nodes)
-        {
-            free(input_nodes);
-            input_nodes = NULL;
-        }
     }
 
     if ( output_count > 0 )
@@ -360,35 +377,36 @@ static vsi_status _add_graph_dataconvert_for_int8
         {
             vsi_nn_tensor_id_t id = output_ids[i];
             vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
-            vsi_nn_tensor_id_t input;
-
-            memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t));
-            attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
-            attr.dtype.zero_point += 128;
-            attr.vtl = TRUE;
-            input = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL );
+            vsi_nn_tensor_id_t input = 0;
 
-            _add_dataconvert_node(graph, dataconvert_idx ++, VSI_NN_OPTIMIZE_BACKWARD,
-                &output_nodes[i], 1, input, id);
+            if (tensor)
+            {
+                memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t));
+                attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
+                attr.dtype.zero_point += 128;
+                attr.vtl = TRUE;
+                input = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL );
+
+                _add_dataconvert_node(graph, dataconvert_idx ++, VSI_NN_OPTIMIZE_BACKWARD,
+                    &output_nodes[i], 1, input, id);
+            }
         }
+    }
 
-        if(output_nodes)
+final:
+    for ( i = 0; i < input_valid_count; i++)
+    {
+        if (input_nodes)
         {
-            free(output_nodes);
-            output_nodes = NULL;
+            vsi_nn_safe_free(input_nodes[i]);
         }
     }
+    vsi_nn_safe_free(input_nodes);
 
-    if (input_ids)
-    {
-        free(input_ids);
-        input_ids = NULL;
-    }
-    if (output_ids)
-    {
-        free(output_ids);
-        output_ids = NULL;
-    }
+    vsi_nn_safe_free(output_nodes);
+
+    vsi_nn_safe_free(input_ids);
+    vsi_nn_safe_free(output_ids);
 
     return status;
 } /* _add_graph_dataconvert_for_int8() */
@@ -402,7 +420,7 @@ static vsi_status _add_graph_data_convert
     vsi_status status = VSI_FAILURE;
 
     status = _add_graph_dataconvert_for_int8(graph, dirty);
-    TEST_CHECK_STATUS(status, final);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
 final:
     return status;
@@ -510,7 +528,7 @@ vsi_status vsi_nn_CopyDataToRawTensor
     }
     else
     {
-        status = vsi_nn_copy_tensor_patch(tensor, &attr, data, VX_WRITE_ONLY);
+        status = vsi_nn_copy_tensor_patch(tensor, &attr, data, VX_WRITE_ONLY, NULL, NULL);
     }
 
     _try_set_const_raw_tensor(tensor, attr);
@@ -537,11 +555,11 @@ static vx_tensor _create_const_raw_tensor
     params.num_of_dims = attr.dim_num;
     for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
     {
-        size_vxsize[i] = -1 == attr.size[i] ? -1 : (vx_size)attr.size[i];
+        size_vxsize[i] = (vsi_size_t)-1 == attr.size[i] ? (vx_size)-1 : (vx_size)attr.size[i];
     }
     for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
     {
-        size_u32[i] = -1 == attr.size[i] ? -1 : (vx_uint32)attr.size[i];
+        size_u32[i] = (vsi_size_t)-1 == attr.size[i] ? (vx_uint32)-1 : (vx_uint32)attr.size[i];
     }
 #ifdef VSI_40BIT_VA_SUPPORT
     params.sizes = size_vxsize;
@@ -558,14 +576,19 @@ static vx_tensor _create_const_raw_tensor
         params.quant_data.dfp.fixed_point_pos = (uint8_t)attr.dtype.fl;
         break;
     case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
+    case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
+    case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8:
         params.quant_data.affine.scale = attr.dtype.scale;
         params.quant_data.affine.zeroPoint = (int32_t)attr.dtype.zero_point;
         break;
     case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC:
+    case VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8:
 #ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT
         // This is a hack that driver doesn't support const scale
         scales = (float *)malloc(sizeof(float) * attr.dtype.scale_dim);
+        CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
         zeroPoints = (int32_t *)malloc(sizeof(attr.dtype.zero_points[0]) * attr.dtype.zero_points_dim);
+        CHECK_PTR_FAIL_GOTO( zeroPoints, "Create buffer fail.", final );
         memcpy(scales, attr.dtype.scales, attr.dtype.scale_dim * sizeof(float));
         memcpy(zeroPoints, attr.dtype.zero_points, attr.dtype.zero_points_dim * sizeof(attr.dtype.zero_points[0]));
         params.quant_data.affinePerChannel.channelDim = attr.dtype.channel_dim;
@@ -575,7 +598,8 @@ static vx_tensor _create_const_raw_tensor
         params.quant_data.affinePerChannel.zeroPointCount = attr.dtype.zero_points_dim;
         break;
 #else
-    VSILOGE( "can't support qnt_type VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC." );
+    VSILOGE( "can't support qnt_type VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC"
+        "or VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8." );
 #endif
     default:
         break;
@@ -622,7 +646,7 @@ static vx_tensor _create_const_raw_tensor
                     vx_size stride_size_vxsize[_cnt_of_array(stride_size)] = {0};
                     for(i = 0; i < _cnt_of_array(attr.size); i++)
                     {
-                        size[i] = -1 == attr.size[i] ? -1 : (vx_size)attr.size[i];
+                        size[i] = (vsi_size_t)-1 == attr.size[i] ? (vx_size)-1 : (vx_size)attr.size[i];
                     }
                     for(i = 0; i < _cnt_of_array(stride_size); i++)
                     {
@@ -630,6 +654,7 @@ static vx_tensor _create_const_raw_tensor
                     }
                     addr = vxCreateTensorAddressing(graph->ctx->c,
                         size, stride_size_vxsize, (vx_size)attr.dim_num);
+                    CHECK_PTR_FAIL_GOTO( addr, "Create tensor address fail.", final );
                 }
 #else
                 {
@@ -637,14 +662,16 @@ static vx_tensor _create_const_raw_tensor
                     uint32_t stride_size_32bit[_cnt_of_array(stride_size)] = {0};
                     for(i = 0; i < _cnt_of_array(attr.size); i++)
                     {
-                        size_32bit[i] = -1 == attr.size[i] ? -1 : (uint32_t)attr.size[i];
+                        size_32bit[i] = (vsi_size_t)-1 == attr.size[i] ? (uint32_t)-1 : (uint32_t)attr.size[i];
                     }
                     for(i = 0; i < _cnt_of_array(stride_size); i++)
                     {
-                        stride_size_32bit[i] = -1 == stride_size[i] ? -1 : (uint32_t)stride_size[i];
+                        stride_size_32bit[i] = (vsi_size_t)-1 == stride_size[i] ? \
+                            (uint32_t)-1 : (uint32_t)stride_size[i];
                     }
                     addr = vxCreateTensorAddressing(graph->ctx->c,
                         size_32bit, stride_size_32bit, (vx_uint8)attr.dim_num);
+                    CHECK_PTR_FAIL_GOTO( addr, "Create tensor address fail.", final );
                 }
 #endif
 #ifdef VX_13_NN_COMPATIBLITY
@@ -687,18 +714,12 @@ static vx_tensor _create_const_raw_tensor
     }
 
 final:
-    if( NULL == tensor )
+    if ( NULL == tensor )
     {
         VSILOGE( "Create vx tensor fail." );
     }
-    if( scales )
-    {
-        free( scales );
-    }
-    if (zeroPoints)
-    {
-        free( zeroPoints );
-    }
+    vsi_nn_safe_free(scales);
+    vsi_nn_safe_free(zeroPoints);
 
     return tensor;
 } /* _create_const_raw_tensor() */
@@ -745,20 +766,23 @@ static void _convert_const_I8toU8
 {
     uint8_t    * data = NULL;
     vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
-    vsi_nn_tensor_attr_t *attr = &tensor->attr;
+    vsi_nn_tensor_attr_t *attr = NULL;
     vsi_size_t sz = 0;
     vsi_size_t i = 0;
 
+    CHECK_PTR_FAIL_GOTO( tensor, "Get tensor fail.", final );
+    attr = &tensor->attr;
+
     sz = vsi_nn_GetElementNum( tensor );
 
     data = vsi_nn_ConvertTensorToData( graph, tensor );
-    if( NULL == data )
+    if ( NULL == data )
     {
         VSILOGE( "Convert data fail." );
         return ;
     }
 
-    for( i = 0; i < sz; i++ )
+    for ( i = 0; i < sz; i++ )
     {
         data[i] = data[i] ^ 0x80;
     }
@@ -769,6 +793,7 @@ static void _convert_const_I8toU8
     if ( tensor->t ) vxReleaseTensor(&tensor->t);
     tensor->t = vsi_nn_CreateRawTensorFromData(graph, data, attr);
 
+final:
     vsi_nn_safe_free( data );
 }/* _convert_const_I8toU8() */
 
@@ -777,7 +802,7 @@ static vsi_status _convert_graph_const_tensor
     vsi_nn_graph_t* graph
     )
 {
-    vsi_status status = VSI_SUCCESS;
+    vsi_status status = VSI_FAILURE;
     uint32_t node_num = graph->node_num;
     vsi_nn_node_t* node = NULL;
     uint32_t i = 0;
@@ -786,6 +811,8 @@ static vsi_status _convert_graph_const_tensor
     for(i = 0; i < node_num; i++)
     {
         node = vsi_nn_GetNode(graph, i);
+        CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final );
+
         for(j = 0; j < node->input.num; j++)
         {
            vsi_nn_tensor_id_t id = node->input.tensors[j];
@@ -797,7 +824,9 @@ static vsi_status _convert_graph_const_tensor
            }
         }
     }
+    status = VSI_SUCCESS;
 
+final:
     return status;
 } /* _convert_graph_const_tensor() */
 
@@ -829,23 +858,26 @@ static vsi_status _convert_graph_virtual_tensor
     for(i = 0; i < node_num; i++)
     {
         node = vsi_nn_GetNode(graph, i);
+        CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final );
+
         for(j = 0; j < node->input.num; j++)
         {
-           vsi_nn_tensor_id_t id = node->input.tensors[j];
-           vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
+            vsi_nn_tensor_id_t id = node->input.tensors[j];
+            vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
 
-           status = _convert_virtual_tensor_attr(tensor);
+            status = _convert_virtual_tensor_attr(tensor);
         }
 
         for(j = 0; j < node->output.num; j++)
         {
-           vsi_nn_tensor_id_t id = node->output.tensors[j];
-           vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
+            vsi_nn_tensor_id_t id = node->output.tensors[j];
+            vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
 
-           status = _convert_virtual_tensor_attr(tensor);
+            status = _convert_virtual_tensor_attr(tensor);
         }
     }
 
+final:
     return status;
 } /* _convert_graph_virtual_tensor() */
 
@@ -857,13 +889,13 @@ static vsi_status _graph_optimization_convert_int8_to_uint8
 {
     vsi_status status = VSI_FAILURE;
     status = _convert_graph_virtual_tensor(graph);
-    TEST_CHECK_STATUS(status, final);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
     status = _convert_graph_const_tensor(graph);
-    TEST_CHECK_STATUS(status, final);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
     status = _add_graph_data_convert(graph, dirty);
-    TEST_CHECK_STATUS(status, final);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
 final:
     return status;
@@ -875,13 +907,15 @@ vsi_status vsi_nn_OptimizeGraph
     vsi_bool *dirty
     )
 {
-    vsi_status status = VSI_SUCCESS;
+    vsi_status status = VSI_FAILURE;
     uint32_t i = 0;
     vsi_bool nbg_flag = FALSE;
     vsi_nn_node_t* node = NULL;
     for(i = 0; i < graph->node_num; i++)
     {
         node = vsi_nn_GetNode(graph, i);
+        CHECK_PTR_FAIL_GOTO( node, "Get node fail.", final );
+
         if(node->op == VSI_NN_OP_NBG)
         {
             nbg_flag = TRUE;
@@ -889,10 +923,12 @@ vsi_status vsi_nn_OptimizeGraph
         }
     }
 
+    status = VSI_SUCCESS;
+
     if (!nbg_flag && graph->ctx->options.enable_asymi8_to_u8)
     {
         status = _graph_optimization_convert_int8_to_uint8(graph, dirty);
-        TEST_CHECK_STATUS(status, final);
+        CHECK_STATUS_FAIL_GOTO(status, final);
     }
 
 final:
diff --git a/src/tim/vx/internal/src/vsi_nn_internal_node.c b/src/tim/vx/internal/src/vsi_nn_internal_node.c
index 24265a11b..ff5b1cce0 100644
--- a/src/tim/vx/internal/src/vsi_nn_internal_node.c
+++ b/src/tim/vx/internal/src/vsi_nn_internal_node.c
@@ -41,9 +41,9 @@
 /**********************************************************
 * MACROS
 **********************************************************/
-#define LINKLIST_APPEND( _HEAD, _ITEM ) do {                \
+#define LINKLIST_APPEND( _HEAD, _ITEM ) {                \
     vsi_nn_LinkListPushEnd((vsi_nn_link_list_t **)&(_HEAD), \
-    (vsi_nn_link_list_t *)(_ITEM) ); } while( 0 )
+    (vsi_nn_link_list_t *)(_ITEM) ); }
 
 #define WKSP(_NODE_PTR) ((vsi_nn_internal_node_wksp_t *)    \
     ((_NODE_PTR)->internal_node_wksp))
@@ -214,6 +214,7 @@ vsi_nn_internal_tensor_t* vsi_nn_internal_create_zero_bias_tensor
     {
         case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
         case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
+        case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8:
             scale = input_attr->dtype.scale;
             break;
 
@@ -235,6 +236,7 @@ vsi_nn_internal_tensor_t* vsi_nn_internal_create_zero_bias_tensor
     {
         case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
         case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
+        case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8:
             attr.dtype.scale = weight_attr->dtype.scale * scale;
             attr.dtype.zero_point = 0;
             attr.dtype.qnt_type = weight_attr->dtype.qnt_type;
@@ -702,22 +704,48 @@ vsi_status vsi_nn_internal_optimize_node
 {
     vsi_status status = VSI_SUCCESS;
     vsi_nn_internal_node_t* curr = NULL;
+    int32_t n = 0;
 
     curr = WKSP(node)->nodes;
-    while( NULL != curr )
+    n = (int32_t)vsi_nn_LinkListGetNodeNumber((vsi_nn_link_list_t *)WKSP(node));
+
+    if (direction == VSI_NN_OPTIMIZE_BACKWARD)
     {
-        VSILOGD("Optimize node uid[%u] sub_uid[%u] op[%s]",
-            node->uid, curr->node->uid, vsi_nn_OpGetName(curr->node->op));
+        int32_t i = 0;
 
-        status = vsi_nn_OpOptimize( curr->node->op, curr->node,
-            curr->inputs, curr->outputs, direction );
-        if( VSI_SUCCESS != status )
+        for ( i = n - 1; i >= 0; i-- )
         {
-            VSILOGE("op_optimize fail %d", curr->node->op);
-            break;
+            curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)WKSP(node), i);
+            VSILOGD("Optimize backward for node uid[%u] sub_uid[%u] op[%s]",
+                node->uid, curr->node->uid, vsi_nn_OpGetName(curr->node->op));
+
+            status = vsi_nn_OpOptimize( curr->node->op, curr->node,
+                curr->inputs, curr->outputs, direction );
+            if ( VSI_SUCCESS != status )
+            {
+                VSILOGE("op_optimize backward fail %d", curr->node->op);
+                break;
+            }
+
         }
+    }
+    else
+    {
+        while( NULL != curr )
+        {
+            VSILOGD("Optimize forward for node uid[%u] sub_uid[%u] op[%s]",
+                node->uid, curr->node->uid, vsi_nn_OpGetName(curr->node->op));
 
-        curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)curr );
+            status = vsi_nn_OpOptimize( curr->node->op, curr->node,
+                curr->inputs, curr->outputs, direction );
+            if( VSI_SUCCESS != status )
+            {
+                VSILOGE("op_optimize forward fail %d", curr->node->op);
+                break;
+            }
+
+            curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)curr );
+        }
     }
 
     return status;
diff --git a/src/tim/vx/internal/src/vsi_nn_kernel_prv.h b/src/tim/vx/internal/src/vsi_nn_kernel_prv.h
index fa01a5e37..76b1cc01f 100644
--- a/src/tim/vx/internal/src/vsi_nn_kernel_prv.h
+++ b/src/tim/vx/internal/src/vsi_nn_kernel_prv.h
@@ -55,6 +55,12 @@ vsi_bool vsi_nn_is_sp_supported_broadcast
         vsi_nn_tensor_t*  output
     );
 
+vsi_bool vsi_nn_kernel_optimize_element_shape_with_max_rank
+    (
+    const vsi_size_t* shape_x, const vsi_size_t rank_x,
+    vsi_size_t* out_shape_x, vsi_size_t* out_rank_x, vsi_size_t max_rank
+    );
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/tim/vx/internal/src/vsi_nn_node.c b/src/tim/vx/internal/src/vsi_nn_node.c
index f13e80b67..4ffd68769 100644
--- a/src/tim/vx/internal/src/vsi_nn_node.c
+++ b/src/tim/vx/internal/src/vsi_nn_node.c
@@ -48,7 +48,7 @@ vsi_nn_node_t * vsi_nn_NewNode
     if(NULL == graph || FALSE == vsi_nn_OpIsValid(op))
     {
         VSILOGE("Create node %s. fail", vsi_nn_OpGetName(op));
-        return NULL;
+        goto final;
     }
 
     node = (vsi_nn_node_t *)malloc( sizeof( vsi_nn_node_t ) );
@@ -73,23 +73,41 @@ vsi_nn_node_t * vsi_nn_NewNode
         node->output.num = (uint32_t)output_num;
         node->output.tensors = (vsi_nn_tensor_id_t *) malloc(
             output_num * sizeof( vsi_nn_tensor_id_t ) );
+        if (NULL == node->output.tensors)
+        {
+            goto final;
+        }
         vsi_nn_InitTensorsId( node->output.tensors, (uint32_t)output_num );
 
         /* init input struct */
         node->input.num = (uint32_t)input_num;
         node->input.tensors = (vsi_nn_tensor_id_t *) malloc(
             input_num * sizeof( vsi_nn_tensor_id_t ) );
+        if (NULL == node->input.tensors)
+        {
+            goto final;
+        }
         vsi_nn_InitTensorsId( node->input.tensors, (uint32_t)input_num );
         node->attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE;
         node->attr.enable_op_constraint_check = TRUE;
     }
     else
     {
-        return NULL;
+        goto final;
     }
 
     node->uid = VSI_NN_NODE_UID_NA;
+
     return node;
+final:
+    if (node)
+    {
+        vsi_nn_safe_free(node->output.tensors);
+        vsi_nn_safe_free(node->input.tensors);
+    }
+    vsi_nn_safe_free(node);
+
+    return NULL;
 } /* vsi_nn_NewNode() */
 
 /*
@@ -214,6 +232,8 @@ vsi_status vsi_nn_update_node_attr
 {
     vsi_status status = VSI_FAILURE;
 
+    VSI_UNREFERENCED(node);
+
 #if(defined(VX_PRELOAD_CONST_TENSOR_SUPPORT) && VX_PRELOAD_CONST_TENSOR_SUPPORT)
     if(node)
     {
diff --git a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
index b3e2ef191..b7f8b706e 100644
--- a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
+++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
@@ -205,13 +205,14 @@ static _node_template s_template[] =
     /* MAXUNPOOL */             NULL,
     /* REVERSESEQUENCE */       NULL,
     /* LPNORM */                NULL,
+    /* RESIZE_3D */                NULL,
 };
 //_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c );
 
 void vsi_nn_apply_node_attr_template
     ( vsi_nn_node_t * node )
 {
-    if( node->op >= _cnt_of_array( s_template ) )
+    if( node->op >= (vsi_nn_op_t)_cnt_of_array( s_template ) )
     {
         VSILOGW( "Unsupport operation id %d.", node->op );
         return;
diff --git a/src/tim/vx/internal/src/vsi_nn_ops.c b/src/tim/vx/internal/src/vsi_nn_ops.c
index 8ca7df26e..b706240c6 100644
--- a/src/tim/vx/internal/src/vsi_nn_ops.c
+++ b/src/tim/vx/internal/src/vsi_nn_ops.c
@@ -298,6 +298,9 @@ void vsi_nn_OpGetIoNum
     )
 {
     const vsi_nn_op_proc_t * proc;
+
+    VSI_UNREFERENCED(node);
+
     proc = vsi_nn_OpGetProc( op );
     if( NULL != proc )
     {
diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
index 63c80f112..265d9221d 100644
--- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
+++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
@@ -170,26 +170,46 @@ static void _set_preproc_node_rect_params
 static void _set_preproc_node_norm_params
     (
     vsi_nn_node_t* node,
-    vsi_nn_preprocess_mean_and_scale_t* mean_and_scale,
-    vsi_nn_tensor_attr_t* attr
+    vsi_nn_preprocess_type_e type,
+    void* mean_and_scale
     )
 {
     int32_t i = 0;
     if(mean_and_scale != NULL)
     {
-        for(i = 0; i < mean_and_scale->channel_len; i++)
+        if (type == VSI_NN_PREPROCESS_MEAN_AND_SCALE)
         {
-            node->nn_param.pre_process.norm.mean[i] = mean_and_scale->channel_mean[i];
+            vsi_nn_preprocess_mean_and_scale_t* means_and_single_scale =
+            (vsi_nn_preprocess_mean_and_scale_t*)mean_and_scale;
+            node->nn_param.pre_process.norm2.scale[0] = means_and_single_scale->scale;
+            node->nn_param.pre_process.norm2.scale[1] = means_and_single_scale->scale;
+            node->nn_param.pre_process.norm2.scale[2] = means_and_single_scale->scale;
+            for(i = 0; i < means_and_single_scale->channel_len; i++)
+            {
+                node->nn_param.pre_process.norm.mean[i] = means_and_single_scale->channel_mean[i];
+            }
+        }
+        else if (type == VSI_NN_PREPROCESS_MEANS_AND_SCALES)
+        {
+            vsi_nn_preprocess_means_and_scales_t* means_and_scales =
+            (vsi_nn_preprocess_means_and_scales_t*)mean_and_scale;
+            for (i = 0; i < means_and_scales->scale_len; i++)
+            {
+                node->nn_param.pre_process.norm2.scale[i] = means_and_scales->scale[i];
+            }
+            for(i = 0; i < means_and_scales->channel_len; i++)
+            {
+                node->nn_param.pre_process.norm.mean[i] = means_and_scales->channel_mean[i];
+            }
         }
-        node->nn_param.pre_process.norm.scale = mean_and_scale->scale;
     }
     else
     {
-        for(i = 0; i < (int32_t)attr->dim_num - 1; i++)
+        for(i = 0; i < 3; i++)
         {
             node->nn_param.pre_process.norm.mean[i] = 0;
+            node->nn_param.pre_process.norm2.scale[i] = 1.0f;
         }
-        node->nn_param.pre_process.norm.scale = 1.0f;
     }
 } /* _set_preproc_node_norm_params() */
 
@@ -268,7 +288,7 @@ static void _set_preproc_node_input_attr
 
     if(*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR || *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_GRAY)
     {
-        if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC)
+        if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC && input_size != NULL)
         {
             input_attr->size[0] = input_size->w;
             input_attr->size[1] = input_size->h;
@@ -453,7 +473,7 @@ vsi_status vsi_nn_add_single_preproc_node
     vsi_nn_node_t* node = NULL;
     vsi_nn_preprocess_image_size_t* input_size = NULL;
     vsi_nn_preprocess_crop_t* crop = NULL;
-    vsi_nn_preprocess_mean_and_scale_t* mean_and_scale = NULL;
+    void* mean_and_scale = NULL;
     vsi_nn_preprocess_permute_t* permute = NULL;
     vsi_nn_preprocess_image_resize_t* image_resize = NULL;
     vsi_nn_preprocess_dtype_convert_t* data_convert = NULL;
@@ -462,6 +482,7 @@ vsi_status vsi_nn_add_single_preproc_node
     vsi_nn_tensor_id_t preproc_inputs[3] = {0};
     vsi_nn_tensor_id_t preproc_output;
     vsi_nn_tensor_t* org_norm_tensor = NULL;
+    vsi_nn_preprocess_type_e mean_and_scale_type = VSI_NN_PREPROCESS_MEAN_AND_SCALE;
     uint32_t node_input_num = 1;
     int32_t reverse_channel = 0;
     uint32_t i = 0;
@@ -501,6 +522,11 @@ vsi_status vsi_nn_add_single_preproc_node
 
        else if(preprocess[idx].type == VSI_NN_PREPROCESS_IMAGE_SIZE)
            input_size = (vsi_nn_preprocess_image_size_t*)preprocess[idx].param;
+       else if(preprocess[idx].type == VSI_NN_PREPROCESS_MEANS_AND_SCALES)
+       {
+           mean_and_scale = (vsi_nn_process_means_and_scales_t*)preprocess[idx].param;
+           mean_and_scale_type  = VSI_NN_PREPROCESS_MEANS_AND_SCALES;
+       }
        else
        {
            VSILOGE("preprocess[%d] type is not support, please have a check!", idx);
@@ -509,13 +535,20 @@ vsi_status vsi_nn_add_single_preproc_node
        }
     }
 
-    if(source_layout == NULL)
+    if (source_layout == NULL)
     {
         VSILOGE("Preprocess source layout need to be set!");
         status = VSI_FAILURE;
         TEST_CHECK_STATUS(status, final);
     }
 
+    if (source_format == NULL)
+    {
+        VSILOGE("Preprocess source source format need to be set!");
+        status = VSI_FAILURE;
+        TEST_CHECK_STATUS(status, final);
+    }
+
     /* Add preprocess node */
     if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 ||
         *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444 ||
@@ -530,6 +563,7 @@ vsi_status vsi_nn_add_single_preproc_node
     }
 
     node = vsi_nn_AddNode(graph, VSI_NN_OP_PRE_PROCESS, node_input_num, 1, NULL);
+    TEST_CHECK_PTR(node, final);
     node->uid = (uint32_t)(VSI_NN_PREPROC_NODE_UID_BASE) + input_idx;
 
     /* Set preprocess node parameters */
@@ -537,7 +571,7 @@ vsi_status vsi_nn_add_single_preproc_node
     TEST_CHECK_STATUS(status, final);
 
     _set_preproc_node_rect_params(node, crop, input_size, source_format);
-    _set_preproc_node_norm_params(node, mean_and_scale, &org_norm_tensor->attr);
+    _set_preproc_node_norm_params(node, mean_and_scale_type, mean_and_scale);
 
     if(permute != NULL)
     {
@@ -698,7 +732,17 @@ vsi_status vsi_nn_add_single_postproc_node
     }
 
     /* Reconnect node tensors */
+    if (NULL == node->input.tensors)
+    {
+        status = VSI_FAILURE;
+        goto final;
+    }
     node->input.tensors[0] = postproc_input;
+    if (NULL == node->output.tensors)
+    {
+        status = VSI_FAILURE;
+        goto final;
+    }
     node->output.tensors[0] = postproc_output;
     for(i = 0; i < last_node->output.num; i++)
     {
@@ -800,7 +844,7 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam
     uint32_t num_of_graph_outputs;
     uint32_t num_of_graph_real_outputs;
     vx_reference* graph_outputs = NULL;
-    vsi_nn_tensor_t* tensor;
+    vsi_nn_tensor_t* tensor = NULL;
     vsi_nn_node_t** nodes = NULL;
     vsi_nn_node_t* node = NULL;
     vsi_nn_node_id_t* processed_node_id_list = NULL;
@@ -866,11 +910,13 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam
                     }
                 }
             }
+            vsi_nn_safe_free(nodes);
         }
     }
 
     graph_inputs = (vx_reference*)malloc(num_of_graph_real_inputs * sizeof(vx_reference));
     TEST_CHECK_PTR( graph_inputs, final );
+    memset(graph_inputs,  0, num_of_graph_inputs * sizeof(vx_reference));
     memset(processed_node_id_list,  0, num_of_graph_inputs * sizeof(vsi_nn_node_id_t));
     processed_idx = 0;
     for (i = 0, j=0; i < num_of_graph_inputs; i++)
@@ -879,6 +925,7 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam
         vsi_bool enabled = FALSE;
         uint32_t nodes_count = 0;
         tensor = vsi_nn_GetTensor(graph, graph->input.tensors[i]);
+        TEST_CHECK_PTR( tensor, final );
         vsi_nn_get_tensor_consumers(graph, graph->input.tensors[i], NULL, &nodes_count);
         if (nodes_count != 0)
         {
@@ -937,19 +984,22 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam
                                     vx_enum data_type = 0;
 
                                     param = vxGetParameterByIndex(prenode, p);
-                                    vxQueryParameter(param,
-                                                     VX_PARAMETER_TYPE,
-                                                     &type,
-                                                     sizeof(vx_enum));
-                                    vxQueryParameter(param,
-                                                     VX_PARAMETER_DIRECTION,
-                                                     &direction,
-                                                     sizeof(vx_enum));
-                                    if (direction != VX_INPUT) continue;
-                                    vxQueryParameter(param,
-                                                     VX_PARAMETER_REF,
-                                                     &ref,
-                                                     sizeof(vx_reference));
+                                    if (param)
+                                    {
+                                        vxQueryParameter(param,
+                                                         VX_PARAMETER_TYPE,
+                                                         &type,
+                                                         sizeof(vx_enum));
+                                        vxQueryParameter(param,
+                                                         VX_PARAMETER_DIRECTION,
+                                                         &direction,
+                                                         sizeof(vx_enum));
+                                        if (direction != VX_INPUT) continue;
+                                        vxQueryParameter(param,
+                                                         VX_PARAMETER_REF,
+                                                         &ref,
+                                                         sizeof(vx_reference));
+                                    }
                                     if (type == VX_TYPE_TENSOR)
                                     {
                                         graph_inputs[j++] = ref;
@@ -986,6 +1036,7 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam
                     }
                 }
             }
+            vsi_nn_safe_free(nodes);
         }
     }
     num_of_graph_outputs = graph->output.num;
@@ -1003,6 +1054,8 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam
     }
     graph_outputs = (vx_reference*)malloc(num_of_graph_real_outputs * sizeof(vx_reference));
     TEST_CHECK_PTR( graph_outputs, final );
+    memset(graph_outputs,  0, num_of_graph_real_outputs * sizeof(vx_reference));
+
     for (i = 0, j = 0; i < num_of_graph_outputs; i++)
     {
         tensor = vsi_nn_GetTensor(graph, graph->output.tensors[i]);
@@ -1063,7 +1116,7 @@ vsi_status vsi_nn_UpdateCropParamsForBinaryGraph
     uint32_t i, j;
     uint32_t numParams = 0;
     int32_t scalar_value[4] = {0};
-    vsi_status status = VSI_FAILURE;
+    vsi_status status = VSI_SUCCESS;
     uint32_t input_idx = enabled_crop_input_idx;
     scalar_value[0] = (int32_t)((crop_w << 15) / dst_w);
     scalar_value[1] = (int32_t)((crop_h << 15) / dst_h);
@@ -1073,7 +1126,7 @@ vsi_status vsi_nn_UpdateCropParamsForBinaryGraph
     for (i = 0; i < graph->node_num; i++)
     {
         vsi_nn_node_t* node = vsi_nn_GetNode(graph, i);
-        if (node->op == VSI_NN_OP_NBG)
+        if (node && node->op == VSI_NN_OP_NBG)
         {
             vx_parameter param = 0;
             vx_enum type = 0;
@@ -1081,16 +1134,19 @@ vsi_status vsi_nn_UpdateCropParamsForBinaryGraph
             uint32_t scalar_idx = 0;
             uint32_t scalar_value_idx = 0;
             int32_t temp_value = 0;
-            status = vxQueryNode(node->n, VX_NODE_PARAMETERS, &numParams, sizeof(numParams));
+            status |= vxQueryNode(node->n, VX_NODE_PARAMETERS, &numParams, sizeof(numParams));
             for (j = 0; j < numParams; j++)
             {
-
                 param = vxGetParameterByIndex(node->n, j);
-                status = vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum));
-                if (type == VX_TYPE_SCALAR)
+
+                if (param)
                 {
-                    scalar_idx = j;
-                    break;
+                    status |= vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum));
+                    if (type == VX_TYPE_SCALAR)
+                    {
+                        scalar_idx = j;
+                        break;
+                    }
                 }
             }
             while (input_idx > 0)
@@ -1099,12 +1155,15 @@ vsi_status vsi_nn_UpdateCropParamsForBinaryGraph
                 for (j = tensor_idx; j < numParams; j++)
                 {
                     param = vxGetParameterByIndex(node->n, j);
-                    status = vxQueryParameter(
-                        param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum));
-                    if (type == VX_TYPE_SCALAR)
+                    if (param)
                     {
-                        scalar_idx = j;
-                        break;
+                        status |= vxQueryParameter(
+                            param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum));
+                        if (type == VX_TYPE_SCALAR)
+                        {
+                            scalar_idx = j;
+                            break;
+                        }
                     }
                 }
                 input_idx--;
@@ -1113,12 +1172,15 @@ vsi_status vsi_nn_UpdateCropParamsForBinaryGraph
             {
                 temp_value = scalar_value[scalar_value_idx++];
                 param = vxGetParameterByIndex(node->n, j);
-                status = vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum));
-                if (type == VX_TYPE_SCALAR)
+                if (param)
                 {
-                    status = vxQueryParameter(param, VX_PARAMETER_REF, &ref, sizeof(vx_reference));
-                    status = vxWriteScalarValue((vx_scalar)ref, &temp_value);
-                    status = vxSetParameterByIndex(node->n, j, ref);
+                    status |= vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum));
+                    if (type == VX_TYPE_SCALAR)
+                    {
+                        status |= vxQueryParameter(param, VX_PARAMETER_REF, &ref, sizeof(vx_reference));
+                        status |= vxWriteScalarValue((vx_scalar)ref, &temp_value);
+                        status |= vxSetParameterByIndex(node->n, j, ref);
+                    }
                 }
             }
 
diff --git a/src/tim/vx/internal/src/vsi_nn_rnn.c b/src/tim/vx/internal/src/vsi_nn_rnn.c
index 2a3baabaa..545f7dcb6 100644
--- a/src/tim/vx/internal/src/vsi_nn_rnn.c
+++ b/src/tim/vx/internal/src/vsi_nn_rnn.c
@@ -31,6 +31,7 @@
 #include "utils/vsi_nn_util.h"
 #include "vsi_nn_rnn_prv.h"
 #include "vsi_nn_internal_node.h"
+#include "vsi_nn_error.h"
 
 /**********************************************************
 * MACROS
@@ -54,6 +55,12 @@ static vsi_status internal_buffer_init
     vsi_size_t    data_size   = 0;
     uint8_t*    data        = NULL;
 
+    if( NULL == tensor )
+    {
+        VSILOGE("input tensor is NULL.");
+        return status;
+    }
+
     if( TRUE == tensor->attr.vtl )
     {
         VSILOGE("Internal tensors cannot be dumpped.");
@@ -72,7 +79,7 @@ static vsi_status internal_buffer_init
     stride = vsi_nn_TypeGetBytes( tensor->attr.dtype.vx_type );
 
     data = (uint8_t *)malloc(data_size);
-    if( NULL == buffer )
+    if ( NULL == data )
     {
         VSILOGE("Out of memoery.");
         goto error;
@@ -136,6 +143,11 @@ static vsi_status internal_buffer_copy_to_tensor
     }
 
     tensor = vsi_nn_GetTensor( graph, tensorid );
+    if ( NULL == tensor )
+    {
+        VSILOGE("tensor is NULL.");
+        return status;
+    }
     request_data_size = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num, tensor->attr.dtype.vx_type );
     if( request_data_size != buffer->data_size )
     {
@@ -167,6 +179,7 @@ static vsi_status internal_buffer_copy_from_tensor
     }
 
     tensor = vsi_nn_GetTensor( graph, tensorid );
+    CHECK_PTR_FAIL_GOTO( tensor, "Get tensor fail.", final );
     request_data_size = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num, tensor->attr.dtype.vx_type );
     if( request_data_size != buffer->data_size )
     {
@@ -181,6 +194,7 @@ static vsi_status internal_buffer_copy_from_tensor
         status = VSI_SUCCESS;
     }
 
+final:
     vsi_nn_safe_free( data );
 
     return status;
@@ -366,6 +380,8 @@ vsi_status vsi_nn_rnn_InitWksp
         memcpy( &cur_conn->connection, &connections[i], sizeof( connections[i] ) );
 
         output_tensor = vsi_nn_GetTensor( graph, cur_conn->connection.output );
+        CHECK_PTR_FAIL_GOTO( output_tensor, "Get tensor fail.", OnError );
+
         for( j = 0; j < VSI_NN_MAX_RNN_CONNECTION_INPUTS; j++ )
         {
             if( VSI_NN_TENSOR_ID_NA == cur_conn->connection.inputs[j] )
@@ -374,6 +390,8 @@ vsi_status vsi_nn_rnn_InitWksp
             }
             /* make sure input tensors have the same size and dtype with output tensor */
             input_tensor = vsi_nn_GetTensor( graph, cur_conn->connection.inputs[j] );
+            CHECK_PTR_FAIL_GOTO( input_tensor, "Get tensor fail.", OnError );
+
             if( output_tensor->attr.dim_num != input_tensor->attr.dim_num
                 || output_tensor->attr.dtype.vx_type != input_tensor->attr.dtype.vx_type
                 || 0 != memcmp(output_tensor->attr.size, input_tensor->attr.size,
@@ -399,6 +417,8 @@ vsi_status vsi_nn_rnn_InitWksp
         if( cur_conn->connection_inputs_count == 1 )
         {
             input_tensor = vsi_nn_GetTensor( graph, cur_conn->connection.inputs[0] );
+            CHECK_PTR_FAIL_GOTO( input_tensor, "Get tensor fail.", OnError );
+
             if( output_tensor && output_tensor->attr.is_created_from_handle
                 && input_tensor && input_tensor->attr.is_created_from_handle )
             {
@@ -421,7 +441,7 @@ vsi_status vsi_nn_rnn_InitWksp
 
 OnError:
     vsi_nn_safe_free( cur_conn );
-    return status;
+    return VSI_FAILURE;
 } /* vsi_nn_rnn_InitWksp() */
 
 vsi_status vsi_nn_rnn_ResetBuffers
diff --git a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c
index 9466d3d60..44ab53eee 100644
--- a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c
+++ b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c
@@ -33,6 +33,7 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_util.h"
 #include "vsi_nn_rnn_helper.h"
+#include "vsi_nn_error.h"
 
 vsi_bool vsi_nn_rnn_find_best_kernel_size
     (
@@ -121,9 +122,12 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_input_for_nn_fc
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor);
     tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final);
 
     tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
     reshape_in_size = (vsi_size_t*)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(vsi_size_t));
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_in_size, tmp_inode, "Create internal buffer failed", final);
 
     reshape_in_size[3] = input->attr.size[1];
     reshape_in_size[2] = input->attr.size[0] / (kernel_h * kernel_w);
@@ -145,13 +149,17 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_input_for_nn_fc
         c = tensor1->t->attr.size[1];
 
         reshape_size[2] = tensor1->t->attr.size[3];
-        reshape_size[1] = -1;
+        reshape_size[1] = (vsi_size_t)-1;
         reshape_size[0] = tensor1->t->attr.size[0];
         tensor0 = vsi_nn_rnn_create_reshape(self, tensor1->t, NULL, reshape_size, 3, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tensor0, "Create internal tensor failed", final);
 
         tensor2 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+        CHECK_PTR_FAIL_GOTO(tensor2, "Create internal tensor failed", final);
         tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
         permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode, 3 * sizeof(uint32_t));
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(permute_in_perm, tmp_inode, "Create internal buffer failed", final);
         permute_in_perm[0] = 2;
         permute_in_perm[1] = 1;
         permute_in_perm[2] = 0;
@@ -174,6 +182,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_input_for_nn_fc
         tensor1 = NULL;
     }
 
+final:
     return tensor1;
 }
 
@@ -196,6 +205,9 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_output_for_nn_fc
     vsi_nn_tensor_t* tensor = input;
     vsi_bool ret = FALSE;
 
+    VSI_UNREFERENCED(kernel_h);
+    VSI_UNREFERENCED(kernel_w);
+
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor);
 
@@ -208,13 +220,18 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_output_for_nn_fc
         c = tensor->attr.size[1];
 
         reshape_size[2] = tensor->attr.size[3];
-        reshape_size[1] = -1;
+        reshape_size[1] = (vsi_size_t)-1;
         reshape_size[0] = tensor->attr.size[0];
         tensor0 = vsi_nn_rnn_create_reshape(self, tensor, NULL, reshape_size, 3, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tensor0, "Create internal tensor failed", final);
 
         tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+        CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final);
+
         tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
         permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode, 3 * sizeof(uint32_t));
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(permute_in_perm, tmp_inode, "Create internal buffer failed", final);
 
         permute_in_perm[0] = 2;
         permute_in_perm[1] = 1;
@@ -231,13 +248,17 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_output_for_nn_fc
         reshape_size[1] = c;
         reshape_size[0] = tensor1->t->attr.size[0];
         tensor0 = vsi_nn_rnn_create_reshape(self, tensor1->t, NULL, reshape_size, 4, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tensor0, "Create internal tensor failed", final);
 
         tensor = tensor0->t;
     }
 
     tensor2 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(tensor2, "Create internal tensor failed", final);
     tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
     reshape_in_size = (vsi_size_t *)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(vsi_size_t));
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_in_size, tmp_inode, "Create internal buffer failed", final);
 
     reshape_in_size[1] = tensor->attr.size[3];
     reshape_in_size[0] = tensor->attr.size[2];
@@ -252,6 +273,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_output_for_nn_fc
         tensor2 = NULL;
     }
 
+final:
     return tensor2;
 }
 
@@ -272,6 +294,10 @@ vsi_bool vsi_nn_rnn_process_output_for_nn_fc2
     uint32_t* permute_in_perm = NULL;
     vsi_nn_internal_node_t* tmp_inode = NULL;
     vsi_nn_tensor_t* tensor = input;
+    vsi_bool ret = FALSE;
+
+    VSI_UNREFERENCED(kernel_h);
+    VSI_UNREFERENCED(kernel_w);
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor);
@@ -285,13 +311,17 @@ vsi_bool vsi_nn_rnn_process_output_for_nn_fc2
         c = tensor->attr.size[1];
 
         reshape_size[2] = tensor->attr.size[3];
-        reshape_size[1] = -1;
+        reshape_size[1] = (vsi_size_t)-1;
         reshape_size[0] = tensor->attr.size[0];
         tensor0 = vsi_nn_rnn_create_reshape(self, tensor, NULL, reshape_size, 3, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tensor0, "Create internal tensor failed", final);
 
         tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+        CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final);
         tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 );
+        CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
         permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode, 3 * sizeof(uint32_t));
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(permute_in_perm, tmp_inode, "Create internal buffer failed", final);
 
         permute_in_perm[0] = 2;
         permute_in_perm[1] = 1;
@@ -308,12 +338,15 @@ vsi_bool vsi_nn_rnn_process_output_for_nn_fc2
         reshape_size[1] = c;
         reshape_size[0] = tensor1->t->attr.size[0];
         tensor0 = vsi_nn_rnn_create_reshape(self, tensor1->t, NULL, reshape_size, 4, use_virtual_tensor);
+        CHECK_PTR_FAIL_GOTO(tensor0, "Create internal tensor failed", final);
 
         tensor = tensor0->t;
     }
 
     tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
     reshape_in_size = (vsi_size_t*)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(vsi_size_t));
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_in_size, tmp_inode, "Create internal buffer failed", final);
 
     reshape_in_size[1] = tensor->attr.size[3];
     reshape_in_size[0] = tensor->attr.size[2];
@@ -322,9 +355,10 @@ vsi_bool vsi_nn_rnn_process_output_for_nn_fc2
     tmp_inode->node->nn_param.reshape2.dim_num = 2;
     tmp_inode->inputs[0] = tensor;
     tmp_inode->outputs[0] = output;
-    vsi_nn_internal_setup_node(self, tmp_inode);
+    ret = vsi_nn_internal_setup_node(self, tmp_inode);
 
-    return TRUE;
+final:
+    return ret;
 }
 
 vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tp_fc
@@ -351,12 +385,16 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tp_fc
         /* create zero bias for NN/TP */
         tensor1 = vsi_nn_internal_create_zero_bias_tensor(
             self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE);
+        CHECK_PTR_FAIL_GOTO( tensor1, "Create tensor fail.", final );
+
         tensor = tensor1->t;
     }
     vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor);
     tensor2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    CHECK_PTR_FAIL_GOTO(tensor2, "Create internal tensor failed", final);
 
     tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_FCL, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
     tmp_inode->node->nn_param.fcl.axis = 0;
     tmp_inode->node->nn_param.fcl.weights = (uint32_t)weight->attr.size[1];
 
@@ -370,6 +408,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tp_fc
         tensor2 = NULL;
     }
 
+final:
     return tensor2;
 }
 
@@ -400,15 +439,19 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc
         /* create zero bias for NN/TP */
         tensor1 = vsi_nn_internal_create_zero_bias_tensor(
             self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE);
+        CHECK_PTR_FAIL_GOTO( tensor1, "Create tensor fail.", final );
         tensor = tensor1->t;
     }
 
     vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor);
     tensor2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    CHECK_PTR_FAIL_GOTO(tensor2, "Create internal tensor failed", final);
 
     reshaped_weight_tensor = vsi_nn_rnn_prepare_weight_for_nn_fc(self, weight, kernel_h, kernel_w);
+    CHECK_PTR_FAIL_GOTO(reshaped_weight_tensor, "Create internal tensor failed", final);
 
     tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
     tmp_inode->node->nn_param.conv2d.ksize[0] = kernel_w;
     tmp_inode->node->nn_param.conv2d.ksize[1] = kernel_h;
     tmp_inode->node->nn_param.conv2d.stride[0] = 1;
@@ -432,6 +475,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc
         tensor2 = NULL;
     }
 
+final:
     return tensor2;
 }
 
@@ -459,6 +503,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_prepare_weight_for_nn_fc
     memcpy( &attr.dtype, &weight->attr.dtype, sizeof(attr.dtype));
     memcpy( &attr.size, &reshaped_weight_shape, sizeof(attr.size));
     reshaped_weight_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    CHECK_PTR_FAIL_GOTO(reshaped_weight_tensor, "Create internal tensor failed", final);
 
     vsi_nn_ReshapeTensor( self->graph, weight, reshaped_weight_tensor->t, reshaped_weight_shape, 4 );
 
@@ -468,6 +513,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_prepare_weight_for_nn_fc
         vsi_nn_SetTensorAttr(reshaped_weight_tensor->t, VSI_NN_TENSOR_ATTR_CONST);
     }
 
+final:
     return reshaped_weight_tensor;
 }
 
@@ -499,15 +545,20 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc_relu
         /* create zero bias for NN/TP */
         tensor1 = vsi_nn_internal_create_zero_bias_tensor(
             self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE);
+
+        CHECK_PTR_FAIL_GOTO( tensor1, "Create tensor fail.", final );
         tensor = tensor1->t;
     }
 
     vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor);
     tensor2 = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    CHECK_PTR_FAIL_GOTO(tensor2, "Create internal tensor failed", final);
 
     reshaped_weight_tensor = vsi_nn_rnn_prepare_weight_for_nn_fc(self, weight, kernel_h, kernel_w);
+    CHECK_PTR_FAIL_GOTO(reshaped_weight_tensor, "Create internal tensor failed", final);
 
     tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV_RELU, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
     tmp_inode->node->nn_param.conv2d.ksize[0] = kernel_w;
     tmp_inode->node->nn_param.conv2d.ksize[1] = kernel_h;
     tmp_inode->node->nn_param.conv2d.stride[0] = 1;
@@ -536,6 +587,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc_relu
         tensor2 = NULL;
     }
 
+final:
     return tensor2;
 }
 
@@ -556,8 +608,10 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tensor_add
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor);
     tensor1 = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final);
 
     tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_ADD, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
 
     tmp_inode->inputs[0] = input1;
     tmp_inode->inputs[1] = input2;
@@ -567,6 +621,8 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tensor_add
     {
         tensor1 = NULL;
     }
+
+final:
     return tensor1;
 }
 
@@ -612,8 +668,10 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_activation
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor);
     tensor1 = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final);
 
     tmp_inode = vsi_nn_internal_new_node(self, vsi_nn_rnn_get_act_op_type(act_type), 0, 0 );
+    CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
 
     tmp_inode->inputs[0] = input;
     tmp_inode->node->nn_param.tanh.scale_a = 1.0f;
@@ -625,6 +683,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_activation
         tensor1 = NULL;
     }
 
+final:
     return tensor1;
 }
 
@@ -649,11 +708,14 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_transpose_time_major
         vsi_nn_internal_init_tensor_attr(&attr,
             &input->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
     }
 
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PERMUTE, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(curr,
         VSI_NN_MAX_DIM_NUM * sizeof(uint32_t));
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(permute_in_perm, curr, "Create internal buffer failed", final);
     permute_in_perm[0] = 0;
     permute_in_perm[1] = 2;
     permute_in_perm[2] = 1;
@@ -676,10 +738,11 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_transpose_time_major
         output_tensor = NULL;
     }
 
+final:
     return output_tensor;
 }
 
-void vsi_nn_rnn_split_input_tensor
+vsi_status vsi_nn_rnn_split_input_tensor
     (
     vsi_nn_node_t * self,
     vsi_nn_tensor_t * input,
@@ -688,6 +751,7 @@ void vsi_nn_rnn_split_input_tensor
     vsi_bool use_virtual_tensor
     )
 {
+    vsi_status status = VSI_FAILURE;
     uint32_t* slices = NULL;
     vsi_nn_internal_node_t* curr = NULL;
     vsi_nn_tensor_attr_t attr;
@@ -696,7 +760,9 @@ void vsi_nn_rnn_split_input_tensor
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, time_step );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     slices = (uint32_t *)vsi_nn_internal_new_node_param(curr, time_step * sizeof(uint32_t));
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(slices, curr, "Create internal buffer failed", final);
     curr->node->nn_param.split.axis = 2; /* timestep axis */
     curr->node->nn_param.split.slices_num = time_step;
     curr->inputs[0] = input;
@@ -707,13 +773,18 @@ void vsi_nn_rnn_split_input_tensor
         slices[i] = 1;
         vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(output_tensor, curr, "Create internal tensor failed", final);
         curr->outputs[i] = output_tensor->t;
         output[i] = output_tensor->t;
     }
     vsi_nn_internal_setup_node( self, curr );
+
+    status = VSI_SUCCESS;
+final:
+    return status;
 }
 
-void vsi_nn_rnn_data_check_aligned
+vsi_status vsi_nn_rnn_data_check_aligned
     (
     vsi_nn_node_t * self,
     vsi_nn_tensor_t ** input,
@@ -721,6 +792,7 @@ void vsi_nn_rnn_data_check_aligned
     vsi_bool use_virtual_tensor
     )
 {
+    vsi_status status = VSI_FAILURE;
     vsi_nn_internal_node_t* curr = NULL;
     vsi_nn_tensor_attr_t attr;
     vsi_nn_internal_tensor_t* output_tensor = NULL;
@@ -736,8 +808,10 @@ void vsi_nn_rnn_data_check_aligned
         {
             vsi_nn_internal_init_tensor_attr(&attr, &input[i]->attr.dtype, use_virtual_tensor);
             output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+            CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
 
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+            CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
             curr->inputs[0] = input[i];
             curr->outputs[0] = output_tensor->t;
             vsi_nn_internal_setup_node( self, curr );
@@ -747,6 +821,10 @@ void vsi_nn_rnn_data_check_aligned
 
         ofst += tensor_size;
     }
+
+    status = VSI_SUCCESS;
+final:
+    return status;
 }
 
 vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_split_output
@@ -767,11 +845,14 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_split_output
     /* reshape for split output */
     vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor);
     output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
 
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     reshape_split_size = (vsi_size_t *)vsi_nn_internal_new_node_param(curr,
         VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
-    reshape_split_size[0] = -1;
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_split_size, curr, "Create internal buffer failed", final);
+    reshape_split_size[0] = (vsi_size_t)-1;
     reshape_split_size[1] = batch_size;
 
     curr->node->nn_param.reshape2.size = reshape_split_size;
@@ -784,6 +865,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_split_output
         output_tensor = NULL;
     }
 
+final:
     return output_tensor;
 }
 
@@ -806,11 +888,14 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_cell_output
     /* reshape output to 3-dims */
     vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor);
     output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
 
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     reshape_grucell_output_size = (vsi_size_t*)vsi_nn_internal_new_node_param(curr,
         VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
-    reshape_grucell_output_size[0] = -1;
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_grucell_output_size, curr, "Create internal buffer failed", final);
+    reshape_grucell_output_size[0] = (vsi_size_t)-1;
     reshape_grucell_output_size[1] = batch_size;
     reshape_grucell_output_size[2] = 1;
 
@@ -824,6 +909,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_cell_output
         output_tensor = NULL;
     }
 
+final:
     return output_tensor;
 }
 
@@ -845,8 +931,10 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_binary_operator
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor);
     output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+    CHECK_PTR_FAIL_GOTO(output_tensor, "Create internal tensor failed", final);
 
     tmp_inode = vsi_nn_internal_new_node(self, op, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
     tmp_inode->node->nn_param.multiply.scale = 1.0f;
     tmp_inode->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
     tmp_inode->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN;
@@ -859,6 +947,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_binary_operator
         output_tensor = NULL;
     }
 
+final:
     return output_tensor;
 }
 
@@ -876,9 +965,11 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_concat_impl
     vsi_nn_tensor_attr_t attr;
     vsi_nn_internal_tensor_t* tmp_tensor = NULL;
     vsi_nn_internal_node_t* inode = NULL;
-    int tensor_count = 1;
+    int32_t tensor_count = 1;
     vsi_bool ret = FALSE;
 
+    VSI_UNREFERENCED(axis);
+
     va_start(args, tensor);
 
     FOREACH_ARGS(args, next, vsi_nn_tensor_t*)
@@ -893,8 +984,10 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_concat_impl
     attr.vtl = use_virtual_tensor;
     attr.is_const = FALSE;
     tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(tmp_tensor, "Create internal tensor failed", final);
 
     inode = vsi_nn_internal_new_node(self, VSI_NN_OP_CONCAT, tensor_count, 1);
+    CHECK_PTR_FAIL_GOTO(inode, "Create internal node failed", final);
     inode->inputs[0] = tensor;
     tensor_count = 0;
     va_start(args, tensor);
@@ -912,6 +1005,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_concat_impl
         tmp_tensor = NULL;
     }
 
+final:
     return tmp_tensor;
 }
 
@@ -938,9 +1032,11 @@ vsi_nn_internal_tensor_t** vsi_nn_create_split
     }
 
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, slices_num );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     if(!slices)
     {
         slices = (uint32_t *)vsi_nn_internal_new_node_param(curr, slices_num * sizeof(uint32_t));
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(slices, curr, "Create internal buffer failed", final);
         num_per_output = (uint32_t)(tensor->attr.size[axis] / slices_num);
         for( i = 0; i < slices_num; i++ )
         {
@@ -949,6 +1045,7 @@ vsi_nn_internal_tensor_t** vsi_nn_create_split
     }
     output_tensors = (vsi_nn_internal_tensor_t**)vsi_nn_internal_new_node_param(curr,
         slices_num * sizeof(vsi_nn_internal_tensor_t*));
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(output_tensors, curr, "Create internal buffer failed", final);
     curr->node->nn_param.split.axis = axis;
     curr->node->nn_param.split.slices_num = slices_num;
     curr->node->nn_param.split.slices = slices;
@@ -959,10 +1056,12 @@ vsi_nn_internal_tensor_t** vsi_nn_create_split
     for( i = 0; i < slices_num; i++ )
     {
         output_tensors[i] = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(output_tensors[i], curr, "Create internal tensor failed", final);
         curr->outputs[i] = output_tensors[i]->t;
     }
     vsi_nn_internal_setup_node( self, curr );
 
+final:
     return output_tensors;
 }
 
@@ -982,7 +1081,9 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_reshape
     vsi_bool ret = FALSE;
 
     curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 );
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     reshape_in_size = (vsi_size_t*)vsi_nn_internal_new_node_param(curr, dim_num * sizeof(vsi_size_t));
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_in_size, curr, "Create internal buffer failed", final);
     memcpy(reshape_in_size, size, dim_num * sizeof(vsi_size_t));
     curr->node->nn_param.reshape2.size = reshape_in_size;
     curr->node->nn_param.reshape2.dim_num = (uint32_t)dim_num;
@@ -999,6 +1100,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_reshape
         memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
         vsi_nn_internal_init_tensor_attr(&attr, &input_tensor->attr.dtype, use_virtual_tensor);
         tensor0 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(tensor0, curr, "Create internal tensor failed", final);
         curr->outputs[0] = tensor0->t;
     }
     ret = vsi_nn_internal_setup_node(self, curr);
@@ -1007,7 +1109,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_reshape
         tensor0 = NULL;
     }
 
-
+final:
     return tensor0;
 }
 
@@ -1027,8 +1129,10 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_permute
     vsi_bool ret = FALSE;
 
     curr = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0);
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(curr,
         dim_num * sizeof(uint32_t));
+    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(permute_in_perm, curr, "Create internal buffer failed", final);
 
     for (i = 0; i < dim_num; i++)
     {
@@ -1047,6 +1151,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_permute
         vsi_nn_tensor_attr_t attr;
         vsi_nn_internal_init_tensor_attr(&attr, &input_tensor->attr.dtype, use_virtual_tensor);
         tensor0 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(tensor0, curr, "Create internal tensor failed", final);
         curr->outputs[0] = tensor0->t;
     }
     ret = vsi_nn_internal_setup_node(self, curr);
@@ -1055,6 +1160,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_permute
         tensor0 = NULL;
     }
 
+final:
     return tensor0;
 }
 
@@ -1072,6 +1178,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tensor_copy
     vsi_bool ret = FALSE;
 
     curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0);
+    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
     curr->inputs[0] = input_tensor;
     if(!dtype)
     {
@@ -1087,6 +1194,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tensor_copy
         vsi_nn_tensor_attr_t attr;
         vsi_nn_internal_init_tensor_attr(&attr, dtype, use_virtual_tensor);
         tensor0 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+        CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(tensor0, curr, "Create internal tensor failed", final);
         curr->outputs[0] = tensor0->t;
     }
     ret = vsi_nn_internal_setup_node(self, curr);
@@ -1095,5 +1203,6 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tensor_copy
         tensor0 = NULL;
     }
 
+final:
     return tensor0;
 }
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c
index 0710a624e..5f7cb47c5 100644
--- a/src/tim/vx/internal/src/vsi_nn_tensor.c
+++ b/src/tim/vx/internal/src/vsi_nn_tensor.c
@@ -40,6 +40,7 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_dtype_util_prv.h"
 #include "utils/vsi_nn_tensor_op.h"
+#include "vsi_nn_error.h"
 
 static vsi_bool _try_set_const_tensor
     (
@@ -119,6 +120,8 @@ static void print_tensor
         ext_attr[count] = 0;
         break;
     case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
+    case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
+    case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8:
         count = snprintf( &ext_attr[0], _EXT_ATTR_BUF_SZ,
             "ASM zp=%3d, scale=%.6f",
             tensor->attr.dtype.zero_point, tensor->attr.dtype.scale );
@@ -126,6 +129,7 @@ static void print_tensor
         break;
 #ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT
     case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC:
+    case VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8:
         count = snprintf( &ext_attr[0], _EXT_ATTR_BUF_SZ,
             "SYM PERCHANNEL axis=%d, count=%d",
             tensor->attr.dtype.channel_dim, tensor->attr.dtype.scale_dim );
@@ -258,15 +262,15 @@ static vsi_bool _auto_cal_shape
     vsi_size_t * dim_num
     )
 {
-    vsi_bool   ret;
+    vsi_bool ret;
     vsi_ssize_t  neg_idx;
-    vsi_size_t i;
-    vsi_size_t total_size;
+    vsi_size_t i = 0;
+    vsi_size_t total_size = 1;
 
     ret = TRUE;
     neg_idx = -1;
     total_size = vsi_nn_ShapeProduct( input_shape, input_dim );
-    if (-1 == *dim_num)
+    if ((vsi_size_t)-1 == *dim_num)
     {
         *dim_num = 1;
         shape[0] = total_size;
@@ -283,7 +287,7 @@ static vsi_bool _auto_cal_shape
                 {
                     VSILOGE( "Wrong shape '%"VSI_SSIZE_T_SPECIFIER"' ", (vsi_ssize_t)shape[i] );
                     ret = FALSE;
-                    break;
+                    goto final;
                 }
                 shape[i] = input_shape[i];
             }
@@ -297,17 +301,16 @@ static vsi_bool _auto_cal_shape
         {
             VSILOGE( "Wrong shape '%"VSI_SSIZE_T_SPECIFIER"' ", (vsi_ssize_t)shape[i] );
             ret = FALSE;
-            break;
+            goto final;
         }
     }
-    if( FALSE == ret  )
-    {
-        shape[neg_idx] = -1;
-    }
-    else if(neg_idx != -1)
+
+    if (-1 != neg_idx)
     {
-        shape[neg_idx] = (uint32_t)total_size;
+        shape[neg_idx] = (vsi_size_t)total_size;
     }
+
+final:
     return ret;
 } /* _auto_cal_shape() */
 
@@ -328,15 +331,21 @@ static vsi_bool _init_tensor
     size_t i = 0;
     ret = TRUE;
 
+    if (tensor->attr.dim_num > VSI_NN_MAX_DIM_NUM)
+    {
+        VSILOGE( "tensor rank greater than %d.", VSI_NN_MAX_DIM_NUM );
+        return FALSE;
+    }
+
     memset( &params, 0, sizeof( vx_tensor_create_params_t ) );
     params.num_of_dims = tensor->attr.dim_num;
-    for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+    for(i = 0; i < tensor->attr.dim_num; i++)
     {
-        size_vxsize[i] = -1 == tensor->attr.size[i] ? -1 : (vx_size)tensor->attr.size[i];
+        size_vxsize[i] = (vsi_size_t)-1 == tensor->attr.size[i] ? (vx_size)-1 : (vx_size)tensor->attr.size[i];
     }
-    for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+    for(i = 0; i < tensor->attr.dim_num; i++)
     {
-        size_u32[i] = -1 == tensor->attr.size[i] ? -1 : (vx_uint32)tensor->attr.size[i];
+        size_u32[i] = (vsi_size_t)-1 == tensor->attr.size[i] ? (vx_uint32)-1 : (vx_uint32)tensor->attr.size[i];
     }
 #ifdef VSI_40BIT_VA_SUPPORT
     params.sizes = size_vxsize;
@@ -354,11 +363,13 @@ static vsi_bool _init_tensor
         break;
     case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
     case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
+    case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8:
         params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE;
         params.quant_data.affine.scale = tensor->attr.dtype.scale;
         params.quant_data.affine.zeroPoint = (int32_t)tensor->attr.dtype.zero_point;
         break;
     case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC:
+    case VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8:
 #ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT
         #ifdef VX_QUANT_AFFINE_SCALE_PER_CHANNEL
             params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_CHANNEL;
@@ -367,6 +378,7 @@ static vsi_bool _init_tensor
         #endif
         // This is a hack that driver doesn't support const scales
         scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.scale_dim);
+        CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
         memcpy(scales, tensor->attr.dtype.scales, tensor->attr.dtype.scale_dim * sizeof(float));
         params.quant_data.affinePerChannel.channelDim = tensor->attr.dtype.channel_dim;
         params.quant_data.affinePerChannel.scaleCount = tensor->attr.dtype.scale_dim;
@@ -378,6 +390,7 @@ static vsi_bool _init_tensor
             // it's symmetric quantized tensor. Fake a zp information filled with zero to meet low-level's
             // requirement
             null_zp = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.scale_dim);
+            CHECK_PTR_FAIL_GOTO( null_zp, "Create buffer fail.", final );
             memset(null_zp, 0, sizeof(int32_t) * tensor->attr.dtype.scale_dim);
             params.quant_data.affinePerChannel.zeroPoint = null_zp;
             params.quant_data.affinePerChannel.zeroPointCount= tensor->attr.dtype.scale_dim;
@@ -395,10 +408,12 @@ static vsi_bool _init_tensor
         #endif
         // This is a hack that driver doesn't support const scales
         scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.scale_dim);
+        CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
         memcpy(scales,
                tensor->attr.dtype.scales,
                tensor->attr.dtype.scale_dim * sizeof(float));
         zeroPoints = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.zero_points_dim);
+        CHECK_PTR_FAIL_GOTO( zeroPoints, "Create buffer fail.", final );
         memcpy(zeroPoints,
                tensor->attr.dtype.zero_points,
                tensor->attr.dtype.zero_points_dim * sizeof(int32_t));
@@ -472,14 +487,17 @@ static vsi_bool _init_tensor
                     vx_size stride_size_vxsize[_cnt_of_array(stride_size)] = {0};
                     for(i = 0; i < _cnt_of_array(tensor->attr.size); i++)
                     {
-                        size_vxsize2[i] = -1 == tensor->attr.size[i] ? -1 : (vx_size)tensor->attr.size[i];
+                        size_vxsize2[i] = (vsi_size_t)-1 == tensor->attr.size[i] ? \
+                            (vx_size)-1 : (vx_size)tensor->attr.size[i];
                     }
                     for(i = 0; i < _cnt_of_array(stride_size); i++)
                     {
-                        stride_size_vxsize[i] = -1 == stride_size[i] ? -1 : (vx_size)stride_size[i];
+                        stride_size_vxsize[i] = (vsi_size_t)-1 == stride_size[i] ? \
+                            (vx_size)-1 : (vx_size)stride_size[i];
                     }
                     addr = vxCreateTensorAddressing(graph->ctx->c,
                         size_vxsize2, stride_size_vxsize, (vx_size)tensor->attr.dim_num);
+                    CHECK_PTR_FAIL_GOTO( addr, "Create tensor address fail.", final );
                 }
 #else
                 {
@@ -487,14 +505,17 @@ static vsi_bool _init_tensor
                     uint32_t stride_size_32bit[_cnt_of_array(stride_size)] = {0};
                     for(i = 0; i < _cnt_of_array(tensor->attr.size); i++)
                     {
-                        size_32bit[i] = -1 == tensor->attr.size[i] ? -1 : (uint32_t)tensor->attr.size[i];
+                        size_32bit[i] = (vsi_size_t)-1 == tensor->attr.size[i] ? \
+                            (uint32_t)-1 : (uint32_t)tensor->attr.size[i];
                     }
                     for(i = 0; i < _cnt_of_array(stride_size); i++)
                     {
-                        stride_size_32bit[i] = -1 == stride_size[i] ? -1 : (uint32_t)stride_size[i];
+                        stride_size_32bit[i] = (vsi_size_t)-1 == stride_size[i] ? \
+                            (uint32_t)-1 : (uint32_t)stride_size[i];
                     }
                     addr = vxCreateTensorAddressing(graph->ctx->c,
                         size_32bit, stride_size_32bit, (uint8_t)tensor->attr.dim_num);
+                    CHECK_PTR_FAIL_GOTO( addr, "Create tensor address fail.", final );
                 }
 #endif
 #ifdef VX_CREATE_TENSOR_SUPPORT_PHYSICAL
@@ -970,6 +991,9 @@ vsi_size_t vsi_nn_CopyTensorToBuffer
     vsi_size_t     sz;
     vsi_size_t     stride_size[VSI_NN_MAX_DIM_NUM];
     vsi_status     status;
+
+    VSI_UNREFERENCED(graph);
+
     if( NULL == tensor || NULL == buffer )
     {
         return 0;
@@ -977,7 +1001,7 @@ vsi_size_t vsi_nn_CopyTensorToBuffer
     sz = 0;
     status = VSI_FAILURE;
 
-    status = vsi_nn_copy_tensor_patch(tensor->t, &tensor->attr, buffer, VX_READ_ONLY);
+    status = vsi_nn_copy_tensor_patch(tensor->t, &tensor->attr, buffer, VX_READ_ONLY, NULL, NULL);
     if(VSI_SUCCESS == status)
     {
         sz = vsi_nn_GetStrideSize( &tensor->attr, stride_size );
@@ -996,7 +1020,7 @@ float * vsi_nn_ConvertTensorToFloat32Data
     vsi_size_t elements;
     vsi_size_t i;
     vsi_size_t stride;
-    float *data;
+    float *data = NULL;
 
     if(NULL == graph || NULL == tensor)
     {
@@ -1008,7 +1032,7 @@ float * vsi_nn_ConvertTensorToFloat32Data
 
     data = NULL;
     data = (float *)malloc(elements * sizeof(float));
-
+    CHECK_PTR_FAIL_GOTO( data, "Create buffer fail.", final );
     if( tensor->attr.is_created_from_handle )
     {
 #ifdef VSI_INVALIDATE_HANDLE_SUPPORT
@@ -1031,7 +1055,14 @@ float * vsi_nn_ConvertTensorToFloat32Data
     else
     {
         tensor_data = vsi_nn_ConvertTensorToData(graph, tensor);
+        if ( tensor_data == NULL )
+        {
+            VSILOGE("tensor_data is NULL.");
+            vsi_nn_safe_free(data);
+            return NULL;
+        }
     }
+
     for(i = 0; i < elements; i++)
     {
         status = dtype_to_float32(&tensor_data[stride * i], &data[i], &tensor->attr.dtype);
@@ -1043,6 +1074,7 @@ float * vsi_nn_ConvertTensorToFloat32Data
         }
     }
 
+final:
     if( !tensor->attr.is_created_from_handle )
     {
         vsi_nn_safe_free( tensor_data );
@@ -1061,6 +1093,9 @@ uint8_t * vsi_nn_ConvertTensorToData
     vsi_size_t     buf_sz;
     vsi_size_t     stride_size[VSI_NN_MAX_DIM_NUM];
     vsi_status     status;
+
+    VSI_UNREFERENCED(graph);
+
     if( NULL == tensor )
     {
         return NULL;
@@ -1074,6 +1109,12 @@ uint8_t * vsi_nn_ConvertTensorToData
     if( buf_sz > 0 )
     {
         data = (uint8_t *)malloc( buf_sz );
+        if (data == NULL)
+        {
+            VSILOGE("Create buffer fail");
+
+            return NULL;
+        }
     }
     if( data && tensor->attr.is_created_from_handle )
     {
@@ -1100,13 +1141,14 @@ uint8_t * vsi_nn_ConvertTensorToData
     {
         if( NULL != data )
         {
-            status = vsi_nn_copy_tensor_patch(tensor->t, &tensor->attr, data, VX_READ_ONLY);
+            status = vsi_nn_copy_tensor_patch(tensor->t, &tensor->attr, data, VX_READ_ONLY, NULL, NULL);
         }
         if(VSI_SUCCESS != status)
         {
             VSILOGE("Read tensor data fail");
             free(data);
             data = NULL;
+            return NULL;
         }
     }
     if(tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT4 ||
@@ -1114,12 +1156,16 @@ uint8_t * vsi_nn_ConvertTensorToData
     {
         vsi_size_t dest_size = vsi_nn_GetElementNum(tensor);
         new_data = (uint8_t*)malloc(dest_size);
-        status = vsi_nn_Unpack4bitData(tensor, data, new_data, tensor->attr.dtype.vx_type);
-        if(data)
+        if (new_data == NULL)
         {
-            free(data);
-            data = NULL;
+            VSILOGE("Create buffer fail");
+            vsi_nn_safe_free(data);
+
+            return NULL;
         }
+
+        status = vsi_nn_Unpack4bitData(tensor, data, new_data, tensor->attr.dtype.vx_type);
+        vsi_nn_safe_free(data);
         return new_data;
     }
     else
@@ -1149,6 +1195,9 @@ uint8_t * vsi_nn_ConvertRawTensorToData
     vsi_size_t     buf_sz;
     vsi_status     status;
     vsi_nn_tensor_attr_t attr;
+
+    VSI_UNREFERENCED(addr);
+
     if( NULL == tensor || NULL == context )
     {
         return NULL;
@@ -1175,7 +1224,7 @@ uint8_t * vsi_nn_ConvertRawTensorToData
         {
             return data;
         }
-        status = vsi_nn_copy_tensor_patch(tensor, &attr, data, VX_READ_ONLY);
+        status = vsi_nn_copy_tensor_patch(tensor, &attr, data, VX_READ_ONLY, NULL, NULL);
         if( VSI_SUCCESS != status )
         {
             VSILOGE("Read tensor data fail");
@@ -1205,6 +1254,8 @@ uint8_t * vsi_nn_ConvertRawTensorToData2
     vsi_size_t buf_sz;
     vsi_status status;
 
+    VSI_UNREFERENCED(addr);
+
     if( NULL == tensor || NULL == context )
     {
         return NULL;
@@ -1229,6 +1280,7 @@ uint8_t * vsi_nn_ConvertRawTensorToData2
         break;
     case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
     case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
+    case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8:
         status = vxQueryTensor(tensor, VX_TENSOR_ZERO_POINT,
             &(attr->dtype.zero_point), sizeof(int32_t));
         status = vxQueryTensor(tensor, VX_TENSOR_SCALE,
@@ -1250,7 +1302,7 @@ uint8_t * vsi_nn_ConvertRawTensorToData2
         {
             return data;
         }
-        status = vsi_nn_copy_tensor_patch(tensor, attr, data, VX_READ_ONLY);
+        status = vsi_nn_copy_tensor_patch(tensor, attr, data, VX_READ_ONLY, NULL, NULL);
         if( VSI_SUCCESS != status )
         {
             VSILOGE("Read tensor data fail");
@@ -1407,7 +1459,8 @@ void vsi_nn_SaveDataToText
         write_data = vsi_nn_DataAsFloat32( &data[stride * i],
             type );
         if( type == VSI_NN_TYPE_UINT8 || type == VSI_NN_TYPE_INT8 ||
-            type == VSI_NN_TYPE_UINT4 || type == VSI_NN_TYPE_INT4 )
+            type == VSI_NN_TYPE_UINT4 || type == VSI_NN_TYPE_INT4 ||
+            type == VSI_NN_TYPE_FLOAT8_E4M3 || type == VSI_NN_TYPE_FLOAT8_E5M2 )
         {
             count += snprintf( (char *)&buf[count], _TENSOR_TMPBUF_SZ - count,
                 "%d%s", (int32_t)write_data, seperator );
@@ -1549,6 +1602,10 @@ vsi_status vsi_nn_CopyDataToTensor
     )
 {
     vsi_status         status = VSI_FAILURE;
+    uint8_t* new_data = NULL;
+
+    VSI_UNREFERENCED(graph);
+
     if( NULL == data || NULL == tensor )
     {
         return status;
@@ -1581,24 +1638,22 @@ vsi_status vsi_nn_CopyDataToTensor
         if( tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT4 ||
             tensor->attr.dtype.vx_type == VSI_NN_TYPE_UINT4 )
         {
-            uint8_t* new_data = NULL;
             vsi_size_t dest_size = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num,
                                                          tensor->attr.dtype.vx_type);
             new_data = (uint8_t*)malloc( dest_size );
+            CHECK_PTR_FAIL_GOTO( new_data, "Create buffer fail.", final );
             status = vsi_nn_Pack4bitData(tensor, (uint8_t*)data, new_data);
-            status = vsi_nn_copy_tensor_patch( tensor->t, &tensor->attr, new_data, VX_WRITE_ONLY );
-            if( new_data )
-            {
-                free( new_data );
-                new_data = NULL;
-            }
+            status = vsi_nn_copy_tensor_patch( tensor->t, &tensor->attr, new_data, VX_WRITE_ONLY, NULL, NULL );
         }
         else
         {
-            status = vsi_nn_copy_tensor_patch( tensor->t, &tensor->attr, data, VX_WRITE_ONLY );
+            status = vsi_nn_copy_tensor_patch( tensor->t, &tensor->attr, data, VX_WRITE_ONLY, NULL, NULL );
         }
     }
 
+final:
+    vsi_nn_safe_free(new_data);
+
     return status;
 } /* vsi_nn_CopyDataToTensor() */
 
@@ -1780,6 +1835,12 @@ vsi_nn_tensor_t *vsi_nn_reshape_tensor
     {
         return NULL;
     }
+
+    if (dim_num > VSI_NN_MAX_DIM_NUM)
+    {
+        VSILOGE( "tensor rank greater than %d.", VSI_NN_MAX_DIM_NUM );
+        return NULL;
+    }
     /* New a ovxlib tensor struct */
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     memcpy(&attr, &input->attr, sizeof(vsi_nn_tensor_attr_t));
@@ -1813,6 +1874,13 @@ vsi_bool vsi_nn_ReshapeTensor
 {
     vsi_bool ret;
     vsi_size_t new_shape[VSI_NN_MAX_DIM_NUM] = {0};
+
+    if (dim_num > VSI_NN_MAX_DIM_NUM)
+    {
+        VSILOGE( "tensor rank greater than %d.", VSI_NN_MAX_DIM_NUM );
+        return FALSE;
+    }
+
     memcpy(new_shape, shape, sizeof(vsi_size_t) * dim_num);
 
     ret = TRUE;
@@ -1913,6 +1981,12 @@ vx_tensor vsi_nn_safe_reshape_tensor
     vsi_size_t        size_of_shape_element
     )
 {
+    if (sizes > VSI_NN_MAX_DIM_NUM)
+    {
+        VSILOGE( "tensor rank greater than %d.", VSI_NN_MAX_DIM_NUM );
+        return NULL;
+    }
+
     if(sizeof(vx_size) == size_of_shape_element)
     {
         vx_size* num_of_dims_vxsize = (vx_size*)num_of_dims;
@@ -1924,7 +1998,8 @@ vx_tensor vsi_nn_safe_reshape_tensor
                 vsi_size_t i = 0;
                 for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
                 {
-                    new_shape_int32[i] = -1 == num_of_dims_vxsize[i] ? -1 : (int32_t)num_of_dims_vxsize[i];
+                    new_shape_int32[i] = (vx_size)-1 == num_of_dims_vxsize[i] ? \
+                        (int32_t)-1 : (int32_t)num_of_dims_vxsize[i];
                 }
                 return vxReshapeTensor( tensor, new_shape_int32, (uint32_t)sizes );
             }
@@ -1939,7 +2014,7 @@ vx_tensor vsi_nn_safe_reshape_tensor
                 vsi_size_t i = 0;
                 for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
                 {
-                    new_shape_vxsize[i] = -1 == num_of_dims_int32[i] ? -1 : (vx_size)num_of_dims_int32[i];
+                    new_shape_vxsize[i] = -1 == num_of_dims_int32[i] ? (vx_size)-1 : (vx_size)num_of_dims_int32[i];
                 }
                 return vxReshapeTensor( tensor, new_shape_vxsize, (vx_size)sizes );
             }
@@ -1970,7 +2045,7 @@ void vsi_nn_PermuteTensor
     uint32_t i;
     vsi_status  status;
 
-    if( NULL == tensor || NULL == perm || 0 == dim_num )
+    if( NULL == tensor || NULL == perm || 0 == dim_num || dim_num > VSI_NN_MAX_DIM_NUM )
     {
         VSILOGE( "Wrong perm parameters." );
         return;
@@ -2231,8 +2306,10 @@ void vsi_nn_ReleaseTensorRelevance
     )
 {
     uint32_t i;
-    if(NULL == tensor_ref || NULL == graph)
+    if (NULL == tensor_ref || NULL == graph)
     {
+        vsi_nn_safe_free(tensor_ref);
+
         return ;
     }
 
@@ -2250,11 +2327,7 @@ void vsi_nn_ReleaseTensorRelevance
         }
     }
 
-    if(tensor_ref)
-    {
-        free(tensor_ref);
-        tensor_ref = NULL;
-    }
+    vsi_nn_safe_free(tensor_ref);
 } /* vsi_nn_ReleaseTensorRelevance() */
 
 vsi_nn_tensor_rel_t *vsi_nn_CreateTensorRelevance
@@ -2286,6 +2359,11 @@ vsi_nn_tensor_rel_t *vsi_nn_CreateTensorRelevance
         for(j = 0; j < graph->node_num; j++)
         {
             node = vsi_nn_GetNode( graph, (vsi_nn_node_id_t)j );
+            if (node == NULL)
+            {
+                continue;
+            }
+
             for(k = 0; k < node->output.num; k++)
             {
                 if(node->output.tensors[k] == i)
@@ -2423,6 +2501,7 @@ vsi_status vsi_nn_vxGetTensorAttr
         break;
     case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
     case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
+    case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8:
         status = vxQueryTensor(tensor, VX_TENSOR_ZERO_POINT,
             &(attr->dtype.zero_point), sizeof(int32_t));
         TEST_CHECK_STATUS( status, final );
@@ -2468,7 +2547,7 @@ uint8_t *vsi_nn_vxCopyTensorToData
         }
     }
 
-    status = vsi_nn_copy_tensor_patch(tensor, attr, data, VX_READ_ONLY);
+    status = vsi_nn_copy_tensor_patch(tensor, attr, data, VX_READ_ONLY, NULL, NULL);
     if(VSI_SUCCESS != status)
     {
         VSILOGE("Copy tensor to data fail");
@@ -2498,7 +2577,7 @@ vsi_status vsi_nn_vxCopyDataToTensor
 
     memset(stride_size, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM);
     vsi_nn_GetStrideSize(attr, stride_size);
-    status = vsi_nn_copy_tensor_patch(tensor, attr, data, VX_WRITE_ONLY);
+    status = vsi_nn_copy_tensor_patch(tensor, attr, data, VX_WRITE_ONLY, NULL, NULL);
     if(VSI_SUCCESS != status)
     {
         VSILOGE("Copy data to tensor fail");
@@ -2619,24 +2698,39 @@ vsi_status vsi_nn_copy_tensor_patch
     vx_tensor tensor,
     vsi_nn_tensor_attr_t *attr,
     void * user_ptr,
-    vsi_enum usage
+    vsi_enum usage,
+    vsi_size_t* start,
+    vsi_size_t* end
     )
 {
-    vsi_size_t start[VSI_NN_MAX_DIM_NUM],end[VSI_NN_MAX_DIM_NUM],stride[VSI_NN_MAX_DIM_NUM];
+    vsi_size_t tmp_start[VSI_NN_MAX_DIM_NUM],tmp_end[VSI_NN_MAX_DIM_NUM],stride[VSI_NN_MAX_DIM_NUM];
     vsi_status status = VSI_FAILURE;
-    uint32_t i;
+
     if(NULL == tensor || NULL == user_ptr)
     {
         VSILOGE("Invalid parameter");
         return status;
     }
     vsi_nn_GetStrideSize(attr, stride);
-    memset(start, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM);
-    for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+    if (NULL == start)
+    {
+        memset(tmp_start, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM);
+    }
+    else
+    {
+        memcpy(tmp_start, start, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM);
+    }
+
+    if (NULL == end)
     {
-        end[i] = attr->size[i];
+        memcpy(tmp_end, attr->size, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM);
     }
-    status = vsi_nn_copy_tensor_veiw_patch(tensor, attr, user_ptr, start, end, stride, usage, 0);
+    else
+    {
+        memcpy(tmp_end, end, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM);
+    }
+
+    status = vsi_nn_copy_tensor_veiw_patch(tensor, attr, user_ptr, tmp_start, tmp_end, stride, usage, 0);
     return status;
 } /* vsi_nn_copy_tensor_patch() */
 
@@ -2673,7 +2767,9 @@ void vsi_nn_reshuffle_weight_data
     int32_t item_size = vsi_nn_TypeGetBytes(weights->attr.dtype.vx_type);
 
     weight_data = vsi_nn_ConvertTensorToData(graph, weights);
+    CHECK_PTR_FAIL_GOTO( weight_data, "Create weight_data fail.", final );
     buffer = (uint8_t*)malloc(item_size * slice_size * weight_size_c * weight_size_b);
+    CHECK_PTR_FAIL_GOTO( buffer, "Create buffer fail.", final );
     memset(buffer, 0x00, item_size * slice_size * weight_size_c * weight_size_b);
     memcpy(buffer, weight_data, item_size * slice_size * weight_size_c * weight_size_b);
 #if 0 // transpose whnc to whcn if need
@@ -2717,6 +2813,8 @@ void vsi_nn_reshuffle_weight_data
         }
     }
     vsi_nn_CopyDataToTensor( graph, weights, weight_data );
+
+final:
     vsi_nn_Free( buffer );
     vsi_nn_safe_free( weight_data );
 }
@@ -2806,6 +2904,7 @@ vsi_status vsi_nn_SwapHandle
     )
 {
     vsi_status status = VSI_FAILURE;
+    VSI_UNREFERENCED(is_new_ptr_malloc_by_ovxlib);
     if (!tensor)
     {
         return VSI_FAILURE;
@@ -3021,15 +3120,17 @@ static vsi_bool _init_dummy_tensor
     size_t i = 0;
     ret = TRUE;
 
+    VSI_UNREFERENCED(graph);
+
     memset( &params, 0, sizeof( vx_tensor_create_params_t ) );
     params.num_of_dims = tensor->attr.dim_num;
     for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
     {
-        size_vxsize[i] = -1 == tensor->attr.size[i] ? -1 : (vx_size)tensor->attr.size[i];
+        size_vxsize[i] = (vsi_size_t)-1 == tensor->attr.size[i] ? (vx_size)-1 : (vx_size)tensor->attr.size[i];
     }
     for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
     {
-        size_u32[i] = -1 == tensor->attr.size[i] ? -1 : (vx_uint32)tensor->attr.size[i];
+        size_u32[i] = (vsi_size_t)-1 == tensor->attr.size[i] ? (vx_uint32)-1 : (vx_uint32)tensor->attr.size[i];
     }
 #ifdef VSI_40BIT_VA_SUPPORT
     params.sizes = size_vxsize;
@@ -3047,11 +3148,13 @@ static vsi_bool _init_dummy_tensor
         break;
     case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
     case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
+    case VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8:
         params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE;
         params.quant_data.affine.scale = tensor->attr.dtype.scale;
         params.quant_data.affine.zeroPoint = (int32_t)tensor->attr.dtype.zero_point;
         break;
     case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC:
+    case VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8:
 #ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT
         #ifdef VX_QUANT_AFFINE_SCALE_PER_CHANNEL
             params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_CHANNEL;
@@ -3060,6 +3163,7 @@ static vsi_bool _init_dummy_tensor
         #endif
         // This is a hack that driver doesn't support const scales
         scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.scale_dim);
+        CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
         memcpy(scales, tensor->attr.dtype.scales, tensor->attr.dtype.scale_dim * sizeof(float));
         params.quant_data.affinePerChannel.channelDim = tensor->attr.dtype.channel_dim;
         params.quant_data.affinePerChannel.scaleCount = tensor->attr.dtype.scale_dim;
@@ -3071,6 +3175,7 @@ static vsi_bool _init_dummy_tensor
             // it's symmetric quantized tensor. Fake a zp information filled with zero to meet low-level's
             // requirement
             null_zp = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.scale_dim);
+            CHECK_PTR_FAIL_GOTO( null_zp, "Create buffer fail.", final );
             memset(null_zp, 0, sizeof(int32_t) * tensor->attr.dtype.scale_dim);
             params.quant_data.affinePerChannel.zeroPoint = null_zp;
             params.quant_data.affinePerChannel.zeroPointCount= tensor->attr.dtype.scale_dim;
@@ -3092,6 +3197,7 @@ static vsi_bool _init_dummy_tensor
                tensor->attr.dtype.scales,
                tensor->attr.dtype.scale_dim * sizeof(float));
         zeroPoints = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.zero_points_dim);
+        CHECK_PTR_FAIL_GOTO( zeroPoints, "Create buffer fail.", final );
         memcpy(zeroPoints,
                tensor->attr.dtype.zero_points,
                tensor->attr.dtype.zero_points_dim * sizeof(int32_t));
diff --git a/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h b/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h
index d46138fa2..1937569fc 100644
--- a/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h
+++ b/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h
@@ -97,6 +97,16 @@ vsi_bool vsi_nn_is_stream_process_supported_types
     size_t input_num
     );
 
+vsi_bool vsi_nn_is_same_data_type(
+    vsi_nn_tensor_t * src,
+    vsi_nn_tensor_t * dst
+    );
+
+vsi_bool vsi_nn_is_same_quant_type(
+    vsi_nn_tensor_t * src,
+    vsi_nn_tensor_t * dst
+    );
+
 #ifdef __cplusplus
 }
 #endif