Initial commit

2025-06-09 18:06:36 +02:00 · 2025-06-09 18:06:36 +02:00 · ce3dd83b9f
commit ce3dd83b9f
1470 changed files with 1054449 additions and 0 deletions
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/CMakeLists.txt
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/CMakeLists.txt
@ -0,0 +1,41 @@
+cmake_minimum_required (VERSION 3.14)
+
+project(CMSISDSPSVM)
+
+include(configLib)
+include(configDsp)
+
+
+add_library(CMSISDSPSVM STATIC)
+
+target_sources(CMSISDSPSVM PRIVATE arm_svm_linear_init_f32.c) 
+target_sources(CMSISDSPSVM PRIVATE arm_svm_rbf_init_f32.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_linear_predict_f32.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_rbf_predict_f32.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_polynomial_init_f32.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_sigmoid_init_f32.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_polynomial_predict_f32.c)  
+target_sources(CMSISDSPSVM PRIVATE arm_svm_sigmoid_predict_f32.c)
+
+
+
+configLib(CMSISDSPSVM ${ROOT})
+configDsp(CMSISDSPSVM ${ROOT})
+
+### Includes
+target_include_directories(CMSISDSPSVM PUBLIC "${DSP}/Include")
+
+if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
+target_sources(CMSISDSPSVM PRIVATE arm_svm_linear_init_f16.c) 
+target_sources(CMSISDSPSVM PRIVATE arm_svm_rbf_init_f16.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_linear_predict_f16.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_rbf_predict_f16.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_polynomial_init_f16.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_sigmoid_init_f16.c)
+target_sources(CMSISDSPSVM PRIVATE arm_svm_polynomial_predict_f16.c)  
+target_sources(CMSISDSPSVM PRIVATE arm_svm_sigmoid_predict_f16.c)
+endif()
+
+
+
+
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctions.c
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctions.c
@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        BayesFunctions.c
+ * Description:  Combination of all SVM function source files.
+ *
+ * $Date:        16. March 2020
+ * $Revision:    V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_svm_linear_init_f32.c"
+#include "arm_svm_linear_predict_f32.c"
+#include "arm_svm_polynomial_init_f32.c"
+#include "arm_svm_polynomial_predict_f32.c"
+#include "arm_svm_rbf_init_f32.c"
+#include "arm_svm_rbf_predict_f32.c"
+#include "arm_svm_sigmoid_init_f32.c"
+#include "arm_svm_sigmoid_predict_f32.c"
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctionsF16.c
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/SVMFunctionsF16.c
@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        BayesFunctions.c
+ * Description:  Combination of all SVM function source files.
+ *
+ * $Date:        16. March 2020
+ * $Revision:    V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_svm_linear_init_f16.c"
+#include "arm_svm_linear_predict_f16.c"
+#include "arm_svm_polynomial_init_f16.c"
+#include "arm_svm_polynomial_predict_f16.c"
+#include "arm_svm_rbf_init_f16.c"
+#include "arm_svm_rbf_predict_f16.c"
+#include "arm_svm_sigmoid_init_f16.c"
+#include "arm_svm_sigmoid_predict_f16.c"
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f16.c
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f16.c
@ -0,0 +1,98 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_linear_init_f16.c
+ * Description:  SVM Linear Instance Initialization
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+ * @defgroup groupSVM SVM Functions
+ *
+ */
+
+/**
+  @ingroup groupSVM
+ */
+
+/**
+  @defgroup linearsvm Linear SVM
+
+  Linear SVM classifier
+ */
+
+/**
+ * @addtogroup linearsvm
+ * @{
+ */
+
+
+/**
+ * @brief        SVM linear instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in]    S                      Parameters for the SVM function
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @return none.
+ *
+ */
+
+
+void arm_svm_linear_init_f16(arm_svm_linear_instance_f16 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float16_t intercept,
+  const float16_t *dualCoefficients,
+  const float16_t *supportVectors,
+  const int32_t *classes)
+{
+   S->nbOfSupportVectors = nbOfSupportVectors;
+   S->vectorDimension = vectorDimension;
+   S->intercept = intercept;
+   S->dualCoefficients = dualCoefficients;
+   S->supportVectors = supportVectors;
+   S->classes = classes;
+}
+
+
+
+/**
+ * @} end of linearsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f32.c
@ -0,0 +1,92 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_linear_init_f32.c
+ * Description:  SVM Linear Instance Initialization
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include <limits.h>
+#include <math.h>
+
+/**
+ * @defgroup groupSVM SVM Functions
+ *
+ */
+
+/**
+  @ingroup groupSVM
+ */
+
+/**
+  @defgroup linearsvm Linear SVM
+
+  Linear SVM classifier
+ */
+
+/**
+ * @addtogroup linearsvm
+ * @{
+ */
+
+
+/**
+ * @brief        SVM linear instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in]    S                      Parameters for the SVM function
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @return none.
+ *
+ */
+
+
+void arm_svm_linear_init_f32(arm_svm_linear_instance_f32 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float32_t intercept,
+  const float32_t *dualCoefficients,
+  const float32_t *supportVectors,
+  const int32_t *classes)
+{
+   S->nbOfSupportVectors = nbOfSupportVectors;
+   S->vectorDimension = vectorDimension;
+   S->intercept = intercept;
+   S->dualCoefficients = dualCoefficients;
+   S->supportVectors = supportVectors;
+   S->classes = classes;
+}
+
+
+
+/**
+ * @} end of linearsvm group
+ */
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c
@ -0,0 +1,314 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_linear_predict_f16.c
+ * Description:  SVM Linear Classifier
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+
+/**
+ * @addtogroup linearsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM linear prediction
+ * @param[in]    S          Pointer to an instance of the linear SVM structure.
+ * @param[in]    in         Pointer to input vector
+ * @param[out]   pResult    Decision value
+ * @return none.
+ *
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_svm_linear_predict_f16(
+    const arm_svm_linear_instance_f16 *S,
+    const float16_t * in,
+    int32_t * pResult)
+{
+        /* inlined Matrix x Vector function interleaved with dot prod */
+    uint32_t        numRows = S->nbOfSupportVectors;
+    uint32_t        numCols = S->vectorDimension;
+    const float16_t *pSupport = S->supportVectors;
+    const float16_t *pSrcA = pSupport;
+    const float16_t *pInA0;
+    const float16_t *pInA1;
+    uint32_t         row;
+    uint32_t         blkCnt;     /* loop counters */
+    const float16_t *pDualCoef = S->dualCoefficients;
+    _Float16       sum = S->intercept;
+    row = numRows;
+
+    /*
+     * compute 4 rows in parrallel
+     */
+    while (row >= 4) 
+    {
+        const float16_t *pInA2, *pInA3;
+        float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+        f16x8_t         vecIn, acc0, acc1, acc2, acc3;
+        float16_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 4 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+        acc1 = vdupq_n_f16(0.0f);
+        acc2 = vdupq_n_f16(0.0f);
+        acc3 = vdupq_n_f16(0.0f);
+
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+        pSrcA2Vec = pInA2;
+        pSrcA3Vec = pInA3;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 8;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vld1q(pSrcA2Vec);
+            pSrcA2Vec += 8;
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vld1q(pSrcA3Vec);
+            pSrcA3Vec += 8;
+            acc3 = vfmaq(acc3, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA2Vec, p0);
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA3Vec, p0);
+            acc3 = vfmaq(acc3, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        acc0 = vmulq_n_f16(acc0,*pDualCoef++);
+        acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
+        acc0 = vfmaq_n_f16(acc0,acc2,*pDualCoef++);
+        acc0 = vfmaq_n_f16(acc0,acc3,*pDualCoef++);
+
+        sum += (_Float16)vecAddAcrossF16Mve(acc0);
+
+        pSrcA += numCols * 4;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 4;
+    }
+
+    /*
+     * compute 2 rows in parallel
+     */
+    if (row >= 2) {
+        float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+        f16x8_t         vecIn, acc0, acc1;
+        float16_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 2 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+        acc1 = vdupq_n_f16(0.0f);
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 8;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        acc0 = vmulq_n_f16(acc0,*pDualCoef++);
+        acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
+
+        sum += (_Float16)vecAddAcrossF16Mve(acc0);
+
+        pSrcA += numCols * 2;
+        row -= 2;
+    }
+
+    if (row >= 1) {
+        f16x8_t         vecIn, acc0;
+        float16_t const *pSrcA0Vec, *pInVec;
+        float16_t const *pSrcVecPtr = in;
+        /*
+         * Initialize the pointers to last MatrixA row
+         */
+        pInA0 = pSrcA;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+
+        pSrcA0Vec = pInA0;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        sum += (_Float16)*pDualCoef++ * (_Float16)vecAddAcrossF16Mve(acc0);
+
+    }
+
+    *pResult = S->classes[STEP(sum)];
+}
+
+#else
+void arm_svm_linear_predict_f16(
+    const arm_svm_linear_instance_f16 *S,
+    const float16_t * in,
+    int32_t * pResult)
+{
+    _Float16 sum=S->intercept;
+    _Float16 dot=0;
+    uint32_t i,j;
+    const float16_t *pSupport = S->supportVectors;
+
+    for(i=0; i < S->nbOfSupportVectors; i++)
+    {
+        dot=0;
+        for(j=0; j < S->vectorDimension; j++)
+        {
+            dot = (_Float16)dot + (_Float16)in[j]* (_Float16)*pSupport++;
+        }
+        sum += (_Float16)S->dualCoefficients[i] * (_Float16)dot;
+    }
+    *pResult=S->classes[STEP(sum)];
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of linearsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f32.c
@ -0,0 +1,461 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_linear_predict_f32.c
+ * Description:  SVM Linear Classifier
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include <limits.h>
+#include <math.h>
+
+
+/**
+ * @addtogroup linearsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM linear prediction
+ * @param[in]    S          Pointer to an instance of the linear SVM structure.
+ * @param[in]    in         Pointer to input vector
+ * @param[out]   pResult    Decision value
+ * @return none.
+ *
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_svm_linear_predict_f32(
+    const arm_svm_linear_instance_f32 *S,
+    const float32_t * in,
+    int32_t * pResult)
+{
+        /* inlined Matrix x Vector function interleaved with dot prod */
+    uint32_t        numRows = S->nbOfSupportVectors;
+    uint32_t        numCols = S->vectorDimension;
+    const float32_t *pSupport = S->supportVectors;
+    const float32_t *pSrcA = pSupport;
+    const float32_t *pInA0;
+    const float32_t *pInA1;
+    uint32_t         row;
+    uint32_t         blkCnt;     /* loop counters */
+    const float32_t *pDualCoef = S->dualCoefficients;
+    float32_t       sum = S->intercept;
+    row = numRows;
+
+    /*
+     * compute 4 rows in parrallel
+     */
+    while (row >= 4) 
+    {
+        const float32_t *pInA2, *pInA3;
+        float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+        f32x4_t         vecIn, acc0, acc1, acc2, acc3;
+        float32_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 4 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f32(0.0f);
+        acc1 = vdupq_n_f32(0.0f);
+        acc2 = vdupq_n_f32(0.0f);
+        acc3 = vdupq_n_f32(0.0f);
+
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+        pSrcA2Vec = pInA2;
+        pSrcA3Vec = pInA3;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U) {
+            f32x4_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 4;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 4;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 4;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vld1q(pSrcA2Vec);
+            pSrcA2Vec += 4;
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vld1q(pSrcA3Vec);
+            pSrcA3Vec += 4;
+            acc3 = vfmaq(acc3, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp32q(blkCnt);
+            f32x4_t         vecA;
+
+            vecIn = vldrwq_z_f32(pInVec, p0);
+            vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vldrwq_z_f32(pSrcA2Vec, p0);
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vldrwq_z_f32(pSrcA3Vec, p0);
+            acc3 = vfmaq(acc3, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+
+        acc0 = vmulq_n_f32(acc0,*pDualCoef++);
+        acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++);
+        acc0 = vfmaq_n_f32(acc0,acc2,*pDualCoef++);
+        acc0 = vfmaq_n_f32(acc0,acc3,*pDualCoef++);
+
+        sum += vecAddAcrossF32Mve(acc0);
+
+        pSrcA += numCols * 4;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 4;
+    }
+
+    /*
+     * compute 2 rows in parallel
+     */
+    if (row >= 2) {
+        float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+        f32x4_t         vecIn, acc0, acc1;
+        float32_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 2 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f32(0.0f);
+        acc1 = vdupq_n_f32(0.0f);
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U) {
+            f32x4_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 4;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 4;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 4;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp32q(blkCnt);
+            f32x4_t         vecA;
+
+            vecIn = vldrwq_z_f32(pInVec, p0);
+            vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        acc0 = vmulq_n_f32(acc0,*pDualCoef++);
+        acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++);
+
+        sum += vecAddAcrossF32Mve(acc0);
+
+
+        pSrcA += numCols * 2;
+        row -= 2;
+    }
+
+    if (row >= 1) {
+        f32x4_t         vecIn, acc0;
+        float32_t const *pSrcA0Vec, *pInVec;
+        float32_t const *pSrcVecPtr = in;
+        /*
+         * Initialize the pointers to last MatrixA row
+         */
+        pInA0 = pSrcA;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f32(0.0f);
+
+        pSrcA0Vec = pInA0;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U) {
+            f32x4_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 4;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 4;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp32q(blkCnt);
+            f32x4_t         vecA;
+
+            vecIn = vldrwq_z_f32(pInVec, p0);
+            vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        sum += *pDualCoef++ * vecAddAcrossF32Mve(acc0);
+
+    }
+
+    *pResult = S->classes[STEP(sum)];
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+void arm_svm_linear_predict_f32(
+    const arm_svm_linear_instance_f32 *S,
+    const float32_t * in,
+    int32_t * pResult)
+{
+    float32_t sum = S->intercept;
+   
+    float32_t dot;
+    float32x4_t dotV; 
+
+    float32x4_t accuma,accumb,accumc,accumd,accum;
+    float32x2_t accum2;
+    float32x4_t vec1;
+
+    float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
+
+    uint32_t blkCnt;   
+    uint32_t vectorBlkCnt;   
+
+    const float32_t *pIn = in;
+
+    const float32_t *pSupport = S->supportVectors;
+
+    const float32_t *pSupporta = S->supportVectors;
+    const float32_t *pSupportb;
+    const float32_t *pSupportc;
+    const float32_t *pSupportd;
+
+    pSupportb = pSupporta + S->vectorDimension;
+    pSupportc = pSupportb + S->vectorDimension;
+    pSupportd = pSupportc + S->vectorDimension;
+
+    const float32_t *pDualCoefs = S->dualCoefficients;
+
+    vectorBlkCnt = S->nbOfSupportVectors >> 2;
+
+    while (vectorBlkCnt > 0U)
+    {
+        accuma = vdupq_n_f32(0);
+        accumb = vdupq_n_f32(0);
+        accumc = vdupq_n_f32(0);
+        accumd = vdupq_n_f32(0);
+
+        pIn = in;
+
+        blkCnt = S->vectorDimension >> 2;
+        while (blkCnt > 0U)
+        {
+        
+            vec1 = vld1q_f32(pIn);
+            vec2a = vld1q_f32(pSupporta);
+            vec2b = vld1q_f32(pSupportb);
+            vec2c = vld1q_f32(pSupportc);
+            vec2d = vld1q_f32(pSupportd);
+
+            pIn += 4;
+            pSupporta += 4;
+            pSupportb += 4;
+            pSupportc += 4;
+            pSupportd += 4;
+
+            accuma = vmlaq_f32(accuma, vec1,vec2a);
+            accumb = vmlaq_f32(accumb, vec1,vec2b);
+            accumc = vmlaq_f32(accumc, vec1,vec2c);
+            accumd = vmlaq_f32(accumd, vec1,vec2d);
+
+            blkCnt -- ;
+        }
+        accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
+
+        accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
+
+        accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
+
+        accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
+
+
+        blkCnt = S->vectorDimension & 3;
+        while (blkCnt > 0U)
+        {
+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
+
+            pIn++;
+
+            blkCnt -- ;
+        }
+
+        vec1 = vld1q_f32(pDualCoefs);
+        pDualCoefs += 4; 
+
+        accum = vmulq_f32(vec1,dotV);
+        accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+        sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+        pSupporta += 3*S->vectorDimension;
+        pSupportb += 3*S->vectorDimension;
+        pSupportc += 3*S->vectorDimension;
+        pSupportd += 3*S->vectorDimension;
+
+        vectorBlkCnt -- ;
+    }
+
+    pSupport = pSupporta;
+    vectorBlkCnt = S->nbOfSupportVectors & 3;
+    while (vectorBlkCnt > 0U)
+    {
+        accum = vdupq_n_f32(0);
+        dot = 0.0f;
+        pIn = in;
+
+        blkCnt = S->vectorDimension >> 2;
+        while (blkCnt > 0U)
+        {
+        
+            vec1 = vld1q_f32(pIn);
+            vec2 = vld1q_f32(pSupport);
+            pIn += 4;
+            pSupport += 4;
+
+            accum = vmlaq_f32(accum, vec1,vec2);
+
+            blkCnt -- ;
+        }
+        accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+        dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+
+        blkCnt = S->vectorDimension & 3;
+        while (blkCnt > 0U)
+        {
+            dot = dot + *pIn++ * *pSupport++;
+
+            blkCnt -- ;
+        }
+
+        sum += *pDualCoefs++ * dot;
+        vectorBlkCnt -- ;
+    }
+
+    *pResult=S->classes[STEP(sum)];
+}
+#else
+void arm_svm_linear_predict_f32(
+    const arm_svm_linear_instance_f32 *S,
+    const float32_t * in,
+    int32_t * pResult)
+{
+    float32_t sum=S->intercept;
+    float32_t dot=0;
+    uint32_t i,j;
+    const float32_t *pSupport = S->supportVectors;
+
+    for(i=0; i < S->nbOfSupportVectors; i++)
+    {
+        dot=0;
+        for(j=0; j < S->vectorDimension; j++)
+        {
+            dot = dot + in[j]* *pSupport++;
+        }
+        sum += S->dualCoefficients[i] * dot;
+    }
+    *pResult=S->classes[STEP(sum)];
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of linearsvm group
+ */
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f16.c
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f16.c
@ -0,0 +1,103 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_polynomial_init_f16.c
+ * Description:  SVM Polynomial Instance Initialization
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup groupSVM
+ */
+
+/**
+  @defgroup polysvm Polynomial SVM
+
+  Polynomial SVM classifier
+ */
+
+/**
+ * @addtogroup polysvm
+ * @{
+ */
+
+
+/**
+ * @brief        SVM polynomial instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in]    S                      points to an instance of the polynomial SVM structure.
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @param[in]    degree                 Polynomial degree
+ * @param[in]    coef0                  coeff0 (scikit-learn terminology)
+ * @param[in]    gamma                  gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+
+void arm_svm_polynomial_init_f16(arm_svm_polynomial_instance_f16 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float16_t intercept,
+  const float16_t *dualCoefficients,
+  const float16_t *supportVectors,
+  const int32_t *classes,
+  int32_t      degree,
+  float16_t coef0,
+  float16_t gamma
+  )
+{
+   S->nbOfSupportVectors = nbOfSupportVectors;
+   S->vectorDimension = vectorDimension;
+   S->intercept = intercept;
+   S->dualCoefficients = dualCoefficients;
+   S->supportVectors = supportVectors;
+   S->classes = classes;
+   S->degree = degree;
+   S->coef0 = coef0;
+   S->gamma = gamma;
+}
+
+
+
+/**
+ * @} end of polysvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f32.c
@ -0,0 +1,97 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_polynomial_init_f32.c
+ * Description:  SVM Polynomial Instance Initialization
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup groupSVM
+ */
+
+/**
+  @defgroup polysvm Polynomial SVM
+
+  Polynomial SVM classifier
+ */
+
+/**
+ * @addtogroup polysvm
+ * @{
+ */
+
+
+/**
+ * @brief        SVM polynomial instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in]    S                      points to an instance of the polynomial SVM structure.
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @param[in]    degree                 Polynomial degree
+ * @param[in]    coef0                  coeff0 (scikit-learn terminology)
+ * @param[in]    gamma                  gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+
+void arm_svm_polynomial_init_f32(arm_svm_polynomial_instance_f32 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float32_t intercept,
+  const float32_t *dualCoefficients,
+  const float32_t *supportVectors,
+  const int32_t *classes,
+  int32_t      degree,
+  float32_t coef0,
+  float32_t gamma
+  )
+{
+   S->nbOfSupportVectors = nbOfSupportVectors;
+   S->vectorDimension = vectorDimension;
+   S->intercept = intercept;
+   S->dualCoefficients = dualCoefficients;
+   S->supportVectors = supportVectors;
+   S->classes = classes;
+   S->degree = degree;
+   S->coef0 = coef0;
+   S->gamma = gamma;
+}
+
+
+
+/**
+ * @} end of polysvm group
+ */
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
@ -0,0 +1,369 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_polynomial_predict_f16.c
+ * Description:  SVM Polynomial Classifier
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+#if !defined(ARM_MATH_MVE_FLOAT16) || defined(ARM_MATH_AUTOVECTORIZE)
+
+/*
+
+_Float16 is not supported in g++ so we avoid putting _Float16 definitions
+in the public headers.
+
+This function should at some point be moved in FastMath.
+
+*/
+__STATIC_INLINE float16_t arm_exponent_f16(float16_t x, int32_t nb)
+{
+    float16_t r = x;
+    nb --;
+    while(nb > 0)
+    {
+        r = (_Float16)r * (_Float16)x;
+        nb--;
+    }
+    return(r);
+}
+#endif
+
+/**
+ * @addtogroup polysvm
+ * @{
+ */
+
+
+
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+/**
+ * @brief SVM polynomial prediction
+ * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
+ * @param[in]    in         Pointer to input vector
+ * @param[out]   pResult    Decision value
+ * @return none.
+ *
+ */
+void arm_svm_polynomial_predict_f16(
+    const arm_svm_polynomial_instance_f16 *S,
+    const float16_t * in,
+    int32_t * pResult)
+{
+        /* inlined Matrix x Vector function interleaved with dot prod */
+    uint32_t        numRows = S->nbOfSupportVectors;
+    uint32_t        numCols = S->vectorDimension;
+    const float16_t *pSupport = S->supportVectors;
+    const float16_t *pSrcA = pSupport;
+    const float16_t *pInA0;
+    const float16_t *pInA1;
+    uint32_t         row;
+    uint32_t         blkCnt;     /* loop counters */
+    const float16_t *pDualCoef = S->dualCoefficients;
+    _Float16       sum = S->intercept;
+    f16x8_t         vSum = vdupq_n_f16(0.0f);
+
+    row = numRows;
+
+    /*
+     * compute 4 rows in parrallel
+     */
+    while (row >= 4) {
+        const float16_t *pInA2, *pInA3;
+        float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+        f16x8_t         vecIn, acc0, acc1, acc2, acc3;
+        float16_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 4 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+        acc1 = vdupq_n_f16(0.0f);
+        acc2 = vdupq_n_f16(0.0f);
+        acc3 = vdupq_n_f16(0.0f);
+
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+        pSrcA2Vec = pInA2;
+        pSrcA3Vec = pInA3;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 8;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vld1q(pSrcA2Vec);
+            pSrcA2Vec += 8;
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vld1q(pSrcA3Vec);
+            pSrcA3Vec += 8;
+            acc3 = vfmaq(acc3, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA2Vec, p0);
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA3Vec, p0);
+            acc3 = vfmaq(acc3, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f16x8_t         vtmp = vuninitializedq_f16();
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
+
+        vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
+                             arm_vec_exponent_f16
+                             (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), 
+                                S->degree),vctp16q(4));
+        
+        pDualCoef += 4;
+
+        pSrcA += numCols * 4;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 4;
+    }
+
+    /*
+     * compute 2 rows in parrallel
+     */
+    if (row >= 2) {
+        float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+        f16x8_t         vecIn, acc0, acc1;
+        float16_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 2 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+        acc1 = vdupq_n_f16(0.0f);
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 8;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f16x8_t         vtmp = vuninitializedq_f16();
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+
+        vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
+                             arm_vec_exponent_f16
+                             (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree), 
+                             vctp16q(2));
+        
+        pDualCoef += 2;
+        pSrcA += numCols * 2;
+        row -= 2;
+    }
+
+    if (row >= 1) {
+        f16x8_t         vecIn, acc0;
+        float16_t const *pSrcA0Vec, *pInVec;
+        float16_t const *pSrcVecPtr = in;
+        /*
+         * Initialize the pointers to last MatrixA row
+         */
+        pInA0 = pSrcA;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+
+        pSrcA0Vec = pInA0;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f16x8_t         vtmp = vuninitializedq_f16();
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+        vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
+                             arm_vec_exponent_f16
+                             (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree), 
+                             vctp16q(1));
+    }
+    sum += (_Float16)vecAddAcrossF16Mve(vSum);
+
+    
+    *pResult = S->classes[STEP(sum)];
+}
+
+#else
+
+
+/**
+ * @brief SVM polynomial prediction
+ * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
+ * @param[in]    in         Pointer to input vector
+ * @param[out]   pResult    Decision value
+ * @return none.
+ *
+ */
+void arm_svm_polynomial_predict_f16(
+    const arm_svm_polynomial_instance_f16 *S,
+    const float16_t * in,
+    int32_t * pResult)
+{
+    _Float16 sum=S->intercept;
+    _Float16 dot=0;
+    uint32_t i,j;
+    const float16_t *pSupport = S->supportVectors;
+
+    for(i=0; i < S->nbOfSupportVectors; i++)
+    {
+        dot=0;
+        for(j=0; j < S->vectorDimension; j++)
+        {
+            dot = (_Float16)dot + (_Float16)in[j]* (_Float16)*pSupport++;
+        }
+        sum += (_Float16)S->dualCoefficients[i] * (_Float16)arm_exponent_f16((_Float16)S->gamma * (_Float16)dot + (_Float16)S->coef0, S->degree);
+    }
+
+    *pResult=S->classes[STEP(sum)];
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+/**
+ * @} end of polysvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c
@ -0,0 +1,490 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_polynomial_predict_f32.c
+ * Description:  SVM Polynomial Classifier
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include <limits.h>
+#include <math.h>
+
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include "arm_vec_math.h"
+#endif
+
+/**
+ * @addtogroup polysvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM polynomial prediction
+ * @param[in]    S          Pointer to an instance of the polynomial SVM structure.
+ * @param[in]    in         Pointer to input vector
+ * @param[out]   pResult    Decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math.h"
+
+void arm_svm_polynomial_predict_f32(
+    const arm_svm_polynomial_instance_f32 *S,
+    const float32_t * in,
+    int32_t * pResult)
+{
+        /* inlined Matrix x Vector function interleaved with dot prod */
+    uint32_t        numRows = S->nbOfSupportVectors;
+    uint32_t        numCols = S->vectorDimension;
+    const float32_t *pSupport = S->supportVectors;
+    const float32_t *pSrcA = pSupport;
+    const float32_t *pInA0;
+    const float32_t *pInA1;
+    uint32_t         row;
+    uint32_t         blkCnt;     /* loop counters */
+    const float32_t *pDualCoef = S->dualCoefficients;
+    float32_t       sum = S->intercept;
+    f32x4_t         vSum = vdupq_n_f32(0.0f);
+
+    row = numRows;
+
+    /*
+     * compute 4 rows in parrallel
+     */
+    while (row >= 4) {
+        const float32_t *pInA2, *pInA3;
+        float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+        f32x4_t         vecIn, acc0, acc1, acc2, acc3;
+        float32_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 4 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f32(0.0f);
+        acc1 = vdupq_n_f32(0.0f);
+        acc2 = vdupq_n_f32(0.0f);
+        acc3 = vdupq_n_f32(0.0f);
+
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+        pSrcA2Vec = pInA2;
+        pSrcA3Vec = pInA3;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U) {
+            f32x4_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 4;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 4;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 4;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vld1q(pSrcA2Vec);
+            pSrcA2Vec += 4;
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vld1q(pSrcA3Vec);
+            pSrcA3Vec += 4;
+            acc3 = vfmaq(acc3, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp32q(blkCnt);
+            f32x4_t         vecA;
+
+            vecIn = vldrwq_z_f32(pInVec, p0);
+            vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vldrwq_z_f32(pSrcA2Vec, p0);
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vldrwq_z_f32(pSrcA3Vec, p0);
+            acc3 = vfmaq(acc3, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f32x4_t         vtmp = vuninitializedq_f32();
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
+
+        vSum = vfmaq_f32(vSum, vld1q(pDualCoef),
+                             arm_vec_exponent_f32
+                             (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree));
+        
+        pDualCoef += 4;
+
+        pSrcA += numCols * 4;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 4;
+    }
+
+    /*
+     * compute 2 rows in parrallel
+     */
+    if (row >= 2) {
+        float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+        f32x4_t         vecIn, acc0, acc1;
+        float32_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 2 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f32(0.0f);
+        acc1 = vdupq_n_f32(0.0f);
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U) {
+            f32x4_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 4;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 4;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 4;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp32q(blkCnt);
+            f32x4_t         vecA;
+
+            vecIn = vldrwq_z_f32(pInVec, p0);
+            vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f32x4_t         vtmp = vuninitializedq_f32();
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
+
+        vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef),
+                             arm_vec_exponent_f32
+                             (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree), 
+                             vctp32q(2));
+        
+        pDualCoef += 2;
+        pSrcA += numCols * 2;
+        row -= 2;
+    }
+
+    if (row >= 1) {
+        f32x4_t         vecIn, acc0;
+        float32_t const *pSrcA0Vec, *pInVec;
+        float32_t const *pSrcVecPtr = in;
+        /*
+         * Initialize the pointers to last MatrixA row
+         */
+        pInA0 = pSrcA;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f32(0.0f);
+
+        pSrcA0Vec = pInA0;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U) {
+            f32x4_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 4;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 4;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp32q(blkCnt);
+            f32x4_t         vecA;
+
+            vecIn = vldrwq_z_f32(pInVec, p0);
+            vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f32x4_t         vtmp = vuninitializedq_f32();
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+        vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef),
+                             arm_vec_exponent_f32
+                             (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree), 
+                             vctp32q(1));
+    }
+    sum += vecAddAcrossF32Mve(vSum);
+
+    
+    *pResult = S->classes[STEP(sum)];
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+void arm_svm_polynomial_predict_f32(
+    const arm_svm_polynomial_instance_f32 *S,
+    const float32_t * in,
+    int32_t * pResult)
+{
+    float32_t sum = S->intercept;
+   
+    float32_t dot;
+    float32x4_t dotV; 
+
+    float32x4_t accuma,accumb,accumc,accumd,accum;
+    float32x2_t accum2;
+    float32x4_t vec1;
+    float32x4_t coef0 = vdupq_n_f32(S->coef0);
+
+    float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
+
+    uint32_t blkCnt;   
+    uint32_t vectorBlkCnt;   
+
+    const float32_t *pIn = in;
+
+    const float32_t *pSupport = S->supportVectors;
+
+    const float32_t *pSupporta = S->supportVectors;
+    const float32_t *pSupportb;
+    const float32_t *pSupportc;
+    const float32_t *pSupportd;
+
+    pSupportb = pSupporta + S->vectorDimension;
+    pSupportc = pSupportb + S->vectorDimension;
+    pSupportd = pSupportc + S->vectorDimension;
+
+    const float32_t *pDualCoefs = S->dualCoefficients;
+
+    vectorBlkCnt = S->nbOfSupportVectors >> 2;
+    while (vectorBlkCnt > 0U)
+    {
+        accuma = vdupq_n_f32(0);
+        accumb = vdupq_n_f32(0);
+        accumc = vdupq_n_f32(0);
+        accumd = vdupq_n_f32(0);
+
+        pIn = in;
+
+        blkCnt = S->vectorDimension >> 2;
+        while (blkCnt > 0U)
+        {
+        
+            vec1 = vld1q_f32(pIn);
+            vec2a = vld1q_f32(pSupporta);
+            vec2b = vld1q_f32(pSupportb);
+            vec2c = vld1q_f32(pSupportc);
+            vec2d = vld1q_f32(pSupportd);
+
+            pIn += 4;
+            pSupporta += 4;
+            pSupportb += 4;
+            pSupportc += 4;
+            pSupportd += 4;
+
+            accuma = vmlaq_f32(accuma, vec1,vec2a);
+            accumb = vmlaq_f32(accumb, vec1,vec2b);
+            accumc = vmlaq_f32(accumc, vec1,vec2c);
+            accumd = vmlaq_f32(accumd, vec1,vec2d);
+
+            blkCnt -- ;
+        }
+        accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
+
+        accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
+
+        accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
+
+        accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
+
+
+        blkCnt = S->vectorDimension & 3;
+        while (blkCnt > 0U)
+        {
+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
+
+            pIn++;
+
+            blkCnt -- ;
+        }
+
+        vec1 = vld1q_f32(pDualCoefs);
+        pDualCoefs += 4; 
+
+        // To vectorize later
+        dotV = vmulq_n_f32(dotV, S->gamma);
+        dotV = vaddq_f32(dotV, coef0);
+
+        dotV = arm_vec_exponent_f32(dotV,S->degree);
+
+        accum = vmulq_f32(vec1,dotV);
+        accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+        sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+        pSupporta += 3*S->vectorDimension;
+        pSupportb += 3*S->vectorDimension;
+        pSupportc += 3*S->vectorDimension;
+        pSupportd += 3*S->vectorDimension;
+
+        vectorBlkCnt -- ;
+    }
+
+    pSupport = pSupporta;
+    vectorBlkCnt = S->nbOfSupportVectors & 3;
+
+    while (vectorBlkCnt > 0U)
+    {
+        accum = vdupq_n_f32(0);
+        dot = 0.0f;
+        pIn = in;
+
+        blkCnt = S->vectorDimension >> 2;
+        while (blkCnt > 0U)
+        {
+        
+            vec1 = vld1q_f32(pIn);
+            vec2 = vld1q_f32(pSupport);
+            pIn += 4;
+            pSupport += 4;
+
+            accum = vmlaq_f32(accum, vec1,vec2);
+
+            blkCnt -- ;
+        }
+        accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+        dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+
+        blkCnt = S->vectorDimension & 3;
+        while (blkCnt > 0U)
+        {
+            dot = dot + *pIn++ * *pSupport++;
+
+            blkCnt -- ;
+        }
+
+        sum += *pDualCoefs++ * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree);
+        vectorBlkCnt -- ;
+    }
+
+    *pResult=S->classes[STEP(sum)];
+}
+#else
+void arm_svm_polynomial_predict_f32(
+    const arm_svm_polynomial_instance_f32 *S,
+    const float32_t * in,
+    int32_t * pResult)
+{
+    float32_t sum=S->intercept;
+    float32_t dot=0;
+    uint32_t i,j;
+    const float32_t *pSupport = S->supportVectors;
+
+    for(i=0; i < S->nbOfSupportVectors; i++)
+    {
+        dot=0;
+        for(j=0; j < S->vectorDimension; j++)
+        {
+            dot = dot + in[j]* *pSupport++;
+        }
+        sum += S->dualCoefficients[i] * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree);
+    }
+
+    *pResult=S->classes[STEP(sum)];
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+/**
+ * @} end of polysvm group
+ */
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f16.c
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f16.c
@ -0,0 +1,97 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_rbf_init_f16.c
+ * Description:  SVM Radial Basis Function Instance Initialization
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup groupSVM
+ */
+
+/**
+  @defgroup rbfsvm RBF SVM
+
+  RBF SVM classifier
+ */
+
+
+/**
+ * @addtogroup rbfsvm
+ * @{
+ */
+
+
+/**
+ * @brief        SVM radial basis function instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in]    S                      points to an instance of the polynomial SVM structure.
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @param[in]    gamma                  gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_rbf_init_f16(arm_svm_rbf_instance_f16 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float16_t intercept,
+  const float16_t *dualCoefficients,
+  const float16_t *supportVectors,
+  const int32_t *classes,
+  float16_t gamma
+  )
+{
+   S->nbOfSupportVectors = nbOfSupportVectors;
+   S->vectorDimension = vectorDimension;
+   S->intercept = intercept;
+   S->dualCoefficients = dualCoefficients;
+   S->supportVectors = supportVectors;
+   S->classes = classes;
+   S->gamma = gamma;
+}
+
+
+
+/**
+ * @} end of rbfsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f32.c
@ -0,0 +1,91 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_rbf_init_f32.c
+ * Description:  SVM Radial Basis Function Instance Initialization
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup groupSVM
+ */
+
+/**
+  @defgroup rbfsvm RBF SVM
+
+  RBF SVM classifier
+ */
+
+
+/**
+ * @addtogroup rbfsvm
+ * @{
+ */
+
+
+/**
+ * @brief        SVM radial basis function instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in]    S                      points to an instance of the polynomial SVM structure.
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @param[in]    gamma                  gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_rbf_init_f32(arm_svm_rbf_instance_f32 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float32_t intercept,
+  const float32_t *dualCoefficients,
+  const float32_t *supportVectors,
+  const int32_t *classes,
+  float32_t gamma
+  )
+{
+   S->nbOfSupportVectors = nbOfSupportVectors;
+   S->vectorDimension = vectorDimension;
+   S->intercept = intercept;
+   S->dualCoefficients = dualCoefficients;
+   S->supportVectors = supportVectors;
+   S->classes = classes;
+   S->gamma = gamma;
+}
+
+
+
+/**
+ * @} end of rbfsvm group
+ */
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f16.c
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f16.c
@ -0,0 +1,352 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_rbf_predict_f16.c
+ * Description:  SVM Radial Basis Function Classifier
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+
+/**
+ * @addtogroup rbfsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM rbf prediction
+ * @param[in]    S         Pointer to an instance of the rbf SVM structure.
+ * @param[in]    in        Pointer to input vector
+ * @param[out]   pResult   decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+void arm_svm_rbf_predict_f16(
+    const arm_svm_rbf_instance_f16 *S,
+    const float16_t * in,
+    int32_t * pResult)
+{
+        /* inlined Matrix x Vector function interleaved with dot prod */
+    uint32_t        numRows = S->nbOfSupportVectors;
+    uint32_t        numCols = S->vectorDimension;
+    const float16_t *pSupport = S->supportVectors;
+    const float16_t *pSrcA = pSupport;
+    const float16_t *pInA0;
+    const float16_t *pInA1;
+    uint32_t         row;
+    uint32_t         blkCnt;     /* loop counters */
+    const float16_t *pDualCoef = S->dualCoefficients;
+    _Float16       sum = S->intercept;
+    f16x8_t         vSum = vdupq_n_f16(0.0f16);
+
+    row = numRows;
+
+    /*
+     * compute 4 rows in parrallel
+     */
+    while (row >= 4) {
+        const float16_t *pInA2, *pInA3;
+        float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+        f16x8_t         vecIn, acc0, acc1, acc2, acc3;
+        float16_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 4 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f16);
+        acc1 = vdupq_n_f16(0.0f16);
+        acc2 = vdupq_n_f16(0.0f16);
+        acc3 = vdupq_n_f16(0.0f16);
+
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+        pSrcA2Vec = pInA2;
+        pSrcA3Vec = pInA3;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+            f16x8_t         vecDif;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            vecDif = vsubq(vecIn, vecA);
+            acc0 = vfmaq(acc0, vecDif, vecDif);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 8;
+            vecDif = vsubq(vecIn, vecA);
+            acc1 = vfmaq(acc1, vecDif, vecDif);
+            vecA = vld1q(pSrcA2Vec);
+            pSrcA2Vec += 8;
+            vecDif = vsubq(vecIn, vecA);
+            acc2 = vfmaq(acc2, vecDif, vecDif);
+            vecA = vld1q(pSrcA3Vec);
+            pSrcA3Vec += 8;
+            vecDif = vsubq(vecIn, vecA);
+            acc3 = vfmaq(acc3, vecDif, vecDif);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+            f16x8_t         vecDif;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            vecDif = vsubq(vecIn, vecA);
+            acc0 = vfmaq(acc0, vecDif, vecDif);
+            vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+            vecDif = vsubq(vecIn, vecA);
+            acc1 = vfmaq(acc1, vecDif, vecDif);
+            vecA = vldrhq_z_f16(pSrcA2Vec, p0);;
+            vecDif = vsubq(vecIn, vecA);
+            acc2 = vfmaq(acc2, vecDif, vecDif);
+            vecA = vldrhq_z_f16(pSrcA3Vec, p0);
+            vecDif = vsubq(vecIn, vecA);
+            acc3 = vfmaq(acc3, vecDif, vecDif);
+        }
+        /*
+         * Sum the partial parts
+         */
+
+        //sum += *pDualCoef++ * expf(-S->gamma * vecReduceF16Mve(acc0));
+        f16x8_t         vtmp = vuninitializedq_f16();
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
+
+        vSum =
+            vfmaq_m_f16(vSum, vld1q(pDualCoef),
+                      vexpq_f16(vmulq_n_f16(vtmp, -(_Float16)S->gamma)),vctp16q(4));
+        pDualCoef += 4;
+        pSrcA += numCols * 4;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 4;
+    }
+
+    /*
+     * compute 2 rows in parrallel
+     */
+    if (row >= 2) {
+        float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+        f16x8_t         vecIn, acc0, acc1;
+        float16_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 2 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f16);
+        acc1 = vdupq_n_f16(0.0f16);
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+            f16x8_t         vecDif;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            vecDif = vsubq(vecIn, vecA);
+            acc0 = vfmaq(acc0, vecDif, vecDif);;
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 8;
+            vecDif = vsubq(vecIn, vecA);
+            acc1 = vfmaq(acc1, vecDif, vecDif);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA, vecDif;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            vecDif = vsubq(vecIn, vecA);
+            acc0 = vfmaq(acc0, vecDif, vecDif);
+            vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+            vecDif = vsubq(vecIn, vecA);
+            acc1 = vfmaq(acc1, vecDif, vecDif);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f16x8_t         vtmp = vuninitializedq_f16();
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+
+        vSum =
+            vfmaq_m_f16(vSum, vld1q(pDualCoef),
+                        vexpq_f16(vmulq_n_f16(vtmp, -(_Float16)S->gamma)), vctp16q(2));
+        pDualCoef += 2;
+
+        pSrcA += numCols * 2;
+        row -= 2;
+    }
+
+    if (row >= 1) {
+        f16x8_t         vecIn, acc0;
+        float16_t const *pSrcA0Vec, *pInVec;
+        float16_t const *pSrcVecPtr = in;
+        /*
+         * Initialize the pointers to last MatrixA row
+         */
+        pInA0 = pSrcA;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+
+        pSrcA0Vec = pInA0;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA, vecDif;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            vecDif = vsubq(vecIn, vecA);
+            acc0 = vfmaq(acc0, vecDif, vecDif);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA, vecDif;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            vecDif = vsubq(vecIn, vecA);
+            acc0 = vfmaq(acc0, vecDif, vecDif);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f16x8_t         vtmp = vuninitializedq_f16();
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+
+        vSum =
+            vfmaq_m_f16(vSum, vld1q(pDualCoef),
+                        vexpq_f16(vmulq_n_f16(vtmp, -(_Float16)S->gamma)), vctp16q(1));
+
+    }
+
+
+    sum += (_Float16)vecAddAcrossF16Mve(vSum);
+    *pResult = S->classes[STEP(sum)];
+}
+
+#else
+void arm_svm_rbf_predict_f16(
+    const arm_svm_rbf_instance_f16 *S,
+    const float16_t * in,
+    int32_t * pResult)
+{
+    _Float16 sum=S->intercept;
+    _Float16 dot=00.f16;
+    uint32_t i,j;
+    const float16_t *pSupport = S->supportVectors;
+
+    for(i=0; i < S->nbOfSupportVectors; i++)
+    {
+        dot=0.0f16;
+        for(j=0; j < S->vectorDimension; j++)
+        {
+            dot = dot + SQ((_Float16)in[j] - (_Float16) *pSupport);
+            pSupport++;
+        }
+        sum += (_Float16)S->dualCoefficients[i] * (_Float16)expf((float32_t)(-(_Float16)S->gamma * (_Float16)dot));
+    }
+    *pResult=S->classes[STEP(sum)];
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of rbfsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f32.c
@ -0,0 +1,523 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_rbf_predict_f32.c
+ * Description:  SVM Radial Basis Function Classifier
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include <limits.h>
+#include <math.h>
+
+
+/**
+ * @addtogroup rbfsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM rbf prediction
+ * @param[in]    S         Pointer to an instance of the rbf SVM structure.
+ * @param[in]    in        Pointer to input vector
+ * @param[out]   pResult   decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math.h"
+
+void arm_svm_rbf_predict_f32(
+    const arm_svm_rbf_instance_f32 *S,
+    const float32_t * in,
+    int32_t * pResult)
+{
+        /* inlined Matrix x Vector function interleaved with dot prod */
+    uint32_t        numRows = S->nbOfSupportVectors;
+    uint32_t        numCols = S->vectorDimension;
+    const float32_t *pSupport = S->supportVectors;
+    const float32_t *pSrcA = pSupport;
+    const float32_t *pInA0;
+    const float32_t *pInA1;
+    uint32_t         row;
+    uint32_t         blkCnt;     /* loop counters */
+    const float32_t *pDualCoef = S->dualCoefficients;
+    float32_t       sum = S->intercept;
+    f32x4_t         vSum = vdupq_n_f32(0);
+
+    row = numRows;
+
+    /*
+     * compute 4 rows in parrallel
+     */
+    while (row >= 4) {
+        const float32_t *pInA2, *pInA3;
+        float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+        f32x4_t         vecIn, acc0, acc1, acc2, acc3;
+        float32_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 4 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f32(0.0f);
+        acc1 = vdupq_n_f32(0.0f);
+        acc2 = vdupq_n_f32(0.0f);
+        acc3 = vdupq_n_f32(0.0f);
+
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+        pSrcA2Vec = pInA2;
+        pSrcA3Vec = pInA3;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U) {
+            f32x4_t         vecA;
+            f32x4_t         vecDif;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 4;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 4;
+            vecDif = vsubq(vecIn, vecA);
+            acc0 = vfmaq(acc0, vecDif, vecDif);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 4;
+            vecDif = vsubq(vecIn, vecA);
+            acc1 = vfmaq(acc1, vecDif, vecDif);
+            vecA = vld1q(pSrcA2Vec);
+            pSrcA2Vec += 4;
+            vecDif = vsubq(vecIn, vecA);
+            acc2 = vfmaq(acc2, vecDif, vecDif);
+            vecA = vld1q(pSrcA3Vec);
+            pSrcA3Vec += 4;
+            vecDif = vsubq(vecIn, vecA);
+            acc3 = vfmaq(acc3, vecDif, vecDif);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp32q(blkCnt);
+            f32x4_t         vecA;
+            f32x4_t         vecDif;
+
+            vecIn = vldrwq_z_f32(pInVec, p0);
+            vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+            vecDif = vsubq(vecIn, vecA);
+            acc0 = vfmaq(acc0, vecDif, vecDif);
+            vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+            vecDif = vsubq(vecIn, vecA);
+            acc1 = vfmaq(acc1, vecDif, vecDif);
+            vecA = vldrwq_z_f32(pSrcA2Vec, p0);;
+            vecDif = vsubq(vecIn, vecA);
+            acc2 = vfmaq(acc2, vecDif, vecDif);
+            vecA = vldrwq_z_f32(pSrcA3Vec, p0);
+            vecDif = vsubq(vecIn, vecA);
+            acc3 = vfmaq(acc3, vecDif, vecDif);
+        }
+        /*
+         * Sum the partial parts
+         */
+
+        //sum += *pDualCoef++ * expf(-S->gamma * vecReduceF32Mve(acc0));
+        f32x4_t         vtmp = vuninitializedq_f32();
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
+
+        vSum =
+            vfmaq_f32(vSum, vld1q(pDualCoef),
+                      vexpq_f32(vmulq_n_f32(vtmp, -S->gamma)));
+        pDualCoef += 4;
+        pSrcA += numCols * 4;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 4;
+    }
+
+    /*
+     * compute 2 rows in parrallel
+     */
+    if (row >= 2) {
+        float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+        f32x4_t         vecIn, acc0, acc1;
+        float32_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 2 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f32(0.0f);
+        acc1 = vdupq_n_f32(0.0f);
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U) {
+            f32x4_t         vecA;
+            f32x4_t         vecDif;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 4;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 4;
+            vecDif = vsubq(vecIn, vecA);
+            acc0 = vfmaq(acc0, vecDif, vecDif);;
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 4;
+            vecDif = vsubq(vecIn, vecA);
+            acc1 = vfmaq(acc1, vecDif, vecDif);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp32q(blkCnt);
+            f32x4_t         vecA, vecDif;
+
+            vecIn = vldrwq_z_f32(pInVec, p0);
+            vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+            vecDif = vsubq(vecIn, vecA);
+            acc0 = vfmaq(acc0, vecDif, vecDif);
+            vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+            vecDif = vsubq(vecIn, vecA);
+            acc1 = vfmaq(acc1, vecDif, vecDif);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f32x4_t         vtmp = vuninitializedq_f32();
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
+
+        vSum =
+            vfmaq_m_f32(vSum, vld1q(pDualCoef),
+                        vexpq_f32(vmulq_n_f32(vtmp, -S->gamma)), vctp32q(2));
+        pDualCoef += 2;
+
+        pSrcA += numCols * 2;
+        row -= 2;
+    }
+
+    if (row >= 1) {
+        f32x4_t         vecIn, acc0;
+        float32_t const *pSrcA0Vec, *pInVec;
+        float32_t const *pSrcVecPtr = in;
+        /*
+         * Initialize the pointers to last MatrixA row
+         */
+        pInA0 = pSrcA;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f32(0.0f);
+
+        pSrcA0Vec = pInA0;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U) {
+            f32x4_t         vecA, vecDif;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 4;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 4;
+            vecDif = vsubq(vecIn, vecA);
+            acc0 = vfmaq(acc0, vecDif, vecDif);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp32q(blkCnt);
+            f32x4_t         vecA, vecDif;
+
+            vecIn = vldrwq_z_f32(pInVec, p0);
+            vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+            vecDif = vsubq(vecIn, vecA);
+            acc0 = vfmaq(acc0, vecDif, vecDif);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f32x4_t         vtmp = vuninitializedq_f32();
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+
+        vSum =
+            vfmaq_m_f32(vSum, vld1q(pDualCoef),
+                        vexpq_f32(vmulq_n_f32(vtmp, -S->gamma)), vctp32q(1));
+
+    }
+
+
+    sum += vecAddAcrossF32Mve(vSum);
+    *pResult = S->classes[STEP(sum)];
+}
+
+
+#else
+#if defined(ARM_MATH_NEON)
+
+#include "NEMath.h"
+
+void arm_svm_rbf_predict_f32(
+    const arm_svm_rbf_instance_f32 *S,
+    const float32_t * in,
+    int32_t * pResult)
+{
+    float32_t sum = S->intercept;
+   
+    float32_t dot;
+    float32x4_t dotV; 
+
+    float32x4_t accuma,accumb,accumc,accumd,accum;
+    float32x2_t accum2;
+    float32x4_t temp;
+    float32x4_t vec1;
+
+    float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
+
+    uint32_t blkCnt;   
+    uint32_t vectorBlkCnt;   
+
+    const float32_t *pIn = in;
+
+    const float32_t *pSupport = S->supportVectors;
+
+    const float32_t *pSupporta = S->supportVectors;
+    const float32_t *pSupportb;
+    const float32_t *pSupportc;
+    const float32_t *pSupportd;
+
+    pSupportb = pSupporta + S->vectorDimension;
+    pSupportc = pSupportb + S->vectorDimension;
+    pSupportd = pSupportc + S->vectorDimension;
+
+    const float32_t *pDualCoefs = S->dualCoefficients;
+
+
+    vectorBlkCnt = S->nbOfSupportVectors >> 2;
+    while (vectorBlkCnt > 0U)
+    {
+        accuma = vdupq_n_f32(0);
+        accumb = vdupq_n_f32(0);
+        accumc = vdupq_n_f32(0);
+        accumd = vdupq_n_f32(0);
+
+        pIn = in;
+
+        blkCnt = S->vectorDimension >> 2;
+        while (blkCnt > 0U)
+        {
+        
+            vec1 = vld1q_f32(pIn);
+            vec2a = vld1q_f32(pSupporta);
+            vec2b = vld1q_f32(pSupportb);
+            vec2c = vld1q_f32(pSupportc);
+            vec2d = vld1q_f32(pSupportd);
+
+            pIn += 4;
+            pSupporta += 4;
+            pSupportb += 4;
+            pSupportc += 4;
+            pSupportd += 4;
+
+            temp = vsubq_f32(vec1, vec2a);
+            accuma = vmlaq_f32(accuma, temp, temp);
+
+            temp = vsubq_f32(vec1, vec2b);
+            accumb = vmlaq_f32(accumb, temp, temp);
+
+            temp = vsubq_f32(vec1, vec2c);
+            accumc = vmlaq_f32(accumc, temp, temp);
+
+            temp = vsubq_f32(vec1, vec2d);
+            accumd = vmlaq_f32(accumd, temp, temp);
+
+            blkCnt -- ;
+        }
+        accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
+
+        accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
+
+        accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
+
+        accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
+
+
+        blkCnt = S->vectorDimension & 3;
+        while (blkCnt > 0U)
+        {
+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + SQ(*pIn - *pSupporta), dotV,0);
+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + SQ(*pIn - *pSupportb), dotV,1);
+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + SQ(*pIn - *pSupportc), dotV,2);
+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + SQ(*pIn - *pSupportd), dotV,3);
+
+            pSupporta++;
+            pSupportb++;
+            pSupportc++;
+            pSupportd++;
+
+            pIn++;
+
+            blkCnt -- ;
+        }
+
+        vec1 = vld1q_f32(pDualCoefs);
+        pDualCoefs += 4; 
+
+        // To vectorize later
+        dotV = vmulq_n_f32(dotV, -S->gamma);
+        dotV = vexpq_f32(dotV);
+
+        accum = vmulq_f32(vec1,dotV);
+        accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+        sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+        pSupporta += 3*S->vectorDimension;
+        pSupportb += 3*S->vectorDimension;
+        pSupportc += 3*S->vectorDimension;
+        pSupportd += 3*S->vectorDimension;
+
+        vectorBlkCnt -- ;
+    }
+
+    pSupport = pSupporta;
+    vectorBlkCnt = S->nbOfSupportVectors & 3;
+
+    while (vectorBlkCnt > 0U)
+    {
+        accum = vdupq_n_f32(0);
+        dot = 0.0f;
+        pIn = in;
+
+        blkCnt = S->vectorDimension >> 2;
+        while (blkCnt > 0U)
+        {
+        
+            vec1 = vld1q_f32(pIn);
+            vec2 = vld1q_f32(pSupport);
+            pIn += 4;
+            pSupport += 4;
+
+            temp = vsubq_f32(vec1,vec2);
+            accum = vmlaq_f32(accum, temp,temp);
+
+            blkCnt -- ;
+        }
+        accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+        dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+
+        blkCnt = S->vectorDimension & 3;
+        while (blkCnt > 0U)
+        {
+
+            dot = dot + SQ(*pIn - *pSupport);
+            pIn++;
+            pSupport++;
+
+            blkCnt -- ;
+        }
+
+        sum += *pDualCoefs++ * expf(-S->gamma * dot);
+        vectorBlkCnt -- ;
+    }
+
+    *pResult=S->classes[STEP(sum)];
+}
+#else
+void arm_svm_rbf_predict_f32(
+    const arm_svm_rbf_instance_f32 *S,
+    const float32_t * in,
+    int32_t * pResult)
+{
+    float32_t sum=S->intercept;
+    float32_t dot=0;
+    uint32_t i,j;
+    const float32_t *pSupport = S->supportVectors;
+
+    for(i=0; i < S->nbOfSupportVectors; i++)
+    {
+        dot=0;
+        for(j=0; j < S->vectorDimension; j++)
+        {
+            dot = dot + SQ(in[j] - *pSupport);
+            pSupport++;
+        }
+        sum += S->dualCoefficients[i] * expf(-S->gamma * dot);
+    }
+    *pResult=S->classes[STEP(sum)];
+}
+#endif
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of rbfsvm group
+ */
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f16.c
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f16.c
@ -0,0 +1,98 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_sigmoid_predict_f16.c
+ * Description:  SVM Sigmoid Instance Initialization
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup groupSVM
+ */
+
+/**
+  @defgroup sigmoidsvm Sigmoid SVM
+
+  Sigmoid SVM classifier
+ */
+
+/**
+ * @addtogroup sigmoidsvm
+ * @{
+ */
+
+
+/**
+ * @brief        SVM sigmoid instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in]    S                      points to an instance of the rbf SVM structure.
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @param[in]    coef0                  coeff0 (scikit-learn terminology)
+ * @param[in]    gamma                  gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_sigmoid_init_f16(arm_svm_sigmoid_instance_f16 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float16_t intercept,
+  const float16_t *dualCoefficients,
+  const float16_t *supportVectors,
+  const int32_t *classes,
+  float16_t coef0,
+  float16_t gamma
+  )
+{
+   S->nbOfSupportVectors = nbOfSupportVectors;
+   S->vectorDimension = vectorDimension;
+   S->intercept = intercept;
+   S->dualCoefficients = dualCoefficients;
+   S->supportVectors = supportVectors;
+   S->classes = classes;
+   S->coef0 = coef0;
+   S->gamma = gamma;
+}
+
+
+/**
+ * @} end of sigmoidsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f32.c
@ -0,0 +1,92 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_sigmoid_predict_f32.c
+ * Description:  SVM Sigmoid Instance Initialization
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup groupSVM
+ */
+
+/**
+  @defgroup sigmoidsvm Sigmoid SVM
+
+  Sigmoid SVM classifier
+ */
+
+/**
+ * @addtogroup sigmoidsvm
+ * @{
+ */
+
+
+/**
+ * @brief        SVM sigmoid instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in]    S                      points to an instance of the rbf SVM structure.
+ * @param[in]    nbOfSupportVectors     Number of support vectors
+ * @param[in]    vectorDimension        Dimension of vector space
+ * @param[in]    intercept              Intercept
+ * @param[in]    dualCoefficients       Array of dual coefficients
+ * @param[in]    supportVectors         Array of support vectors
+ * @param[in]    classes                Array of 2 classes ID
+ * @param[in]    coef0                  coeff0 (scikit-learn terminology)
+ * @param[in]    gamma                  gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_sigmoid_init_f32(arm_svm_sigmoid_instance_f32 *S, 
+  uint32_t nbOfSupportVectors,
+  uint32_t vectorDimension,
+  float32_t intercept,
+  const float32_t *dualCoefficients,
+  const float32_t *supportVectors,
+  const int32_t *classes,
+  float32_t coef0,
+  float32_t gamma
+  )
+{
+   S->nbOfSupportVectors = nbOfSupportVectors;
+   S->vectorDimension = vectorDimension;
+   S->intercept = intercept;
+   S->dualCoefficients = dualCoefficients;
+   S->supportVectors = supportVectors;
+   S->classes = classes;
+   S->coef0 = coef0;
+   S->gamma = gamma;
+}
+
+
+/**
+ * @} end of sigmoidsvm group
+ */
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c
@ -0,0 +1,333 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_sigmoid_predict_f16.c
+ * Description:  SVM Sigmoid Classifier
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+ * @addtogroup sigmoidsvm
+ * @{
+ */
+
+
+
+/**
+ * @brief SVM sigmoid prediction
+ * @param[in]    S        Pointer to an instance of the rbf SVM structure.
+ * @param[in]    in       Pointer to input vector
+ * @param[out]   pResult  Decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+void arm_svm_sigmoid_predict_f16(
+    const arm_svm_sigmoid_instance_f16 *S,
+    const float16_t * in,
+    int32_t * pResult)
+{
+        /* inlined Matrix x Vector function interleaved with dot prod */
+    uint32_t        numRows = S->nbOfSupportVectors;
+    uint32_t        numCols = S->vectorDimension;
+    const float16_t *pSupport = S->supportVectors;
+    const float16_t *pSrcA = pSupport;
+    const float16_t *pInA0;
+    const float16_t *pInA1;
+    uint32_t         row;
+    uint32_t         blkCnt;     /* loop counters */
+    const float16_t *pDualCoef = S->dualCoefficients;
+    _Float16       sum = S->intercept;
+    f16x8_t         vSum = vdupq_n_f16(0.0f);
+
+    row = numRows;
+
+    /*
+     * compute 4 rows in parrallel
+     */
+    while (row >= 4) {
+        const float16_t *pInA2, *pInA3;
+        float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+        f16x8_t         vecIn, acc0, acc1, acc2, acc3;
+        float16_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 4 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+        acc1 = vdupq_n_f16(0.0f);
+        acc2 = vdupq_n_f16(0.0f);
+        acc3 = vdupq_n_f16(0.0f);
+
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+        pSrcA2Vec = pInA2;
+        pSrcA3Vec = pInA3;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 8;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vld1q(pSrcA2Vec);
+            pSrcA2Vec += 8;
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vld1q(pSrcA3Vec);
+            pSrcA3Vec += 8;
+            acc3 = vfmaq(acc3, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA2Vec, p0);
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA3Vec, p0);
+            acc3 = vfmaq(acc3, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f16x8_t         vtmp = vuninitializedq_f16();
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
+
+        vSum =
+            vfmaq_m_f16(vSum, vld1q(pDualCoef),
+                      vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),vctp16q(4));
+
+        pDualCoef += 4;
+
+        pSrcA += numCols * 4;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 4;
+    }
+
+    /*
+     * compute 2 rows in parrallel
+     */
+    if (row >= 2) {
+        float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+        f16x8_t         vecIn, acc0, acc1;
+        float16_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 2 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+        acc1 = vdupq_n_f16(0.0f);
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 8;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f16x8_t         vtmp = vuninitializedq_f16();
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+
+        vSum =
+            vfmaq_m_f16(vSum, vld1q(pDualCoef),
+                        vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),
+                        vctp16q(2));
+
+        pSrcA += numCols * 2;
+        row -= 2;
+    }
+
+    if (row >= 1) {
+        f16x8_t         vecIn, acc0;
+        float16_t const *pSrcA0Vec, *pInVec;
+        float16_t const *pSrcVecPtr = in;
+        /*
+         * Initialize the pointers to last MatrixA row
+         */
+        pInA0 = pSrcA;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f16(0.0f);
+
+        pSrcA0Vec = pInA0;
+
+        blkCnt = numCols >> 3;
+        while (blkCnt > 0U) {
+            f16x8_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 8;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 8;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 7;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp16q(blkCnt);
+            f16x8_t         vecA;
+
+            vecIn = vldrhq_z_f16(pInVec, p0);
+            vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f16x8_t         vtmp = vuninitializedq_f16();
+        vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+
+        vSum =
+            vfmaq_m_f16(vSum, vld1q(pDualCoef),
+                        vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),
+                        vctp16q(1));
+    }
+    sum += (_Float16)vecAddAcrossF16Mve(vSum);
+
+    *pResult = S->classes[STEP(sum)];
+}
+
+#else
+void arm_svm_sigmoid_predict_f16(
+    const arm_svm_sigmoid_instance_f16 *S,
+    const float16_t * in,
+    int32_t * pResult)
+{
+    _Float16 sum=S->intercept;
+    _Float16 dot=0.0f16;
+    uint32_t i,j;
+    const float16_t *pSupport = S->supportVectors;
+
+    for(i=0; i < S->nbOfSupportVectors; i++)
+    {
+        dot=0.0f16;
+        for(j=0; j < S->vectorDimension; j++)
+        {
+            dot = (_Float16)dot + (_Float16)in[j] * (_Float16)*pSupport++;
+        }
+        sum += (_Float16)S->dualCoefficients[i] * (_Float16)tanhf((float32_t)((_Float16)S->gamma * (_Float16)dot + (_Float16)S->coef0));
+    }
+    *pResult=S->classes[STEP(sum)];
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of sigmoidsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
--- a/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c
@ -0,0 +1,487 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_svm_sigmoid_predict_f32.c
+ * Description:  SVM Sigmoid Classifier
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include <limits.h>
+#include <math.h>
+
+/**
+ * @addtogroup sigmoidsvm
+ * @{
+ */
+
+
+
+/**
+ * @brief SVM sigmoid prediction
+ * @param[in]    S        Pointer to an instance of the rbf SVM structure.
+ * @param[in]    in       Pointer to input vector
+ * @param[out]   pResult  Decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math.h"
+
+void arm_svm_sigmoid_predict_f32(
+    const arm_svm_sigmoid_instance_f32 *S,
+    const float32_t * in,
+    int32_t * pResult)
+{
+        /* inlined Matrix x Vector function interleaved with dot prod */
+    uint32_t        numRows = S->nbOfSupportVectors;
+    uint32_t        numCols = S->vectorDimension;
+    const float32_t *pSupport = S->supportVectors;
+    const float32_t *pSrcA = pSupport;
+    const float32_t *pInA0;
+    const float32_t *pInA1;
+    uint32_t         row;
+    uint32_t         blkCnt;     /* loop counters */
+    const float32_t *pDualCoef = S->dualCoefficients;
+    float32_t       sum = S->intercept;
+    f32x4_t         vSum = vdupq_n_f32(0.0f);
+
+    row = numRows;
+
+    /*
+     * compute 4 rows in parrallel
+     */
+    while (row >= 4) {
+        const float32_t *pInA2, *pInA3;
+        float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+        f32x4_t         vecIn, acc0, acc1, acc2, acc3;
+        float32_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 4 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        pInA2 = pInA1 + numCols;
+        pInA3 = pInA2 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f32(0.0f);
+        acc1 = vdupq_n_f32(0.0f);
+        acc2 = vdupq_n_f32(0.0f);
+        acc3 = vdupq_n_f32(0.0f);
+
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+        pSrcA2Vec = pInA2;
+        pSrcA3Vec = pInA3;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U) {
+            f32x4_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 4;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 4;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 4;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vld1q(pSrcA2Vec);
+            pSrcA2Vec += 4;
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vld1q(pSrcA3Vec);
+            pSrcA3Vec += 4;
+            acc3 = vfmaq(acc3, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp32q(blkCnt);
+            f32x4_t         vecA;
+
+            vecIn = vldrwq_z_f32(pInVec, p0);
+            vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+            vecA = vldrwq_z_f32(pSrcA2Vec, p0);
+            acc2 = vfmaq(acc2, vecIn, vecA);
+            vecA = vldrwq_z_f32(pSrcA3Vec, p0);
+            acc3 = vfmaq(acc3, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f32x4_t         vtmp = vuninitializedq_f32();
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
+
+        vSum =
+            vfmaq_f32(vSum, vld1q(pDualCoef),
+                      vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)));
+
+        pDualCoef += 4;
+
+        pSrcA += numCols * 4;
+        /*
+         * Decrement the row loop counter
+         */
+        row -= 4;
+    }
+
+    /*
+     * compute 2 rows in parrallel
+     */
+    if (row >= 2) {
+        float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+        f32x4_t         vecIn, acc0, acc1;
+        float32_t const *pSrcVecPtr = in;
+
+        /*
+         * Initialize the pointers to 2 consecutive MatrixA rows
+         */
+        pInA0 = pSrcA;
+        pInA1 = pInA0 + numCols;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f32(0.0f);
+        acc1 = vdupq_n_f32(0.0f);
+        pSrcA0Vec = pInA0;
+        pSrcA1Vec = pInA1;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U) {
+            f32x4_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 4;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 4;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vld1q(pSrcA1Vec);
+            pSrcA1Vec += 4;
+            acc1 = vfmaq(acc1, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp32q(blkCnt);
+            f32x4_t         vecA;
+
+            vecIn = vldrwq_z_f32(pInVec, p0);
+            vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+            vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+            acc1 = vfmaq(acc1, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f32x4_t         vtmp = vuninitializedq_f32();
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
+
+        vSum =
+            vfmaq_m_f32(vSum, vld1q(pDualCoef),
+                        vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)),
+                        vctp32q(2));
+
+        pSrcA += numCols * 2;
+        row -= 2;
+    }
+
+    if (row >= 1) {
+        f32x4_t         vecIn, acc0;
+        float32_t const *pSrcA0Vec, *pInVec;
+        float32_t const *pSrcVecPtr = in;
+        /*
+         * Initialize the pointers to last MatrixA row
+         */
+        pInA0 = pSrcA;
+        /*
+         * Initialize the vector pointer
+         */
+        pInVec = pSrcVecPtr;
+        /*
+         * reset accumulators
+         */
+        acc0 = vdupq_n_f32(0.0f);
+
+        pSrcA0Vec = pInA0;
+
+        blkCnt = numCols >> 2;
+        while (blkCnt > 0U) {
+            f32x4_t         vecA;
+
+            vecIn = vld1q(pInVec);
+            pInVec += 4;
+            vecA = vld1q(pSrcA0Vec);
+            pSrcA0Vec += 4;
+            acc0 = vfmaq(acc0, vecIn, vecA);
+
+            blkCnt--;
+        }
+        /*
+         * tail
+         * (will be merged thru tail predication)
+         */
+        blkCnt = numCols & 3;
+        if (blkCnt > 0U) {
+            mve_pred16_t    p0 = vctp32q(blkCnt);
+            f32x4_t         vecA;
+
+            vecIn = vldrwq_z_f32(pInVec, p0);
+            vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+            acc0 = vfmaq(acc0, vecIn, vecA);
+        }
+        /*
+         * Sum the partial parts
+         */
+        f32x4_t         vtmp = vuninitializedq_f32();
+        vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+
+        vSum =
+            vfmaq_m_f32(vSum, vld1q(pDualCoef),
+                        vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)),
+                        vctp32q(1));
+    }
+    sum += vecAddAcrossF32Mve(vSum);
+
+    *pResult = S->classes[STEP(sum)];
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+#include "NEMath.h"
+
+void arm_svm_sigmoid_predict_f32(
+    const arm_svm_sigmoid_instance_f32 *S,
+    const float32_t * in,
+    int32_t * pResult)
+{
+    float32_t sum = S->intercept;
+   
+    float32_t dot;
+    float32x4_t dotV; 
+
+    float32x4_t accuma,accumb,accumc,accumd,accum;
+    float32x2_t accum2;
+    float32x4_t vec1;
+    float32x4_t coef0 = vdupq_n_f32(S->coef0);
+
+    float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
+
+    uint32_t blkCnt;   
+    uint32_t vectorBlkCnt;   
+
+    const float32_t *pIn = in;
+
+    const float32_t *pSupport = S->supportVectors;
+
+    const float32_t *pSupporta = S->supportVectors;
+    const float32_t *pSupportb;
+    const float32_t *pSupportc;
+    const float32_t *pSupportd;
+
+    pSupportb = pSupporta + S->vectorDimension;
+    pSupportc = pSupportb + S->vectorDimension;
+    pSupportd = pSupportc + S->vectorDimension;
+
+    const float32_t *pDualCoefs = S->dualCoefficients;
+
+    vectorBlkCnt = S->nbOfSupportVectors >> 2;
+    while (vectorBlkCnt > 0U)
+    {
+        accuma = vdupq_n_f32(0);
+        accumb = vdupq_n_f32(0);
+        accumc = vdupq_n_f32(0);
+        accumd = vdupq_n_f32(0);
+
+        pIn = in;
+
+        blkCnt = S->vectorDimension >> 2;
+        while (blkCnt > 0U)
+        {
+        
+            vec1 = vld1q_f32(pIn);
+            vec2a = vld1q_f32(pSupporta);
+            vec2b = vld1q_f32(pSupportb);
+            vec2c = vld1q_f32(pSupportc);
+            vec2d = vld1q_f32(pSupportd);
+
+            pIn += 4;
+            pSupporta += 4;
+            pSupportb += 4;
+            pSupportc += 4;
+            pSupportd += 4;
+
+            accuma = vmlaq_f32(accuma, vec1,vec2a);
+            accumb = vmlaq_f32(accumb, vec1,vec2b);
+            accumc = vmlaq_f32(accumc, vec1,vec2c);
+            accumd = vmlaq_f32(accumd, vec1,vec2d);
+
+            blkCnt -- ;
+        }
+        accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
+
+        accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
+
+        accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
+
+        accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
+
+
+        blkCnt = S->vectorDimension & 3;
+        while (blkCnt > 0U)
+        {
+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
+
+            pIn++;
+
+            blkCnt -- ;
+        }
+
+        vec1 = vld1q_f32(pDualCoefs);
+        pDualCoefs += 4; 
+
+        // To vectorize later
+        dotV = vmulq_n_f32(dotV, S->gamma);
+        dotV = vaddq_f32(dotV, coef0);
+
+        dotV = vtanhq_f32(dotV);
+
+        accum = vmulq_f32(vec1,dotV);
+        accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+        sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+        pSupporta += 3*S->vectorDimension;
+        pSupportb += 3*S->vectorDimension;
+        pSupportc += 3*S->vectorDimension;
+        pSupportd += 3*S->vectorDimension;
+
+        vectorBlkCnt -- ;
+    }
+
+    pSupport = pSupporta;
+    vectorBlkCnt = S->nbOfSupportVectors & 3;
+
+    while (vectorBlkCnt > 0U)
+    {
+        accum = vdupq_n_f32(0);
+        dot = 0.0f;
+        pIn = in;
+
+        blkCnt = S->vectorDimension >> 2;
+        while (blkCnt > 0U)
+        {
+        
+            vec1 = vld1q_f32(pIn);
+            vec2 = vld1q_f32(pSupport);
+            pIn += 4;
+            pSupport += 4;
+
+            accum = vmlaq_f32(accum, vec1,vec2);
+
+            blkCnt -- ;
+        }
+        accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+        dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+
+        blkCnt = S->vectorDimension & 3;
+        while (blkCnt > 0U)
+        {
+            dot = dot + *pIn++ * *pSupport++;
+
+            blkCnt -- ;
+        }
+
+        sum += *pDualCoefs++ * tanhf(S->gamma * dot + S->coef0);
+        vectorBlkCnt -- ;
+    }
+
+    *pResult=S->classes[STEP(sum)];
+}
+#else
+void arm_svm_sigmoid_predict_f32(
+    const arm_svm_sigmoid_instance_f32 *S,
+    const float32_t * in,
+    int32_t * pResult)
+{
+    float32_t sum=S->intercept;
+    float32_t dot=0;
+    uint32_t i,j;
+    const float32_t *pSupport = S->supportVectors;
+
+    for(i=0; i < S->nbOfSupportVectors; i++)
+    {
+        dot=0;
+        for(j=0; j < S->vectorDimension; j++)
+        {
+            dot = dot + in[j]* *pSupport++;
+        }
+        sum += S->dualCoefficients[i] * tanhf(S->gamma * dot + S->coef0);
+    }
+    *pResult=S->classes[STEP(sum)];
+}
+
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of sigmoidsvm group
+ */