Initial commit

2025-06-09 18:06:36 +02:00 · 2025-06-09 18:06:36 +02:00 · ce3dd83b9f
commit ce3dd83b9f
1470 changed files with 1054449 additions and 0 deletions
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/CMakeLists.txt
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/CMakeLists.txt
@ -0,0 +1,28 @@
+cmake_minimum_required (VERSION 3.14)
+
+project(CMSISDSPSupport)
+
+include(configLib)
+include(configDsp)
+
+file(GLOB SRC "./*_*.c")
+
+add_library(CMSISDSPSupport STATIC ${SRC})
+
+configLib(CMSISDSPSupport ${ROOT})
+configDsp(CMSISDSPSupport ${ROOT})
+
+### Includes
+target_include_directories(CMSISDSPSupport PUBLIC "${DSP}/Include")
+
+if ((NOT ARMAC5) AND (NOT DISABLEFLOAT16))
+target_sources(CMSISDSPSupport PRIVATE arm_copy_f16.c)
+target_sources(CMSISDSPSupport PRIVATE arm_fill_f16.c)
+target_sources(CMSISDSPSupport PRIVATE arm_f16_to_q15.c)
+target_sources(CMSISDSPSupport PRIVATE arm_q15_to_f16.c)
+target_sources(CMSISDSPSupport PRIVATE arm_float_to_f16.c)
+target_sources(CMSISDSPSupport PRIVATE arm_f16_to_float.c)
+target_sources(CMSISDSPSupport PRIVATE arm_weighted_sum_f16.c)
+target_sources(CMSISDSPSupport PRIVATE arm_barycenter_f16.c)
+endif()
+
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctions.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctions.c
@ -0,0 +1,63 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        SupportFunctions.c
+ * Description:  Combination of all support function source files.
+ *
+ * $Date:        16. March 2020
+ * $Revision:    V1.1.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_barycenter_f32.c"
+#include "arm_bitonic_sort_f32.c"
+#include "arm_bubble_sort_f32.c"
+#include "arm_copy_f32.c"
+#include "arm_copy_f64.c"
+#include "arm_copy_q15.c"
+#include "arm_copy_q31.c"
+#include "arm_copy_q7.c"
+#include "arm_fill_f32.c"
+#include "arm_fill_f64.c"
+#include "arm_fill_q15.c"
+#include "arm_fill_q31.c"
+#include "arm_fill_q7.c"
+#include "arm_heap_sort_f32.c"
+#include "arm_insertion_sort_f32.c"
+#include "arm_merge_sort_f32.c"
+#include "arm_merge_sort_init_f32.c"
+#include "arm_quick_sort_f32.c"
+#include "arm_selection_sort_f32.c"
+#include "arm_sort_f32.c"
+#include "arm_sort_init_f32.c"
+#include "arm_weighted_sum_f32.c"
+
+#include "arm_float_to_q15.c"
+#include "arm_float_to_q31.c"
+#include "arm_float_to_q7.c"
+#include "arm_q15_to_float.c"
+#include "arm_q15_to_q31.c"
+#include "arm_q15_to_q7.c"
+#include "arm_q31_to_float.c"
+#include "arm_q31_to_q15.c"
+#include "arm_q31_to_q7.c"
+#include "arm_q7_to_float.c"
+#include "arm_q7_to_q15.c"
+#include "arm_q7_to_q31.c"
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctionsF16.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/SupportFunctionsF16.c
@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        SupportFunctions.c
+ * Description:  Combination of all support function source files.
+ *
+ * $Date:        16. March 2020
+ * $Revision:    V1.1.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_copy_f16.c"
+#include "arm_fill_f16.c"
+#include "arm_f16_to_q15.c"
+#include "arm_f16_to_float.c"
+#include "arm_q15_to_f16.c"
+#include "arm_float_to_f16.c"
+#include "arm_weighted_sum_f16.c"
+#include "arm_barycenter_f16.c"
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f16.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f16.c
@ -0,0 +1,274 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_barycenter_f16.c
+ * Description:  Barycenter
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include <limits.h>
+#include <math.h>
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @defgroup barycenter Barycenter
+
+  Barycenter of weighted vectors
+ */
+
+/**
+  @addtogroup barycenter
+  @{
+ */
+
+
+/**
+ * @brief Barycenter
+ *
+ *
+ * @param[in]    *in         List of vectors
+ * @param[in]    *weights    Weights of the vectors
+ * @param[out]   *out        Barycenter
+ * @param[in]    nbVectors   Number of vectors
+ * @param[in]    vecDim      Dimension of space (vector dimension)
+ * @return       None
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_barycenter_f16(const float16_t *in, 
+  const float16_t *weights, 
+  float16_t *out, 
+  uint32_t nbVectors,
+  uint32_t vecDim)
+{
+    const float16_t *pIn, *pW;
+    const float16_t *pIn1, *pIn2, *pIn3, *pIn4;
+    float16_t      *pOut;
+    uint32_t        blkCntVector, blkCntSample;
+    float16_t       accum, w;
+
+    blkCntVector = nbVectors;
+    blkCntSample = vecDim;
+
+    accum = 0.0f;
+
+    pW = weights;
+    pIn = in;
+
+
+    arm_fill_f16(0.0f, out, vecDim);
+
+
+    /* Sum */
+    pIn1 = pIn;
+    pIn2 = pIn1 + vecDim;
+    pIn3 = pIn2 + vecDim;
+    pIn4 = pIn3 + vecDim;
+
+    blkCntVector = nbVectors >> 2;
+    while (blkCntVector > 0) 
+    {
+        f16x8_t         outV, inV1, inV2, inV3, inV4;
+        float16_t       w1, w2, w3, w4;
+
+        pOut = out;
+        w1 = *pW++;
+        w2 = *pW++;
+        w3 = *pW++;
+        w4 = *pW++;
+        accum += (_Float16)w1 + (_Float16)w2 + (_Float16)w3 + (_Float16)w4;
+
+        blkCntSample = vecDim >> 3;
+        while (blkCntSample > 0) {
+            outV = vld1q((const float16_t *) pOut);
+            inV1 = vld1q(pIn1);
+            inV2 = vld1q(pIn2);
+            inV3 = vld1q(pIn3);
+            inV4 = vld1q(pIn4);
+            outV = vfmaq(outV, inV1, w1);
+            outV = vfmaq(outV, inV2, w2);
+            outV = vfmaq(outV, inV3, w3);
+            outV = vfmaq(outV, inV4, w4);
+            vst1q(pOut, outV);
+
+            pOut += 8;
+            pIn1 += 8;
+            pIn2 += 8;
+            pIn3 += 8;
+            pIn4 += 8;
+
+            blkCntSample--;
+        }
+
+        blkCntSample = vecDim & 7;
+        while (blkCntSample > 0) {
+            *pOut = (_Float16)*pOut + (_Float16)*pIn1++ * (_Float16)w1;
+            *pOut = (_Float16)*pOut + (_Float16)*pIn2++ * (_Float16)w2;
+            *pOut = (_Float16)*pOut + (_Float16)*pIn3++ * (_Float16)w3;
+            *pOut = (_Float16)*pOut + (_Float16)*pIn4++ * (_Float16)w4;
+            pOut++;
+            blkCntSample--;
+        }
+
+        pIn1 += 3 * vecDim;
+        pIn2 += 3 * vecDim;
+        pIn3 += 3 * vecDim;
+        pIn4 += 3 * vecDim;
+
+        blkCntVector--;
+    }
+
+    pIn = pIn1;
+
+    blkCntVector = nbVectors & 3;
+    while (blkCntVector > 0) 
+    {
+        f16x8_t         inV, outV;
+
+        pOut = out;
+        w = *pW++;
+        accum += (_Float16)w;
+
+        blkCntSample = vecDim >> 3;
+        while (blkCntSample > 0) 
+        {
+            outV = vld1q_f16(pOut);
+            inV = vld1q_f16(pIn);
+            outV = vfmaq(outV, inV, w);
+            vst1q_f16(pOut, outV);
+            pOut += 8;
+            pIn += 8;
+
+            blkCntSample--;
+        }
+
+        blkCntSample = vecDim & 7;
+        while (blkCntSample > 0) 
+        {
+            *pOut = (_Float16)*pOut + (_Float16)*pIn++ * (_Float16)w;
+            pOut++;
+            blkCntSample--;
+        }
+
+        blkCntVector--;
+    }
+
+    /* Normalize */
+    pOut = out;
+    accum = 1.0f16 / (_Float16)accum;
+
+    blkCntSample = vecDim >> 3;
+    while (blkCntSample > 0) 
+    {
+        f16x8_t         tmp;
+
+        tmp = vld1q((const float16_t *) pOut);
+        tmp = vmulq(tmp, accum);
+        vst1q(pOut, tmp);
+        pOut += 8;
+        blkCntSample--;
+    }
+
+    blkCntSample = vecDim & 7;
+    while (blkCntSample > 0) 
+    {
+        *pOut = (_Float16)*pOut * (_Float16)accum;
+        pOut++;
+        blkCntSample--;
+    }
+}
+#else
+void arm_barycenter_f16(const float16_t *in, const float16_t *weights, float16_t *out, uint32_t nbVectors,uint32_t vecDim)
+{
+
+   const float16_t *pIn,*pW;
+   float16_t *pOut;
+   uint32_t blkCntVector,blkCntSample;
+   float16_t accum, w;
+
+   blkCntVector = nbVectors;
+   blkCntSample = vecDim;
+
+   accum = 0.0f16;
+
+   pW = weights;
+   pIn = in;
+
+   /* Set counters to 0 */
+   blkCntSample = vecDim;
+   pOut = out;
+
+   while(blkCntSample > 0)
+   {
+         *pOut = 0.0f16;
+         pOut++;
+         blkCntSample--;
+   }
+
+   /* Sum */
+   while(blkCntVector > 0)
+   {
+      pOut = out;
+      w = *pW++;
+      accum += (_Float16)w;
+
+      blkCntSample = vecDim;
+      while(blkCntSample > 0)
+      {
+          *pOut = (_Float16)*pOut + (_Float16)*pIn++ * (_Float16)w;
+          pOut++;
+          blkCntSample--;
+      }
+
+      blkCntVector--;
+   }
+
+   /* Normalize */
+   blkCntSample = vecDim;
+   pOut = out;
+
+   while(blkCntSample > 0)
+   {
+         *pOut = (_Float16)*pOut / (_Float16)accum;
+         pOut++;
+         blkCntSample--;
+   }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of barycenter group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f32.c
@ -0,0 +1,414 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_barycenter_f32.c
+ * Description:  Barycenter
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+#include <limits.h>
+#include <math.h>
+
+
+/**
+  @ingroup barycenter
+ */
+
+
+/**
+ * @brief Barycenter
+ *
+ *
+ * @param[in]    *in         List of vectors
+ * @param[in]    *weights    Weights of the vectors
+ * @param[out]   *out        Barycenter
+ * @param[in]    nbVectors   Number of vectors
+ * @param[in]    vecDim      Dimension of space (vector dimension)
+ * @return       None
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_barycenter_f32(const float32_t *in, 
+  const float32_t *weights, 
+  float32_t *out, 
+  uint32_t nbVectors,
+  uint32_t vecDim)
+{
+    const float32_t *pIn, *pW;
+    const float32_t *pIn1, *pIn2, *pIn3, *pIn4;
+    float32_t      *pOut;
+    uint32_t        blkCntVector, blkCntSample;
+    float32_t       accum, w;
+
+    blkCntVector = nbVectors;
+    blkCntSample = vecDim;
+
+    accum = 0.0f;
+
+    pW = weights;
+    pIn = in;
+
+
+    arm_fill_f32(0.0f, out, vecDim);
+
+
+    /* Sum */
+    pIn1 = pIn;
+    pIn2 = pIn1 + vecDim;
+    pIn3 = pIn2 + vecDim;
+    pIn4 = pIn3 + vecDim;
+
+    blkCntVector = nbVectors >> 2;
+    while (blkCntVector > 0) 
+    {
+        f32x4_t         outV, inV1, inV2, inV3, inV4;
+        float32_t       w1, w2, w3, w4;
+
+        pOut = out;
+        w1 = *pW++;
+        w2 = *pW++;
+        w3 = *pW++;
+        w4 = *pW++;
+        accum += w1 + w2 + w3 + w4;
+
+        blkCntSample = vecDim >> 2;
+        while (blkCntSample > 0) {
+            outV = vld1q((const float32_t *) pOut);
+            inV1 = vld1q(pIn1);
+            inV2 = vld1q(pIn2);
+            inV3 = vld1q(pIn3);
+            inV4 = vld1q(pIn4);
+            outV = vfmaq(outV, inV1, w1);
+            outV = vfmaq(outV, inV2, w2);
+            outV = vfmaq(outV, inV3, w3);
+            outV = vfmaq(outV, inV4, w4);
+            vst1q(pOut, outV);
+
+            pOut += 4;
+            pIn1 += 4;
+            pIn2 += 4;
+            pIn3 += 4;
+            pIn4 += 4;
+
+            blkCntSample--;
+        }
+
+        blkCntSample = vecDim & 3;
+        while (blkCntSample > 0) {
+            *pOut = *pOut + *pIn1++ * w1;
+            *pOut = *pOut + *pIn2++ * w2;
+            *pOut = *pOut + *pIn3++ * w3;
+            *pOut = *pOut + *pIn4++ * w4;
+            pOut++;
+            blkCntSample--;
+        }
+
+        pIn1 += 3 * vecDim;
+        pIn2 += 3 * vecDim;
+        pIn3 += 3 * vecDim;
+        pIn4 += 3 * vecDim;
+
+        blkCntVector--;
+    }
+
+    pIn = pIn1;
+
+    blkCntVector = nbVectors & 3;
+    while (blkCntVector > 0) 
+    {
+        f32x4_t         inV, outV;
+
+        pOut = out;
+        w = *pW++;
+        accum += w;
+
+        blkCntSample = vecDim >> 2;
+        while (blkCntSample > 0) 
+        {
+            outV = vld1q_f32(pOut);
+            inV = vld1q_f32(pIn);
+            outV = vfmaq(outV, inV, w);
+            vst1q_f32(pOut, outV);
+            pOut += 4;
+            pIn += 4;
+
+            blkCntSample--;
+        }
+
+        blkCntSample = vecDim & 3;
+        while (blkCntSample > 0) 
+        {
+            *pOut = *pOut + *pIn++ * w;
+            pOut++;
+            blkCntSample--;
+        }
+
+        blkCntVector--;
+    }
+
+    /* Normalize */
+    pOut = out;
+    accum = 1.0f / accum;
+
+    blkCntSample = vecDim >> 2;
+    while (blkCntSample > 0) 
+    {
+        f32x4_t         tmp;
+
+        tmp = vld1q((const float32_t *) pOut);
+        tmp = vmulq(tmp, accum);
+        vst1q(pOut, tmp);
+        pOut += 4;
+        blkCntSample--;
+    }
+
+    blkCntSample = vecDim & 3;
+    while (blkCntSample > 0) 
+    {
+        *pOut = *pOut * accum;
+        pOut++;
+        blkCntSample--;
+    }
+}
+#else
+#if defined(ARM_MATH_NEON)
+
+#include "NEMath.h"
+void arm_barycenter_f32(const float32_t *in, const float32_t *weights, float32_t *out, uint32_t nbVectors,uint32_t vecDim)
+{
+
+   const float32_t *pIn,*pW, *pIn1, *pIn2, *pIn3, *pIn4;
+   float32_t *pOut;
+   uint32_t blkCntVector,blkCntSample;
+   float32_t accum, w,w1,w2,w3,w4;
+
+   float32x4_t tmp, inV,outV, inV1, inV2, inV3, inV4;
+
+   blkCntVector = nbVectors;
+   blkCntSample = vecDim;
+
+   accum = 0.0f;
+
+   pW = weights;
+   pIn = in;
+
+   /* Set counters to 0 */
+   tmp = vdupq_n_f32(0.0f);
+   pOut = out;
+
+   blkCntSample = vecDim >> 2;
+   while(blkCntSample > 0)
+   {
+         vst1q_f32(pOut, tmp);
+         pOut += 4;
+         blkCntSample--;
+   }
+
+   blkCntSample = vecDim & 3;
+   while(blkCntSample > 0)
+   {
+         *pOut = 0.0f;
+         pOut++;
+         blkCntSample--;
+   }
+
+   /* Sum */
+  
+   pIn1 = pIn;
+   pIn2 = pIn1 + vecDim;
+   pIn3 = pIn2 + vecDim;
+   pIn4 = pIn3 + vecDim;
+   
+   blkCntVector = nbVectors >> 2;
+   while(blkCntVector > 0)
+   {
+      pOut = out;
+      w1 = *pW++;
+      w2 = *pW++;
+      w3 = *pW++;
+      w4 = *pW++;
+      accum += w1 + w2 + w3 + w4;
+
+      blkCntSample = vecDim >> 2;
+      while(blkCntSample > 0)
+      {
+          outV = vld1q_f32(pOut);
+          inV1 = vld1q_f32(pIn1);
+          inV2 = vld1q_f32(pIn2);
+          inV3 = vld1q_f32(pIn3);
+          inV4 = vld1q_f32(pIn4);
+          outV = vmlaq_n_f32(outV,inV1,w1);
+          outV = vmlaq_n_f32(outV,inV2,w2);
+          outV = vmlaq_n_f32(outV,inV3,w3);
+          outV = vmlaq_n_f32(outV,inV4,w4);
+          vst1q_f32(pOut, outV);
+          pOut += 4;
+          pIn1 += 4;
+          pIn2 += 4;
+          pIn3 += 4;
+          pIn4 += 4;
+
+          blkCntSample--;
+      }
+
+      blkCntSample = vecDim & 3;
+      while(blkCntSample > 0)
+      {
+          *pOut = *pOut + *pIn1++ * w1;
+          *pOut = *pOut + *pIn2++ * w2;
+          *pOut = *pOut + *pIn3++ * w3;
+          *pOut = *pOut + *pIn4++ * w4;
+          pOut++;
+          blkCntSample--;
+      }
+
+      pIn1 += 3*vecDim;
+      pIn2 += 3*vecDim;
+      pIn3 += 3*vecDim;
+      pIn4 += 3*vecDim;
+
+      blkCntVector--;
+   }
+
+   pIn = pIn1;
+
+   blkCntVector = nbVectors & 3;
+   while(blkCntVector > 0)
+   {
+      pOut = out;
+      w = *pW++;
+      accum += w;
+
+      blkCntSample = vecDim >> 2;
+      while(blkCntSample > 0)
+      {
+          outV = vld1q_f32(pOut);
+          inV = vld1q_f32(pIn);
+          outV = vmlaq_n_f32(outV,inV,w);
+          vst1q_f32(pOut, outV);
+          pOut += 4;
+          pIn += 4;
+
+          blkCntSample--;
+      }
+      
+      blkCntSample = vecDim & 3;
+      while(blkCntSample > 0)
+      {
+          *pOut = *pOut + *pIn++ * w;
+          pOut++;
+          blkCntSample--;
+      }
+
+      blkCntVector--;
+   }
+
+   /* Normalize */
+   pOut = out;
+   accum = 1.0f / accum;
+
+   blkCntSample = vecDim >> 2;
+   while(blkCntSample > 0)
+   {
+         tmp = vld1q_f32(pOut);
+         tmp = vmulq_n_f32(tmp,accum);
+         vst1q_f32(pOut, tmp);
+         pOut += 4;
+         blkCntSample--;
+   }
+
+   blkCntSample = vecDim & 3;
+   while(blkCntSample > 0)
+   {
+         *pOut = *pOut * accum;
+         pOut++;
+         blkCntSample--;
+   }
+
+}
+#else
+void arm_barycenter_f32(const float32_t *in, const float32_t *weights, float32_t *out, uint32_t nbVectors,uint32_t vecDim)
+{
+
+   const float32_t *pIn,*pW;
+   float32_t *pOut;
+   uint32_t blkCntVector,blkCntSample;
+   float32_t accum, w;
+
+   blkCntVector = nbVectors;
+   blkCntSample = vecDim;
+
+   accum = 0.0f;
+
+   pW = weights;
+   pIn = in;
+
+   /* Set counters to 0 */
+   blkCntSample = vecDim;
+   pOut = out;
+
+   while(blkCntSample > 0)
+   {
+         *pOut = 0.0f;
+         pOut++;
+         blkCntSample--;
+   }
+
+   /* Sum */
+   while(blkCntVector > 0)
+   {
+      pOut = out;
+      w = *pW++;
+      accum += w;
+
+      blkCntSample = vecDim;
+      while(blkCntSample > 0)
+      {
+          *pOut = *pOut + *pIn++ * w;
+          pOut++;
+          blkCntSample--;
+      }
+
+      blkCntVector--;
+   }
+
+   /* Normalize */
+   blkCntSample = vecDim;
+   pOut = out;
+
+   while(blkCntSample > 0)
+   {
+         *pOut = *pOut / accum;
+         pOut++;
+         blkCntSample--;
+   }
+
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of barycenter group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c
@ -0,0 +1,104 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_bubble_sort_f32.c
+ * Description:  Floating point bubble sort
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+#include "arm_sorting.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Sorting
+  @{
+ */
+
+/**
+   * @private
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   *
+   * @par        Algorithm
+   *               The bubble sort algorithm is a simple comparison algorithm that
+   *               reads the elements of a vector from the beginning to the end,
+   *               compares the adjacent ones and swaps them if they are in the 
+   *               wrong order. The procedure is repeated until there is nothing 
+   *               left to swap. Bubble sort is fast for input vectors that are
+   *               nearly sorted.
+   *
+   * @par          It's an in-place algorithm. In order to obtain an out-of-place
+   *               function, a memcpy of the source vector is performed
+   */
+
+void arm_bubble_sort_f32(
+  const arm_sort_instance_f32 * S, 
+        float32_t * pSrc, 
+        float32_t * pDst, 
+        uint32_t blockSize)
+{
+    uint8_t dir = S->dir;
+    uint32_t i;
+    uint8_t swapped =1;
+    float32_t * pA;
+    float32_t temp;
+
+    if(pSrc != pDst) // out-of-place
+    {
+	memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+	pA = pDst;
+    }
+    else
+	pA = pSrc;
+
+    while(swapped==1) // If nothing has been swapped after one loop stop
+    {
+	swapped=0;
+
+        for(i=0; i<blockSize-1; i++)
+	{
+	    if(dir==(pA[i]>pA[i+1]))
+	    {
+		// Swap
+		temp = pA[i];
+		pA[i] = pA[i+1];
+		pA[i+1] = temp;
+
+		// Update flag
+		swapped = 1;
+	    }
+	}
+
+	blockSize--;
+    }
+}
+
+/**
+  @} end of Sorting group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f16.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f16.c
@ -0,0 +1,130 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_copy_f16.c
+ * Description:  Copies the elements of a floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupSupport
+ */
+
+
+/**
+  @addtogroup copy
+  @{
+ */
+
+/**
+  @brief         Copies the elements of a f16 vector.
+  @param[in]     pSrc       points to input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_copy_f16(
+  const float16_t * pSrc,
+  float16_t * pDst,
+  uint32_t blockSize)
+{
+    do {
+        mve_pred16_t    p = vctp16q(blockSize);
+
+        vstrhq_p_f16(pDst,
+        vldrhq_z_f16((float16_t const *) pSrc, p), p);
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+        blockSize -= 8;
+    }
+    while ((int32_t) blockSize > 0);
+}
+
+#else
+
+void arm_copy_f16(
+  const float16_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+
+    /* Copy and store result in destination buffer */
+    *pDst++ = *pSrc++;
+    *pDst++ = *pSrc++;
+    *pDst++ = *pSrc++;
+    *pDst++ = *pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+
+    /* Copy and store result in destination buffer */
+    *pDst++ = *pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicCopy group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f32.c
@ -0,0 +1,192 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_copy_f32.c
+ * Description:  Copies the elements of a floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @defgroup copy Vector Copy
+
+  Copies sample by sample from source vector to destination vector.
+
+  <pre>
+      pDst[n] = pSrc[n];   0 <= n < blockSize.
+  </pre>
+
+  There are separate functions for floating point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+  @addtogroup copy
+  @{
+ */
+
+/**
+  @brief         Copies the elements of a floating-point vector.
+  @param[in]     pSrc       points to input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_copy_f32(
+  const float32_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+  uint32_t blkCnt;
+  blkCnt = blockSize >> 2U;
+
+  /* Compute 4 outputs at a time */
+  while (blkCnt > 0U)
+  {
+      vstrwq_f32(pDst, vldrwq_f32(pSrc));
+      /*
+       * Decrement the blockSize loop counter
+       * Advance vector source and destination pointers
+       */
+      pSrc += 4;
+      pDst += 4;
+      blkCnt --;
+  }
+
+  blkCnt = blockSize & 3;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+
+    /* Copy and store result in destination buffer */
+    *pDst++ = *pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+    
+}
+
+#else
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_copy_f32(
+  const float32_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* loop counter */
+
+  float32x4_t inV;
+
+  blkCnt = blockSize >> 2U;
+
+  /* Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+    /* Copy and then store the results in the destination buffer */
+    inV = vld1q_f32(pSrc);
+    vst1q_f32(pDst, inV);
+    pSrc += 4;
+    pDst += 4;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize & 3;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+    /* Copy and then store the results in the destination buffer */
+    *pDst++ = *pSrc++;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+}
+#else
+void arm_copy_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+
+    /* Copy and store result in destination buffer */
+    *pDst++ = *pSrc++;
+    *pDst++ = *pSrc++;
+    *pDst++ = *pSrc++;
+    *pDst++ = *pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+
+    /* Copy and store result in destination buffer */
+    *pDst++ = *pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of BasicCopy group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f64.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_f64.c
@ -0,0 +1,71 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_copy_f64.c
+ * Description:  Copies the elements of a floating-point vector
+ *
+ * $Date:        13 September 2021
+ * $Revision:    V1.10.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup copy
+  @{
+ */
+
+/**
+  @brief         Copies the elements of a floating-point vector.
+  @param[in]     pSrc       points to input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+void arm_copy_f64(
+  const float64_t * pSrc,
+        float64_t * pDst,
+        uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+
+    /* Copy and store result in destination buffer */
+    *pDst++ = *pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+
+/**
+  @} end of BasicCopy group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q15.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q15.c
@ -0,0 +1,130 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_copy_q15.c
+ * Description:  Copies the elements of a Q15 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup copy
+  @{
+ */
+
+/**
+  @brief         Copies the elements of a Q15 vector.
+  @param[in]     pSrc       points to input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_copy_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+  uint32_t blkCnt;
+
+  blkCnt = blockSize >> 3;
+  while (blkCnt > 0U)
+  {
+      vstrhq_s16(pDst,vldrhq_s16(pSrc));
+      /*
+       * Decrement the blockSize loop counter
+       * Advance vector source and destination pointers
+       */
+      pSrc += 8;
+      pDst += 8;
+      blkCnt --;
+  }
+
+  blkCnt = blockSize & 7;
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+
+    /* Copy and store result in destination buffer */
+    *pDst++ = *pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#else
+void arm_copy_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+
+    /* read 2 times 2 samples at a time */
+    write_q15x2_ia (&pDst, read_q15x2_ia (&pSrc));
+    write_q15x2_ia (&pDst, read_q15x2_ia (&pSrc));
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+
+    /* Copy and store result in destination buffer */
+    *pDst++ = *pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicCopy group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q31.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q31.c
@ -0,0 +1,135 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_copy_q31.c
+ * Description:  Copies the elements of a Q31 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup copy
+  @{
+ */
+
+/**
+  @brief         Copies the elements of a Q31 vector.
+  @param[in]     pSrc       points to input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_copy_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+  uint32_t blkCnt;
+  blkCnt = blockSize >> 2U;
+
+  /* Compute 4 outputs at a time */
+  while (blkCnt > 0U)
+  {
+        vstrwq_s32(pDst,vldrwq_s32(pSrc));
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+        blkCnt --;
+  }
+
+  blkCnt = blockSize & 3;
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+
+    /* Copy and store result in destination buffer */
+    *pDst++ = *pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+    
+}
+
+#else
+void arm_copy_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+
+    /* Copy and store result in destination buffer */
+    *pDst++ = *pSrc++;
+    *pDst++ = *pSrc++;
+    *pDst++ = *pSrc++;
+    *pDst++ = *pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+
+    /* Copy and store result in destination buffer */
+    *pDst++ = *pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicCopy group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q7.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_copy_q7.c
@ -0,0 +1,132 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_copy_q7.c
+ * Description:  Copies the elements of a Q7 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup copy
+  @{
+ */
+
+/**
+  @brief         Copies the elements of a Q7 vector.
+  @param[in]     pSrc       points to input vector
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_copy_q7(
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+
+  uint32_t blkCnt;  
+
+  blkCnt = blockSize >> 4;
+  while (blkCnt > 0U)
+  {
+
+        vstrbq_s8(pDst,vldrbq_s8(pSrc));
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pSrc += 16;
+        pDst += 16;
+        blkCnt --;
+  }
+
+  blkCnt = blockSize & 0xF;
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+
+    /* Copy and store result in destination buffer */
+    *pDst++ = *pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+
+#else
+void arm_copy_q7(
+  const q7_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+
+    /* read 4 samples at a time */
+    write_q7x4_ia (&pDst, read_q7x4_ia (&pSrc));
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A */
+
+    /* Copy and store result in destination buffer */
+    *pDst++ = *pSrc++;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of BasicCopy group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_float.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_float.c
@ -0,0 +1,134 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_float_to_q15.c
+ * Description:  Converts the elements of the floating-point vector to Q15 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+ * @defgroup f16_to_x  Convert 16-bit floating point value
+ */
+
+/**
+  @addtogroup f16_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the f16 vector to f32 vector.
+  @param[in]     pSrc       points to the f16 input vector
+  @param[out]    pDst       points to the f32 output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_f16_to_float built. Helium version has build issues with gcc."
+#endif 
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) &&  !defined(__CMSIS_GCC_H)
+
+void arm_f16_to_float(
+  const float16_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    int32_t  blkCnt;           /* loop counters */
+    float16x8_t vecDst;
+    float32x4x2_t tmp;
+
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0)
+    {
+        vecDst = vldrhq_f16(pSrc);          
+        pSrc += 8;
+
+        tmp.val[0] = vcvtbq_f32_f16(vecDst);
+        tmp.val[1] = vcvttq_f32_f16(vecDst);
+        vst2q(pDst,tmp);
+        
+        pDst += 8;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     * (will be merged thru tail predication)
+     */
+    blkCnt = blockSize & 7;
+    while (blkCnt > 0)
+    {
+
+        *pDst++ = (float32_t) *pSrc++;
+        /*
+         * Decrement the loop counter
+         */
+        blkCnt--;
+    }
+}
+
+#else
+void arm_f16_to_float(
+  const float16_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    const float16_t *pIn = pSrc;      /* Src pointer */
+    uint32_t  blkCnt;           /* loop counter */
+
+    /*
+     * Loop over blockSize number of values
+     */
+    blkCnt = blockSize;
+
+    while (blkCnt > 0U)
+    {
+
+        *pDst++ = (float32_t) * pIn++;
+        /*
+         * Decrement the loop counter
+         */
+        blkCnt--;
+    }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of f16_to_x group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_q15.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_q15.c
@ -0,0 +1,157 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_float_to_q15.c
+ * Description:  Converts the elements of the floating-point vector to Q15 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup f16_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the f16 vector to Q15 vector.
+  @param[in]     pSrc       points to the f16 input vector
+  @param[out]    pDst       points to the Q15 output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Details
+                   The equation used for the conversion process is:
+  <pre>
+      pDst[n] = (q15_t)(pSrc[n] * 32768);   0 <= n < blockSize.
+  </pre>
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+
+  @note
+                   In order to apply rounding in scalar version, the library should be rebuilt with the ROUNDING macro
+                   defined in the preprocessor section of project options.
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_f16_to_q15(
+  const float16_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+    float16_t       maxQ = (float16_t) Q15_MAX;
+    float16x8_t         vecDst;
+
+
+    do {
+        mve_pred16_t    p = vctp16q(blockSize);
+
+        vecDst = vldrhq_z_f16((float16_t const *) pSrc, p);
+        /* C = A * 32767 */
+        /* convert from float to Q15 and then store the results in the destination buffer */
+        vecDst = vmulq_m(vuninitializedq_f16(), vecDst, maxQ, p);
+
+        vstrhq_p_s16(pDst,
+            vcvtaq_m(vuninitializedq_s16(), vecDst, p), p);
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+        blockSize -= 8;
+    }
+    while ((int32_t) blockSize > 0);
+}
+
+#else
+
+void arm_f16_to_q15(
+  const float16_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+    const float16_t *pIn = pSrc;      /* Src pointer */
+    uint32_t  blkCnt;           /* loop counter */
+#ifdef ARM_MATH_ROUNDING
+    float16_t in;
+#endif                          /*      #ifdef ARM_MATH_ROUNDING        */
+
+    /*
+     * Loop over blockSize number of values
+     */
+    blkCnt = blockSize;
+
+    while (blkCnt > 0U)
+    {
+
+#ifdef ARM_MATH_ROUNDING
+
+        /*
+         * C = A * 65536
+         */
+        /*
+         * convert from float to Q31 and then store the results in the destination buffer
+         */
+        in = *pIn++;
+        in = (in * 32768.0);
+        in += in > 0.0 ? 0.5 : -0.5;
+        *pDst++ = clip_q31_to_q15((q31_t) (in));
+
+#else
+
+        /*
+         * C = A * 32768
+         */
+        /*
+         * convert from float to Q31 and then store the results in the destination buffer
+         */
+        *pDst++ = clip_q31_to_q15((q31_t) ((_Float16)*pIn++ * 32768.0f16));
+
+#endif                          /*      #ifdef ARM_MATH_ROUNDING        */
+
+        /*
+         * Decrement the loop counter
+         */
+        blkCnt--;
+    }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of f16_to_x group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f16.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f16.c
@ -0,0 +1,127 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_fill_f16.c
+ * Description:  Fills a constant value into a floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupSupport
+ */
+
+
+/**
+  @addtogroup Fill
+  @{
+ */
+
+/**
+  @brief         Fills a constant value into a f16 vector.
+  @param[in]     value      input value to be filled
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_fill_f16(
+  float16_t value,
+  float16_t * pDst,
+  uint32_t blockSize)
+{
+     do {
+        mve_pred16_t    p = vctp16q(blockSize);
+
+        vstrhq_p_f16(pDst,
+            vdupq_m_n_f16(vuninitializedq_f16(), value, p), p);
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pDst += 8;
+        blockSize -= 8;
+    }
+    while ((int32_t) blockSize > 0);
+}
+#else
+void arm_fill_f16(
+  float16_t value,
+  float16_t * pDst,
+  uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+
+    /* Fill value in destination buffer */
+    *pDst++ = value;
+    *pDst++ = value;
+    *pDst++ = value;
+    *pDst++ = value;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+
+    /* Fill value in destination buffer */
+    *pDst++ = value;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of Fill group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f32.c
@ -0,0 +1,189 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_fill_f32.c
+ * Description:  Fills a constant value into a floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @defgroup Fill Vector Fill
+
+  Fills the destination vector with a constant value.
+
+  <pre>
+      pDst[n] = value;   0 <= n < blockSize.
+  </pre>
+
+  There are separate functions for floating point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+  @addtogroup Fill
+  @{
+ */
+
+/**
+  @brief         Fills a constant value into a floating-point vector.
+  @param[in]     value      input value to be filled
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_fill_f32(
+  float32_t value,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+  uint32_t blkCnt;
+  blkCnt = blockSize >> 2U;
+
+  /* Compute 4 outputs at a time */
+  while (blkCnt > 0U)
+  {
+
+    vstrwq_f32(pDst,vdupq_n_f32(value));
+    /*
+     * Decrement the blockSize loop counter
+     * Advance vector source and destination pointers
+     */
+    pDst += 4;
+    blkCnt --;
+  }
+
+  blkCnt = blockSize & 3;
+
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+
+    /* Fill value in destination buffer */
+    *pDst++ = value;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#else
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_fill_f32(
+  float32_t value,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* loop counter */
+
+
+  float32x4_t inV = vdupq_n_f32(value);
+
+  blkCnt = blockSize >> 2U;
+
+  /* Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+    /* Fill the value in the destination buffer */
+    vst1q_f32(pDst, inV);
+    pDst += 4;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize & 3;
+
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+    /* Fill the value in the destination buffer */
+    *pDst++ = value;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+}
+#else
+void arm_fill_f32(
+  float32_t value,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+
+    /* Fill value in destination buffer */
+    *pDst++ = value;
+    *pDst++ = value;
+    *pDst++ = value;
+    *pDst++ = value;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+
+    /* Fill value in destination buffer */
+    *pDst++ = value;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of Fill group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f64.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_f64.c
@ -0,0 +1,71 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_fill_f64.c
+ * Description:  Fills a constant value into a floating-point vector
+ *
+ * $Date:        13 September 2021
+ * $Revision:    V1.10.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Fill
+  @{
+ */
+
+/**
+  @brief         Fills a constant value into a floating-point vector.
+  @param[in]     value      input value to be filled
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+void arm_fill_f64(
+  float64_t value,
+  float64_t * pDst,
+  uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+
+    /* Fill value in destination buffer */
+    *pDst++ = value;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+
+/**
+  @} end of Fill group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q15.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q15.c
@ -0,0 +1,134 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_fill_q15.c
+ * Description:  Fills a constant value into a Q15 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Fill
+  @{
+ */
+
+/**
+  @brief         Fills a constant value into a Q15 vector.
+  @param[in]     value      input value to be filled
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_fill_q15(
+  q15_t value,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+  uint32_t blkCnt;  
+  blkCnt = blockSize >> 3;
+  while (blkCnt > 0U)
+  {
+
+        vstrhq_s16(pDst,vdupq_n_s16(value));
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pDst += 8;
+        blkCnt --;
+    }
+
+  blkCnt = blockSize & 7;
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+
+    /* Fill value in destination buffer */
+    *pDst++ = value;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+
+#else
+void arm_fill_q15(
+  q15_t value,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+  q31_t packedValue;                             /* value packed to 32 bits */
+
+  /* Packing two 16 bit values to 32 bit value in order to use SIMD */
+  packedValue = __PKHBT(value, value, 16U);
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+
+    /* fill 2 times 2 samples at a time */
+    write_q15x2_ia (&pDst, packedValue);
+    write_q15x2_ia (&pDst, packedValue);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+
+    /* Fill value in destination buffer */
+    *pDst++ = value;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of Fill group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q31.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q31.c
@ -0,0 +1,135 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_fill_q31.c
+ * Description:  Fills a constant value into a Q31 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Fill
+  @{
+ */
+
+/**
+  @brief         Fills a constant value into a Q31 vector.
+  @param[in]     value      input value to be filled
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_fill_q31(
+        q31_t value,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+  uint32_t blkCnt;
+  blkCnt = blockSize >> 2U;
+
+  /* Compute 4 outputs at a time */
+  while (blkCnt > 0U)
+  {
+
+        vstrwq_s32(pDst,vdupq_n_s32(value));
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pDst += 4;
+        blkCnt --;
+  }
+
+  blkCnt = blockSize & 3;
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+
+    /* Fill value in destination buffer */
+    *pDst++ = value;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+    
+}
+
+#else
+void arm_fill_q31(
+  q31_t value,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+
+    /* Fill value in destination buffer */
+    *pDst++ = value;
+    *pDst++ = value;
+    *pDst++ = value;
+    *pDst++ = value;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+
+    /* Fill value in destination buffer */
+    *pDst++ = value;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of Fill group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q7.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_fill_q7.c
@ -0,0 +1,133 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_fill_q7.c
+ * Description:  Fills a constant value into a Q7 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Fill
+  @{
+ */
+
+/**
+  @brief         Fills a constant value into a Q7 vector.
+  @param[in]     value      input value to be filled
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_fill_q7(
+  q7_t value,
+  q7_t * pDst,
+  uint32_t blockSize)
+{
+  uint32_t blkCnt; 
+
+  blkCnt = blockSize >> 4;
+  while (blkCnt > 0U)
+  {
+
+        vstrbq_s8(pDst,vdupq_n_s8(value));
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pDst += 16;
+        blkCnt --;
+    }
+
+  blkCnt = blockSize & 0xF;
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+
+    /* Fill value in destination buffer */
+    *pDst++ = value;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#else
+void arm_fill_q7(
+  q7_t value,
+  q7_t * pDst,
+  uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+  q31_t packedValue;                             /* value packed to 32 bits */
+
+  /* Packing four 8 bit values to 32 bit value in order to use SIMD */
+  packedValue = __PACKq7(value, value, value, value);
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+
+    /* fill 4 samples at a time */
+    write_q7x4_ia (&pDst, packedValue);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = value */
+
+    /* Fill value in destination buffer */
+    *pDst++ = value;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of Fill group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_f16.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_f16.c
@ -0,0 +1,131 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_float_to_q15.c
+ * Description:  Converts the elements of the floating-point vector to Q15 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup float_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the floating-point vector to f16 vector.
+  @param[in]     pSrc       points to the f32 input vector
+  @param[out]    pDst       points to the f16 output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_float_to_f16 built. Helium version has build issues with gcc."
+#endif 
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) &&  !defined(__CMSIS_GCC_H)
+
+void arm_float_to_f16(
+  const float32_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+    int32_t  blkCnt;           /* loop counters */
+    float32x4x2_t tmp;
+    float16x8_t vecDst;
+    float32_t const *pSrcVec;
+
+
+    pSrcVec = (float32_t const *) pSrc;
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0)
+    {
+        /* convert from float32 to float16 and then store the results in the destination buffer */
+        tmp = vld2q(pSrcVec);   pSrcVec += 8;
+        /* narrow / merge */
+        vecDst = vcvtbq_f16_f32(vecDst, tmp.val[0]);
+        vecDst = vcvttq_f16_f32(vecDst, tmp.val[1]);
+        vst1q(pDst, vecDst);    pDst += 8;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        tmp = vld2q(pSrcVec);
+        vecDst = vcvtbq_f16_f32(vecDst, tmp.val[0]);
+        vecDst = vcvttq_f16_f32(vecDst, tmp.val[1]);
+        vstrhq_p(pDst, vecDst, p0);
+    }
+}
+
+#else
+
+void arm_float_to_f16(
+  const float32_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+    const float32_t *pIn = pSrc;      /* Src pointer */
+    uint32_t  blkCnt;           /* loop counter */
+
+    /*
+     * Loop over blockSize number of values
+     */
+    blkCnt = blockSize;
+
+    while (blkCnt > 0U)
+    {
+
+        *pDst++ = (float16_t) * pIn++;
+        /*
+         * Decrement the loop counter
+         */
+        blkCnt--;
+    }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of float_to_x group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q15.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q15.c
@ -0,0 +1,308 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_float_to_q15.c
+ * Description:  Converts the elements of the floating-point vector to Q15 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup float_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the floating-point vector to Q15 vector.
+  @param[in]     pSrc       points to the floating-point input vector
+  @param[out]    pDst       points to the Q15 output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Details
+                   The equation used for the conversion process is:
+  <pre>
+      pDst[n] = (q15_t)(pSrc[n] * 32768);   0 <= n < blockSize.
+  </pre>
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+
+  @note
+                   In order to apply rounding, the library should be rebuilt with the ROUNDING macro
+                   defined in the preprocessor section of project options.
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_float_to_q15(
+  const float32_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+    uint32_t         blkCnt;
+    float32_t       maxQ = (float32_t) Q15_MAX;
+    f32x4x2_t       tmp;
+    q15x8_t         vecDst;
+#ifdef ARM_MATH_ROUNDING
+    float32_t in;
+#endif
+
+
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U) 
+    {
+        /* C = A * 32768 */
+        /* convert from float to q15 and then store the results in the destination buffer */
+        tmp = vld2q(pSrc);
+
+        tmp.val[0] = vmulq(tmp.val[0], maxQ);
+        tmp.val[1] = vmulq(tmp.val[1], maxQ);
+
+        vecDst = vqmovnbq(vecDst, vcvtaq_s32_f32(tmp.val[0]));
+        vecDst = vqmovntq(vecDst, vcvtaq_s32_f32(tmp.val[1]));
+        vst1q(pDst, vecDst);
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        pDst += 8;
+        pSrc += 8;
+    }
+
+    blkCnt = blockSize & 7;
+    while (blkCnt > 0U)
+    {
+      /* C = A * 32768 */
+
+      /* convert from float to Q15 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+      in = (*pSrc++ * 32768.0f);
+      in += in > 0.0f ? 0.5f : -0.5f;
+      *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+#else
+
+      /* C = A * 32768 */
+      /* Convert from float to q15 and then store the results in the destination buffer */
+      *pDst++ = (q15_t) __SSAT((q31_t) (*pSrc++ * 32768.0f), 16);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+}
+
+#else
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_float_to_q15(
+  const float32_t * pSrc,
+  q15_t * pDst,
+  uint32_t blockSize)
+{
+  const float32_t *pIn = pSrc;                         /* Src pointer */
+  uint32_t blkCnt;                               /* loop counter */
+
+  float32x4_t inV;
+  #ifdef ARM_MATH_ROUNDING
+  float32x4_t zeroV = vdupq_n_f32(0.0f);
+  float32x4_t pHalf = vdupq_n_f32(0.5f / 32768.0f);
+  float32x4_t mHalf = vdupq_n_f32(-0.5f / 32768.0f);
+  float32x4_t r;
+  uint32x4_t cmp;
+  float32_t in;
+  #endif
+
+  int32x4_t cvt;
+  int16x4_t outV;
+
+  blkCnt = blockSize >> 2U;
+
+  /* Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+
+#ifdef ARM_MATH_ROUNDING
+    /* C = A * 32768 */
+    /* Convert from float to q15 and then store the results in the destination buffer */
+    inV = vld1q_f32(pIn);
+    cmp = vcgtq_f32(inV,zeroV);
+    r = vbslq_f32(cmp,pHalf,mHalf);
+    inV = vaddq_f32(inV, r);
+
+    pIn += 4;
+
+    cvt = vcvtq_n_s32_f32(inV,15);
+    outV = vqmovn_s32(cvt);
+
+    vst1_s16(pDst, outV);
+    pDst += 4;
+
+#else
+
+    /* C = A * 32768 */
+    /* Convert from float to q15 and then store the results in the destination buffer */
+    inV = vld1q_f32(pIn);
+
+    cvt = vcvtq_n_s32_f32(inV,15);
+    outV = vqmovn_s32(cvt);
+
+    vst1_s16(pDst, outV);
+    pDst += 4;
+    pIn += 4;
+
+#endif /*      #ifdef ARM_MATH_ROUNDING        */
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize & 3;
+
+  while (blkCnt > 0U)
+  {
+
+#ifdef ARM_MATH_ROUNDING
+    /* C = A * 32768 */
+    /* Convert from float to q15 and then store the results in the destination buffer */
+    in = *pIn++;
+    in = (in * 32768.0f);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+#else
+
+    /* C = A * 32768 */
+    /* Convert from float to q15 and then store the results in the destination buffer */
+    *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+
+#endif /*      #ifdef ARM_MATH_ROUNDING        */
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+}
+#else
+void arm_float_to_q15(
+  const float32_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+  const float32_t *pIn = pSrc;                         /* Source pointer */
+
+#ifdef ARM_MATH_ROUNDING
+        float32_t in;
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * 32768 */
+
+    /* convert from float to Q15 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+    in = (*pIn++ * 32768.0f);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+    in = (*pIn++ * 32768.0f);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+    in = (*pIn++ * 32768.0f);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+    in = (*pIn++ * 32768.0f);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+#else
+
+    *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+    *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+    *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+    *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * 32768 */
+
+    /* convert from float to Q15 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+    in = (*pIn++ * 32768.0f);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+#else
+
+    /* C = A * 32768 */
+    /* Convert from float to q15 and then store the results in the destination buffer */
+    *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of float_to_x group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q31.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q31.c
@ -0,0 +1,314 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_float_to_q31.c
+ * Description:  Converts the elements of the floating-point vector to Q31 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+ * @defgroup float_to_x  Convert 32-bit floating point value
+ */
+
+/**
+  @addtogroup float_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the floating-point vector to Q31 vector.
+  @param[in]     pSrc       points to the floating-point input vector
+  @param[out]    pDst       points to the Q31 output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Details
+                   The equation used for the conversion process is:
+  <pre>
+      pDst[n] = (q31_t)(pSrc[n] * 2147483648);   0 <= n < blockSize.
+  </pre>
+
+  @par           Scaling and Overflow Behavior
+                   The function uses saturating arithmetic.
+                   Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
+
+  @note
+                   In order to apply rounding, the library should be rebuilt with the ROUNDING macro
+                   defined in the preprocessor section of project options.
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_float_to_q31(
+  const float32_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+    uint32_t         blkCnt;
+    float32_t       maxQ = (float32_t) Q31_MAX;
+    f32x4_t         vecDst;
+#ifdef ARM_MATH_ROUNDING
+    float32_t in;
+#endif
+
+
+    blkCnt = blockSize >> 2U;
+
+    /* Compute 4 outputs at a time. */
+    while (blkCnt > 0U)
+    {
+
+        vecDst = vldrwq_f32(pSrc);
+        /* C = A * 2147483648 */
+        /* convert from float to Q31 and then store the results in the destination buffer */
+        vecDst = vmulq(vecDst, maxQ);
+
+        vstrwq_s32(pDst, vcvtaq_s32_f32(vecDst));
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+        blkCnt --;
+    }
+
+    blkCnt = blockSize & 3;
+
+    while (blkCnt > 0U)
+    {
+       /* C = A * 2147483648 */
+   
+       /* convert from float to Q31 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+       in = (*pSrc++ * 2147483648.0f);
+       in += in > 0.0f ? 0.5f : -0.5f;
+       *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+#else
+
+       /* C = A * 2147483648 */
+       /* Convert from float to Q31 and then store the results in the destination buffer */
+       *pDst++ = clip_q63_to_q31((q63_t) (*pSrc++ * 2147483648.0f));
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+       /* Decrement loop counter */
+       blkCnt--;
+  }
+}
+#else
+#if defined(ARM_MATH_NEON)
+void arm_float_to_q31(
+  const float32_t * pSrc,
+  q31_t * pDst,
+  uint32_t blockSize)
+{
+  const float32_t *pIn = pSrc;                         /* Src pointer */
+  uint32_t blkCnt;                               /* loop counter */
+
+  float32x4_t inV;
+  #ifdef ARM_MATH_ROUNDING
+  float32_t in;
+  float32x4_t zeroV = vdupq_n_f32(0.0f);
+  float32x4_t pHalf = vdupq_n_f32(0.5f / 2147483648.0f);
+  float32x4_t mHalf = vdupq_n_f32(-0.5f / 2147483648.0f);
+  float32x4_t r;
+  uint32x4_t cmp;
+  #endif
+
+  int32x4_t outV;
+
+  blkCnt = blockSize >> 2U;
+
+  /* Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+
+#ifdef ARM_MATH_ROUNDING
+
+    /* C = A * 32768 */
+    /* Convert from float to Q31 and then store the results in the destination buffer */
+    inV = vld1q_f32(pIn);
+    cmp = vcgtq_f32(inV,zeroV);
+    r = vbslq_f32(cmp,pHalf,mHalf);
+    inV = vaddq_f32(inV, r);
+
+    pIn += 4;
+
+    outV = vcvtq_n_s32_f32(inV,31);
+
+    vst1q_s32(pDst, outV);
+    pDst += 4;
+
+#else
+
+    /* C = A * 2147483648 */
+    /* Convert from float to Q31 and then store the results in the destination buffer */
+    inV = vld1q_f32(pIn);
+
+    outV = vcvtq_n_s32_f32(inV,31);
+
+    vst1q_s32(pDst, outV);
+    pDst += 4;
+    pIn += 4;
+
+#endif /*      #ifdef ARM_MATH_ROUNDING        */
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize & 3;
+
+  while (blkCnt > 0U)
+  {
+
+#ifdef ARM_MATH_ROUNDING
+
+    /* C = A * 2147483648 */
+    /* Convert from float to Q31 and then store the results in the destination buffer */
+    in = *pIn++;
+    in = (in * 2147483648.0f);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+#else
+
+    /* C = A * 2147483648 */
+    /* Convert from float to Q31 and then store the results in the destination buffer */
+    *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+
+#endif /*      #ifdef ARM_MATH_ROUNDING        */
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+
+}
+#else
+void arm_float_to_q31(
+  const float32_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+  const float32_t *pIn = pSrc;                         /* Source pointer */
+
+#ifdef ARM_MATH_ROUNDING
+        float32_t in;
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * 2147483648 */
+
+    /* convert from float to Q31 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+    in = (*pIn++ * 2147483648.0f);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+    in = (*pIn++ * 2147483648.0f);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+    in = (*pIn++ * 2147483648.0f);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+    in = (*pIn++ * 2147483648.0f);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+#else
+
+    /* C = A * 2147483648 */
+    /* Convert from float to Q31 and then store the results in the destination buffer */
+    *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+    *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+    *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+    *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * 2147483648 */
+
+    /* convert from float to Q31 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+    in = (*pIn++ * 2147483648.0f);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+#else
+
+    /* C = A * 2147483648 */
+    /* Convert from float to Q31 and then store the results in the destination buffer */
+    *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of float_to_x group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q7.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q7.c
@ -0,0 +1,330 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_float_to_q7.c
+ * Description:  Converts the elements of the floating-point vector to Q7 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup float_to_x
+  @{
+ */
+
+/**
+ * @brief Converts the elements of the floating-point vector to Q7 vector.
+ * @param[in]       *pSrc points to the floating-point input vector
+ * @param[out]      *pDst points to the Q7 output vector
+ * @param[in]       blockSize length of the input vector
+ * @return none.
+ *
+ *\par Description:
+ * \par
+ * The equation used for the conversion process is:
+ * <pre>
+ * 	pDst[n] = (q7_t)(pSrc[n] * 128);   0 <= n < blockSize.
+ * </pre>
+ * \par Scaling and Overflow Behavior:
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
+ * \note
+ * In order to apply rounding, the library should be rebuilt with the ROUNDING macro
+ * defined in the preprocessor section of project options.
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_float_to_q7(
+  const float32_t * pSrc,
+  q7_t * pDst,
+  uint32_t blockSize)
+{
+    uint32_t         blkCnt;     /* loop counters */
+    float32_t       maxQ = powf(2.0, 7);
+    f32x4x4_t       tmp;
+    q15x8_t         evVec, oddVec;
+    q7x16_t         vecDst;
+    float32_t const *pSrcVec;
+#ifdef ARM_MATH_ROUNDING
+    float32_t in;
+#endif
+
+    pSrcVec = (float32_t const *) pSrc;
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U) {
+        tmp = vld4q(pSrcVec);
+        pSrcVec += 16;
+        /*
+         * C = A * 128.0
+         * convert from float to q7 and then store the results in the destination buffer
+         */
+        tmp.val[0] = vmulq(tmp.val[0], maxQ);
+        tmp.val[1] = vmulq(tmp.val[1], maxQ);
+        tmp.val[2] = vmulq(tmp.val[2], maxQ);
+        tmp.val[3] = vmulq(tmp.val[3], maxQ);
+
+        /*
+         * convert and pack evens
+         */
+        evVec = vqmovnbq(evVec, vcvtaq_s32_f32(tmp.val[0]));
+        evVec = vqmovntq(evVec, vcvtaq_s32_f32(tmp.val[2]));
+        /*
+         * convert and pack odds
+         */
+        oddVec = vqmovnbq(oddVec, vcvtaq_s32_f32(tmp.val[1]));
+        oddVec = vqmovntq(oddVec, vcvtaq_s32_f32(tmp.val[3]));
+        /*
+         * merge
+         */
+        vecDst = vqmovnbq(vecDst, evVec);
+        vecDst = vqmovntq(vecDst, oddVec);
+
+        vst1q(pDst, vecDst);
+        pDst += 16;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+
+  blkCnt = blockSize & 0xF;
+  while (blkCnt > 0U)
+  {
+    /* C = A * 128 */
+
+    /* Convert from float to q7 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+    in = (*pSrcVec++ * 128);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+#else
+
+    *pDst++ = (q7_t) __SSAT((q31_t) (*pSrcVec++ * 128.0f), 8);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#else
+#if defined(ARM_MATH_NEON)
+void arm_float_to_q7(
+  const float32_t * pSrc,
+  q7_t * pDst,
+  uint32_t blockSize)
+{
+  const float32_t *pIn = pSrc;                         /* Src pointer */
+  uint32_t blkCnt;                               /* loop counter */
+
+  float32x4_t inV;
+  #ifdef ARM_MATH_ROUNDING
+  float32_t in;
+  float32x4_t zeroV = vdupq_n_f32(0.0f);
+  float32x4_t pHalf = vdupq_n_f32(0.5f / 128.0f);
+  float32x4_t mHalf = vdupq_n_f32(-0.5f / 128.0f);
+  float32x4_t r;
+  uint32x4_t cmp;
+  #endif
+
+  int16x4_t cvt1,cvt2;
+  int8x8_t outV;
+
+  blkCnt = blockSize >> 3U;
+
+  /* Compute 8 outputs at a time.
+   ** a second loop below computes the remaining 1 to 7 samples. */
+  while (blkCnt > 0U)
+  {
+
+#ifdef ARM_MATH_ROUNDING
+    /* C = A * 128 */
+    /* Convert from float to q7 and then store the results in the destination buffer */
+    inV = vld1q_f32(pIn);
+    cmp = vcgtq_f32(inV,zeroV);
+    r = vbslq_f32(cmp,pHalf,mHalf);
+    inV = vaddq_f32(inV, r);
+    cvt1 = vqmovn_s32(vcvtq_n_s32_f32(inV,7));
+    pIn += 4;
+
+    inV = vld1q_f32(pIn);
+    cmp = vcgtq_f32(inV,zeroV);
+    r = vbslq_f32(cmp,pHalf,mHalf);
+    inV = vaddq_f32(inV, r);
+    cvt2 = vqmovn_s32(vcvtq_n_s32_f32(inV,7));
+    pIn += 4;
+    
+    outV = vqmovn_s16(vcombine_s16(cvt1,cvt2));
+    vst1_s8(pDst, outV);
+    pDst += 8;
+
+#else
+
+    /* C = A * 128 */
+    /* Convert from float to q7 and then store the results in the destination buffer */
+    inV = vld1q_f32(pIn);
+    cvt1 = vqmovn_s32(vcvtq_n_s32_f32(inV,7));
+    pIn += 4;
+
+    inV = vld1q_f32(pIn);
+    cvt2 = vqmovn_s32(vcvtq_n_s32_f32(inV,7));
+    pIn += 4;
+
+    outV = vqmovn_s16(vcombine_s16(cvt1,cvt2));
+
+    vst1_s8(pDst, outV);
+    pDst += 8;
+#endif /*      #ifdef ARM_MATH_ROUNDING        */
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize & 7;
+
+  while (blkCnt > 0U)
+  {
+
+#ifdef ARM_MATH_ROUNDING
+    /* C = A * 128 */
+    /* Convert from float to q7 and then store the results in the destination buffer */
+    in = *pIn++;
+    in = (in * 128);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+#else
+
+    /* C = A * 128 */
+    /* Convert from float to q7 and then store the results in the destination buffer */
+    *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+
+#endif /*      #ifdef ARM_MATH_ROUNDING        */
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+}
+#else
+void arm_float_to_q7(
+  const float32_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+  const float32_t *pIn = pSrc;                         /* Source pointer */
+
+#ifdef ARM_MATH_ROUNDING
+        float32_t in;
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * 128 */
+
+    /* Convert from float to q7 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+    in = (*pIn++ * 128);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+    in = (*pIn++ * 128);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+    in = (*pIn++ * 128);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+    in = (*pIn++ * 128);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+#else
+
+    *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+    *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+    *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+    *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * 128 */
+
+    /* Convert from float to q7 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+    in = (*pIn++ * 128);
+    in += in > 0.0f ? 0.5f : -0.5f;
+    *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+#else
+
+    *pDst++ = (q7_t) __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of float_to_x group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c
@ -0,0 +1,119 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_heap_sort_f32.c
+ * Description:  Floating point heap sort
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+#include "arm_sorting.h"
+
+
+
+static void arm_heapify(float32_t * pSrc, uint32_t n, uint32_t i, uint8_t dir)
+{
+    /* Put all the elements of pSrc in heap order */
+    uint32_t k = i; // Initialize largest/smallest as root 
+    uint32_t l = 2*i + 1; // left = 2*i + 1 
+    uint32_t r = 2*i + 2; // right = 2*i + 2 
+    float32_t temp;
+
+    if (l < n && dir==(pSrc[l] > pSrc[k]) )
+        k = l; 
+
+    if (r < n && dir==(pSrc[r] > pSrc[k]) )
+        k = r; 
+
+    if (k != i) 
+    { 
+	temp = pSrc[i];
+	pSrc[i]=pSrc[k];
+	pSrc[k]=temp;
+
+        arm_heapify(pSrc, n, k, dir); 
+    }
+}
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Sorting
+  @{
+ */
+
+/**
+   * @private
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   *
+   * @par        Algorithm
+   *               The heap sort algorithm is a comparison algorithm that
+   *               divides the input array into a sorted and an unsorted region, 
+   *               and shrinks the unsorted region by extracting the largest 
+   *               element and moving it to the sorted region. A heap data 
+   *               structure is used to find the maximum.
+   *
+   * @par          It's an in-place algorithm. In order to obtain an out-of-place
+   *               function, a memcpy of the source vector is performed.
+   */
+void arm_heap_sort_f32(
+  const arm_sort_instance_f32 * S, 
+        float32_t * pSrc, 
+        float32_t * pDst, 
+        uint32_t blockSize)
+{
+    float32_t * pA;
+    int32_t i;
+    float32_t temp;
+
+    if(pSrc != pDst) // out-of-place
+    {   
+        memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+        pA = pDst;
+    }
+    else
+        pA = pSrc;
+
+    // Build the heap array so that the largest value is the root
+    for (i = blockSize/2 - 1; i >= 0; i--) 
+        arm_heapify(pA, blockSize, i, S->dir); 
+
+    for (i = blockSize - 1; i >= 0; i--)
+    {
+        // Swap
+	temp = pA[i];
+	pA[i] = pA[0];
+        pA[0] = temp;
+
+        // Restore heap order
+	arm_heapify(pA, i, 0, S->dir);
+    }
+}
+/**
+  @} end of Sorting group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c
@ -0,0 +1,93 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_insertion_sort_f32.c
+ * Description:  Floating point insertion sort
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+#include "arm_sorting.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Sorting
+  @{
+ */
+
+/**
+   * @private
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   *
+   * @par        Algorithm
+   *               The insertion sort is a simple sorting algorithm that
+   *               reads all the element of the input array and removes one element 
+   *               at a time, finds the location it belongs in the final sorted list, 
+   *               and inserts it there. 
+   *
+   * @par          It's an in-place algorithm. In order to obtain an out-of-place
+   *               function, a memcpy of the source vector is performed.
+   */
+
+void arm_insertion_sort_f32(
+  const arm_sort_instance_f32 * S, 
+        float32_t *pSrc, 
+        float32_t* pDst, 
+        uint32_t blockSize)
+{
+    float32_t * pA;
+    uint8_t dir = S->dir;
+    uint32_t i, j;
+    float32_t temp;
+
+    if(pSrc != pDst) // out-of-place
+    {   
+        memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+        pA = pDst;
+    }
+    else
+        pA = pSrc;
+ 
+    // Real all the element of the input array
+    for(i=0; i<blockSize; i++)
+    {
+	// Move the i-th element to the right position
+        for (j = i; j>0 && dir==(pA[j]<pA[j-1]); j--)
+        {
+	    // Swap
+            temp = pA[j];
+	    pA[j] = pA[j-1];
+	    pA[j-1] = temp;
+        }
+    }
+} 
+
+/**
+  @} end of Sorting group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c
@ -0,0 +1,127 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_merge_sort_f32.c
+ * Description:  Floating point merge sort
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+#include "arm_sorting.h"
+
+
+static void topDownMerge(float32_t * pA, uint32_t begin, uint32_t middle, uint32_t end, float32_t * pB, uint8_t dir)
+{
+    /* Left  array is pA[begin:middle-1]
+     * Right Array is pA[middle:end-1] 
+     * They are merged in pB
+     */
+
+    uint32_t i = begin;
+    uint32_t j = middle;
+    uint32_t k;
+ 
+    // Read all the elements in the sublist
+    for (k = begin; k < end; k++)
+    {
+	// Merge 
+        if (i < middle && (j >= end || dir==(pA[i] <= pA[j])) )
+        {
+            pB[k] = pA[i];
+            i++;
+        }
+        else
+        {
+            pB[k] = pA[j];
+            j++;
+        }
+    }
+}
+
+static void arm_merge_sort_core_f32(float32_t * pB, uint32_t begin, uint32_t end, float32_t * pA, uint8_t dir)
+{
+    if((int32_t)end - (int32_t)begin >= 2 )           // If run size != 1 divide
+    {                                 
+        int32_t middle = (end + begin) / 2;           // Take the middle point
+
+        arm_merge_sort_core_f32(pA, begin,  middle, pB, dir);  // Sort the left part
+        arm_merge_sort_core_f32(pA, middle,    end, pB, dir);  // Sort the right part
+
+        topDownMerge(pB, begin, middle, end, pA, dir);
+    }
+}
+
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Sorting
+  @{
+ */
+
+/**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   *
+   * @par        Algorithm
+   *               The merge sort algorithm is a comparison algorithm that
+   *               divide the input array in sublists and merge them to produce
+   *               longer sorted sublists until there is only one list remaining.
+   *
+   * @par          A work array is always needed. It must be allocated by the user 
+   *               linked to the instance at initialization time.
+   *
+   * @par          It's an in-place algorithm. In order to obtain an out-of-place
+   *               function, a memcpy of the source vector is performed
+   */
+
+
+void arm_merge_sort_f32(
+  const arm_merge_sort_instance_f32 * S, 
+        float32_t *pSrc, 
+        float32_t *pDst, 
+        uint32_t blockSize)
+{
+    float32_t * pA;
+
+    /* Out-of-place */ 
+    if(pSrc != pDst)
+    {
+        memcpy(pDst, pSrc, blockSize*sizeof(float32_t));
+        pA = pDst;
+    }
+    else
+        pA = pSrc;
+
+    /* A working buffer is needed */
+    memcpy(S->buffer, pSrc, blockSize*sizeof(float32_t));
+
+    arm_merge_sort_core_f32(S->buffer, 0, blockSize, pA, S->dir);
+}
+/**
+  @} end of Sorting group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_init_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_init_f32.c
@ -0,0 +1,53 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_merge_sort_init_f32.c
+ * Description:  Floating point merge sort initialization function
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Sorting
+  @{
+ */
+
+
+  /**
+   * @param[in,out]  S            points to an instance of the sorting structure.
+   * @param[in]      dir          Sorting order.
+   * @param[in]      buffer       Working buffer.
+   */
+void arm_merge_sort_init_f32(arm_merge_sort_instance_f32 * S, arm_sort_dir dir, float32_t * buffer)
+{
+    S->dir    = dir;
+    S->buffer = buffer;
+}
+/**
+  @} end of Sorting group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_f16.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_f16.c
@ -0,0 +1,155 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_q15_to_float.c
+ * Description:  Converts the elements of the Q15 vector to floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+ * @defgroup q15_to_x  Convert 16-bit fixed point value
+ */
+
+/**
+  @addtogroup q15_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the Q15 vector to f16 vector.
+  @param[in]     pSrc       points to the Q15 input vector
+  @param[out]    pDst       points to the f16 output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Details
+                   The equation used for the conversion process is:
+  <pre>
+      pDst[n] = (float16_t) pSrc[n] / 32768;   0 <= n < blockSize.
+  </pre>
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_q15_to_f16(
+  const q15_t * pSrc,
+  float16_t * pDst,
+  uint32_t blockSize)
+{
+    int32_t  blkCnt;           /* loop counters */
+    q15x8_t vecDst;
+    q15_t const *pSrcVec;
+
+    pSrcVec = (q15_t const *) pSrc;
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0)
+    {
+        /* C = (float16_t) A / 32768 */
+        /* convert from q15 to float and then store the results in the destination buffer */
+        vecDst = vld1q(pSrcVec);   pSrcVec += 8;
+        vstrhq(pDst, vcvtq_n_f16_s16(vecDst, 15));  pDst += 8;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     * (will be merged thru tail predication)
+     */
+    blkCnt = blockSize & 7;
+    if (blkCnt > 0)
+    {
+        mve_pred16_t p0 = vctp16q(blkCnt);
+        vecDst = vld1q(pSrcVec);   pSrcVec += 8;
+        vstrhq_p(pDst, vcvtq_n_f16_s16(vecDst, 15), p0);
+    }
+}
+#else
+
+void arm_q15_to_f16(
+  const q15_t * pSrc,
+        float16_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+  const q15_t *pIn = pSrc;                             /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (float16_t) A / 32768 */
+
+    /* Convert from q15 to float and store result in destination buffer */
+    *pDst++ = ((_Float16) * pIn++ / 32768.0f16);
+    *pDst++ = ((_Float16) * pIn++ / 32768.0f16);
+    *pDst++ = ((_Float16) * pIn++ / 32768.0f16);
+    *pDst++ = ((_Float16) * pIn++ / 32768.0f16);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = (float16_t) A / 32768 */
+
+    /* Convert from q15 to float and store result in destination buffer */
+    *pDst++ = ((_Float16) *pIn++ / 32768.0f16);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of q15_to_x group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c
@ -0,0 +1,207 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_q15_to_float.c
+ * Description:  Converts the elements of the Q15 vector to floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+ * @defgroup q15_to_x  Convert 16-bit fixed point value
+ */
+
+/**
+  @addtogroup q15_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the Q15 vector to floating-point vector.
+  @param[in]     pSrc       points to the Q15 input vector
+  @param[out]    pDst       points to the floating-point output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Details
+                   The equation used for the conversion process is:
+  <pre>
+      pDst[n] = (float32_t) pSrc[n] / 32768;   0 <= n < blockSize.
+  </pre>
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q15_to_float(
+  const q15_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+  uint32_t blkCnt;
+
+  q15x8_t vecDst;
+  q15_t const *pSrcVec;
+
+  pSrcVec = (q15_t const *) pSrc;
+  blkCnt = blockSize >> 2;
+  while (blkCnt > 0U)
+  {
+      /* C = (float32_t) A / 32768 */
+      /* convert from q15 to float and then store the results in the destination buffer */
+      vecDst = vldrhq_s32(pSrcVec);
+      pSrcVec += 4;
+      vstrwq(pDst, vcvtq_n_f32_s32((int32x4_t)vecDst, 15));
+      pDst += 4;
+      /*
+       * Decrement the blockSize loop counter
+       */
+      blkCnt--;
+  }
+
+  blkCnt = blockSize & 3;
+  while (blkCnt > 0U)
+  {
+    /* C = (float32_t) A / 32768 */
+
+    /* Convert from q15 to float and store result in destination buffer */
+    *pDst++ = ((float32_t) *pSrcVec++ / 32768.0f);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#else
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_q15_to_float(
+  const q15_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+  const q15_t *pIn = pSrc;                             /* Src pointer */
+  uint32_t blkCnt;                               /* loop counter */
+
+  int16x8_t inV;
+  int32x4_t inV0, inV1;
+  float32x4_t outV;
+
+  blkCnt = blockSize >> 3U;
+
+  /* Compute 8 outputs at a time.
+   ** a second loop below computes the remaining 1 to 7 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = (float32_t) A / 32768 */
+    /* convert from q15 to float and then store the results in the destination buffer */
+    inV = vld1q_s16(pIn);
+    pIn += 8;
+
+    inV0 = vmovl_s16(vget_low_s16(inV));
+    inV1 = vmovl_s16(vget_high_s16(inV));
+
+    outV = vcvtq_n_f32_s32(inV0,15);
+    vst1q_f32(pDst, outV);
+    pDst += 4;
+
+    outV = vcvtq_n_f32_s32(inV1,15);
+    vst1q_f32(pDst, outV);
+    pDst += 4;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 8, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize & 7;
+
+
+  while (blkCnt > 0U)
+  {
+    /* C = (float32_t) A / 32768 */
+    /* convert from q15 to float and then store the results in the destination buffer */
+    *pDst++ = ((float32_t) * pIn++ / 32768.0f);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+}
+#else
+void arm_q15_to_float(
+  const q15_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+  const q15_t *pIn = pSrc;                             /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (float32_t) A / 32768 */
+
+    /* Convert from q15 to float and store result in destination buffer */
+    *pDst++ = ((float32_t) * pIn++ / 32768.0f);
+    *pDst++ = ((float32_t) * pIn++ / 32768.0f);
+    *pDst++ = ((float32_t) * pIn++ / 32768.0f);
+    *pDst++ = ((float32_t) * pIn++ / 32768.0f);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = (float32_t) A / 32768 */
+
+    /* Convert from q15 to float and store result in destination buffer */
+    *pDst++ = ((float32_t) *pIn++ / 32768.0f);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of q15_to_x group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q31.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q31.c
@ -0,0 +1,182 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_q15_to_q31.c
+ * Description:  Converts the elements of the Q15 vector to Q31 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup q15_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the Q15 vector to Q31 vector.
+  @param[in]     pSrc       points to the Q15 input vector
+  @param[out]    pDst       points to the Q31 output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Details
+                   The equation used for the conversion process is:
+  <pre>
+      pDst[n] = (q31_t) pSrc[n] << 16;   0 <= n < blockSize.
+  </pre>
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q15_to_q31(
+  const q15_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+
+  uint32_t blkCnt;
+
+  q31x4_t         vecDst;
+
+  blkCnt = blockSize>> 2;
+  while (blkCnt > 0U)
+  {
+
+        /* C = (q31_t)A << 16 */
+        /* convert from q15 to q31 and then store the results in the destination buffer */
+        /* load q15 + 32-bit widening */
+        vecDst = vldrhq_s32((q15_t const *) pSrc);
+        vecDst = vshlq_n(vecDst, 16);
+        vstrwq_s32(pDst, vecDst);
+
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pDst += 4;
+        pSrc += 4;
+        blkCnt --;
+    }
+
+  blkCnt = blockSize & 3;
+  while (blkCnt > 0U)
+  {
+    /* C = (q31_t) A << 16 */
+
+    /* Convert from q15 to q31 and store result in destination buffer */
+    *pDst++ = (q31_t) *pSrc++ << 16;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#else
+void arm_q15_to_q31(
+  const q15_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+  const q15_t *pIn = pSrc;                             /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+        q31_t in1, in2;
+        q31_t out1, out2, out3, out4;
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (q31_t)A << 16 */
+
+    /* Convert from q15 to q31 and store result in destination buffer */
+    in1 = read_q15x2_ia (&pIn);
+    in2 = read_q15x2_ia (&pIn);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+    /* extract lower 16 bits to 32 bit result */
+    out1 = in1 << 16U;
+    /* extract upper 16 bits to 32 bit result */
+    out2 = in1 & 0xFFFF0000;
+    /* extract lower 16 bits to 32 bit result */
+    out3 = in2 << 16U;
+    /* extract upper 16 bits to 32 bit result */
+    out4 = in2 & 0xFFFF0000;
+
+#else
+
+    /* extract upper 16 bits to 32 bit result */
+    out1 = in1 & 0xFFFF0000;
+    /* extract lower 16 bits to 32 bit result */
+    out2 = in1 << 16U;
+    /* extract upper 16 bits to 32 bit result */
+    out3 = in2 & 0xFFFF0000;
+    /* extract lower 16 bits to 32 bit result */
+    out4 = in2 << 16U;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+    *pDst++ = out1;
+    *pDst++ = out2;
+    *pDst++ = out3;
+    *pDst++ = out4;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = (q31_t) A << 16 */
+
+    /* Convert from q15 to q31 and store result in destination buffer */
+    *pDst++ = (q31_t) *pIn++ << 16;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of q15_to_x group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q7.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q7.c
@ -0,0 +1,190 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_q15_to_q7.c
+ * Description:  Converts the elements of the Q15 vector to Q7 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup q15_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the Q15 vector to Q7 vector.
+  @param[in]     pSrc       points to the Q15 input vector
+  @param[out]    pDst       points to the Q7 output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Details
+                   The equation used for the conversion process is:
+  <pre>
+      pDst[n] = (q7_t) pSrc[n] >> 8;   0 <= n < blockSize.
+  </pre>
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q15_to_q7(
+  const q15_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8x2_t tmp;
+    q15_t const *pSrcVec;
+    q7x16_t vecDst;
+
+
+    pSrcVec = (q15_t const *) pSrc;
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        /* C = (q7_t) A >> 8 */
+        /* convert from q15 to q7 and then store the results in the destination buffer */
+        tmp = vld2q(pSrcVec);   
+        pSrcVec += 16;
+        vecDst = vqshrnbq_n_s16(vecDst, tmp.val[0], 8);
+        vecDst = vqshrntq_n_s16(vecDst, tmp.val[1], 8);
+        vst1q(pDst, vecDst);    
+        pDst += 16;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+
+  blkCnt = blockSize & 0xF;
+  while (blkCnt > 0U)
+  {
+    /* C = (q7_t) A >> 8 */
+
+    /* Convert from q15 to q7 and store result in destination buffer */
+    *pDst++ = (q7_t) (*pSrcVec++ >> 8);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#else
+void arm_q15_to_q7(
+  const q15_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+  const q15_t *pIn = pSrc;                             /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+        q31_t in1, in2;
+        q31_t out1, out2;
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (q7_t) A >> 8 */
+
+    /* Convert from q15 to q7 and store result in destination buffer */
+#if defined (ARM_MATH_DSP)
+
+    in1 = read_q15x2_ia (&pIn);
+    in2 = read_q15x2_ia (&pIn);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+    out1 = __PKHTB(in2, in1, 16);
+    out2 = __PKHBT(in2, in1, 16);
+
+#else
+
+    out1 = __PKHTB(in1, in2, 16);
+    out2 = __PKHBT(in1, in2, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+    /* rotate packed value by 24 */
+    out2 = ((uint32_t) out2 << 8) | ((uint32_t) out2 >> 24);
+
+    /* anding with 0xff00ff00 to get two 8 bit values */
+    out1 = out1 & 0xFF00FF00;
+    /* anding with 0x00ff00ff to get two 8 bit values */
+    out2 = out2 & 0x00FF00FF;
+
+    /* oring two values(contains two 8 bit values) to get four packed 8 bit values */
+    out1 = out1 | out2;
+
+    /* store 4 samples at a time to destiantion buffer */
+    write_q7x4_ia (&pDst, out1);
+
+#else
+
+    *pDst++ = (q7_t) (*pIn++ >> 8);
+    *pDst++ = (q7_t) (*pIn++ >> 8);
+    *pDst++ = (q7_t) (*pIn++ >> 8);
+    *pDst++ = (q7_t) (*pIn++ >> 8);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = (q7_t) A >> 8 */
+
+    /* Convert from q15 to q7 and store result in destination buffer */
+    *pDst++ = (q7_t) (*pIn++ >> 8);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of q15_to_x group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_float.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_float.c
@ -0,0 +1,202 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_q31_to_float.c
+ * Description:  Converts the elements of the Q31 vector to floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+ * @defgroup q31_to_x  Convert 32-bit fixed point value
+ */
+
+/**
+  @addtogroup q31_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the Q31 vector to floating-point vector.
+  @param[in]     pSrc       points to the Q31 input vector
+  @param[out]    pDst       points to the floating-point output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Details
+                   The equation used for the conversion process is:
+  <pre>
+      pDst[n] = (float32_t) pSrc[n] / 2147483648;   0 <= n < blockSize.
+  </pre>
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q31_to_float(
+  const q31_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4_t vecDst;
+    q31_t const *pSrcVec;
+
+    pSrcVec = (q31_t const *) pSrc;
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /* C = (float32_t) A / 2147483648 */
+        /* convert from q31 to float and then store the results in the destination buffer */
+        vecDst = vld1q(pSrcVec);   
+        pSrcVec += 4;
+        vstrwq(pDst, vcvtq_n_f32_s32(vecDst, 31));  
+        pDst += 4;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     * (will be merged thru tail predication)
+     */
+    blkCnt = blockSize & 3;
+    while (blkCnt > 0U)
+    {
+      /* C = (float32_t) A / 2147483648 */
+  
+      /* Convert from q31 to float and store result in destination buffer */
+      *pDst++ = ((float32_t) *pSrcVec++ / 2147483648.0f);
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+}
+
+#else
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_q31_to_float(
+  const q31_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+  const q31_t *pIn = pSrc;                             /* Src pointer */
+  uint32_t blkCnt;                               /* loop counter */
+
+  int32x4_t inV;
+  float32x4_t outV;
+
+  blkCnt = blockSize >> 2U;
+
+  /* Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = (float32_t) A / 2147483648 */
+    /* Convert from q31 to float and then store the results in the destination buffer */
+    inV = vld1q_s32(pIn);
+    pIn += 4;
+
+    outV = vcvtq_n_f32_s32(inV,31);
+
+    vst1q_f32(pDst, outV);
+    pDst += 4;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize & 3;
+
+
+  while (blkCnt > 0U)
+  {
+    /* C = (float32_t) A / 2147483648 */
+    /* Convert from q31 to float and then store the results in the destination buffer */
+    *pDst++ = ((float32_t) * pIn++ / 2147483648.0f);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+}
+#else
+void arm_q31_to_float(
+  const q31_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+  const q31_t *pIn = pSrc;                             /* Src pointer */
+  uint32_t blkCnt;                               /* loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (float32_t) A / 2147483648 */
+
+    /* Convert from q31 to float and store result in destination buffer */
+    *pDst++ = ((float32_t) *pIn++ / 2147483648.0f);
+    *pDst++ = ((float32_t) *pIn++ / 2147483648.0f);
+    *pDst++ = ((float32_t) *pIn++ / 2147483648.0f);
+    *pDst++ = ((float32_t) *pIn++ / 2147483648.0f);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = (float32_t) A / 2147483648 */
+
+    /* Convert from q31 to float and store result in destination buffer */
+    *pDst++ = ((float32_t) *pIn++ / 2147483648.0f);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of q31_to_x group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q15.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q15.c
@ -0,0 +1,181 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_q31_to_q15.c
+ * Description:  Converts the elements of the Q31 vector to Q15 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup q31_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the Q31 vector to Q15 vector.
+  @param[in]     pSrc       points to the Q31 input vector
+  @param[out]    pDst       points to the Q15 output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Details
+                   The equation used for the conversion process is:
+  <pre>
+      pDst[n] = (q15_t) pSrc[n] >> 16;   0 <= n < blockSize.
+  </pre>
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q31_to_q15(
+  const q31_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4x2_t tmp;
+    q15x8_t vecDst;
+    q31_t const *pSrcVec;
+
+
+    pSrcVec = (q31_t const *) pSrc;
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /* C = (q15_t) A >> 16 */
+        /* convert from q31 to q15 and then store the results in the destination buffer */
+        tmp = vld2q(pSrcVec);   
+        pSrcVec += 8;
+        vecDst = vshrnbq_n_s32(vecDst, tmp.val[0], 16);
+        vecDst = vshrntq_n_s32(vecDst, tmp.val[1], 16);
+        vst1q(pDst, vecDst);    
+        pDst += 8;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+    while (blkCnt > 0U)
+    {
+      /* C = (q15_t) (A >> 16) */
+  
+      /* Convert from q31 to q15 and store result in destination buffer */
+      *pDst++ = (q15_t) (*pSrcVec++ >> 16);
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+}
+
+#else
+void arm_q31_to_q15(
+  const q31_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+  const q31_t *pIn = pSrc;                             /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+        q31_t in1, in2, in3, in4;
+        q31_t out1, out2;
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (q15_t) (A >> 16) */
+
+    /* Convert from q31 to q15 and store result in destination buffer */
+#if defined (ARM_MATH_DSP)
+
+    in1 = *pIn++;
+    in2 = *pIn++;
+    in3 = *pIn++;
+    in4 = *pIn++;
+
+    /* pack two higher 16-bit values from two 32-bit values */
+#ifndef ARM_MATH_BIG_ENDIAN
+    out1 = __PKHTB(in2, in1, 16);
+    out2 = __PKHTB(in4, in3, 16);
+#else
+    out1 = __PKHTB(in1, in2, 16);
+    out2 = __PKHTB(in3, in4, 16);
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+    write_q15x2_ia (&pDst, out1);
+    write_q15x2_ia (&pDst, out2);
+
+#else
+
+    *pDst++ = (q15_t) (*pIn++ >> 16);
+    *pDst++ = (q15_t) (*pIn++ >> 16);
+    *pDst++ = (q15_t) (*pIn++ >> 16);
+    *pDst++ = (q15_t) (*pIn++ >> 16);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = (q15_t) (A >> 16) */
+
+    /* Convert from q31 to q15 and store result in destination buffer */
+    *pDst++ = (q15_t) (*pIn++ >> 16);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of q31_to_x group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q7.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q7.c
@ -0,0 +1,169 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_q31_to_q7.c
+ * Description:  Converts the elements of the Q31 vector to Q7 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup q31_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the Q31 vector to Q7 vector.
+  @param[in]     pSrc       points to the Q31 input vector
+  @param[out]    pDst       points to the Q7 output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Details
+                   The equation used for the conversion process is:
+  <pre>
+      pDst[n] = (q7_t) pSrc[n] >> 24;   0 <= n < blockSize.
+  </pre>
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q31_to_q7(
+  const q31_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4x4_t tmp;
+    q15x8_t evVec, oddVec;
+    q7x16_t  vecDst;
+    q31_t const *pSrcVec;
+
+    pSrcVec = (q31_t const *) pSrc;
+    blkCnt = blockSize >> 4;
+    while (blkCnt > 0U)
+    {
+        tmp = vld4q(pSrcVec);  
+        pSrcVec += 16;
+        /* C = (q7_t) A >> 24 */
+        /* convert from q31 to q7 and then store the results in the destination buffer */
+        /*
+         * narrow and pack evens
+         */
+        evVec = vshrnbq_n_s32(evVec, tmp.val[0], 16);
+        evVec = vshrntq_n_s32(evVec, tmp.val[2], 16);
+        /*
+         * narrow and pack odds
+         */
+        oddVec = vshrnbq_n_s32(oddVec, tmp.val[1], 16);
+        oddVec = vshrntq_n_s32(oddVec, tmp.val[3], 16);
+        /*
+         * narrow & merge
+         */
+        vecDst = vshrnbq_n_s16(vecDst, evVec, 8);
+        vecDst = vshrntq_n_s16(vecDst, oddVec, 8);
+
+        vst1q(pDst, vecDst);    
+        pDst += 16;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 0xF;
+    while (blkCnt > 0U)
+    {
+      /* C = (q7_t) (A >> 24) */
+  
+      /* Convert from q31 to q7 and store result in destination buffer */
+      *pDst++ = (q7_t) (*pSrcVec++ >> 24);
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+}
+#else
+void arm_q31_to_q7(
+  const q31_t * pSrc,
+        q7_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+  const q31_t *pIn = pSrc;                             /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  q7_t out1, out2, out3, out4;
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (q7_t) (A >> 24) */
+
+    /* Convert from q31 to q7 and store result in destination buffer */
+
+    out1 = (q7_t) (*pIn++ >> 24);
+    out2 = (q7_t) (*pIn++ >> 24);
+    out3 = (q7_t) (*pIn++ >> 24);
+    out4 = (q7_t) (*pIn++ >> 24);
+    write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = (q7_t) (A >> 24) */
+
+    /* Convert from q31 to q7 and store result in destination buffer */
+    *pDst++ = (q7_t) (*pIn++ >> 24);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of q31_to_x group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c
@ -0,0 +1,218 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_q7_to_float.c
+ * Description:  Converts the elements of the Q7 vector to floating-point vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+ * @defgroup q7_to_x  Convert 8-bit fixed point value
+ */
+
+/**
+  @addtogroup q7_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the Q7 vector to floating-point vector.
+  @param[in]     pSrc       points to the Q7 input vector
+  @param[out]    pDst       points to the floating-point output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+ @par            Details
+                   The equation used for the conversion process is:
+  <pre>
+      pDst[n] = (float32_t) pSrc[n] / 128;   0 <= n < blockSize.
+  </pre>
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q7_to_float(
+  const q7_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+    uint32_t  blkCnt;           /* loop counters */
+    q7x16_t vecDst;
+    q7_t const *pSrcVec;
+
+    pSrcVec = (q7_t const *) pSrc;
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        /* C = (float32_t) A / 32768 */
+        /* convert from q7 to float and then store the results in the destination buffer */
+        vecDst = vldrbq_s32(pSrcVec);    
+        pSrcVec += 4;
+        vstrwq(pDst, vcvtq_n_f32_s32((int32x4_t)vecDst, 7));   
+        pDst += 4;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+
+  blkCnt = blockSize & 3;
+  while (blkCnt > 0U)
+  {
+    /* C = (float32_t) A / 128 */
+
+    /* Convert from q7 to float and store result in destination buffer */
+    *pDst++ = ((float32_t) * pSrcVec++ / 128.0f);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#else
+#if defined(ARM_MATH_NEON)
+void arm_q7_to_float(
+  const q7_t * pSrc,
+  float32_t * pDst,
+  uint32_t blockSize)
+{
+  const q7_t *pIn = pSrc;                              /* Src pointer */
+  uint32_t blkCnt;                               /* loop counter */
+
+  int8x16_t inV;
+  int16x8_t inVLO, inVHI;
+  int32x4_t inVLL, inVLH, inVHL, inVHH;
+  float32x4_t outV;
+
+  blkCnt = blockSize >> 4U;
+
+  /* Compute 16 outputs at a time.
+   ** a second loop below computes the remaining 1 to 15 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = (float32_t) A / 128 */
+    /* Convert from q7 to float and then store the results in the destination buffer */
+    inV = vld1q_s8(pIn);
+    pIn += 16;
+
+    inVLO = vmovl_s8(vget_low_s8(inV));
+    inVHI = vmovl_s8(vget_high_s8(inV));
+
+    inVLL = vmovl_s16(vget_low_s16(inVLO));
+    inVLH = vmovl_s16(vget_high_s16(inVLO));
+    inVHL = vmovl_s16(vget_low_s16(inVHI));
+    inVHH = vmovl_s16(vget_high_s16(inVHI));
+
+    outV = vcvtq_n_f32_s32(inVLL,7);
+    vst1q_f32(pDst, outV);
+    pDst += 4;
+
+    outV = vcvtq_n_f32_s32(inVLH,7);
+    vst1q_f32(pDst, outV);
+    pDst += 4;
+
+    outV = vcvtq_n_f32_s32(inVHL,7);
+    vst1q_f32(pDst, outV);
+    pDst += 4;
+
+    outV = vcvtq_n_f32_s32(inVHH,7);
+    vst1q_f32(pDst, outV);
+    pDst += 4;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 16, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize & 0xF;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (float32_t) A / 128 */
+    /* Convert from q7 to float and then store the results in the destination buffer */
+    *pDst++ = ((float32_t) * pIn++ / 128.0f);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+}
+#else
+void arm_q7_to_float(
+  const q7_t * pSrc,
+        float32_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+  const q7_t *pIn = pSrc;                              /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (float32_t) A / 128 */
+
+    /* Convert from q7 to float and store result in destination buffer */
+    *pDst++ = ((float32_t) * pIn++ / 128.0f);
+    *pDst++ = ((float32_t) * pIn++ / 128.0f);
+    *pDst++ = ((float32_t) * pIn++ / 128.0f);
+    *pDst++ = ((float32_t) * pIn++ / 128.0f);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = (float32_t) A / 128 */
+
+    /* Convert from q7 to float and store result in destination buffer */
+    *pDst++ = ((float32_t) * pIn++ / 128.0f);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+  @} end of q7_to_x group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q15.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q15.c
@ -0,0 +1,188 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_q7_to_q15.c
+ * Description:  Converts the elements of the Q7 vector to Q15 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup q7_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the Q7 vector to Q15 vector.
+  @param[in]     pSrc       points to the Q7 input vector
+  @param[out]    pDst       points to the Q15 output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Details
+                   The equation used for the conversion process is:
+  <pre>
+      pDst[n] = (q15_t) pSrc[n] << 8;   0 <= n < blockSize.
+  </pre>
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q7_to_q15(
+  const q7_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8_t vecDst;
+    q7_t const *pSrcVec;
+
+
+    pSrcVec = (q7_t const *) pSrc;
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        /* C = (q15_t) A << 8 */
+        /* convert from q7 to q15 and then store the results in the destination buffer */
+        /* load q7 + 32-bit widening */
+        vecDst = vldrbq_s16(pSrcVec);    
+        pSrcVec += 8;
+        vecDst = vecDst << 8;
+        vstrhq(pDst, vecDst);   
+        pDst += 8;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+
+  blkCnt = blockSize & 7;
+  while (blkCnt > 0U)
+  {
+    /* C = (q15_t) A << 8 */
+
+    /* Convert from q7 to q15 and store result in destination buffer */
+    *pDst++ = (q15_t) * pSrcVec++ << 8;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#else
+void arm_q7_to_q15(
+  const q7_t * pSrc,
+        q15_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+  const q7_t *pIn = pSrc;                              /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+        q31_t in;
+        q31_t in1, in2;
+        q31_t out1, out2;
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (q15_t) A << 8 */
+
+    /* Convert from q7 to q15 and store result in destination buffer */
+#if defined (ARM_MATH_DSP)
+
+    in = read_q7x4_ia (&pIn);
+
+    /* rotatate in by 8 and extend two q7_t values to q15_t values */
+    in1 = __SXTB16(__ROR(in, 8));
+
+    /* extend remainig two q7_t values to q15_t values */
+    in2 = __SXTB16(in);
+
+    in1 = in1 << 8U;
+    in2 = in2 << 8U;
+
+    in1 = in1 & 0xFF00FF00;
+    in2 = in2 & 0xFF00FF00;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+    out2 = __PKHTB(in1, in2, 16);
+    out1 = __PKHBT(in2, in1, 16);
+#else
+    out1 = __PKHTB(in1, in2, 16);
+    out2 = __PKHBT(in2, in1, 16);
+#endif
+
+    write_q15x2_ia (&pDst, out1);
+    write_q15x2_ia (&pDst, out2);
+
+#else
+
+    *pDst++ = (q15_t) *pIn++ << 8;
+    *pDst++ = (q15_t) *pIn++ << 8;
+    *pDst++ = (q15_t) *pIn++ << 8;
+    *pDst++ = (q15_t) *pIn++ << 8;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = (q15_t) A << 8 */
+
+    /* Convert from q7 to q15 and store result in destination buffer */
+    *pDst++ = (q15_t) * pIn++ << 8;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of q7_to_x group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q31.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q31.c
@ -0,0 +1,164 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_q7_to_q31.c
+ * Description:  Converts the elements of the Q7 vector to Q31 vector
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup q7_to_x
+  @{
+ */
+
+/**
+  @brief         Converts the elements of the Q7 vector to Q31 vector.
+  @param[in]     pSrc       points to the Q7 input vector
+  @param[out]    pDst       points to the Q31 output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+
+  @par           Details
+                   The equation used for the conversion process is:
+  <pre>
+      pDst[n] = (q31_t) pSrc[n] << 24;   0 <= n < blockSize.
+  </pre>
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q7_to_q31(
+  const q7_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+    uint32_t blkCnt;   
+    q31x4_t         vecDst;
+
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+
+        /* C = (q31_t)A << 16 */
+        /* convert from q15 to q31 and then store the results in the destination buffer */
+        /* load q15 + 32-bit widening */
+        vecDst = vldrbq_s32((q7_t const *) pSrc);
+        vecDst = vshlq_n(vecDst, 24);
+        vstrwq_s32(pDst, vecDst);
+
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        pDst += 4;
+        pSrc += 4;
+        blkCnt --;
+     }
+
+  blkCnt = blockSize & 3;
+  while (blkCnt > 0U)
+  {
+    /* C = (q31_t) A << 24 */
+
+    /* Convert from q7 to q31 and store result in destination buffer */
+    *pDst++ = (q31_t) *pSrc++ << 24;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+
+#else
+void arm_q7_to_q31(
+  const q7_t * pSrc,
+        q31_t * pDst,
+        uint32_t blockSize)
+{
+        uint32_t blkCnt;                               /* Loop counter */
+  const q7_t *pIn = pSrc;                              /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+        q31_t in;
+
+  /* Loop unrolling: Compute 4 outputs at a time */
+  blkCnt = blockSize >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (q31_t) A << 24 */
+
+    /* Convert from q7 to q31 and store result in destination buffer */
+    in = read_q7x4_ia (&pIn);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+    *pDst++ = (__ROR(in, 8)) & 0xFF000000;
+    *pDst++ = (__ROR(in, 16)) & 0xFF000000;
+    *pDst++ = (__ROR(in, 24)) & 0xFF000000;
+    *pDst++ = (in & 0xFF000000);
+
+#else
+
+    *pDst++ = (in & 0xFF000000);
+    *pDst++ = (__ROR(in, 24)) & 0xFF000000;
+    *pDst++ = (__ROR(in, 16)) & 0xFF000000;
+    *pDst++ = (__ROR(in, 8)) & 0xFF000000;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Loop unrolling: Compute remaining outputs */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = (q31_t) A << 24 */
+
+    /* Convert from q7 to q31 and store result in destination buffer */
+    *pDst++ = (q31_t) * pIn++ << 24;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+  @} end of q7_to_x group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c
@ -0,0 +1,181 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_quick_sort_f32.c
+ * Description:  Floating point quick sort
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_sorting.h"
+
+static uint32_t arm_quick_sort_partition_f32(float32_t *pSrc, int32_t first, int32_t last, uint8_t dir)
+{
+    /* This function will be called */
+    int32_t i, j, pivot_index;
+    float32_t pivot;
+    float32_t temp;
+
+    /* The first element is the pivot */
+    pivot_index = first; 
+    pivot = pSrc[pivot_index];
+
+    /* Initialize indices for do-while loops */
+    i = first - 1;
+    j = last + 1; 
+
+    while(i < j) 
+    {
+        /* The loop will stop as soon as the indices i and j cross each other.
+         *
+         * This event will happen surely since the values of the indices are incremented and 
+         * decrement in the do-while loops that are executed at least once. 
+         * It is impossible to loop forever inside the do-while loops since the pivot is
+         * always an element of the array and the conditions cannot be always true (at least 
+         * the i-th or the j-th element will be equal to the pivot-th element).
+         * For example, in the extreme case of an ordered array the do-while loop related to i will stop
+         * at the first iteration (because pSrc[i]=pSrc[pivot] already), and the loop related to j
+         * will stop after (last-first) iterations (when j=pivot=i=first). j is returned and
+         * j+1 is going to be used as pivot by other calls of the function, until j=pivot=last. */ 
+
+        /* Move indices to the right and to the left */
+        if(dir)
+        {    
+            /* Compare left elements with pivot */
+            do
+            {
+                i++; 
+            } while (pSrc[i] < pivot && i<last);
+       
+            /* Compare right elements with pivot */
+            do
+            {
+                j--; 
+            } while (pSrc[j] > pivot);
+        }
+        else
+        {
+            /* Compare left elements with pivot */
+            do
+            {
+                i++; 
+            } while (pSrc[i] > pivot && i<last);
+        
+            /* Compare right elements with pivot */
+            do
+            {
+                j--; 
+            } while (pSrc[j] < pivot);
+        }
+
+        /* If the indices didn't cross each other */
+        if (i < j) 
+        { 
+            /* i and j are in the wrong position -> Swap */
+            temp=pSrc[i];
+            pSrc[i]=pSrc[j];
+            pSrc[j]=temp;
+        }
+    }
+
+    return j; 
+}
+
+static void arm_quick_sort_core_f32(float32_t *pSrc, int32_t first, int32_t last, uint8_t dir)
+{
+    /* If the array [first ... last] has more than one element */
+    if(first<last)
+    {
+        int32_t pivot;
+
+        /* Compute pivot */
+        pivot = arm_quick_sort_partition_f32(pSrc, first, last, dir);
+
+        /* Iterate algorithm with two sub-arrays [first ... pivot] and [pivot+1 ... last] */
+        arm_quick_sort_core_f32(pSrc, first,   pivot, dir);
+        arm_quick_sort_core_f32(pSrc, pivot+1, last,  dir);
+    }
+}
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Sorting
+  @{
+ */
+
+/**
+   * @private
+   * @param[in]      S          points to an instance of the sorting structure.
+   * @param[in,out]  pSrc       points to the block of input data.
+   * @param[out]     pDst       points to the block of output data.
+   * @param[in]      blockSize  number of samples to process.
+   *
+   * @par        Algorithm
+   *                The quick sort algorithm is a comparison algorithm that
+   *                divides the input array into two smaller sub-arrays and 
+   *                recursively sort them. An element of the array (the pivot)
+   *                is chosen, all the elements with values smaller than the 
+   *                pivot are moved before the pivot, while all elements with 
+   *                values greater than the pivot are moved after it (partition).
+   *
+   * @par
+   *                In this implementation the Hoare partition scheme has been 
+   *                used [Hoare, C. A. R. (1 January 1962). "Quicksort". The Computer
+   *                Journal. 5 (1): 10...16.] The first element has always been chosen
+   *                as the pivot. The partition algorithm guarantees that the returned
+   *                pivot is never placed outside the vector, since it is returned only 
+   *                when the pointers crossed each other. In this way it isn't 
+   *                possible to obtain empty partitions and infinite recursion is avoided.
+   *
+   * @par
+   *                It's an in-place algorithm. In order to obtain an out-of-place
+   *                function, a memcpy of the source vector is performed.
+   */
+
+void arm_quick_sort_f32(
+  const arm_sort_instance_f32 * S, 
+        float32_t * pSrc, 
+        float32_t * pDst, 
+        uint32_t blockSize)
+{
+    float32_t * pA;
+
+    /* Out-of-place */
+    if(pSrc != pDst) 
+    {   
+        memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+        pA = pDst;
+    }
+    else
+        pA = pSrc;
+
+    arm_quick_sort_core_f32(pA, 0, blockSize-1, S->dir);
+    /* The previous function could be called recursively a maximum 
+     * of (blockSize-1) times, generating a stack consumption of 4*(blockSize-1) bytes. */
+}
+
+/**
+  @} end of Sorting group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c
@ -0,0 +1,107 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_selection_sort_f32.c
+ * Description:  Floating point selection sort
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_sorting.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Sorting
+  @{
+ */
+
+/**
+   * @private
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   *
+   * @par        Algorithm
+   *               The Selection sort algorithm is a comparison algorithm that
+   *               divides the input array into a sorted and an unsorted sublist 
+   *               (initially the sorted sublist is empty and the unsorted sublist
+   *               is the input array), looks for the smallest (or biggest)
+   *               element in the unsorted sublist, swapping it with the leftmost
+   *               one, and moving the sublists boundary one element to the right.
+   *
+   * @par          It's an in-place algorithm. In order to obtain an out-of-place
+   *               function, a memcpy of the source vector is performed.
+   */
+
+void arm_selection_sort_f32(
+  const arm_sort_instance_f32 * S, 
+        float32_t * pSrc, 
+        float32_t * pDst, 
+        uint32_t blockSize)
+{
+    uint32_t i, j, k;
+    uint8_t dir = S->dir;
+    float32_t temp;
+
+    float32_t * pA;
+
+    if(pSrc != pDst) // out-of-place
+    {
+        memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+        pA = pDst;
+    }
+    else
+        pA = pSrc;
+
+    /*  Move the boundary one element to the right */
+    for (i=0; i<blockSize-1; i++)
+    {
+        /* Initialize the minimum/maximum as the first element */
+        k = i;
+
+        /* Look in the unsorted list to find the minimum/maximum value */
+        for (j=i+1; j<blockSize; j++)
+        {
+            if (dir==(pA[j] < pA[k]) )
+            {
+                /* Update value */
+                k = j;
+            }
+        }
+    
+        if (k != i) 
+        {
+            /* Swap the minimum/maximum with the leftmost element */
+            temp=pA[i];
+	    pA[i]=pA[k];
+	    pA[k]=temp;
+        }
+    }
+}
+
+/**
+  @} end of Sorting group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c
@ -0,0 +1,86 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_sort_f32.c
+ * Description:  Floating point sort
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_sorting.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Sorting
+  @{
+ */
+
+
+/**
+ * @brief Generic sorting function
+ *
+ * @param[in]  S          points to an instance of the sorting structure.
+ * @param[in]  pSrc       points to the block of input data.
+ * @param[out] pDst       points to the block of output data.
+ * @param[in]  blockSize  number of samples to process.
+ */
+
+void arm_sort_f32(
+  const arm_sort_instance_f32 * S, 
+        float32_t * pSrc, 
+        float32_t * pDst, 
+        uint32_t blockSize)
+{
+    switch(S->alg)
+    {
+        case ARM_SORT_BITONIC:
+        arm_bitonic_sort_f32(S, pSrc, pDst, blockSize);
+        break;
+
+        case ARM_SORT_BUBBLE:
+        arm_bubble_sort_f32(S, pSrc, pDst, blockSize);
+        break;
+
+        case ARM_SORT_HEAP:
+        arm_heap_sort_f32(S, pSrc, pDst, blockSize);
+        break;
+
+        case ARM_SORT_INSERTION:
+        arm_insertion_sort_f32(S, pSrc, pDst, blockSize);
+        break;
+
+        case ARM_SORT_QUICK:
+        arm_quick_sort_f32(S, pSrc, pDst, blockSize);
+        break;
+
+        case ARM_SORT_SELECTION:
+        arm_selection_sort_f32(S, pSrc, pDst, blockSize);
+        break;
+    }
+}
+
+/**
+  @} end of Sorting group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c
@ -0,0 +1,54 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_sort_init_f32.c
+ * Description:  Floating point sort initialization function
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_sorting.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Sorting
+  @{
+ */
+
+
+  /**
+   * @param[in,out]  S            points to an instance of the sorting structure.
+   * @param[in]      alg          Selected algorithm.
+   * @param[in]      dir          Sorting order.
+   */
+void arm_sort_init_f32(arm_sort_instance_f32 * S, arm_sort_alg alg, arm_sort_dir dir)
+{
+    S->alg         = alg;
+    S->dir         = dir;
+}
+
+/**
+  @} end of Sorting group
+ */
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f16.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f16.c
@ -0,0 +1,146 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_weighted_sum_f16.c
+ * Description:  Weighted Sum
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @defgroup weightedsum Weighted Sum
+
+  Weighted sum of values
+ */
+
+
+/**
+ * @addtogroup weightedsum
+ * @{
+ */
+
+
+/**
+ * @brief Weighted sum
+ *
+ *
+ * @param[in]    *in           Array of input values.
+ * @param[in]    *weigths      Weights
+ * @param[in]    blockSize     Number of samples in the input array.
+ * @return       Weighted sum
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+float16_t arm_weighted_sum_f16(const float16_t *in,const float16_t *weigths, uint32_t blockSize)
+{
+    _Float16       accum1, accum2;
+    float16x8_t    accum1V, accum2V;
+    float16x8_t    inV, wV;
+    const float16_t *pIn, *pW;
+    uint32_t        blkCnt;
+
+
+    pIn = in;
+    pW = weigths;
+
+
+    accum1V = vdupq_n_f16(0.0f16);
+    accum2V = vdupq_n_f16(0.0f16);
+
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0) 
+    {
+        inV = vld1q(pIn);
+        wV = vld1q(pW);
+
+        pIn += 4;
+        pW += 4;
+
+        accum1V = vfmaq(accum1V, inV, wV);
+        accum2V = vaddq(accum2V, wV);
+        blkCnt--;
+    }
+
+    accum1 = vecAddAcrossF16Mve(accum1V);
+    accum2 = vecAddAcrossF16Mve(accum2V);
+
+    blkCnt = blockSize & 7;
+    while(blkCnt > 0)
+    {
+        accum1 += (_Float16)*pIn++ * (_Float16)*pW;
+        accum2 += (_Float16)*pW++;
+        blkCnt--;
+    }
+
+
+    return (accum1 / accum2);
+}
+
+#else
+
+float16_t arm_weighted_sum_f16(const float16_t *in, const float16_t *weigths, uint32_t blockSize)
+{
+
+    _Float16 accum1, accum2;
+    const float16_t *pIn, *pW;
+    uint32_t blkCnt;
+
+
+    pIn = in;
+    pW = weigths;
+
+    accum1=0.0f16;
+    accum2=0.0f16;
+
+    blkCnt = blockSize;
+    while(blkCnt > 0)
+    {
+        accum1 += (_Float16)*pIn++ * (_Float16)*pW;
+        accum2 += (_Float16)*pW++;
+        blkCnt--;
+    }
+
+    return(accum1 / accum2);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of weightedsum group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
+
--- a/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f32.c
+++ b/Drivers/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f32.c
@ -0,0 +1,187 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_weighted_sum_f32.c
+ * Description:  Weighted Sum
+ *
+ * $Date:        23 April 2021
+ * $Revision:    V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "dsp/support_functions.h"
+
+/**
+ * @addtogroup weightedsum
+ * @{
+ */
+
+
+/**
+ * @brief Weighted sum
+ *
+ *
+ * @param[in]    *in           Array of input values.
+ * @param[in]    *weigths      Weights
+ * @param[in]    blockSize     Number of samples in the input array.
+ * @return       Weighted sum
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+float32_t arm_weighted_sum_f32(const float32_t *in,const float32_t *weigths, uint32_t blockSize)
+{
+    float32_t       accum1, accum2;
+    f32x4_t         accum1V, accum2V;
+    f32x4_t         inV, wV;
+    const float32_t *pIn, *pW;
+    uint32_t        blkCnt;
+
+
+    pIn = in;
+    pW = weigths;
+
+
+    accum1V = vdupq_n_f32(0.0);
+    accum2V = vdupq_n_f32(0.0);
+
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0) 
+    {
+        inV = vld1q(pIn);
+        wV = vld1q(pW);
+
+        pIn += 4;
+        pW += 4;
+
+        accum1V = vfmaq(accum1V, inV, wV);
+        accum2V = vaddq(accum2V, wV);
+        blkCnt--;
+    }
+
+    accum1 = vecAddAcrossF32Mve(accum1V);
+    accum2 = vecAddAcrossF32Mve(accum2V);
+
+    blkCnt = blockSize & 3;
+    while(blkCnt > 0)
+    {
+        accum1 += *pIn++ * *pW;
+        accum2 += *pW++;
+        blkCnt--;
+    }
+
+
+    return (accum1 / accum2);
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+
+#include "NEMath.h"
+float32_t arm_weighted_sum_f32(const float32_t *in,const float32_t *weigths, uint32_t blockSize)
+{
+
+    float32_t accum1, accum2;
+    float32x4_t accum1V, accum2V;
+    float32x2_t tempV;
+
+    float32x4_t inV,wV;
+
+    const float32_t *pIn, *pW;
+    uint32_t blkCnt;
+
+
+    pIn = in;
+    pW = weigths;
+
+    accum1=0.0f;
+    accum2=0.0f;
+
+    accum1V = vdupq_n_f32(0.0f);
+    accum2V = vdupq_n_f32(0.0f);
+
+    blkCnt = blockSize >> 2;
+    while(blkCnt > 0)
+    {
+        inV = vld1q_f32(pIn);
+        wV = vld1q_f32(pW);
+
+        pIn += 4; 
+        pW += 4;
+
+        accum1V = vmlaq_f32(accum1V,inV,wV);
+        accum2V = vaddq_f32(accum2V,wV);
+        blkCnt--;
+    }
+
+    tempV = vpadd_f32(vget_low_f32(accum1V),vget_high_f32(accum1V));
+    accum1 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1);
+
+    tempV = vpadd_f32(vget_low_f32(accum2V),vget_high_f32(accum2V));
+    accum2 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1);
+
+    blkCnt = blockSize & 3;
+    while(blkCnt > 0)
+    {
+        accum1 += *pIn++ * *pW;
+        accum2 += *pW++;
+        blkCnt--;
+    }
+
+
+    return(accum1 / accum2);
+}
+#else
+float32_t arm_weighted_sum_f32(const float32_t *in, const float32_t *weigths, uint32_t blockSize)
+{
+
+    float32_t accum1, accum2;
+    const float32_t *pIn, *pW;
+    uint32_t blkCnt;
+
+
+    pIn = in;
+    pW = weigths;
+
+    accum1=0.0f;
+    accum2=0.0f;
+
+    blkCnt = blockSize;
+    while(blkCnt > 0)
+    {
+        accum1 += *pIn++ * *pW;
+        accum2 += *pW++;
+        blkCnt--;
+    }
+
+    return(accum1 / accum2);
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of weightedsum group
+ */