Init commit of bundled StdPeriph-CMSIS library

4 years ago · c21da4388c
--- a/CMSIS/.gitignore
+++ b/CMSIS/.gitignore
@@ -0,0 +1,4 @@
 Documentation/
 Examples/
 Lib/
--- a/AGREEMENT.pdf
+++ b/AGREEMENT.pdf
--- a/support.htm
+++ b/support.htm
@@ -0,0 +1,243 @@
 <html>
 <head>
 <title>CMSIS Debug Support</title>
 <meta http-equiv="Content-Type" content="text/html; charset=windows-1252">
 <meta name="GENERATOR" content="Microsoft FrontPage 6.0">
 <meta name="ProgId" content="FrontPage.Editor.Document">
 <style>
 <!--
 /*-----------------------------------------------------------
 Keil Software CHM Style Sheet
 -----------------------------------------------------------*/
 body         { color: #000000; background-color: #FFFFFF; font-size: 75%; font-family: 
               Verdana, Arial, 'Sans Serif' }
 a:link       { color: #0000FF; text-decoration: underline }
 a:visited    { color: #0000FF; text-decoration: underline }
 a:active     { color: #FF0000; text-decoration: underline }
 a:hover      { color: #FF0000; text-decoration: underline }
 h1           { font-family: Verdana; font-size: 18pt; color: #000080; font-weight: bold; 
               text-align: Center; margin-right: 3 }
 h2           { font-family: Verdana; font-size: 14pt; color: #000080; font-weight: bold; 
               background-color: #CCCCCC; margin-top: 24; margin-bottom: 3; 
               padding: 6 }
 h3           { font-family: Verdana; font-size: 10pt; font-weight: bold; background-color: 
               #CCCCCC; margin-top: 24; margin-bottom: 3; padding: 6 }
 pre          { font-family: Courier New; font-size: 10pt; background-color: #CCFFCC; 
               margin-left: 24; margin-right: 24 }
 ul           { list-style-type: square; margin-top: 6pt; margin-bottom: 0 }
 ol           { margin-top: 6pt; margin-bottom: 0 }
 li           { clear: both; margin-bottom: 6pt }
 table        { font-size: 100%; border-width: 0; padding: 0 }
 th           { color: #FFFFFF; background-color: #000080; text-align: left; vertical-align: 
               bottom; padding-right: 6pt }
 tr           { text-align: left; vertical-align: top }
 td           { text-align: left; vertical-align: top; padding-right: 6pt }
 .ToolT       { font-size: 8pt; color: #808080 }
 .TinyT       { font-size: 8pt; text-align: Center }
 code         { color: #000000; background-color: #E0E0E0; font-family: 'Courier New', Courier; 
               line-height: 120%; font-style: normal }
 /*-----------------------------------------------------------
 Notes
 -----------------------------------------------------------*/
 p.note       { font-weight: bold; clear: both; margin-bottom: 3pt; padding-top: 6pt }
 /*-----------------------------------------------------------
 Expanding/Contracting Divisions
 -----------------------------------------------------------*/
 #expand      { text-decoration: none; margin-bottom: 3pt }
 img.expand   { border-style: none; border-width: medium }
 div.expand   { display: none; margin-left: 9pt; margin-top: 0 }
 /*-----------------------------------------------------------
 Where List Tags
 -----------------------------------------------------------*/
 p.wh         { font-weight: bold; clear: both; margin-top: 6pt; margin-bottom: 3pt }
 table.wh     { width: 100% }
 td.whItem    { white-space: nowrap; font-style: italic; padding-right: 6pt; padding-bottom: 
               6pt }
 td.whDesc    { padding-bottom: 6pt }
 /*-----------------------------------------------------------
 Keil Table Tags
 -----------------------------------------------------------*/
 table.kt     { border: 1pt solid #000000 }
 th.kt        { white-space: nowrap; border-bottom: 1pt solid #000000; padding-left: 6pt; 
               padding-right: 6pt; padding-top: 4pt; padding-bottom: 4pt }
 tr.kt        {  }
 td.kt        { color: #000000; background-color: #E0E0E0; border-top: 1pt solid #A0A0A0; 
               padding-left: 6pt; padding-right: 6pt; padding-top: 2pt; 
               padding-bottom: 2pt }
 /*-----------------------------------------------------------
 -----------------------------------------------------------*/
 -->
 </style>
 </head>
 <body>
 <h1>CMSIS Debug Support</h1>
 <hr>
 <h2>Cortex-M3 ITM Debug Access</h2>
 <p>
  The Cortex-M3 incorporates the Instrumented Trace Macrocell (ITM) that provides together with 
  the Serial Viewer Output trace capabilities for the microcontroller system. The ITM has 
  32 communication channels which are able to transmit 32 / 16 / 8 bit values; two ITM 
  communication channels are used by CMSIS to output the following information:
 </p>
 <ul>
 	<li>ITM Channel 0: used for printf-style output via the debug interface.</li>
 	<li>ITM Channel 31: is reserved for RTOS kernel awareness debugging.</li>
 </ul>
 <h2>Debug IN / OUT functions</h2>
 <p>CMSIS provides following debug functions:</p>
 <ul>
 	<li>ITM_SendChar (uses ITM channel 0)</li>
 	<li>ITM_ReceiveChar (uses global variable)</li>
 	<li>ITM_CheckChar (uses global variable)</li>
 </ul>
 <h3>ITM_SendChar</h3>
 <p>
  <strong>ITM_SendChar</strong> is used to transmit a character over ITM channel 0 from 
  the microcontroller system to the debug system. <br>
  Only a 8 bit value is transmitted.
 </p>
 <pre>
 static __INLINE uint32_t ITM_SendChar (uint32_t ch)
 {
  /* check if debugger connected and ITM channel enabled for tracing */
  if ((CoreDebug->DEMCR & CoreDebug_DEMCR_TRCENA)  &amp;&amp;
      (ITM-&gt;TCR & ITM_TCR_ITMENA)                  &amp;&amp;
      (ITM-&gt;TER & (1UL &lt;&lt; 0))  ) 
  {
    while (ITM-&gt;PORT[0].u32 == 0);
    ITM-&gt;PORT[0].u8 = (uint8_t)ch;
  }  
  return (ch);
 }</pre>
 <h3>ITM_ReceiveChar</h3>
 <p>
  ITM communication channel is only capable for OUT direction. For IN direction
  a globel variable is used. A simple mechansim detects if a character is received.
  The project to test need to be build with debug information.
 </p>
 <p>
  The globale variable <strong>ITM_RxBuffer</strong> is used to transmit a 8 bit value from debug system
  to microcontroller system. <strong>ITM_RxBuffer</strong> is 32 bit wide to enshure a proper handshake.
 </p>
 <pre>
 extern volatile int ITM_RxBuffer;                    /* variable to receive characters                             */
 </pre>
 <p>
  A dedicated bit pattern is used to determin if <strong>ITM_RxBuffer</strong> is empty
  or contains a valid value.
 </p>
 <pre>
 #define             ITM_RXBUFFER_EMPTY    0x5AA55AA5 /* value identifying ITM_RxBuffer is ready for next character */
 </pre>
 <p>
  <strong>ITM_ReceiveChar</strong> is used to receive a 8 bit value from the debug system. The function is nonblocking.
  It returns the received character or '-1' if no character was available.
 </p>
 <pre>
 static __INLINE int ITM_ReceiveChar (void) {
  int ch = -1;                               /* no character available */
  if (ITM_RxBuffer != ITM_RXBUFFER_EMPTY) {
    ch = ITM_RxBuffer;
    ITM_RxBuffer = ITM_RXBUFFER_EMPTY;       /* ready for next character */
  }
  return (ch); 
 }
 </pre>
 <h3>ITM_CheckChar</h3>
 <p>
  <strong>ITM_CheckChar</strong> is used to check if a character is received.
 </p>
 <pre>
 static __INLINE int ITM_CheckChar (void) {
  if (ITM_RxBuffer == ITM_RXBUFFER_EMPTY) {
    return (0);                                 /* no character available */
  } else {
    return (1);                                 /*    character available */
  }
 }</pre>
 <h2>ITM Debug Support in uVision</h2>
 <p>
  uVision uses in a debug session the <strong>Debug (printf) Viewer</strong> window to 
  display the debug data.
 </p>
 <p>Direction microcontroller system -&gt; uVision:</p>
 <ul>
  <li>
    Characters received via ITM communication channel 0 are written in a printf style
    to <strong>Debug (printf) Viewer</strong> window.
  </li>
 </ul>
 <p>Direction uVision -&gt; microcontroller system:</p>
 <ul>
  <li>Check if <strong>ITM_RxBuffer</strong> variable is available (only performed once).</li>
  <li>Read character from <strong>Debug (printf) Viewer</strong> window.</li>
  <li>If <strong>ITM_RxBuffer</strong> empty write character to <strong>ITM_RxBuffer</strong>.</li>
 </ul>
 <p class="Note">Note</p>
 <ul>
  <li><p>Current solution does not use a buffer machanism for trasmitting the characters.</p>
  </li>
 </ul>
 <h2>RTX Kernel awareness in uVision</h2>
 <p>
  uVision / RTX are using a simple and efficient solution for RTX Kernel awareness.
  No format overhead is necessary.<br>
  uVsion debugger decodes the RTX events via the 32 / 16 / 8 bit ITM write access
  to ITM communication channel 31.
 </p>
 <p>Following RTX events are traced:</p>
 <ul>
  <li>Task Create / Delete event
    <ol>
      <li>32 bit access. Task start address is transmitted</li>
      <li>16 bit access. Task ID and Create/Delete flag are transmitted<br>
          High byte holds Create/Delete flag, Low byte holds TASK ID.
      </li>
    </ol>
  </li>
  <li>Task switch event
    <ol>
      <li>8 bit access. Task ID of current task is transmitted</li>
    </ol>
  </li>
 </ul>
 <p class="Note">Note</p>
 <ul>
  <li><p>Other RTOS information could be retrieved via memory read access in a polling mode manner.</p>
  </li>
 </ul>
 <p class="MsoNormal"><span lang="EN-GB">&nbsp;</span></p>
 <hr>
 <p class="TinyT">Copyright © KEIL - An ARM Company.<br>
 All rights reserved.<br>
 Visit our web site at <a href="http://www.keil.com">www.keil.com</a>.
 </p>
 </body>
 </html>
--- a/CMSIS/CMSIS_changes.htm
+++ b/CMSIS/CMSIS_changes.htm
@@ -0,0 +1,320 @@
 <html>
 <head>
 <title>CMSIS Changes</title>
 <meta http-equiv="Content-Type" content="text/html; charset=windows-1252">
 <meta name="GENERATOR" content="Microsoft FrontPage 6.0">
 <meta name="ProgId" content="FrontPage.Editor.Document">
 <style>
 <!--
 /*-----------------------------------------------------------
 Keil Software CHM Style Sheet
 -----------------------------------------------------------*/
 body         { color: #000000; background-color: #FFFFFF; font-size: 75%; font-family: 
               Verdana, Arial, 'Sans Serif' }
 a:link       { color: #0000FF; text-decoration: underline }
 a:visited    { color: #0000FF; text-decoration: underline }
 a:active     { color: #FF0000; text-decoration: underline }
 a:hover      { color: #FF0000; text-decoration: underline }
 h1           { font-family: Verdana; font-size: 18pt; color: #000080; font-weight: bold; 
               text-align: Center; margin-right: 3 }
 h2           { font-family: Verdana; font-size: 14pt; color: #000080; font-weight: bold; 
               background-color: #CCCCCC; margin-top: 24; margin-bottom: 3; 
               padding: 6 }
 h3           { font-family: Verdana; font-size: 10pt; font-weight: bold; background-color: 
               #CCCCCC; margin-top: 24; margin-bottom: 3; padding: 6 }
 pre          { font-family: Courier New; font-size: 10pt; background-color: #CCFFCC; 
               margin-left: 24; margin-right: 24 }
 ul           { list-style-type: square; margin-top: 6pt; margin-bottom: 0 }
 ol           { margin-top: 6pt; margin-bottom: 0 }
 li           { clear: both; margin-bottom: 6pt }
 table        { font-size: 100%; border-width: 0; padding: 0 }
 th           { color: #FFFFFF; background-color: #000080; text-align: left; vertical-align: 
               bottom; padding-right: 6pt }
 tr           { text-align: left; vertical-align: top }
 td           { text-align: left; vertical-align: top; padding-right: 6pt }
 .ToolT       { font-size: 8pt; color: #808080 }
 .TinyT       { font-size: 8pt; text-align: Center }
 code         { color: #000000; background-color: #E0E0E0; font-family: 'Courier New', Courier; 
               line-height: 120%; font-style: normal }
 /*-----------------------------------------------------------
 Notes
 -----------------------------------------------------------*/
 p.note       { font-weight: bold; clear: both; margin-bottom: 3pt; padding-top: 6pt }
 /*-----------------------------------------------------------
 Expanding/Contracting Divisions
 -----------------------------------------------------------*/
 #expand      { text-decoration: none; margin-bottom: 3pt }
 img.expand   { border-style: none; border-width: medium }
 div.expand   { display: none; margin-left: 9pt; margin-top: 0 }
 /*-----------------------------------------------------------
 Where List Tags
 -----------------------------------------------------------*/
 p.wh         { font-weight: bold; clear: both; margin-top: 6pt; margin-bottom: 3pt }
 table.wh     { width: 100% }
 td.whItem    { white-space: nowrap; font-style: italic; padding-right: 6pt; padding-bottom: 
               6pt }
 td.whDesc    { padding-bottom: 6pt }
 /*-----------------------------------------------------------
 Keil Table Tags
 -----------------------------------------------------------*/
 table.kt     { border: 1pt solid #000000 }
 th.kt        { white-space: nowrap; border-bottom: 1pt solid #000000; padding-left: 6pt; 
               padding-right: 6pt; padding-top: 4pt; padding-bottom: 4pt }
 tr.kt        {  }
 td.kt        { color: #000000; background-color: #E0E0E0; border-top: 1pt solid #A0A0A0; 
               padding-left: 6pt; padding-right: 6pt; padding-top: 2pt; 
               padding-bottom: 2pt }
 /*-----------------------------------------------------------
 -----------------------------------------------------------*/
 -->
 </style>
 </head>
 <body>
 <h1>Changes to CMSIS version V1.20</h1>
 <hr>
 <h2>1. Removed CMSIS Middelware packages</h2>
 <p>
  CMSIS Middleware is on hold from ARM side until a agreement between all CMSIS partners is found.
 </p>
 <h2>2. SystemFrequency renamed to SystemCoreClock</h2>
 <p>
  The variable name <strong>SystemCoreClock</strong> is more precise than <strong>SystemFrequency</strong>
  because the variable holds the clock value at which the core is running.
 </p>
 <h2>3. Changed startup concept</h2>
 <p>
  The old startup concept (calling SystemInit_ExtMemCtl from startup file and calling SystemInit 
  from main) has the weakness that it does not work for controllers which need a already 
  configuerd clock system to configure the external memory controller.
 </p>
 <h3>Changed startup concept</h3>
 <ul>
  <li>
    SystemInit() is called from startup file before <strong>premain</strong>.
  </li>
  <li>
    <strong>SystemInit()</strong> configures the clock system and also configures
    an existing external memory controller.
  </li>
  <li>
    <strong>SystemInit()</strong> must not use global variables.
  </li>
  <li>
    <strong>SystemCoreClock</strong> is initialized with a correct predefined value.
  </li>
  <li>
    Additional function <strong>void SystemCoreClockUpdate (void)</strong> is provided.<br>
   <strong>SystemCoreClockUpdate()</strong> updates the variable <strong>SystemCoreClock</strong>
   and must be called whenever the core clock is changed.<br>
   <strong>SystemCoreClockUpdate()</strong> evaluates the clock register settings and calculates
   the current core clock.
  </li>
 </ul>
 <h2>4. Advanced Debug Functions</h2>
 <p>
  ITM communication channel is only capable for OUT direction. To allow also communication for
  IN direction a simple concept is provided.
 </p>
 <ul>
  <li>
    Global variable <strong>volatile int ITM_RxBuffer</strong> used for IN data.
  </li>
  <li>
    Function <strong>int ITM_CheckChar (void)</strong> checks if a new character is available.
  </li>
  <li>
    Function <strong>int ITM_ReceiveChar (void)</strong> retrieves the new character.
  </li>
 </ul>
 <p>
  For detailed explanation see file <strong>CMSIS debug support.htm</strong>. 
 </p>
 <h2>5. Core Register Bit Definitions</h2>
 <p>
  Files core_cm3.h and core_cm0.h contain now bit definitions for Core Registers. The name for the
  defines correspond with the Cortex-M Technical Reference Manual.  
 </p>
 <p>
  e.g. SysTick structure with bit definitions
 </p>
 <pre>
 /** @addtogroup CMSIS_CM3_SysTick CMSIS CM3 SysTick
  memory mapped structure for SysTick
  @{
 */
 typedef struct
 {
  __IO uint32_t CTRL;                         /*!< Offset: 0x00  SysTick Control and Status Register */
  __IO uint32_t LOAD;                         /*!< Offset: 0x04  SysTick Reload Value Register       */
  __IO uint32_t VAL;                          /*!< Offset: 0x08  SysTick Current Value Register      */
  __I  uint32_t CALIB;                        /*!< Offset: 0x0C  SysTick Calibration Register        */
 } SysTick_Type;
 /* SysTick Control / Status Register Definitions */
 #define SysTick_CTRL_COUNTFLAG_Pos         16                                             /*!< SysTick CTRL: COUNTFLAG Position */
 #define SysTick_CTRL_COUNTFLAG_Msk         (1ul << SysTick_CTRL_COUNTFLAG_Pos)            /*!< SysTick CTRL: COUNTFLAG Mask */
 #define SysTick_CTRL_CLKSOURCE_Pos          2                                             /*!< SysTick CTRL: CLKSOURCE Position */
 #define SysTick_CTRL_CLKSOURCE_Msk         (1ul << SysTick_CTRL_CLKSOURCE_Pos)            /*!< SysTick CTRL: CLKSOURCE Mask */
 #define SysTick_CTRL_TICKINT_Pos            1                                             /*!< SysTick CTRL: TICKINT Position */
 #define SysTick_CTRL_TICKINT_Msk           (1ul << SysTick_CTRL_TICKINT_Pos)              /*!< SysTick CTRL: TICKINT Mask */
 #define SysTick_CTRL_ENABLE_Pos             0                                             /*!< SysTick CTRL: ENABLE Position */
 #define SysTick_CTRL_ENABLE_Msk            (1ul << SysTick_CTRL_ENABLE_Pos)               /*!< SysTick CTRL: ENABLE Mask */
 /* SysTick Reload Register Definitions */
 #define SysTick_LOAD_RELOAD_Pos             0                                             /*!< SysTick LOAD: RELOAD Position */
 #define SysTick_LOAD_RELOAD_Msk            (0xFFFFFFul << SysTick_LOAD_RELOAD_Pos)        /*!< SysTick LOAD: RELOAD Mask */
 /* SysTick Current Register Definitions */
 #define SysTick_VAL_CURRENT_Pos             0                                             /*!< SysTick VAL: CURRENT Position */
 #define SysTick_VAL_CURRENT_Msk            (0xFFFFFFul << SysTick_VAL_CURRENT_Pos)        /*!< SysTick VAL: CURRENT Mask */
 /* SysTick Calibration Register Definitions */
 #define SysTick_CALIB_NOREF_Pos            31                                             /*!< SysTick CALIB: NOREF Position */
 #define SysTick_CALIB_NOREF_Msk            (1ul << SysTick_CALIB_NOREF_Pos)               /*!< SysTick CALIB: NOREF Mask */
 #define SysTick_CALIB_SKEW_Pos             30                                             /*!< SysTick CALIB: SKEW Position */
 #define SysTick_CALIB_SKEW_Msk             (1ul << SysTick_CALIB_SKEW_Pos)                /*!< SysTick CALIB: SKEW Mask */
 #define SysTick_CALIB_TENMS_Pos             0                                             /*!< SysTick CALIB: TENMS Position */
 #define SysTick_CALIB_TENMS_Msk            (0xFFFFFFul << SysTick_VAL_CURRENT_Pos)        /*!< SysTick CALIB: TENMS Mask */
 /*@}*/ /* end of group CMSIS_CM3_SysTick */</pre>
 <h2>7. DoxyGen Tags</h2>
 <p>
  DoxyGen tags in files core_cm3.[c,h] and core_cm0.[c,h] are reworked to create proper documentation
  using DoxyGen.
 </p>
 <h2>8. Folder Structure</h2>
 <p>
  The folder structure is changed to differentiate the single support packages.
 </p>
  <ul>
    <li>CM0</li>
    <li>CM3
       <ul>
         <li>CoreSupport</li>
         <li>DeviceSupport</li>
           <ul>
             <li>Vendor 
               <ul>
                 <li>Device
                   <ul>
                      <li>Startup
                        <ul>
                          <li>Toolchain</li>
                          <li>Toolchain</li>
                          <li>...</li>
                        </ul>
                      </li>
                   </ul>
                 </li>
                 <li>Device</li>
                 <li>...</li>
               </ul>
             </li>
             <li>Vendor</li>
             <li>...</li>
           </ul>
         </li>
         <li>Example
           <ul>
             <li>Toolchain 
               <ul>
                 <li>Device</li>
                 <li>Device</li>
                 <li>...</li>
               </ul>
             </li>
             <li>Toolchain</li>
             <li>...</li>
           </ul>
         </li>
       </ul>
    </li>
    <li>Documentation</li>
  </ul>
 <h2>9. Open Points</h2>
 <p>
  Following points need to be clarified and solved:
 </p>
 <ul>
  <li>
    <p>
      Equivalent C and Assembler startup files.
    </p>
    <p>
      Is there a need for having C startup files although assembler startup files are
      very efficient and do not need to be changed?
    <p/>
  </li>
  <li>
    <p>
      Placing of HEAP in external RAM.
    </p>
    <p>
      It must be possible to place HEAP in external RAM if the device supports an 
      external memory controller.
    </p>
  </li>
  <li>
    <p>
      Placing of STACK /HEAP.
    </p>
    <p>
      STACK should always be placed at the end of internal RAM.
    </p>
    <p>
      If HEAP is placed in internal RAM than it should be placed after RW ZI section.
    </p>
  </li>
  <li>
    <p>
      Removing core_cm3.c and core_cm0.c.
    </p>
    <p>
      On a long term the functions in core_cm3.c and core_cm0.c must be replaced with 
      appropriate compiler intrinsics.
    </p>
  </li>
 </ul>
 <h2>10. Limitations</h2>
 <p>
  The following limitations are not covered with the current CMSIS version:
 </p>
 <ul>
 <li>
  No <strong>C startup files</strong> for ARM toolchain are provided. 
 </li>
 <li>
  No <strong>C startup files</strong> for GNU toolchain are provided. 
 </li>
 <li>
  No <strong>C startup files</strong> for IAR toolchain are provided. 
 </li>
 <li>
  No <strong>Tasking</strong> projects are provided yet. 
 </li>
 </ul>
--- a/CMSIS/DSP_Lib/Source/ARM/arm_cortexM0x_math.uvopt
+++ b/CMSIS/DSP_Lib/Source/ARM/arm_cortexM0x_math.uvopt
--- a/CMSIS/DSP_Lib/Source/ARM/arm_cortexM0x_math.uvproj
+++ b/CMSIS/DSP_Lib/Source/ARM/arm_cortexM0x_math.uvproj
--- a/CMSIS/DSP_Lib/Source/ARM/arm_cortexM3x_math.uvopt
+++ b/CMSIS/DSP_Lib/Source/ARM/arm_cortexM3x_math.uvopt
--- a/CMSIS/DSP_Lib/Source/ARM/arm_cortexM3x_math.uvproj
+++ b/CMSIS/DSP_Lib/Source/ARM/arm_cortexM3x_math.uvproj
--- a/CMSIS/DSP_Lib/Source/ARM/arm_cortexM4x_math.uvopt
+++ b/CMSIS/DSP_Lib/Source/ARM/arm_cortexM4x_math.uvopt
--- a/CMSIS/DSP_Lib/Source/ARM/arm_cortexM4x_math.uvproj
+++ b/CMSIS/DSP_Lib/Source/ARM/arm_cortexM4x_math.uvproj
--- a/CMSIS/DSP_Lib/Source/ARM/arm_cortexMx_math_Build.bat
+++ b/CMSIS/DSP_Lib/Source/ARM/arm_cortexMx_math_Build.bat
@@ -0,0 +1,29 @@
 SET TMP=C:\Temp
 SET TEMP=C:\Temp
 SET UVEXE=C:\Keil\UV4\UV4.EXE
@echo   Building DSP Library for Cortex-M0 Little Endian
 %UVEXE% -rb arm_cortexM0x_math.uvproj -t"DSP_Lib CM0 LE" -o"DSP_Lib CM0 LE.txt" -j0
@echo   Building DSP Library for Cortex-M0 Big Endian
 %UVEXE% -rb arm_cortexM0x_math.uvproj -t"DSP_Lib CM0 BE" -o"DSP_Lib CM0 BE.txt" -j0
@echo   Building DSP Library for Cortex-M3 Little Endian
 %UVEXE% -rb arm_cortexM3x_math.uvproj -t"DSP_Lib CM3 LE" -o"DSP_Lib CM3 LE.txt" -j0
@echo   Building DSP Library for Cortex-M3 Big Endian
 %UVEXE% -rb arm_cortexM3x_math.uvproj -t"DSP_Lib CM3 BE" -o"DSP_Lib CM3 BE.txt" -j0
@echo   Building DSP Library for Cortex-M4 Little Endian
 %UVEXE% -rb arm_cortexM4x_math.uvproj -t"DSP_Lib CM4 LE" -o"DSP_Lib CM4 LE.txt" -j0
@echo   Building DSP Library for Cortex-M4 Big Endian
 %UVEXE% -rb arm_cortexM4x_math.uvproj -t"DSP_Lib CM4 BE" -o"DSP_Lib CM4 BE.txt" -j0
@echo   Building DSP Library for Cortex-M4 with FPU Little Endian
 %UVEXE% -rb arm_cortexM4x_math.uvproj -t"DSP_Lib CM4 LE FPU" -o"DSP_Lib CM4 LE FPU.txt" -j0
@echo   Building DSP Library for Cortex-M4 with FPU Big Endian
 %UVEXE% -rb arm_cortexM4x_math.uvproj -t"DSP_Lib CM4 BE FPU" -o"DSP_Lib CM4 BE FPU.txt" -j0
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_abs_f32.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_abs_f32.c
@@ -0,0 +1,165 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_abs_f32.c    
 *    
 * Description:	Vector absolute value.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * ---------------------------------------------------------------------------- */
 #include "arm_math.h"
 #include <math.h>
 /**        
 * @ingroup groupMath        
 */
 /**        
 * @defgroup BasicAbs Vector Absolute Value        
 *        
 * Computes the absolute value of a vector on an element-by-element basis.        
 *        
 * <pre>        
 *     pDst[n] = abs(pSrc[n]),   0 <= n < blockSize.        
 * </pre>        
 *        
 * The functions support in-place computation allowing the source and
 * destination pointers to reference the same memory buffer.
 * There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */
 /**        
 * @addtogroup BasicAbs        
 * @{        
 */
 /**        
 * @brief Floating-point vector absolute value.        
 * @param[in]       *pSrc points to the input buffer        
 * @param[out]      *pDst points to the output buffer        
 * @param[in]       blockSize number of samples in each vector        
 * @return none.        
 */
 void arm_abs_f32(
  float32_t * pSrc,
  float32_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  float32_t in1, in2, in3, in4;                  /* temporary variables */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Calculate absolute and then store the results in the destination buffer. */
    /* read sample from source */
    in1 = *pSrc;
    in2 = *(pSrc + 1);
    in3 = *(pSrc + 2);
    /* find absolute value */
    in1 = fabsf(in1);
    /* read sample from source */
    in4 = *(pSrc + 3);
    /* find absolute value */
    in2 = fabsf(in2);
    /* read sample from source */
    *pDst = in1;
    /* find absolute value */
    in3 = fabsf(in3);
    /* find absolute value */
    in4 = fabsf(in4);
    /* store result to destination */
    *(pDst + 1) = in2;
    /* store result to destination */
    *(pDst + 2) = in3;
    /* store result to destination */
    *(pDst + 3) = in4;
    /* Update source pointer to process next sampels */
    pSrc += 4u;
    /* Update destination pointer to process next sampels */
    pDst += 4u;
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /*   #ifndef ARM_MATH_CM0_FAMILY   */
  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Calculate absolute and then store the results in the destination buffer. */
    *pDst++ = fabsf(*pSrc++);
    /* Decrement the loop counter */
    blkCnt--;
  }
 }
 /**        
 * @} end of BasicAbs group        
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_abs_q15.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_abs_q15.c
@@ -0,0 +1,179 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_abs_q15.c    
 *    
 * Description:	Q15 vector absolute value.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup BasicAbs    
 * @{    
 */
 /**    
 * @brief Q15 vector absolute value.    
 * @param[in]       *pSrc points to the input buffer    
 * @param[out]      *pDst points to the output buffer    
 * @param[in]       blockSize number of samples in each vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.    
 */
 void arm_abs_q15(
  q15_t * pSrc,
  q15_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
  __SIMD32_TYPE *simd;
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q15_t in1;                                     /* Input value1 */
  q15_t in2;                                     /* Input value2 */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  simd = __SIMD32_CONST(pDst);
  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Read two inputs */
    in1 = *pSrc++;
    in2 = *pSrc++;
    /* Store the Absolute result in the destination buffer by packing the two values, in a single cycle */
 #ifndef  ARM_MATH_BIG_ENDIAN
    *simd++ =
      __PKHBT(((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)),
              ((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)), 16);
 #else
    *simd++ =
      __PKHBT(((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)),
              ((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)), 16);
 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */
    in1 = *pSrc++;
    in2 = *pSrc++;
 #ifndef  ARM_MATH_BIG_ENDIAN
    *simd++ =
      __PKHBT(((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)),
              ((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)), 16);
 #else
    *simd++ =
      __PKHBT(((in2 > 0) ? in2 : (q15_t)__QSUB16(0, in2)),
              ((in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1)), 16);
 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */
    /* Decrement the loop counter */
    blkCnt--;
  }
  pDst = (q15_t *)simd;
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Read the input */
    in1 = *pSrc++;
    /* Calculate absolute value of input and then store the result in the destination buffer. */
    *pDst++ = (in1 > 0) ? in1 : (q15_t)__QSUB16(0, in1);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  q15_t in;                                      /* Temporary input variable */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Read the input */
    in = *pSrc++;
    /* Calculate absolute value of input and then store the result in the destination buffer. */
    *pDst++ = (in > 0) ? in : ((in == (q15_t) 0x8000) ? 0x7fff : -in);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of BasicAbs group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_abs_q31.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_abs_q31.c
@@ -0,0 +1,130 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_abs_q31.c    
 *    
 * Description:	Q31 vector absolute value.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup BasicAbs    
 * @{    
 */
 /**    
 * @brief Q31 vector absolute value.    
 * @param[in]       *pSrc points to the input buffer    
 * @param[out]      *pDst points to the output buffer    
 * @param[in]       blockSize number of samples in each vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.    
 */
 void arm_abs_q31(
  q31_t * pSrc,
  q31_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
  q31_t in;                                      /* Input value */
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t in1, in2, in3, in4;
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Calculate absolute of input (if -1 then saturated to 0x7fffffff) and then store the results in the destination buffer. */
    in1 = *pSrc++;
    in2 = *pSrc++;
    in3 = *pSrc++;
    in4 = *pSrc++;
    *pDst++ = (in1 > 0) ? in1 : (q31_t)__QSUB(0, in1);
    *pDst++ = (in2 > 0) ? in2 : (q31_t)__QSUB(0, in2);
    *pDst++ = (in3 > 0) ? in3 : (q31_t)__QSUB(0, in3);
    *pDst++ = (in4 > 0) ? in4 : (q31_t)__QSUB(0, in4);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /*   #ifndef ARM_MATH_CM0_FAMILY   */
  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Calculate absolute value of the input (if -1 then saturated to 0x7fffffff) and then store the results in the destination buffer. */
    in = *pSrc++;
    *pDst++ = (in > 0) ? in : ((in == INT32_MIN) ? INT32_MAX : -in);
    /* Decrement the loop counter */
    blkCnt--;
  }
 }
 /**    
 * @} end of BasicAbs group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_abs_q7.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_abs_q7.c
@@ -0,0 +1,157 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_abs_q7.c    
 *    
 * Description:	Q7 vector absolute value.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**        
 * @ingroup groupMath        
 */
 /**        
 * @addtogroup BasicAbs        
 * @{        
 */
 /**        
 * @brief Q7 vector absolute value.        
 * @param[in]       *pSrc points to the input buffer        
 * @param[out]      *pDst points to the output buffer        
 * @param[in]       blockSize number of samples in each vector        
 * @return none.        
 *    
 * \par Conditions for optimum performance    
 *  Input and output buffers should be aligned by 32-bit    
 *    
 *        
 * <b>Scaling and Overflow Behavior:</b>        
 * \par        
 * The function uses saturating arithmetic.        
 * The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.        
 */
 void arm_abs_q7(
  q7_t * pSrc,
  q7_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
  q7_t in;                                       /* Input value1 */
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t in1, in2, in3, in4;                      /* temporary input variables */
  q31_t out1, out2, out3, out4;                  /* temporary output variables */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Read inputs */
    in1 = (q31_t) * pSrc;
    in2 = (q31_t) * (pSrc + 1);
    in3 = (q31_t) * (pSrc + 2);
    /* find absolute value */
    out1 = (in1 > 0) ? in1 : (q31_t)__QSUB8(0, in1);
    /* read input */
    in4 = (q31_t) * (pSrc + 3);
    /* find absolute value */
    out2 = (in2 > 0) ? in2 : (q31_t)__QSUB8(0, in2);
    /* store result to destination */
    *pDst = (q7_t) out1;
    /* find absolute value */
    out3 = (in3 > 0) ? in3 : (q31_t)__QSUB8(0, in3);
    /* find absolute value */
    out4 = (in4 > 0) ? in4 : (q31_t)__QSUB8(0, in4);
    /* store result to destination */
    *(pDst + 1) = (q7_t) out2;
    /* store result to destination */
    *(pDst + 2) = (q7_t) out3;
    /* store result to destination */
    *(pDst + 3) = (q7_t) out4;
    /* update pointers to process next samples */
    pSrc += 4u;
    pDst += 4u;
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  blkCnt = blockSize;
 #endif //      #define ARM_MATH_CM0_FAMILY
  while(blkCnt > 0u)
  {
    /* C = |A| */
    /* Read the input */
    in = *pSrc++;
    /* Store the Absolute result in the destination buffer */
    *pDst++ = (in > 0) ? in : ((in == (q7_t) 0x80) ? 0x7f : -in);
    /* Decrement the loop counter */
    blkCnt--;
  }
 }
 /**        
 * @} end of BasicAbs group        
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_add_f32.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_add_f32.c
@@ -0,0 +1,150 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_add_f32.c    
 *    
 * Description:	Floating-point vector addition.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * ---------------------------------------------------------------------------- */
 #include "arm_math.h"
 /**        
 * @ingroup groupMath        
 */
 /**        
 * @defgroup BasicAdd Vector Addition        
 *        
 * Element-by-element addition of two vectors.        
 *        
 * <pre>        
 *     pDst[n] = pSrcA[n] + pSrcB[n],   0 <= n < blockSize.        
 * </pre>        
 *        
 * There are separate functions for floating-point, Q7, Q15, and Q31 data types.        
 */
 /**        
 * @addtogroup BasicAdd        
 * @{        
 */
 /**        
 * @brief Floating-point vector addition.        
 * @param[in]       *pSrcA points to the first input vector        
 * @param[in]       *pSrcB points to the second input vector        
 * @param[out]      *pDst points to the output vector        
 * @param[in]       blockSize number of samples in each vector        
 * @return none.        
 */
 void arm_add_f32(
  float32_t * pSrcA,
  float32_t * pSrcB,
  float32_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  float32_t inA1, inA2, inA3, inA4;              /* temporary input variabels */
  float32_t inB1, inB2, inB3, inB4;              /* temporary input variables */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A + B */
    /* Add and then store the results in the destination buffer. */
    /* read four inputs from sourceA and four inputs from sourceB */
    inA1 = *pSrcA;
    inB1 = *pSrcB;
    inA2 = *(pSrcA + 1);
    inB2 = *(pSrcB + 1);
    inA3 = *(pSrcA + 2);
    inB3 = *(pSrcB + 2);
    inA4 = *(pSrcA + 3);
    inB4 = *(pSrcB + 3);
    /* C = A + B */
    /* add and store result to destination */
    *pDst = inA1 + inB1;
    *(pDst + 1) = inA2 + inB2;
    *(pDst + 2) = inA3 + inB3;
    *(pDst + 3) = inA4 + inB4;
    /* update pointers to process next samples */
    pSrcA += 4u;
    pSrcB += 4u;
    pDst += 4u;
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.        
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C = A + B */
    /* Add and then store the results in the destination buffer. */
    *pDst++ = (*pSrcA++) + (*pSrcB++);
    /* Decrement the loop counter */
    blkCnt--;
  }
 }
 /**        
 * @} end of BasicAdd group        
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_add_q15.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_add_q15.c
@@ -0,0 +1,140 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_add_q15.c    
 *    
 * Description:	Q15 vector addition    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup BasicAdd    
 * @{    
 */
 /**    
 * @brief Q15 vector addition.    
 * @param[in]       *pSrcA points to the first input vector    
 * @param[in]       *pSrcB points to the second input vector    
 * @param[out]      *pDst points to the output vector    
 * @param[in]       blockSize number of samples in each vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.    
 */
 void arm_add_q15(
  q15_t * pSrcA,
  q15_t * pSrcB,
  q15_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t inA1, inA2, inB1, inB2;
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A + B */
    /* Add and then store the results in the destination buffer. */
    inA1 = *__SIMD32(pSrcA)++;
    inA2 = *__SIMD32(pSrcA)++;
    inB1 = *__SIMD32(pSrcB)++;
    inB2 = *__SIMD32(pSrcB)++;
    *__SIMD32(pDst)++ = __QADD16(inA1, inB1);
    *__SIMD32(pDst)++ = __QADD16(inA2, inB2);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
  while(blkCnt > 0u)
  {
    /* C = A + B */
    /* Add and then store the results in the destination buffer. */
    *pDst++ = (q15_t) __QADD16(*pSrcA++, *pSrcB++);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
  while(blkCnt > 0u)
  {
    /* C = A + B */
    /* Add and then store the results in the destination buffer. */
    *pDst++ = (q15_t) __SSAT(((q31_t) * pSrcA++ + *pSrcB++), 16);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of BasicAdd group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_add_q31.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_add_q31.c
@@ -0,0 +1,148 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_add_q31.c    
 *    
 * Description:	Q31 vector addition.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup BasicAdd    
 * @{    
 */
 /**    
 * @brief Q31 vector addition.    
 * @param[in]       *pSrcA points to the first input vector    
 * @param[in]       *pSrcB points to the second input vector    
 * @param[out]      *pDst points to the output vector    
 * @param[in]       blockSize number of samples in each vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.    
 */
 void arm_add_q31(
  q31_t * pSrcA,
  q31_t * pSrcB,
  q31_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t inA1, inA2, inA3, inA4;
  q31_t inB1, inB2, inB3, inB4;
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A + B */
    /* Add and then store the results in the destination buffer. */
    inA1 = *pSrcA++;
    inA2 = *pSrcA++;
    inB1 = *pSrcB++;
    inB2 = *pSrcB++;
    inA3 = *pSrcA++;
    inA4 = *pSrcA++;
    inB3 = *pSrcB++;
    inB4 = *pSrcB++;
    *pDst++ = __QADD(inA1, inB1);
    *pDst++ = __QADD(inA2, inB2);
    *pDst++ = __QADD(inA3, inB3);
    *pDst++ = __QADD(inA4, inB4);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
  while(blkCnt > 0u)
  {
    /* C = A + B */
    /* Add and then store the results in the destination buffer. */
    *pDst++ = __QADD(*pSrcA++, *pSrcB++);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
  while(blkCnt > 0u)
  {
    /* C = A + B */
    /* Add and then store the results in the destination buffer. */
    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrcA++ + *pSrcB++);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of BasicAdd group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_add_q7.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_add_q7.c
@@ -0,0 +1,134 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_add_q7.c    
 *    
 * Description:	Q7 vector addition.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup BasicAdd    
 * @{    
 */
 /**    
 * @brief Q7 vector addition.    
 * @param[in]       *pSrcA points to the first input vector    
 * @param[in]       *pSrcB points to the second input vector    
 * @param[out]      *pDst points to the output vector    
 * @param[in]       blockSize number of samples in each vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.    
 */
 void arm_add_q7(
  q7_t * pSrcA,
  q7_t * pSrcB,
  q7_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A + B */
    /* Add and then store the results in the destination buffer. */
    *__SIMD32(pDst)++ = __QADD8(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
  while(blkCnt > 0u)
  {
    /* C = A + B */
    /* Add and then store the results in the destination buffer. */
    *pDst++ = (q7_t) __SSAT(*pSrcA++ + *pSrcB++, 8);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
  while(blkCnt > 0u)
  {
    /* C = A + B */
    /* Add and then store the results in the destination buffer. */
    *pDst++ = (q7_t) __SSAT((q15_t) * pSrcA++ + *pSrcB++, 8);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of BasicAdd group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_dot_prod_f32.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_dot_prod_f32.c
@@ -0,0 +1,135 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_dot_prod_f32.c    
 *    
 * Description:	Floating-point dot product.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * ---------------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath
 */
 /**
 * @defgroup dot_prod Vector Dot Product
 *
 * Computes the dot product of two vectors.
 * The vectors are multiplied element-by-element and then summed.
 *
 * <pre>
 *     sum = pSrcA[0]*pSrcB[0] + pSrcA[1]*pSrcB[1] + ... + pSrcA[blockSize-1]*pSrcB[blockSize-1]
 * </pre>     
 *
 * There are separate functions for floating-point, Q7, Q15, and Q31 data types.    
 */
 /**    
 * @addtogroup dot_prod    
 * @{    
 */
 /**    
 * @brief Dot product of floating-point vectors.    
 * @param[in]       *pSrcA points to the first input vector    
 * @param[in]       *pSrcB points to the second input vector    
 * @param[in]       blockSize number of samples in each vector    
 * @param[out]      *result output result returned here    
 * @return none.    
 */
 void arm_dot_prod_f32(
  float32_t * pSrcA,
  float32_t * pSrcB,
  uint32_t blockSize,
  float32_t * result)
 {
  float32_t sum = 0.0f;                          /* Temporary result storage */
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and then store the result in a temporary buffer */
    sum += (*pSrcA++) * (*pSrcB++);
    sum += (*pSrcA++) * (*pSrcB++);
    sum += (*pSrcA++) * (*pSrcB++);
    sum += (*pSrcA++) * (*pSrcB++);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and then store the result in a temporary buffer. */
    sum += (*pSrcA++) * (*pSrcB++);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* Store the result back in the destination buffer */
  *result = sum;
 }
 /**    
 * @} end of dot_prod group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_dot_prod_q15.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_dot_prod_q15.c
@@ -0,0 +1,140 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_dot_prod_q15.c    
 *    
 * Description:	Q15 dot product.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup dot_prod    
 * @{    
 */
 /**    
 * @brief Dot product of Q15 vectors.    
 * @param[in]       *pSrcA points to the first input vector    
 * @param[in]       *pSrcB points to the second input vector    
 * @param[in]       blockSize number of samples in each vector    
 * @param[out]      *result output result returned here    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The intermediate multiplications are in 1.15 x 1.15 = 2.30 format and these    
 * results are added to a 64-bit accumulator in 34.30 format.    
 * Nonsaturating additions are used and given that there are 33 guard bits in the accumulator    
 * there is no risk of overflow.    
 * The return result is in 34.30 format.    
 */
 void arm_dot_prod_q15(
  q15_t * pSrcA,
  q15_t * pSrcB,
  uint32_t blockSize,
  q63_t * result)
 {
  q63_t sum = 0;                                 /* Temporary result storage */
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and then store the result in a temporary buffer. */
    sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum);
    sum = __SMLALD(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++, sum);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
  while(blkCnt > 0u)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and then store the results in a temporary buffer. */
    sum = __SMLALD(*pSrcA++, *pSrcB++, sum);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
  while(blkCnt > 0u)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and then store the results in a temporary buffer. */
    sum += (q63_t) ((q31_t) * pSrcA++ * *pSrcB++);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  /* Store the result in the destination buffer in 34.30 format */
  *result = sum;
 }
 /**    
 * @} end of dot_prod group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_dot_prod_q31.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_dot_prod_q31.c
@@ -0,0 +1,143 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_dot_prod_q31.c    
 *    
 * Description:	Q31 dot product.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup dot_prod    
 * @{    
 */
 /**    
 * @brief Dot product of Q31 vectors.    
 * @param[in]       *pSrcA points to the first input vector    
 * @param[in]       *pSrcB points to the second input vector    
 * @param[in]       blockSize number of samples in each vector    
 * @param[out]      *result output result returned here    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The intermediate multiplications are in 1.31 x 1.31 = 2.62 format and these    
 * are truncated to 2.48 format by discarding the lower 14 bits.    
 * The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.    
 * There are 15 guard bits in the accumulator and there is no risk of overflow as long as    
 * the length of the vectors is less than 2^16 elements.    
 * The return result is in 16.48 format.    
 */
 void arm_dot_prod_q31(
  q31_t * pSrcA,
  q31_t * pSrcB,
  uint32_t blockSize,
  q63_t * result)
 {
  q63_t sum = 0;                                 /* Temporary result storage */
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t inA1, inA2, inA3, inA4;
  q31_t inB1, inB2, inB3, inB4;
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and then store the result in a temporary buffer. */
    inA1 = *pSrcA++;
    inA2 = *pSrcA++;
    inA3 = *pSrcA++;
    inA4 = *pSrcA++;
    inB1 = *pSrcB++;
    inB2 = *pSrcB++;
    inB3 = *pSrcB++;
    inB4 = *pSrcB++;
    sum += ((q63_t) inA1 * inB1) >> 14u;
    sum += ((q63_t) inA2 * inB2) >> 14u;
    sum += ((q63_t) inA3 * inB3) >> 14u;
    sum += ((q63_t) inA4 * inB4) >> 14u;
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Calculate dot product and then store the result in a temporary buffer. */
    sum += ((q63_t) * pSrcA++ * *pSrcB++) >> 14u;
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* Store the result in the destination buffer in 16.48 format */
  *result = sum;
 }
 /**    
 * @} end of dot_prod group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_dot_prod_q7.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_dot_prod_q7.c
@@ -0,0 +1,159 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_dot_prod_q7.c    
 *    
 * Description:	Q7 dot product.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup dot_prod    
 * @{    
 */
 /**    
 * @brief Dot product of Q7 vectors.    
 * @param[in]       *pSrcA points to the first input vector    
 * @param[in]       *pSrcB points to the second input vector    
 * @param[in]       blockSize number of samples in each vector    
 * @param[out]      *result output result returned here    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The intermediate multiplications are in 1.7 x 1.7 = 2.14 format and these    
 * results are added to an accumulator in 18.14 format.    
 * Nonsaturating additions are used and there is no danger of wrap around as long as    
 * the vectors are less than 2^18 elements long.    
 * The return result is in 18.14 format.    
 */
 void arm_dot_prod_q7(
  q7_t * pSrcA,
  q7_t * pSrcB,
  uint32_t blockSize,
  q31_t * result)
 {
  uint32_t blkCnt;                               /* loop counter */
  q31_t sum = 0;                                 /* Temporary variables to store output */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t input1, input2;                          /* Temporary variables to store input */
  q31_t inA1, inA2, inB1, inB2;                  /* Temporary variables to store input */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* read 4 samples at a time from sourceA */
    input1 = *__SIMD32(pSrcA)++;
    /* read 4 samples at a time from sourceB */
    input2 = *__SIMD32(pSrcB)++;
    /* extract two q7_t samples to q15_t samples */
    inA1 = __SXTB16(__ROR(input1, 8));
    /* extract reminaing two samples */
    inA2 = __SXTB16(input1);
    /* extract two q7_t samples to q15_t samples */
    inB1 = __SXTB16(__ROR(input2, 8));
    /* extract reminaing two samples */
    inB2 = __SXTB16(input2);
    /* multiply and accumulate two samples at a time */
    sum = __SMLAD(inA1, inB1, sum);
    sum = __SMLAD(inA2, inB2, sum);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
  while(blkCnt > 0u)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Dot product and then store the results in a temporary buffer. */
    sum = __SMLAD(*pSrcA++, *pSrcB++, sum);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
  while(blkCnt > 0u)
  {
    /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
    /* Dot product and then store the results in a temporary buffer. */
    sum += (q31_t) ((q15_t) * pSrcA++ * *pSrcB++);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  /* Store the result in the destination buffer in 18.14 format */
  *result = sum;
 }
 /**    
 * @} end of dot_prod group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_mult_f32.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_mult_f32.c
@@ -0,0 +1,174 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_mult_f32.c    
 *    
 * Description:	Floating-point vector multiplication.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**        
 * @ingroup groupMath        
 */
 /**        
 * @defgroup BasicMult Vector Multiplication        
 *        
 * Element-by-element multiplication of two vectors.        
 *        
 * <pre>        
 *     pDst[n] = pSrcA[n] * pSrcB[n],   0 <= n < blockSize.        
 * </pre>        
 *        
 * There are separate functions for floating-point, Q7, Q15, and Q31 data types.        
 */
 /**        
 * @addtogroup BasicMult        
 * @{        
 */
 /**        
 * @brief Floating-point vector multiplication.        
 * @param[in]       *pSrcA points to the first input vector        
 * @param[in]       *pSrcB points to the second input vector        
 * @param[out]      *pDst points to the output vector        
 * @param[in]       blockSize number of samples in each vector        
 * @return none.        
 */
 void arm_mult_f32(
  float32_t * pSrcA,
  float32_t * pSrcB,
  float32_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counters */
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  float32_t inA1, inA2, inA3, inA4;              /* temporary input variables */
  float32_t inB1, inB2, inB3, inB4;              /* temporary input variables */
  float32_t out1, out2, out3, out4;              /* temporary output variables */
  /* loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A * B */
    /* Multiply the inputs and store the results in output buffer */
    /* read sample from sourceA */
    inA1 = *pSrcA;
    /* read sample from sourceB */
    inB1 = *pSrcB;
    /* read sample from sourceA */
    inA2 = *(pSrcA + 1);
    /* read sample from sourceB */
    inB2 = *(pSrcB + 1);
    /* out = sourceA * sourceB */
    out1 = inA1 * inB1;
    /* read sample from sourceA */
    inA3 = *(pSrcA + 2);
    /* read sample from sourceB */
    inB3 = *(pSrcB + 2);
    /* out = sourceA * sourceB */
    out2 = inA2 * inB2;
    /* read sample from sourceA */
    inA4 = *(pSrcA + 3);
    /* store result to destination buffer */
    *pDst = out1;
    /* read sample from sourceB */
    inB4 = *(pSrcB + 3);
    /* out = sourceA * sourceB */
    out3 = inA3 * inB3;
    /* store result to destination buffer */
    *(pDst + 1) = out2;
    /* out = sourceA * sourceB */
    out4 = inA4 * inB4;
    /* store result to destination buffer */
    *(pDst + 2) = out3;
    /* store result to destination buffer */
    *(pDst + 3) = out4;
    /* update pointers to process next samples */
    pSrcA += 4u;
    pSrcB += 4u;
    pDst += 4u;
    /* Decrement the blockSize loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.        
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C = A * B */
    /* Multiply the inputs and store the results in output buffer */
    *pDst++ = (*pSrcA++) * (*pSrcB++);
    /* Decrement the blockSize loop counter */
    blkCnt--;
  }
 }
 /**        
 * @} end of BasicMult group        
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_mult_q15.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_mult_q15.c
@@ -0,0 +1,154 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_mult_q15.c    
 *    
 * Description:	Q15 vector multiplication.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup BasicMult    
 * @{    
 */
 /**    
 * @brief           Q15 vector multiplication    
 * @param[in]       *pSrcA points to the first input vector    
 * @param[in]       *pSrcB points to the second input vector    
 * @param[out]      *pDst points to the output vector    
 * @param[in]       blockSize number of samples in each vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.    
 */
 void arm_mult_q15(
  q15_t * pSrcA,
  q15_t * pSrcB,
  q15_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counters */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t inA1, inA2, inB1, inB2;                  /* temporary input variables */
  q15_t out1, out2, out3, out4;                  /* temporary output variables */
  q31_t mul1, mul2, mul3, mul4;                  /* temporary variables */
  /* loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* read two samples at a time from sourceA */
    inA1 = *__SIMD32(pSrcA)++;
    /* read two samples at a time from sourceB */
    inB1 = *__SIMD32(pSrcB)++;
    /* read two samples at a time from sourceA */
    inA2 = *__SIMD32(pSrcA)++;
    /* read two samples at a time from sourceB */
    inB2 = *__SIMD32(pSrcB)++;
    /* multiply mul = sourceA * sourceB */
    mul1 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
    mul2 = (q31_t) ((q15_t) inA1 * (q15_t) inB1);
    mul3 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB2 >> 16));
    mul4 = (q31_t) ((q15_t) inA2 * (q15_t) inB2);
    /* saturate result to 16 bit */
    out1 = (q15_t) __SSAT(mul1 >> 15, 16);
    out2 = (q15_t) __SSAT(mul2 >> 15, 16);
    out3 = (q15_t) __SSAT(mul3 >> 15, 16);
    out4 = (q15_t) __SSAT(mul4 >> 15, 16);
    /* store the result */
 #ifndef ARM_MATH_BIG_ENDIAN
    *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
    *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
 #else
    *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
    *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
 #endif //      #ifndef ARM_MATH_BIG_ENDIAN
    /* Decrement the blockSize loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C = A * B */
    /* Multiply the inputs and store the result in the destination buffer */
    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++)) >> 15), 16);
    /* Decrement the blockSize loop counter */
    blkCnt--;
  }
 }
 /**    
 * @} end of BasicMult group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_mult_q31.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_mult_q31.c
@@ -0,0 +1,160 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_mult_q31.c    
 *    
 * Description:	Q31 vector multiplication.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup BasicMult    
 * @{    
 */
 /**    
 * @brief Q31 vector multiplication.    
 * @param[in]       *pSrcA points to the first input vector    
 * @param[in]       *pSrcB points to the second input vector    
 * @param[out]      *pDst points to the output vector    
 * @param[in]       blockSize number of samples in each vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.    
 */
 void arm_mult_q31(
  q31_t * pSrcA,
  q31_t * pSrcB,
  q31_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counters */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t inA1, inA2, inA3, inA4;                  /* temporary input variables */
  q31_t inB1, inB2, inB3, inB4;                  /* temporary input variables */
  q31_t out1, out2, out3, out4;                  /* temporary output variables */
  /* loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A * B */
    /* Multiply the inputs and then store the results in the destination buffer. */
    inA1 = *pSrcA++;
    inA2 = *pSrcA++;
    inA3 = *pSrcA++;
    inA4 = *pSrcA++;
    inB1 = *pSrcB++;
    inB2 = *pSrcB++;
    inB3 = *pSrcB++;
    inB4 = *pSrcB++;
    out1 = ((q63_t) inA1 * inB1) >> 32;
    out2 = ((q63_t) inA2 * inB2) >> 32;
    out3 = ((q63_t) inA3 * inB3) >> 32;
    out4 = ((q63_t) inA4 * inB4) >> 32;
    out1 = __SSAT(out1, 31);
    out2 = __SSAT(out2, 31);
    out3 = __SSAT(out3, 31);
    out4 = __SSAT(out4, 31);
    *pDst++ = out1 << 1u;
    *pDst++ = out2 << 1u;
    *pDst++ = out3 << 1u;
    *pDst++ = out4 << 1u;
    /* Decrement the blockSize loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
  while(blkCnt > 0u)
  {
    /* C = A * B */
    /* Multiply the inputs and then store the results in the destination buffer. */
    inA1 = *pSrcA++;
    inB1 = *pSrcB++;
    out1 = ((q63_t) inA1 * inB1) >> 32;
    out1 = __SSAT(out1, 31);
    *pDst++ = out1 << 1u;
    /* Decrement the blockSize loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
  while(blkCnt > 0u)
  {
    /* C = A * B */
    /* Multiply the inputs and then store the results in the destination buffer. */
    *pDst++ =
      (q31_t) clip_q63_to_q31(((q63_t) (*pSrcA++) * (*pSrcB++)) >> 31);
    /* Decrement the blockSize loop counter */
    blkCnt--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of BasicMult group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_mult_q7.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_mult_q7.c
@@ -0,0 +1,127 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_mult_q7.c    
 *    
 * Description:	Q7 vector multiplication.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup BasicMult    
 * @{    
 */
 /**    
 * @brief           Q7 vector multiplication    
 * @param[in]       *pSrcA points to the first input vector    
 * @param[in]       *pSrcB points to the second input vector    
 * @param[out]      *pDst points to the output vector    
 * @param[in]       blockSize number of samples in each vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.    
 */
 void arm_mult_q7(
  q7_t * pSrcA,
  q7_t * pSrcB,
  q7_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counters */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q7_t out1, out2, out3, out4;                   /* Temporary variables to store the product */
  /* loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A * B */
    /* Multiply the inputs and store the results in temporary variables */
    out1 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    out2 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    out3 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    out4 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    /* Store the results of 4 inputs in the destination buffer in single cycle by packing */
    *__SIMD32(pDst)++ = __PACKq7(out1, out2, out3, out4);
    /* Decrement the blockSize loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C = A * B */
    /* Multiply the inputs and store the result in the destination buffer */
    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++)) >> 7), 8);
    /* Decrement the blockSize loop counter */
    blkCnt--;
  }
 }
 /**    
 * @} end of BasicMult group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_negate_f32.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_negate_f32.c
@@ -0,0 +1,146 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_negate_f32.c    
 *    
 * Description:	Negates floating-point vectors.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * ---------------------------------------------------------------------------- */
 #include "arm_math.h"
 /**        
 * @ingroup groupMath        
 */
 /**        
 * @defgroup negate Vector Negate        
 *        
 * Negates the elements of a vector.        
 *        
 * <pre>        
 *     pDst[n] = -pSrc[n],   0 <= n < blockSize.        
 * </pre>        
 *
 * The functions support in-place computation allowing the source and
 * destination pointers to reference the same memory buffer.
 * There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */
 /**        
 * @addtogroup negate        
 * @{        
 */
 /**        
 * @brief  Negates the elements of a floating-point vector.        
 * @param[in]  *pSrc points to the input vector        
 * @param[out]  *pDst points to the output vector        
 * @param[in]  blockSize number of samples in the vector        
 * @return none.        
 */
 void arm_negate_f32(
  float32_t * pSrc,
  float32_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  float32_t in1, in2, in3, in4;                  /* temporary variables */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* read inputs from source */
    in1 = *pSrc;
    in2 = *(pSrc + 1);
    in3 = *(pSrc + 2);
    in4 = *(pSrc + 3);
    /* negate the input */
    in1 = -in1;
    in2 = -in2;
    in3 = -in3;
    in4 = -in4;
    /* store the result to destination */
    *pDst = in1;
    *(pDst + 1) = in2;
    *(pDst + 2) = in3;
    *(pDst + 3) = in4;
    /* update pointers to process next samples */
    pSrc += 4u;
    pDst += 4u;
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.        
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Negate and then store the results in the destination buffer. */
    *pDst++ = -*pSrc++;
    /* Decrement the loop counter */
    blkCnt--;
  }
 }
 /**        
 * @} end of negate group        
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_negate_q15.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_negate_q15.c
@@ -0,0 +1,142 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_negate_q15.c    
 *    
 * Description:	Negates Q15 vectors.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**        
 * @ingroup groupMath        
 */
 /**        
 * @addtogroup negate        
 * @{        
 */
 /**        
 * @brief  Negates the elements of a Q15 vector.        
 * @param[in]  *pSrc points to the input vector        
 * @param[out]  *pDst points to the output vector        
 * @param[in]  blockSize number of samples in the vector        
 * @return none.        
 *    
 * \par Conditions for optimum performance    
 *  Input and output buffers should be aligned by 32-bit    
 *    
 *        
 * <b>Scaling and Overflow Behavior:</b>        
 * \par        
 * The function uses saturating arithmetic.        
 * The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.        
 */
 void arm_negate_q15(
  q15_t * pSrc,
  q15_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
  q15_t in;
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t in1, in2;                                /* Temporary variables */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Read two inputs at a time */
    in1 = _SIMD32_OFFSET(pSrc);
    in2 = _SIMD32_OFFSET(pSrc + 2);
    /* negate two samples at a time */
    in1 = __QSUB16(0, in1);
    /* negate two samples at a time */
    in2 = __QSUB16(0, in2);
    /* store the result to destination 2 samples at a time */
    _SIMD32_OFFSET(pDst) = in1;
    /* store the result to destination 2 samples at a time */
    _SIMD32_OFFSET(pDst + 2) = in2;
    /* update pointers to process next samples */
    pSrc += 4u;
    pDst += 4u;
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.        
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Negate and then store the result in the destination buffer. */
    in = *pSrc++;
    *pDst++ = (in == (q15_t) 0x8000) ? 0x7fff : -in;
    /* Decrement the loop counter */
    blkCnt--;
  }
 }
 /**        
 * @} end of negate group        
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_negate_q31.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_negate_q31.c
@@ -0,0 +1,129 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_negate_q31.c    
 *    
 * Description:	Negates Q31 vectors.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup negate    
 * @{    
 */
 /**    
 * @brief  Negates the elements of a Q31 vector.    
 * @param[in]  *pSrc points to the input vector    
 * @param[out]  *pDst points to the output vector    
 * @param[in]  blockSize number of samples in the vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.    
 */
 void arm_negate_q31(
  q31_t * pSrc,
  q31_t * pDst,
  uint32_t blockSize)
 {
  q31_t in;                                      /* Temporary variable */
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t in1, in2, in3, in4;
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Negate and then store the results in the destination buffer. */
    in1 = *pSrc++;
    in2 = *pSrc++;
    in3 = *pSrc++;
    in4 = *pSrc++;
    *pDst++ = __QSUB(0, in1);
    *pDst++ = __QSUB(0, in2);
    *pDst++ = __QSUB(0, in3);
    *pDst++ = __QSUB(0, in4);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Negate and then store the result in the destination buffer. */
    in = *pSrc++;
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
    /* Decrement the loop counter */
    blkCnt--;
  }
 }
 /**    
 * @} end of negate group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_negate_q7.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_negate_q7.c
@@ -0,0 +1,125 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_negate_q7.c    
 *    
 * Description:	Negates Q7 vectors.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup negate    
 * @{    
 */
 /**    
 * @brief  Negates the elements of a Q7 vector.    
 * @param[in]  *pSrc points to the input vector    
 * @param[out]  *pDst points to the output vector    
 * @param[in]  blockSize number of samples in the vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * The Q7 value -1 (0x80) will be saturated to the maximum allowable positive value 0x7F.    
 */
 void arm_negate_q7(
  q7_t * pSrc,
  q7_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
  q7_t in;
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t input;                                   /* Input values1-4 */
  q31_t zero = 0x00000000;
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Read four inputs */
    input = *__SIMD32(pSrc)++;
    /* Store the Negated results in the destination buffer in a single cycle by packing the results */
    *__SIMD32(pDst)++ = __QSUB8(zero, input);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C = -A */
    /* Negate and then store the results in the destination buffer. */ \
      in = *pSrc++;
    *pDst++ = (in == (q7_t) 0x80) ? 0x7f : -in;
    /* Decrement the loop counter */
    blkCnt--;
  }
 }
 /**    
 * @} end of negate group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_offset_f32.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_offset_f32.c
@@ -0,0 +1,165 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_offset_f32.c    
 *    
 * Description:	Floating-point vector offset.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * ---------------------------------------------------------------------------- */
 #include "arm_math.h"
 /**        
 * @ingroup groupMath        
 */
 /**        
 * @defgroup offset Vector Offset        
 *        
 * Adds a constant offset to each element of a vector.        
 *        
 * <pre>        
 *     pDst[n] = pSrc[n] + offset,   0 <= n < blockSize.        
 * </pre>        
 *        
 * The functions support in-place computation allowing the source and
 * destination pointers to reference the same memory buffer.
 * There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 */
 /**        
 * @addtogroup offset        
 * @{        
 */
 /**        
 * @brief  Adds a constant offset to a floating-point vector.        
 * @param[in]  *pSrc points to the input vector        
 * @param[in]  offset is the offset to be added        
 * @param[out]  *pDst points to the output vector        
 * @param[in]  blockSize number of samples in the vector        
 * @return none.        
 */
 void arm_offset_f32(
  float32_t * pSrc,
  float32_t offset,
  float32_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  float32_t in1, in2, in3, in4;
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the results in the destination buffer. */
    /* read samples from source */
    in1 = *pSrc;
    in2 = *(pSrc + 1);
    /* add offset to input */
    in1 = in1 + offset;
    /* read samples from source */
    in3 = *(pSrc + 2);
    /* add offset to input */
    in2 = in2 + offset;
    /* read samples from source */
    in4 = *(pSrc + 3);
    /* add offset to input */
    in3 = in3 + offset;
    /* store result to destination */
    *pDst = in1;
    /* add offset to input */
    in4 = in4 + offset;
    /* store result to destination */
    *(pDst + 1) = in2;
    /* store result to destination */
    *(pDst + 2) = in3;
    /* store result to destination */
    *(pDst + 3) = in4;
    /* update pointers to process next samples */
    pSrc += 4u;
    pDst += 4u;
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.        
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the result in the destination buffer. */
    *pDst++ = (*pSrc++) + offset;
    /* Decrement the loop counter */
    blkCnt--;
  }
 }
 /**        
 * @} end of offset group        
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_offset_q15.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_offset_q15.c
@@ -0,0 +1,136 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_offset_q15.c    
 *    
 * Description:	Q15 vector offset.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup offset    
 * @{    
 */
 /**    
 * @brief  Adds a constant offset to a Q15 vector.    
 * @param[in]  *pSrc points to the input vector    
 * @param[in]  offset is the offset to be added    
 * @param[out]  *pDst points to the output vector    
 * @param[in]  blockSize number of samples in the vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.    
 */
 void arm_offset_q15(
  q15_t * pSrc,
  q15_t offset,
  q15_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t offset_packed;                           /* Offset packed to 32 bit */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* Offset is packed to 32 bit in order to use SIMD32 for addition */
  offset_packed = __PKHBT(offset, offset, 16);
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the results in the destination buffer, 2 samples at a time. */
    *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed);
    *__SIMD32(pDst)++ = __QADD16(*__SIMD32(pSrc)++, offset_packed);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the results in the destination buffer. */
    *pDst++ = (q15_t) __QADD16(*pSrc++, offset);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the results in the destination buffer. */
    *pDst++ = (q15_t) __SSAT(((q31_t) * pSrc++ + offset), 16);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of offset group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_offset_q31.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_offset_q31.c
@@ -0,0 +1,140 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_offset_q31.c    
 *    
 * Description:	Q31 vector offset.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup offset    
 * @{    
 */
 /**    
 * @brief  Adds a constant offset to a Q31 vector.    
 * @param[in]  *pSrc points to the input vector    
 * @param[in]  offset is the offset to be added    
 * @param[out]  *pDst points to the output vector    
 * @param[in]  blockSize number of samples in the vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.    
 */
 void arm_offset_q31(
  q31_t * pSrc,
  q31_t offset,
  q31_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t in1, in2, in3, in4;
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the results in the destination buffer. */
    in1 = *pSrc++;
    in2 = *pSrc++;
    in3 = *pSrc++;
    in4 = *pSrc++;
    *pDst++ = __QADD(in1, offset);
    *pDst++ = __QADD(in2, offset);
    *pDst++ = __QADD(in3, offset);
    *pDst++ = __QADD(in4, offset);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the result in the destination buffer. */
    *pDst++ = __QADD(*pSrc++, offset);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the result in the destination buffer. */
    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrc++ + offset);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of offset group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_offset_q7.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_offset_q7.c
@@ -0,0 +1,135 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_offset_q7.c    
 *    
 * Description:	Q7 vector offset.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup offset    
 * @{    
 */
 /**    
 * @brief  Adds a constant offset to a Q7 vector.    
 * @param[in]  *pSrc points to the input vector    
 * @param[in]  offset is the offset to be added    
 * @param[out]  *pDst points to the output vector    
 * @param[in]  blockSize number of samples in the vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * Results outside of the allowable Q7 range [0x80 0x7F] are saturated.    
 */
 void arm_offset_q7(
  q7_t * pSrc,
  q7_t offset,
  q7_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t offset_packed;                           /* Offset packed to 32 bit */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* Offset is packed to 32 bit in order to use SIMD32 for addition */
  offset_packed = __PACKq7(offset, offset, offset, offset);
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the results in the destination bufferfor 4 samples at a time. */
    *__SIMD32(pDst)++ = __QADD8(*__SIMD32(pSrc)++, offset_packed);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the result in the destination buffer. */
    *pDst++ = (q7_t) __SSAT(*pSrc++ + offset, 8);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
  while(blkCnt > 0u)
  {
    /* C = A + offset */
    /* Add offset and then store the result in the destination buffer. */
    *pDst++ = (q7_t) __SSAT((q15_t) * pSrc++ + offset, 8);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of offset group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_scale_f32.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_scale_f32.c
@@ -0,0 +1,169 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_scale_f32.c    
 *    
 * Description:	Multiplies a floating-point vector by a scalar.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.    
 * ---------------------------------------------------------------------------- */
 #include "arm_math.h"
 /**        
 * @ingroup groupMath        
 */
 /**        
 * @defgroup scale Vector Scale        
 *        
 * Multiply a vector by a scalar value.  For floating-point data, the algorithm used is:        
 *        
 * <pre>        
 *     pDst[n] = pSrc[n] * scale,   0 <= n < blockSize.        
 * </pre>        
 *        
 * In the fixed-point Q7, Q15, and Q31 functions, <code>scale</code> is represented by        
 * a fractional multiplication <code>scaleFract</code> and an arithmetic shift <code>shift</code>.        
 * The shift allows the gain of the scaling operation to exceed 1.0.        
 * The algorithm used with fixed-point data is:        
 *        
 * <pre>        
 *     pDst[n] = (pSrc[n] * scaleFract) << shift,   0 <= n < blockSize.        
 * </pre>        
 *        
 * The overall scale factor applied to the fixed-point data is        
 * <pre>        
 *     scale = scaleFract * 2^shift.        
 * </pre>        
 *
 * The functions support in-place computation allowing the source and destination
 * pointers to reference the same memory buffer.
 */
 /**        
 * @addtogroup scale        
 * @{        
 */
 /**        
 * @brief Multiplies a floating-point vector by a scalar.        
 * @param[in]       *pSrc points to the input vector        
 * @param[in]       scale scale factor to be applied        
 * @param[out]      *pDst points to the output vector        
 * @param[in]       blockSize number of samples in the vector        
 * @return none.        
 */
 void arm_scale_f32(
  float32_t * pSrc,
  float32_t scale,
  float32_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  float32_t in1, in2, in3, in4;                  /* temporary variabels */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A * scale */
    /* Scale the input and then store the results in the destination buffer. */
    /* read input samples from source */
    in1 = *pSrc;
    in2 = *(pSrc + 1);
    /* multiply with scaling factor */
    in1 = in1 * scale;
    /* read input sample from source */
    in3 = *(pSrc + 2);
    /* multiply with scaling factor */
    in2 = in2 * scale;
    /* read input sample from source */
    in4 = *(pSrc + 3);
    /* multiply with scaling factor */
    in3 = in3 * scale;
    in4 = in4 * scale;
    /* store the result to destination */
    *pDst = in1;
    *(pDst + 1) = in2;
    *(pDst + 2) = in3;
    *(pDst + 3) = in4;
    /* update pointers to process next samples */
    pSrc += 4u;
    pDst += 4u;
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.        
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C = A * scale */
    /* Scale the input and then store the result in the destination buffer. */
    *pDst++ = (*pSrc++) * scale;
    /* Decrement the loop counter */
    blkCnt--;
  }
 }
 /**        
 * @} end of scale group        
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_scale_q15.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_scale_q15.c
@@ -0,0 +1,162 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_scale_q15.c    
 *    
 * Description:	Multiplies a Q15 vector by a scalar.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup scale    
 * @{    
 */
 /**    
 * @brief Multiplies a Q15 vector by a scalar.    
 * @param[in]       *pSrc points to the input vector    
 * @param[in]       scaleFract fractional portion of the scale value    
 * @param[in]       shift number of bits to shift the result by    
 * @param[out]      *pDst points to the output vector    
 * @param[in]       blockSize number of samples in the vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.15 format.    
 * These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.    
 */
 void arm_scale_q15(
  q15_t * pSrc,
  q15_t scaleFract,
  int8_t shift,
  q15_t * pDst,
  uint32_t blockSize)
 {
  int8_t kShift = 15 - shift;                    /* shift to apply after scaling */
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q15_t in1, in2, in3, in4;
  q31_t inA1, inA2;                              /* Temporary variables */
  q31_t out1, out2, out3, out4;
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* Reading 2 inputs from memory */
    inA1 = *__SIMD32(pSrc)++;
    inA2 = *__SIMD32(pSrc)++;
    /* C = A * scale */
    /* Scale the inputs and then store the 2 results in the destination buffer        
     * in single cycle by packing the outputs */
    out1 = (q31_t) ((q15_t) (inA1 >> 16) * scaleFract);
    out2 = (q31_t) ((q15_t) inA1 * scaleFract);
    out3 = (q31_t) ((q15_t) (inA2 >> 16) * scaleFract);
    out4 = (q31_t) ((q15_t) inA2 * scaleFract);
    /* apply shifting */
    out1 = out1 >> kShift;
    out2 = out2 >> kShift;
    out3 = out3 >> kShift;
    out4 = out4 >> kShift;
    /* saturate the output */
    in1 = (q15_t) (__SSAT(out1, 16));
    in2 = (q15_t) (__SSAT(out2, 16));
    in3 = (q15_t) (__SSAT(out3, 16));
    in4 = (q15_t) (__SSAT(out4, 16));
    /* store the result to destination */
    *__SIMD32(pDst)++ = __PKHBT(in2, in1, 16);
    *__SIMD32(pDst)++ = __PKHBT(in4, in3, 16);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
  while(blkCnt > 0u)
  {
    /* C = A * scale */
    /* Scale the input and then store the result in the destination buffer. */
    *pDst++ = (q15_t) (__SSAT(((*pSrc++) * scaleFract) >> kShift, 16));
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
  while(blkCnt > 0u)
  {
    /* C = A * scale */
    /* Scale the input and then store the result in the destination buffer. */
    *pDst++ = (q15_t) (__SSAT(((q31_t) * pSrc++ * scaleFract) >> kShift, 16));
    /* Decrement the loop counter */
    blkCnt--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of scale group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_scale_q31.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_scale_q31.c
@@ -0,0 +1,239 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_scale_q31.c    
 *    
 * Description:	Multiplies a Q31 vector by a scalar.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**       
 * @ingroup groupMath       
 */
 /**       
 * @addtogroup scale       
 * @{       
 */
 /**       
 * @brief Multiplies a Q31 vector by a scalar.       
 * @param[in]       *pSrc points to the input vector       
 * @param[in]       scaleFract fractional portion of the scale value       
 * @param[in]       shift number of bits to shift the result by       
 * @param[out]      *pDst points to the output vector       
 * @param[in]       blockSize number of samples in the vector       
 * @return none.       
 *       
 * <b>Scaling and Overflow Behavior:</b>       
 * \par       
 * The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.31 format.       
 * These are multiplied to yield a 2.62 intermediate result and this is shifted with saturation to 1.31 format.       
 */
 void arm_scale_q31(
  q31_t * pSrc,
  q31_t scaleFract,
  int8_t shift,
  q31_t * pDst,
  uint32_t blockSize)
 {
  int8_t kShift = shift + 1;                     /* Shift to apply after scaling */
  int8_t sign = (kShift & 0x80);
  uint32_t blkCnt;                               /* loop counter */
  q31_t in, out;
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t in1, in2, in3, in4;                      /* temporary input variables */
  q31_t out1, out2, out3, out4;                  /* temporary output variabels */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  if(sign == 0u)
  {
    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.       
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u)
    {
      /* read four inputs from source */
      in1 = *pSrc;
      in2 = *(pSrc + 1);
      in3 = *(pSrc + 2);
      in4 = *(pSrc + 3);
      /* multiply input with scaler value */
      in1 = ((q63_t) in1 * scaleFract) >> 32;
      in2 = ((q63_t) in2 * scaleFract) >> 32;
      in3 = ((q63_t) in3 * scaleFract) >> 32;
      in4 = ((q63_t) in4 * scaleFract) >> 32;
      /* apply shifting */
      out1 = in1 << kShift;
      out2 = in2 << kShift;
      /* saturate the results. */
      if(in1 != (out1 >> kShift))
        out1 = 0x7FFFFFFF ^ (in1 >> 31);
      if(in2 != (out2 >> kShift))
        out2 = 0x7FFFFFFF ^ (in2 >> 31);
      out3 = in3 << kShift;
      out4 = in4 << kShift;
      *pDst = out1;
      *(pDst + 1) = out2;
      if(in3 != (out3 >> kShift))
        out3 = 0x7FFFFFFF ^ (in3 >> 31);
      if(in4 != (out4 >> kShift))
        out4 = 0x7FFFFFFF ^ (in4 >> 31);
      /* Store result destination */
      *(pDst + 2) = out3;
      *(pDst + 3) = out4;
      /* Update pointers to process next sampels */
      pSrc += 4u;
      pDst += 4u;
      /* Decrement the loop counter */
      blkCnt--;
    }
  }
  else
  {
    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.       
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u)
    {
      /* read four inputs from source */
      in1 = *pSrc;
      in2 = *(pSrc + 1);
      in3 = *(pSrc + 2);
      in4 = *(pSrc + 3);
      /* multiply input with scaler value */
      in1 = ((q63_t) in1 * scaleFract) >> 32;
      in2 = ((q63_t) in2 * scaleFract) >> 32;
      in3 = ((q63_t) in3 * scaleFract) >> 32;
      in4 = ((q63_t) in4 * scaleFract) >> 32;
      /* apply shifting */
      out1 = in1 >> -kShift;
      out2 = in2 >> -kShift;
      out3 = in3 >> -kShift;
      out4 = in4 >> -kShift;
      /* Store result destination */
      *pDst = out1;
      *(pDst + 1) = out2;
      *(pDst + 2) = out3;
      *(pDst + 3) = out4;
      /* Update pointers to process next sampels */
      pSrc += 4u;
      pDst += 4u;
      /* Decrement the loop counter */
      blkCnt--;
    }
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.       
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  if(sign == 0)
  {
 	  while(blkCnt > 0u)
 	  {
 		/* C = A * scale */
 		/* Scale the input and then store the result in the destination buffer. */
 		in = *pSrc++;
 		in = ((q63_t) in * scaleFract) >> 32;
 		out = in << kShift;
 		if(in != (out >> kShift))
 			out = 0x7FFFFFFF ^ (in >> 31);
 		*pDst++ = out;
 		/* Decrement the loop counter */
 		blkCnt--;
 	  }
  }
  else
  {
 	  while(blkCnt > 0u)
 	  {
 		/* C = A * scale */
 		/* Scale the input and then store the result in the destination buffer. */
 		in = *pSrc++;
 		in = ((q63_t) in * scaleFract) >> 32;
 		out = in >> -kShift;
 		*pDst++ = out;
 		/* Decrement the loop counter */
 		blkCnt--;
 	  }
  }
 }
 /**       
 * @} end of scale group       
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_scale_q7.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_scale_q7.c
@@ -0,0 +1,149 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_scale_q7.c    
 *    
 * Description:	Multiplies a Q7 vector by a scalar.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup scale    
 * @{    
 */
 /**    
 * @brief Multiplies a Q7 vector by a scalar.    
 * @param[in]       *pSrc points to the input vector    
 * @param[in]       scaleFract fractional portion of the scale value    
 * @param[in]       shift number of bits to shift the result by    
 * @param[out]      *pDst points to the output vector    
 * @param[in]       blockSize number of samples in the vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The input data <code>*pSrc</code> and <code>scaleFract</code> are in 1.7 format.    
 * These are multiplied to yield a 2.14 intermediate result and this is shifted with saturation to 1.7 format.    
 */
 void arm_scale_q7(
  q7_t * pSrc,
  q7_t scaleFract,
  int8_t shift,
  q7_t * pDst,
  uint32_t blockSize)
 {
  int8_t kShift = 7 - shift;                     /* shift to apply after scaling */
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q7_t in1, in2, in3, in4, out1, out2, out3, out4;      /* Temporary variables to store input & output */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* Reading 4 inputs from memory */
    in1 = *pSrc++;
    in2 = *pSrc++;
    in3 = *pSrc++;
    in4 = *pSrc++;
    /* C = A * scale */
    /* Scale the inputs and then store the results in the temporary variables. */
    out1 = (q7_t) (__SSAT(((in1) * scaleFract) >> kShift, 8));
    out2 = (q7_t) (__SSAT(((in2) * scaleFract) >> kShift, 8));
    out3 = (q7_t) (__SSAT(((in3) * scaleFract) >> kShift, 8));
    out4 = (q7_t) (__SSAT(((in4) * scaleFract) >> kShift, 8));
    /* Packing the individual outputs into 32bit and storing in    
     * destination buffer in single write */
    *__SIMD32(pDst)++ = __PACKq7(out1, out2, out3, out4);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
  while(blkCnt > 0u)
  {
    /* C = A * scale */
    /* Scale the input and then store the result in the destination buffer. */
    *pDst++ = (q7_t) (__SSAT(((*pSrc++) * scaleFract) >> kShift, 8));
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
  while(blkCnt > 0u)
  {
    /* C = A * scale */
    /* Scale the input and then store the result in the destination buffer. */
    *pDst++ = (q7_t) (__SSAT((((q15_t) * pSrc++ * scaleFract) >> kShift), 8));
    /* Decrement the loop counter */
    blkCnt--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of scale group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_shift_q15.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_shift_q15.c
@@ -0,0 +1,248 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_shift_q15.c    
 *    
 * Description:	Shifts the elements of a Q15 vector by a specified number of bits.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup shift    
 * @{    
 */
 /**    
 * @brief  Shifts the elements of a Q15 vector a specified number of bits.    
 * @param[in]  *pSrc points to the input vector    
 * @param[in]  shiftBits number of bits to shift.  A positive value shifts left; a negative value shifts right.    
 * @param[out]  *pDst points to the output vector    
 * @param[in]  blockSize number of samples in the vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.    
 */
 void arm_shift_q15(
  q15_t * pSrc,
  int8_t shiftBits,
  q15_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
  uint8_t sign;                                  /* Sign of shiftBits */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q15_t in1, in2;                                /* Temporary variables */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* Getting the sign of shiftBits */
  sign = (shiftBits & 0x80);
  /* If the shift value is positive then do right shift else left shift */
  if(sign == 0u)
  {
    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u)
    {
      /* Read 2 inputs */
      in1 = *pSrc++;
      in2 = *pSrc++;
      /* C = A << shiftBits */
      /* Shift the inputs and then store the results in the destination buffer. */
 #ifndef  ARM_MATH_BIG_ENDIAN
      *__SIMD32(pDst)++ = __PKHBT(__SSAT((in1 << shiftBits), 16),
                                  __SSAT((in2 << shiftBits), 16), 16);
 #else
      *__SIMD32(pDst)++ = __PKHBT(__SSAT((in2 << shiftBits), 16),
                                  __SSAT((in1 << shiftBits), 16), 16);
 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */
      in1 = *pSrc++;
      in2 = *pSrc++;
 #ifndef  ARM_MATH_BIG_ENDIAN
      *__SIMD32(pDst)++ = __PKHBT(__SSAT((in1 << shiftBits), 16),
                                  __SSAT((in2 << shiftBits), 16), 16);
 #else
      *__SIMD32(pDst)++ = __PKHBT(__SSAT((in2 << shiftBits), 16),
                                  __SSAT((in1 << shiftBits), 16), 16);
 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */
      /* Decrement the loop counter */
      blkCnt--;
    }
    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
     ** No loop unrolling is used. */
    blkCnt = blockSize % 0x4u;
    while(blkCnt > 0u)
    {
      /* C = A << shiftBits */
      /* Shift and then store the results in the destination buffer. */
      *pDst++ = __SSAT((*pSrc++ << shiftBits), 16);
      /* Decrement the loop counter */
      blkCnt--;
    }
  }
  else
  {
    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u)
    {
      /* Read 2 inputs */
      in1 = *pSrc++;
      in2 = *pSrc++;
      /* C = A >> shiftBits */
      /* Shift the inputs and then store the results in the destination buffer. */
 #ifndef  ARM_MATH_BIG_ENDIAN
      *__SIMD32(pDst)++ = __PKHBT((in1 >> -shiftBits),
                                  (in2 >> -shiftBits), 16);
 #else
      *__SIMD32(pDst)++ = __PKHBT((in2 >> -shiftBits),
                                  (in1 >> -shiftBits), 16);
 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */
      in1 = *pSrc++;
      in2 = *pSrc++;
 #ifndef  ARM_MATH_BIG_ENDIAN
      *__SIMD32(pDst)++ = __PKHBT((in1 >> -shiftBits),
                                  (in2 >> -shiftBits), 16);
 #else
      *__SIMD32(pDst)++ = __PKHBT((in2 >> -shiftBits),
                                  (in1 >> -shiftBits), 16);
 #endif /* #ifndef  ARM_MATH_BIG_ENDIAN    */
      /* Decrement the loop counter */
      blkCnt--;
    }
    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
     ** No loop unrolling is used. */
    blkCnt = blockSize % 0x4u;
    while(blkCnt > 0u)
    {
      /* C = A >> shiftBits */
      /* Shift the inputs and then store the results in the destination buffer. */
      *pDst++ = (*pSrc++ >> -shiftBits);
      /* Decrement the loop counter */
      blkCnt--;
    }
  }
 #else
  /* Run the below code for Cortex-M0 */
  /* Getting the sign of shiftBits */
  sign = (shiftBits & 0x80);
  /* If the shift value is positive then do right shift else left shift */
  if(sign == 0u)
  {
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
    while(blkCnt > 0u)
    {
      /* C = A << shiftBits */
      /* Shift and then store the results in the destination buffer. */
      *pDst++ = __SSAT(((q31_t) * pSrc++ << shiftBits), 16);
      /* Decrement the loop counter */
      blkCnt--;
    }
  }
  else
  {
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
    while(blkCnt > 0u)
    {
      /* C = A >> shiftBits */
      /* Shift the inputs and then store the results in the destination buffer. */
      *pDst++ = (*pSrc++ >> -shiftBits);
      /* Decrement the loop counter */
      blkCnt--;
    }
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of shift group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_shift_q31.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_shift_q31.c
@@ -0,0 +1,203 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_shift_q31.c    
 *    
 * Description:	Shifts the elements of a Q31 vector by a specified number of bits.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**        
 * @ingroup groupMath        
 */
 /**        
 * @defgroup shift Vector Shift        
 *        
 * Shifts the elements of a fixed-point vector by a specified number of bits.        
 * There are separate functions for Q7, Q15, and Q31 data types.        
 * The underlying algorithm used is:        
 *        
 * <pre>        
 *     pDst[n] = pSrc[n] << shift,   0 <= n < blockSize.        
 * </pre>        
 *        
 * If <code>shift</code> is positive then the elements of the vector are shifted to the left.        
 * If <code>shift</code> is negative then the elements of the vector are shifted to the right.        
 *
 * The functions support in-place computation allowing the source and destination
 * pointers to reference the same memory buffer.
 */
 /**        
 * @addtogroup shift        
 * @{        
 */
 /**        
 * @brief  Shifts the elements of a Q31 vector a specified number of bits.        
 * @param[in]  *pSrc points to the input vector        
 * @param[in]  shiftBits number of bits to shift.  A positive value shifts left; a negative value shifts right.        
 * @param[out]  *pDst points to the output vector        
 * @param[in]  blockSize number of samples in the vector        
 * @return none.        
 *        
 *        
 * <b>Scaling and Overflow Behavior:</b>        
 * \par        
 * The function uses saturating arithmetic.        
 * Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] will be saturated.        
 */
 void arm_shift_q31(
  q31_t * pSrc,
  int8_t shiftBits,
  q31_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
  uint8_t sign = (shiftBits & 0x80);             /* Sign of shiftBits */
 #ifndef ARM_MATH_CM0_FAMILY
  q31_t in1, in2, in3, in4;                      /* Temporary input variables */
  q31_t out1, out2, out3, out4;                  /* Temporary output variables */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  if(sign == 0u)
  {
    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u)
    {
      /* C = A  << shiftBits */
      /* Shift the input and then store the results in the destination buffer. */
      in1 = *pSrc;
      in2 = *(pSrc + 1);
      out1 = in1 << shiftBits;
      in3 = *(pSrc + 2);
      out2 = in2 << shiftBits;
      in4 = *(pSrc + 3);
      if(in1 != (out1 >> shiftBits))
        out1 = 0x7FFFFFFF ^ (in1 >> 31);
      if(in2 != (out2 >> shiftBits))
        out2 = 0x7FFFFFFF ^ (in2 >> 31);
      *pDst = out1;
      out3 = in3 << shiftBits;
      *(pDst + 1) = out2;
      out4 = in4 << shiftBits;
      if(in3 != (out3 >> shiftBits))
        out3 = 0x7FFFFFFF ^ (in3 >> 31);
      if(in4 != (out4 >> shiftBits))
        out4 = 0x7FFFFFFF ^ (in4 >> 31);
      *(pDst + 2) = out3;
      *(pDst + 3) = out4;
      /* Update destination pointer to process next sampels */
      pSrc += 4u;
      pDst += 4u;
      /* Decrement the loop counter */
      blkCnt--;
    }
  }
  else
  {
    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u)
    {
      /* C = A >>  shiftBits */
      /* Shift the input and then store the results in the destination buffer. */
      in1 = *pSrc;
      in2 = *(pSrc + 1);
      in3 = *(pSrc + 2);
      in4 = *(pSrc + 3);
      *pDst = (in1 >> -shiftBits);
      *(pDst + 1) = (in2 >> -shiftBits);
      *(pDst + 2) = (in3 >> -shiftBits);
      *(pDst + 3) = (in4 >> -shiftBits);
      pSrc += 4u;
      pDst += 4u;
      blkCnt--;
    }
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C = A (>> or <<) shiftBits */
    /* Shift the input and then store the result in the destination buffer. */
    *pDst++ = (sign == 0u) ? clip_q63_to_q31((q63_t) * pSrc++ << shiftBits) :
      (*pSrc++ >> -shiftBits);
    /* Decrement the loop counter */
    blkCnt--;
  }
 }
 /**        
 * @} end of shift group        
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_shift_q7.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_shift_q7.c
@@ -0,0 +1,220 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_shift_q7.c    
 *    
 * Description:	Processing function for the Q7 Shifting    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**        
 * @ingroup groupMath        
 */
 /**        
 * @addtogroup shift        
 * @{        
 */
 /**        
 * @brief  Shifts the elements of a Q7 vector a specified number of bits.        
 * @param[in]  *pSrc points to the input vector        
 * @param[in]  shiftBits number of bits to shift.  A positive value shifts left; a negative value shifts right.        
 * @param[out]  *pDst points to the output vector        
 * @param[in]  blockSize number of samples in the vector        
 * @return none.        
 *    
 * \par Conditions for optimum performance    
 *  Input and output buffers should be aligned by 32-bit    
 *    
 *        
 * <b>Scaling and Overflow Behavior:</b>        
 * \par        
 * The function uses saturating arithmetic.        
 * Results outside of the allowable Q7 range [0x8 0x7F] will be saturated.        
 */
 void arm_shift_q7(
  q7_t * pSrc,
  int8_t shiftBits,
  q7_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
  uint8_t sign;                                  /* Sign of shiftBits */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q7_t in1;                                      /* Input value1 */
  q7_t in2;                                      /* Input value2 */
  q7_t in3;                                      /* Input value3 */
  q7_t in4;                                      /* Input value4 */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* Getting the sign of shiftBits */
  sign = (shiftBits & 0x80);
  /* If the shift value is positive then do right shift else left shift */
  if(sign == 0u)
  {
    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u)
    {
      /* C = A << shiftBits */
      /* Read 4 inputs */
      in1 = *pSrc;
      in2 = *(pSrc + 1);
      in3 = *(pSrc + 2);
      in4 = *(pSrc + 3);
      /* Store the Shifted result in the destination buffer in single cycle by packing the outputs */
      *__SIMD32(pDst)++ = __PACKq7(__SSAT((in1 << shiftBits), 8),
                                   __SSAT((in2 << shiftBits), 8),
                                   __SSAT((in3 << shiftBits), 8),
                                   __SSAT((in4 << shiftBits), 8));
      /* Update source pointer to process next sampels */
      pSrc += 4u;
      /* Decrement the loop counter */
      blkCnt--;
    }
    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.        
     ** No loop unrolling is used. */
    blkCnt = blockSize % 0x4u;
    while(blkCnt > 0u)
    {
      /* C = A << shiftBits */
      /* Shift the input and then store the result in the destination buffer. */
      *pDst++ = (q7_t) __SSAT((*pSrc++ << shiftBits), 8);
      /* Decrement the loop counter */
      blkCnt--;
    }
  }
  else
  {
    shiftBits = -shiftBits;
    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(blkCnt > 0u)
    {
      /* C = A >> shiftBits */
      /* Read 4 inputs */
      in1 = *pSrc;
      in2 = *(pSrc + 1);
      in3 = *(pSrc + 2);
      in4 = *(pSrc + 3);
      /* Store the Shifted result in the destination buffer in single cycle by packing the outputs */
      *__SIMD32(pDst)++ = __PACKq7((in1 >> shiftBits), (in2 >> shiftBits),
                                   (in3 >> shiftBits), (in4 >> shiftBits));
      pSrc += 4u;
      /* Decrement the loop counter */
      blkCnt--;
    }
    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
     ** No loop unrolling is used. */
    blkCnt = blockSize % 0x4u;
    while(blkCnt > 0u)
    {
      /* C = A >> shiftBits */
      /* Shift the input and then store the result in the destination buffer. */
      in1 = *pSrc++;
      *pDst++ = (in1 >> shiftBits);
      /* Decrement the loop counter */
      blkCnt--;
    }
  }
 #else
  /* Run the below code for Cortex-M0 */
  /* Getting the sign of shiftBits */
  sign = (shiftBits & 0x80);
  /* If the shift value is positive then do right shift else left shift */
  if(sign == 0u)
  {
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
    while(blkCnt > 0u)
    {
      /* C = A << shiftBits */
      /* Shift the input and then store the result in the destination buffer. */
      *pDst++ = (q7_t) __SSAT(((q15_t) * pSrc++ << shiftBits), 8);
      /* Decrement the loop counter */
      blkCnt--;
    }
  }
  else
  {
    /* Initialize blkCnt with number of samples */
    blkCnt = blockSize;
    while(blkCnt > 0u)
    {
      /* C = A >> shiftBits */
      /* Shift the input and then store the result in the destination buffer. */
      *pDst++ = (*pSrc++ >> -shiftBits);
      /* Decrement the loop counter */
      blkCnt--;
    }
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**        
 * @} end of shift group        
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_sub_f32.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_sub_f32.c
@@ -0,0 +1,150 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_sub_f32.c    
 *    
 * Description:	Floating-point vector subtraction.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * ---------------------------------------------------------------------------- */
 #include "arm_math.h"
 /**        
 * @ingroup groupMath        
 */
 /**        
 * @defgroup BasicSub Vector Subtraction        
 *        
 * Element-by-element subtraction of two vectors.        
 *        
 * <pre>        
 *     pDst[n] = pSrcA[n] - pSrcB[n],   0 <= n < blockSize.        
 * </pre>        
 *        
 * There are separate functions for floating-point, Q7, Q15, and Q31 data types.        
 */
 /**        
 * @addtogroup BasicSub        
 * @{        
 */
 /**        
 * @brief Floating-point vector subtraction.        
 * @param[in]       *pSrcA points to the first input vector        
 * @param[in]       *pSrcB points to the second input vector        
 * @param[out]      *pDst points to the output vector        
 * @param[in]       blockSize number of samples in each vector        
 * @return none.        
 */
 void arm_sub_f32(
  float32_t * pSrcA,
  float32_t * pSrcB,
  float32_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  float32_t inA1, inA2, inA3, inA4;              /* temporary variables */
  float32_t inB1, inB2, inB3, inB4;              /* temporary variables */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A - B */
    /* Subtract and then store the results in the destination buffer. */
    /* Read 4 input samples from sourceA and sourceB */
    inA1 = *pSrcA;
    inB1 = *pSrcB;
    inA2 = *(pSrcA + 1);
    inB2 = *(pSrcB + 1);
    inA3 = *(pSrcA + 2);
    inB3 = *(pSrcB + 2);
    inA4 = *(pSrcA + 3);
    inB4 = *(pSrcB + 3);
    /* dst = srcA - srcB */
    /* subtract and store the result */
    *pDst = inA1 - inB1;
    *(pDst + 1) = inA2 - inB2;
    *(pDst + 2) = inA3 - inB3;
    *(pDst + 3) = inA4 - inB4;
    /* Update pointers to process next sampels */
    pSrcA += 4u;
    pSrcB += 4u;
    pDst += 4u;
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.        
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C = A - B */
    /* Subtract and then store the results in the destination buffer. */
    *pDst++ = (*pSrcA++) - (*pSrcB++);
    /* Decrement the loop counter */
    blkCnt--;
  }
 }
 /**        
 * @} end of BasicSub group        
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_sub_q15.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_sub_q15.c
@@ -0,0 +1,140 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_sub_q15.c    
 *    
 * Description:	Q15 vector subtraction.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup BasicSub    
 * @{    
 */
 /**    
 * @brief Q15 vector subtraction.    
 * @param[in]       *pSrcA points to the first input vector    
 * @param[in]       *pSrcB points to the second input vector    
 * @param[out]      *pDst points to the output vector    
 * @param[in]       blockSize number of samples in each vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.    
 */
 void arm_sub_q15(
  q15_t * pSrcA,
  q15_t * pSrcB,
  q15_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t inA1, inA2;
  q31_t inB1, inB2;
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A - B */
    /* Subtract and then store the results in the destination buffer two samples at a time. */
    inA1 = *__SIMD32(pSrcA)++;
    inA2 = *__SIMD32(pSrcA)++;
    inB1 = *__SIMD32(pSrcB)++;
    inB2 = *__SIMD32(pSrcB)++;
    *__SIMD32(pDst)++ = __QSUB16(inA1, inB1);
    *__SIMD32(pDst)++ = __QSUB16(inA2, inB2);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
  while(blkCnt > 0u)
  {
    /* C = A - B */
    /* Subtract and then store the result in the destination buffer. */
    *pDst++ = (q15_t) __QSUB16(*pSrcA++, *pSrcB++);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
  while(blkCnt > 0u)
  {
    /* C = A - B */
    /* Subtract and then store the result in the destination buffer. */
    *pDst++ = (q15_t) __SSAT(((q31_t) * pSrcA++ - *pSrcB++), 16);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of BasicSub group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_sub_q31.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_sub_q31.c
@@ -0,0 +1,146 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_sub_q31.c    
 *    
 * Description:	Q31 vector subtraction.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup BasicSub    
 * @{    
 */
 /**    
 * @brief Q31 vector subtraction.    
 * @param[in]       *pSrcA points to the first input vector    
 * @param[in]       *pSrcB points to the second input vector    
 * @param[out]      *pDst points to the output vector    
 * @param[in]       blockSize number of samples in each vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] will be saturated.    
 */
 void arm_sub_q31(
  q31_t * pSrcA,
  q31_t * pSrcB,
  q31_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t inA1, inA2, inA3, inA4;
  q31_t inB1, inB2, inB3, inB4;
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A - B */
    /* Subtract and then store the results in the destination buffer. */
    inA1 = *pSrcA++;
    inA2 = *pSrcA++;
    inB1 = *pSrcB++;
    inB2 = *pSrcB++;
    inA3 = *pSrcA++;
    inA4 = *pSrcA++;
    inB3 = *pSrcB++;
    inB4 = *pSrcB++;
    *pDst++ = __QSUB(inA1, inB1);
    *pDst++ = __QSUB(inA2, inB2);
    *pDst++ = __QSUB(inA3, inB3);
    *pDst++ = __QSUB(inA4, inB4);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
  while(blkCnt > 0u)
  {
    /* C = A - B */
    /* Subtract and then store the result in the destination buffer. */
    *pDst++ = __QSUB(*pSrcA++, *pSrcB++);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
  while(blkCnt > 0u)
  {
    /* C = A - B */
    /* Subtract and then store the result in the destination buffer. */
    *pDst++ = (q31_t) clip_q63_to_q31((q63_t) * pSrcA++ - *pSrcB++);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of BasicSub group    
 */
--- a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_sub_q7.c
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_sub_q7.c
@@ -0,0 +1,131 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_sub_q7.c    
 *    
 * Description:	Q7 vector subtraction.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.    
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupMath    
 */
 /**    
 * @addtogroup BasicSub    
 * @{    
 */
 /**    
 * @brief Q7 vector subtraction.    
 * @param[in]       *pSrcA points to the first input vector    
 * @param[in]       *pSrcB points to the second input vector    
 * @param[out]      *pDst points to the output vector    
 * @param[in]       blockSize number of samples in each vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.    
 */
 void arm_sub_q7(
  q7_t * pSrcA,
  q7_t * pSrcB,
  q7_t * pDst,
  uint32_t blockSize)
 {
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
 /* Run the below code for Cortex-M4 and Cortex-M3 */
  /*loop Unrolling */
  blkCnt = blockSize >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C = A - B */
    /* Subtract and then store the results in the destination buffer 4 samples at a time. */
    *__SIMD32(pDst)++ = __QSUB8(*__SIMD32(pSrcA)++, *__SIMD32(pSrcB)++);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = blockSize % 0x4u;
  while(blkCnt > 0u)
  {
    /* C = A - B */
    /* Subtract and then store the result in the destination buffer. */
    *pDst++ = __SSAT(*pSrcA++ - *pSrcB++, 8);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  /* Initialize blkCnt with number of samples */
  blkCnt = blockSize;
  while(blkCnt > 0u)
  {
    /* C = A - B */
    /* Subtract and then store the result in the destination buffer. */
    *pDst++ = (q7_t) __SSAT((q15_t) * pSrcA++ - *pSrcB++, 8);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of BasicSub group    
 */
--- a/CMSIS/DSP_Lib/Source/CommonTables/arm_common_tables.c
+++ b/CMSIS/DSP_Lib/Source/CommonTables/arm_common_tables.c
--- a/CMSIS/DSP_Lib/Source/CommonTables/arm_const_structs.c
+++ b/CMSIS/DSP_Lib/Source/CommonTables/arm_const_structs.c
@@ -0,0 +1,156 @@
 /* ---------------------------------------------------------------------- 
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved. 
 * 
 * $Date:        31. July 2014 
 * $Revision: 	V1.4.4
 * 
 * Project: 	    CMSIS DSP Library 
 * Title:	    arm_const_structs.c 
 * 
 * Description:	This file has constant structs that are initialized for
 *              user convenience.  For example, some can be given as 
 *              arguments to the arm_cfft_f32() function.
 * 
 * Target Processor: Cortex-M4/Cortex-M3
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * -------------------------------------------------------------------- */
 #include "arm_const_structs.h"
 //Floating-point structs
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len16 = {
 	16, twiddleCoef_16, armBitRevIndexTable16, ARMBITREVINDEXTABLE__16_TABLE_LENGTH
 };
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len32 = {
 	32, twiddleCoef_32, armBitRevIndexTable32, ARMBITREVINDEXTABLE__32_TABLE_LENGTH
 };
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len64 = {
 	64, twiddleCoef_64, armBitRevIndexTable64, ARMBITREVINDEXTABLE__64_TABLE_LENGTH
 };
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len128 = {
 	128, twiddleCoef_128, armBitRevIndexTable128, ARMBITREVINDEXTABLE_128_TABLE_LENGTH
 };
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len256 = {
 	256, twiddleCoef_256, armBitRevIndexTable256, ARMBITREVINDEXTABLE_256_TABLE_LENGTH
 };
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len512 = {
 	512, twiddleCoef_512, armBitRevIndexTable512, ARMBITREVINDEXTABLE_512_TABLE_LENGTH
 };
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len1024 = {
 	1024, twiddleCoef_1024, armBitRevIndexTable1024, ARMBITREVINDEXTABLE1024_TABLE_LENGTH
 };
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len2048 = {
 	2048, twiddleCoef_2048, armBitRevIndexTable2048, ARMBITREVINDEXTABLE2048_TABLE_LENGTH
 };
 const arm_cfft_instance_f32 arm_cfft_sR_f32_len4096 = {
 	4096, twiddleCoef_4096, armBitRevIndexTable4096, ARMBITREVINDEXTABLE4096_TABLE_LENGTH
 };
 //Fixed-point structs
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len16 = {
 	16, twiddleCoef_16_q31, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED___16_TABLE_LENGTH
 };
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len32 = {
 	32, twiddleCoef_32_q31, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED___32_TABLE_LENGTH
 };
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len64 = {
 	64, twiddleCoef_64_q31, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED___64_TABLE_LENGTH
 };
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len128 = {
 	128, twiddleCoef_128_q31, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED__128_TABLE_LENGTH
 };
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len256 = {
 	256, twiddleCoef_256_q31, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED__256_TABLE_LENGTH
 };
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len512 = {
 	512, twiddleCoef_512_q31, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED__512_TABLE_LENGTH
 };
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len1024 = {
 	1024, twiddleCoef_1024_q31, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
 };
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len2048 = {
 	2048, twiddleCoef_2048_q31, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
 };
 const arm_cfft_instance_q31 arm_cfft_sR_q31_len4096 = {
 	4096, twiddleCoef_4096_q31, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
 };
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len16 = {
 	16, twiddleCoef_16_q15, armBitRevIndexTable_fixed_16, ARMBITREVINDEXTABLE_FIXED___16_TABLE_LENGTH
 };
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len32 = {
 	32, twiddleCoef_32_q15, armBitRevIndexTable_fixed_32, ARMBITREVINDEXTABLE_FIXED___32_TABLE_LENGTH
 };
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len64 = {
 	64, twiddleCoef_64_q15, armBitRevIndexTable_fixed_64, ARMBITREVINDEXTABLE_FIXED___64_TABLE_LENGTH
 };
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len128 = {
 	128, twiddleCoef_128_q15, armBitRevIndexTable_fixed_128, ARMBITREVINDEXTABLE_FIXED__128_TABLE_LENGTH
 };
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len256 = {
 	256, twiddleCoef_256_q15, armBitRevIndexTable_fixed_256, ARMBITREVINDEXTABLE_FIXED__256_TABLE_LENGTH
 };
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len512 = {
 	512, twiddleCoef_512_q15, armBitRevIndexTable_fixed_512, ARMBITREVINDEXTABLE_FIXED__512_TABLE_LENGTH
 };
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len1024 = {
 	1024, twiddleCoef_1024_q15, armBitRevIndexTable_fixed_1024, ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH
 };
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len2048 = {
 	2048, twiddleCoef_2048_q15, armBitRevIndexTable_fixed_2048, ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH
 };
 const arm_cfft_instance_q15 arm_cfft_sR_q15_len4096 = {
 	4096, twiddleCoef_4096_q15, armBitRevIndexTable_fixed_4096, ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH
 };
--- a/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
+++ b/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
@@ -0,0 +1,182 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_cmplx_conj_f32.c    
 *    
 * Description:	Floating-point complex conjugate.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * ---------------------------------------------------------------------------- */
 #include "arm_math.h"
 /**        
 * @ingroup groupCmplxMath        
 */
 /**        
 * @defgroup cmplx_conj Complex Conjugate        
 *        
 * Conjugates the elements of a complex data vector.        
 *       
 * The <code>pSrc</code> points to the source data and        
 * <code>pDst</code> points to the where the result should be written.        
 * <code>numSamples</code> specifies the number of complex samples        
 * and the data in each array is stored in an interleaved fashion        
 * (real, imag, real, imag, ...).        
 * Each array has a total of <code>2*numSamples</code> values.        
 * The underlying algorithm is used:        
 *        
 * <pre>        
 * for(n=0; n<numSamples; n++) {        
 *     pDst[(2*n)+0)] = pSrc[(2*n)+0];     // real part        
 *     pDst[(2*n)+1)] = -pSrc[(2*n)+1];    // imag part        
 * }        
 * </pre>        
 *        
 * There are separate functions for floating-point, Q15, and Q31 data types.        
 */
 /**        
 * @addtogroup cmplx_conj        
 * @{        
 */
 /**        
 * @brief  Floating-point complex conjugate.        
 * @param  *pSrc points to the input vector        
 * @param  *pDst points to the output vector        
 * @param  numSamples number of complex samples in each vector        
 * @return none.        
 */
 void arm_cmplx_conj_f32(
  float32_t * pSrc,
  float32_t * pDst,
  uint32_t numSamples)
 {
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  float32_t inR1, inR2, inR3, inR4;
  float32_t inI1, inI2, inI3, inI4;
  /*loop Unrolling */
  blkCnt = numSamples >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
    /* read real input samples */
    inR1 = pSrc[0];
    /* store real samples to destination */
    pDst[0] = inR1;
    inR2 = pSrc[2];
    pDst[2] = inR2;
    inR3 = pSrc[4];
    pDst[4] = inR3;
    inR4 = pSrc[6];
    pDst[6] = inR4;
    /* read imaginary input samples */
    inI1 = pSrc[1];
    inI2 = pSrc[3];
    /* conjugate input */
    inI1 = -inI1;
    /* read imaginary input samples */
    inI3 = pSrc[5];
    /* conjugate input */
    inI2 = -inI2;
    /* read imaginary input samples */
    inI4 = pSrc[7];
    /* conjugate input */
    inI3 = -inI3;
    /* store imaginary samples to destination */
    pDst[1] = inI1;
    pDst[3] = inI2;
    /* conjugate input */
    inI4 = -inI4;
    /* store imaginary samples to destination */
    pDst[5] = inI3;
    /* increment source pointer by 8 to process next sampels */
    pSrc += 8u;
    /* store imaginary sample to destination */
    pDst[7] = inI4;
    /* increment destination pointer by 8 to store next samples */
    pDst += 8u;
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.        
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  blkCnt = numSamples;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* realOut + j (imagOut) = realIn + j (-1) imagIn */
    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
    *pDst++ = *pSrc++;
    *pDst++ = -*pSrc++;
    /* Decrement the loop counter */
    blkCnt--;
  }
 }
 /**        
 * @} end of cmplx_conj group        
 */
--- a/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
+++ b/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
@@ -0,0 +1,161 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_cmplx_conj_q15.c    
 *    
 * Description:	Q15 complex conjugate.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * ---------------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupCmplxMath    
 */
 /**    
 * @addtogroup cmplx_conj    
 * @{    
 */
 /**    
 * @brief  Q15 complex conjugate.    
 * @param  *pSrc points to the input vector    
 * @param  *pDst points to the output vector    
 * @param  numSamples number of complex samples in each vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * The Q15 value -1 (0x8000) will be saturated to the maximum allowable positive value 0x7FFF.    
 */
 void arm_cmplx_conj_q15(
  q15_t * pSrc,
  q15_t * pDst,
  uint32_t numSamples)
 {
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  uint32_t blkCnt;                               /* loop counter */
  q31_t in1, in2, in3, in4;
  q31_t zero = 0;
  /*loop Unrolling */
  blkCnt = numSamples >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
    in1 = *__SIMD32(pSrc)++;
    in2 = *__SIMD32(pSrc)++;
    in3 = *__SIMD32(pSrc)++;
    in4 = *__SIMD32(pSrc)++;
 #ifndef ARM_MATH_BIG_ENDIAN
    in1 = __QASX(zero, in1);
    in2 = __QASX(zero, in2);
    in3 = __QASX(zero, in3);
    in4 = __QASX(zero, in4);
 #else
    in1 = __QSAX(zero, in1);
    in2 = __QSAX(zero, in2);
    in3 = __QSAX(zero, in3);
    in4 = __QSAX(zero, in4);
 #endif //       #ifndef ARM_MATH_BIG_ENDIAN
    in1 = ((uint32_t) in1 >> 16) | ((uint32_t) in1 << 16);
    in2 = ((uint32_t) in2 >> 16) | ((uint32_t) in2 << 16);
    in3 = ((uint32_t) in3 >> 16) | ((uint32_t) in3 << 16);
    in4 = ((uint32_t) in4 >> 16) | ((uint32_t) in4 << 16);
    *__SIMD32(pDst)++ = in1;
    *__SIMD32(pDst)++ = in2;
    *__SIMD32(pDst)++ = in3;
    *__SIMD32(pDst)++ = in4;
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;
  while(blkCnt > 0u)
  {
    /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
    *pDst++ = *pSrc++;
    *pDst++ = __SSAT(-*pSrc++, 16);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  q15_t in;
  /* Run the below code for Cortex-M0 */
  while(numSamples > 0u)
  {
    /* realOut + j (imagOut) = realIn+ j (-1) imagIn */
    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
    *pDst++ = *pSrc++;
    in = *pSrc++;
    *pDst++ = (in == (q15_t) 0x8000) ? 0x7fff : -in;
    /* Decrement the loop counter */
    numSamples--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of cmplx_conj group    
 */
--- a/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
+++ b/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
@@ -0,0 +1,180 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_cmplx_conj_q31.c    
 *    
 * Description:	Q31 complex conjugate.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.    
 * ---------------------------------------------------------------------------- */
 #include "arm_math.h"
 /**        
 * @ingroup groupCmplxMath        
 */
 /**        
 * @addtogroup cmplx_conj        
 * @{        
 */
 /**        
 * @brief  Q31 complex conjugate.        
 * @param  *pSrc points to the input vector        
 * @param  *pDst points to the output vector        
 * @param  numSamples number of complex samples in each vector        
 * @return none.        
 *        
 * <b>Scaling and Overflow Behavior:</b>        
 * \par        
 * The function uses saturating arithmetic.        
 * The Q31 value -1 (0x80000000) will be saturated to the maximum allowable positive value 0x7FFFFFFF.        
 */
 void arm_cmplx_conj_q31(
  q31_t * pSrc,
  q31_t * pDst,
  uint32_t numSamples)
 {
  uint32_t blkCnt;                               /* loop counter */
  q31_t in;                                      /* Input value */
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t inR1, inR2, inR3, inR4;                  /* Temporary real variables */
  q31_t inI1, inI2, inI3, inI4;                  /* Temporary imaginary variables */
  /*loop Unrolling */
  blkCnt = numSamples >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
    /* Saturated to 0x7fffffff if the input is -1(0x80000000) */
    /* read real input sample */
    inR1 = pSrc[0];
    /* store real input sample */
    pDst[0] = inR1;
    /* read imaginary input sample */
    inI1 = pSrc[1];
    /* read real input sample */
    inR2 = pSrc[2];
    /* store real input sample */
    pDst[2] = inR2;
    /* read imaginary input sample */
    inI2 = pSrc[3];
    /* negate imaginary input sample */
    inI1 = __QSUB(0, inI1);
    /* read real input sample */
    inR3 = pSrc[4];
    /* store real input sample */
    pDst[4] = inR3;
    /* read imaginary input sample */
    inI3 = pSrc[5];
    /* negate imaginary input sample */
    inI2 = __QSUB(0, inI2);
    /* read real input sample */
    inR4 = pSrc[6];
    /* store real input sample */
    pDst[6] = inR4;
    /* negate imaginary input sample */
    inI3 = __QSUB(0, inI3);
    /* store imaginary input sample */
    inI4 = pSrc[7];
    /* store imaginary input samples */
    pDst[1] = inI1;
    /* negate imaginary input sample */
    inI4 = __QSUB(0, inI4);
    /* store imaginary input samples */
    pDst[3] = inI2;
    /* increment source pointer by 8 to proecess next samples */
    pSrc += 8u;
    /* store imaginary input samples */
    pDst[5] = inI3;
    pDst[7] = inI4;
    /* increment destination pointer by 8 to process next samples */
    pDst += 8u;
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.        
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  blkCnt = numSamples;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C[0]+jC[1] = A[0]+ j (-1) A[1] */
    /* Calculate Complex Conjugate and then store the results in the destination buffer. */
    /* Saturated to 0x7fffffff if the input is -1(0x80000000) */
    *pDst++ = *pSrc++;
    in = *pSrc++;
    *pDst++ = (in == INT32_MIN) ? INT32_MAX : -in;
    /* Decrement the loop counter */
    blkCnt--;
  }
 }
 /**        
 * @} end of cmplx_conj group        
 */
--- a/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
+++ b/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
@@ -0,0 +1,203 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_cmplx_dot_prod_f32.c    
 *    
 * Description:	Floating-point complex dot product    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.    
 * ---------------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupCmplxMath    
 */
 /**    
 * @defgroup cmplx_dot_prod Complex Dot Product    
 *    
 * Computes the dot product of two complex vectors.    
 * The vectors are multiplied element-by-element and then summed.    
 *   
 * The <code>pSrcA</code> points to the first complex input vector and    
 * <code>pSrcB</code> points to the second complex input vector.    
 * <code>numSamples</code> specifies the number of complex samples    
 * and the data in each array is stored in an interleaved fashion    
 * (real, imag, real, imag, ...).    
 * Each array has a total of <code>2*numSamples</code> values.    
 *    
 * The underlying algorithm is used:    
 * <pre>    
 * realResult=0;    
 * imagResult=0;    
 * for(n=0; n<numSamples; n++) {    
 *     realResult += pSrcA[(2*n)+0]*pSrcB[(2*n)+0] - pSrcA[(2*n)+1]*pSrcB[(2*n)+1];    
 *     imagResult += pSrcA[(2*n)+0]*pSrcB[(2*n)+1] + pSrcA[(2*n)+1]*pSrcB[(2*n)+0];    
 * }    
 * </pre>    
 *    
 * There are separate functions for floating-point, Q15, and Q31 data types.    
 */
 /**    
 * @addtogroup cmplx_dot_prod    
 * @{    
 */
 /**    
 * @brief  Floating-point complex dot product    
 * @param  *pSrcA points to the first input vector    
 * @param  *pSrcB points to the second input vector    
 * @param  numSamples number of complex samples in each vector    
 * @param  *realResult real part of the result returned here    
 * @param  *imagResult imaginary part of the result returned here    
 * @return none.    
 */
 void arm_cmplx_dot_prod_f32(
  float32_t * pSrcA,
  float32_t * pSrcB,
  uint32_t numSamples,
  float32_t * realResult,
  float32_t * imagResult)
 {
  float32_t real_sum = 0.0f, imag_sum = 0.0f;    /* Temporary result storage */
  float32_t a0,b0,c0,d0;
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  uint32_t blkCnt;                               /* loop counter */
  /*loop Unrolling */
  blkCnt = numSamples >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;  
      real_sum += a0 * c0;
      imag_sum += a0 * d0;
      real_sum -= b0 * d0;
      imag_sum += b0 * c0;
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++; 
      real_sum += a0 * c0;
      imag_sum += a0 * d0;
      real_sum -= b0 * d0;
      imag_sum += b0 * c0;
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;  
      real_sum += a0 * c0;
      imag_sum += a0 * d0;
      real_sum -= b0 * d0;
      imag_sum += b0 * c0;
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++; 
      real_sum += a0 * c0;
      imag_sum += a0 * d0;
      real_sum -= b0 * d0;
      imag_sum += b0 * c0;
      /* Decrement the loop counter */
      blkCnt--;
  }
  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = numSamples & 0x3u;
  while(blkCnt > 0u)
  {
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;  
      real_sum += a0 * c0;
      imag_sum += a0 * d0;
      real_sum -= b0 * d0;
      imag_sum += b0 * c0;
      /* Decrement the loop counter */
      blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  while(numSamples > 0u)
  {
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;  
      real_sum += a0 * c0;
      imag_sum += a0 * d0;
      real_sum -= b0 * d0;
      imag_sum += b0 * c0;
      /* Decrement the loop counter */
      numSamples--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  /* Store the real and imaginary results in the destination buffers */
  *realResult = real_sum;
  *imagResult = imag_sum;
 }
 /**    
 * @} end of cmplx_dot_prod group    
 */
--- a/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
+++ b/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
@@ -0,0 +1,189 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_cmplx_dot_prod_q15.c    
 *    
 * Description:	Processing function for the Q15 Complex Dot product    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupCmplxMath    
 */
 /**    
 * @addtogroup cmplx_dot_prod    
 * @{    
 */
 /**    
 * @brief  Q15 complex dot product    
 * @param  *pSrcA points to the first input vector    
 * @param  *pSrcB points to the second input vector    
 * @param  numSamples number of complex samples in each vector    
 * @param  *realResult real part of the result returned here    
 * @param  *imagResult imaginary part of the result returned here    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function is implemented using an internal 64-bit accumulator.    
 * The intermediate 1.15 by 1.15 multiplications are performed with full precision and yield a 2.30 result.    
 * These are accumulated in a 64-bit accumulator with 34.30 precision.    
 * As a final step, the accumulators are converted to 8.24 format.    
 * The return results <code>realResult</code> and <code>imagResult</code> are in 8.24 format.    
 */
 void arm_cmplx_dot_prod_q15(
  q15_t * pSrcA,
  q15_t * pSrcB,
  uint32_t numSamples,
  q31_t * realResult,
  q31_t * imagResult)
 {
  q63_t real_sum = 0, imag_sum = 0;              /* Temporary result storage */
  q15_t a0,b0,c0,d0;
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  uint32_t blkCnt;                               /* loop counter */
  /*loop Unrolling */
  blkCnt = numSamples >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;  
      real_sum += (q31_t)a0 * c0;
      imag_sum += (q31_t)a0 * d0;
      real_sum -= (q31_t)b0 * d0;
      imag_sum += (q31_t)b0 * c0;
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;  
      real_sum += (q31_t)a0 * c0;
      imag_sum += (q31_t)a0 * d0;
      real_sum -= (q31_t)b0 * d0;
      imag_sum += (q31_t)b0 * c0;
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;  
      real_sum += (q31_t)a0 * c0;
      imag_sum += (q31_t)a0 * d0;
      real_sum -= (q31_t)b0 * d0;
      imag_sum += (q31_t)b0 * c0;
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;  
      real_sum += (q31_t)a0 * c0;
      imag_sum += (q31_t)a0 * d0;
      real_sum -= (q31_t)b0 * d0;
      imag_sum += (q31_t)b0 * c0;
      /* Decrement the loop counter */
      blkCnt--;
  }
  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;
  while(blkCnt > 0u)
  {
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;  
      real_sum += (q31_t)a0 * c0;
      imag_sum += (q31_t)a0 * d0;
      real_sum -= (q31_t)b0 * d0;
      imag_sum += (q31_t)b0 * c0;
      /* Decrement the loop counter */
      blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  while(numSamples > 0u)
  {
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;  
      real_sum += a0 * c0;
      imag_sum += a0 * d0;
      real_sum -= b0 * d0;
      imag_sum += b0 * c0;
      /* Decrement the loop counter */
      numSamples--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  /* Store the real and imaginary results in 8.24 format  */
  /* Convert real data in 34.30 to 8.24 by 6 right shifts */
  *realResult = (q31_t) (real_sum >> 6);
  /* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
  *imagResult = (q31_t) (imag_sum >> 6);
 }
 /**    
 * @} end of cmplx_dot_prod group    
 */
--- a/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
+++ b/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
@@ -0,0 +1,187 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_cmplx_dot_prod_q31.c    
 *    
 * Description:	Q31 complex dot product    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupCmplxMath    
 */
 /**    
 * @addtogroup cmplx_dot_prod    
 * @{    
 */
 /**    
 * @brief  Q31 complex dot product    
 * @param  *pSrcA points to the first input vector    
 * @param  *pSrcB points to the second input vector    
 * @param  numSamples number of complex samples in each vector    
 * @param  *realResult real part of the result returned here    
 * @param  *imagResult imaginary part of the result returned here    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function is implemented using an internal 64-bit accumulator.    
 * The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format.    
 * The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits.    
 * Additions are nonsaturating and no overflow will occur as long as <code>numSamples</code> is less than 32768.    
 * The return results <code>realResult</code> and <code>imagResult</code> are in 16.48 format.    
 * Input down scaling is not required.    
 */
 void arm_cmplx_dot_prod_q31(
  q31_t * pSrcA,
  q31_t * pSrcB,
  uint32_t numSamples,
  q63_t * realResult,
  q63_t * imagResult)
 {
  q63_t real_sum = 0, imag_sum = 0;              /* Temporary result storage */
  q31_t a0,b0,c0,d0;
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  uint32_t blkCnt;                               /* loop counter */
  /*loop Unrolling */
  blkCnt = numSamples >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;  
      real_sum += ((q63_t)a0 * c0) >> 14;
      imag_sum += ((q63_t)a0 * d0) >> 14;
      real_sum -= ((q63_t)b0 * d0) >> 14;
      imag_sum += ((q63_t)b0 * c0) >> 14;
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;  
      real_sum += ((q63_t)a0 * c0) >> 14;
      imag_sum += ((q63_t)a0 * d0) >> 14;
      real_sum -= ((q63_t)b0 * d0) >> 14;
      imag_sum += ((q63_t)b0 * c0) >> 14;
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;  
      real_sum += ((q63_t)a0 * c0) >> 14;
      imag_sum += ((q63_t)a0 * d0) >> 14;
      real_sum -= ((q63_t)b0 * d0) >> 14;
      imag_sum += ((q63_t)b0 * c0) >> 14;
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;  
      real_sum += ((q63_t)a0 * c0) >> 14;
      imag_sum += ((q63_t)a0 * d0) >> 14;
      real_sum -= ((q63_t)b0 * d0) >> 14;
      imag_sum += ((q63_t)b0 * c0) >> 14;
      /* Decrement the loop counter */
      blkCnt--;
  }
  /* If the numSamples  is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;
  while(blkCnt > 0u)
  {
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;  
      real_sum += ((q63_t)a0 * c0) >> 14;
      imag_sum += ((q63_t)a0 * d0) >> 14;
      real_sum -= ((q63_t)b0 * d0) >> 14;
      imag_sum += ((q63_t)b0 * c0) >> 14;
      /* Decrement the loop counter */
      blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  while(numSamples > 0u)
  {
      a0 = *pSrcA++;
      b0 = *pSrcA++;
      c0 = *pSrcB++;
      d0 = *pSrcB++;  
      real_sum += ((q63_t)a0 * c0) >> 14;
      imag_sum += ((q63_t)a0 * d0) >> 14;
      real_sum -= ((q63_t)b0 * d0) >> 14;
      imag_sum += ((q63_t)b0 * c0) >> 14;
      /* Decrement the loop counter */
      numSamples--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  /* Store the real and imaginary results in 16.48 format  */
  *realResult = real_sum;
  *imagResult = imag_sum;
 }
 /**    
 * @} end of cmplx_dot_prod group    
 */
--- a/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
+++ b/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
@@ -0,0 +1,165 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_cmplx_mag_f32.c    
 *    
 * Description:	Floating-point complex magnitude.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * ---------------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupCmplxMath    
 */
 /**    
 * @defgroup cmplx_mag Complex Magnitude    
 *    
 * Computes the magnitude of the elements of a complex data vector.    
 *   
 * The <code>pSrc</code> points to the source data and    
 * <code>pDst</code> points to the where the result should be written.    
 * <code>numSamples</code> specifies the number of complex samples    
 * in the input array and the data is stored in an interleaved fashion    
 * (real, imag, real, imag, ...).    
 * The input array has a total of <code>2*numSamples</code> values;    
 * the output array has a total of <code>numSamples</code> values.    
 * The underlying algorithm is used:    
 *    
 * <pre>    
 * for(n=0; n<numSamples; n++) {    
 *     pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);    
 * }    
 * </pre>    
 *    
 * There are separate functions for floating-point, Q15, and Q31 data types.    
 */
 /**    
 * @addtogroup cmplx_mag    
 * @{    
 */
 /**    
 * @brief Floating-point complex magnitude.    
 * @param[in]       *pSrc points to complex input buffer    
 * @param[out]      *pDst points to real output buffer    
 * @param[in]       numSamples number of complex samples in the input vector    
 * @return none.    
 *    
 */
 void arm_cmplx_mag_f32(
  float32_t * pSrc,
  float32_t * pDst,
  uint32_t numSamples)
 {
  float32_t realIn, imagIn;                      /* Temporary variables to hold input values */
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  uint32_t blkCnt;                               /* loop counter */
  /*loop Unrolling */
  blkCnt = numSamples >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
    realIn = *pSrc++;
    imagIn = *pSrc++;
    /* store the result in the destination buffer. */
    arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
    realIn = *pSrc++;
    imagIn = *pSrc++;
    arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
    realIn = *pSrc++;
    imagIn = *pSrc++;
    arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
    realIn = *pSrc++;
    imagIn = *pSrc++;
    arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;
  while(blkCnt > 0u)
  {
    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
    realIn = *pSrc++;
    imagIn = *pSrc++;
    /* store the result in the destination buffer. */
    arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  while(numSamples > 0u)
  {
    /* out = sqrt((real * real) + (imag * imag)) */
    realIn = *pSrc++;
    imagIn = *pSrc++;
    /* store the result in the destination buffer. */
    arm_sqrt_f32((realIn * realIn) + (imagIn * imagIn), pDst++);
    /* Decrement the loop counter */
    numSamples--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of cmplx_mag group    
 */
--- a/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
+++ b/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
@@ -0,0 +1,153 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_cmplx_mag_q15.c    
 *    
 * Description:	Q15 complex magnitude.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * ---------------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupCmplxMath    
 */
 /**    
 * @addtogroup cmplx_mag    
 * @{    
 */
 /**    
 * @brief  Q15 complex magnitude    
 * @param  *pSrc points to the complex input vector    
 * @param  *pDst points to the real output vector    
 * @param  numSamples number of complex samples in the input vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function implements 1.15 by 1.15 multiplications and finally output is converted into 2.14 format.    
 */
 void arm_cmplx_mag_q15(
  q15_t * pSrc,
  q15_t * pDst,
  uint32_t numSamples)
 {
  q31_t acc0, acc1;                              /* Accumulators */
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  uint32_t blkCnt;                               /* loop counter */
  q31_t in1, in2, in3, in4;
  q31_t acc2, acc3;
  /*loop Unrolling */
  blkCnt = numSamples >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
    in1 = *__SIMD32(pSrc)++;
    in2 = *__SIMD32(pSrc)++;
    in3 = *__SIMD32(pSrc)++;
    in4 = *__SIMD32(pSrc)++;
    acc0 = __SMUAD(in1, in1);
    acc1 = __SMUAD(in2, in2);
    acc2 = __SMUAD(in3, in3);
    acc3 = __SMUAD(in4, in4);
    /* store the result in 2.14 format in the destination buffer. */
    arm_sqrt_q15((q15_t) ((acc0) >> 17), pDst++);
    arm_sqrt_q15((q15_t) ((acc1) >> 17), pDst++);
    arm_sqrt_q15((q15_t) ((acc2) >> 17), pDst++);
    arm_sqrt_q15((q15_t) ((acc3) >> 17), pDst++);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;
  while(blkCnt > 0u)
  {
    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
    in1 = *__SIMD32(pSrc)++;
    acc0 = __SMUAD(in1, in1);
    /* store the result in 2.14 format in the destination buffer. */
    arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  q15_t real, imag;                              /* Temporary variables to hold input values */
  while(numSamples > 0u)
  {
    /* out = sqrt(real * real + imag * imag) */
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (real * real);
    acc1 = (imag * imag);
    /* store the result in 2.14 format in the destination buffer. */
    arm_sqrt_q15((q15_t) (((q63_t) acc0 + acc1) >> 17), pDst++);
    /* Decrement the loop counter */
    numSamples--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of cmplx_mag group    
 */
--- a/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
+++ b/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
@@ -0,0 +1,185 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_cmplx_mag_q31.c    
 *    
 * Description:	Q31 complex magnitude    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * ---------------------------------------------------------------------------- */
 #include "arm_math.h"
 /**        
 * @ingroup groupCmplxMath        
 */
 /**        
 * @addtogroup cmplx_mag        
 * @{        
 */
 /**        
 * @brief  Q31 complex magnitude        
 * @param  *pSrc points to the complex input vector        
 * @param  *pDst points to the real output vector        
 * @param  numSamples number of complex samples in the input vector        
 * @return none.        
 *        
 * <b>Scaling and Overflow Behavior:</b>        
 * \par        
 * The function implements 1.31 by 1.31 multiplications and finally output is converted into 2.30 format.        
 * Input down scaling is not required.        
 */
 void arm_cmplx_mag_q31(
  q31_t * pSrc,
  q31_t * pDst,
  uint32_t numSamples)
 {
  q31_t real, imag;                              /* Temporary variables to hold input values */
  q31_t acc0, acc1;                              /* Accumulators */
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  q31_t real1, real2, imag1, imag2;              /* Temporary variables to hold input values */
  q31_t out1, out2, out3, out4;                  /* Accumulators */
  q63_t mul1, mul2, mul3, mul4;                  /* Temporary variables */
  /*loop Unrolling */
  blkCnt = numSamples >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* read complex input from source buffer */
    real1 = pSrc[0];
    imag1 = pSrc[1];
    real2 = pSrc[2];
    imag2 = pSrc[3];
    /* calculate power of input values */
    mul1 = (q63_t) real1 *real1;
    mul2 = (q63_t) imag1 *imag1;
    mul3 = (q63_t) real2 *real2;
    mul4 = (q63_t) imag2 *imag2;
    /* get the result to 3.29 format */
    out1 = (q31_t) (mul1 >> 33);
    out2 = (q31_t) (mul2 >> 33);
    out3 = (q31_t) (mul3 >> 33);
    out4 = (q31_t) (mul4 >> 33);
    /* add real and imaginary accumulators */
    out1 = out1 + out2;
    out3 = out3 + out4;
    /* read complex input from source buffer */
    real1 = pSrc[4];
    imag1 = pSrc[5];
    real2 = pSrc[6];
    imag2 = pSrc[7];
    /* calculate square root */
    arm_sqrt_q31(out1, &pDst[0]);
    /* calculate power of input values */
    mul1 = (q63_t) real1 *real1;
    /* calculate square root */
    arm_sqrt_q31(out3, &pDst[1]);
    /* calculate power of input values */
    mul2 = (q63_t) imag1 *imag1;
    mul3 = (q63_t) real2 *real2;
    mul4 = (q63_t) imag2 *imag2;
    /* get the result to 3.29 format */
    out1 = (q31_t) (mul1 >> 33);
    out2 = (q31_t) (mul2 >> 33);
    out3 = (q31_t) (mul3 >> 33);
    out4 = (q31_t) (mul4 >> 33);
    /* add real and imaginary accumulators */
    out1 = out1 + out2;
    out3 = out3 + out4;
    /* calculate square root */
    arm_sqrt_q31(out1, &pDst[2]);
    /* increment destination by 8 to process next samples */
    pSrc += 8u;
    /* calculate square root */
    arm_sqrt_q31(out3, &pDst[3]);
    /* increment destination by 4 to process next samples */
    pDst += 4u;
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.        
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  blkCnt = numSamples;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
    /* store the result in 2.30 format in the destination buffer. */
    arm_sqrt_q31(acc0 + acc1, pDst++);
    /* Decrement the loop counter */
    blkCnt--;
  }
 }
 /**        
 * @} end of cmplx_mag group        
 */
--- a/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
+++ b/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
@@ -0,0 +1,215 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_cmplx_mag_squared_f32.c    
 *    
 * Description:	Floating-point complex magnitude squared.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * ---------------------------------------------------------------------------- */
 #include "arm_math.h"
 /**        
 * @ingroup groupCmplxMath        
 */
 /**        
 * @defgroup cmplx_mag_squared Complex Magnitude Squared        
 *        
 * Computes the magnitude squared of the elements of a complex data vector.        
 *       
 * The <code>pSrc</code> points to the source data and        
 * <code>pDst</code> points to the where the result should be written.        
 * <code>numSamples</code> specifies the number of complex samples        
 * in the input array and the data is stored in an interleaved fashion        
 * (real, imag, real, imag, ...).        
 * The input array has a total of <code>2*numSamples</code> values;        
 * the output array has a total of <code>numSamples</code> values.        
 *        
 * The underlying algorithm is used:        
 *        
 * <pre>        
 * for(n=0; n<numSamples; n++) {        
 *     pDst[n] = pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2;        
 * }        
 * </pre>        
 *        
 * There are separate functions for floating-point, Q15, and Q31 data types.        
 */
 /**        
 * @addtogroup cmplx_mag_squared        
 * @{        
 */
 /**        
 * @brief  Floating-point complex magnitude squared        
 * @param[in]  *pSrc points to the complex input vector        
 * @param[out]  *pDst points to the real output vector        
 * @param[in]  numSamples number of complex samples in the input vector        
 * @return none.        
 */
 void arm_cmplx_mag_squared_f32(
  float32_t * pSrc,
  float32_t * pDst,
  uint32_t numSamples)
 {
  float32_t real, imag;                          /* Temporary variables to store real and imaginary values */
  uint32_t blkCnt;                               /* loop counter */
 #ifndef ARM_MATH_CM0_FAMILY
  float32_t real1, real2, real3, real4;          /* Temporary variables to hold real values */
  float32_t imag1, imag2, imag3, imag4;          /* Temporary variables to hold imaginary values */
  float32_t mul1, mul2, mul3, mul4;              /* Temporary variables */
  float32_t mul5, mul6, mul7, mul8;              /* Temporary variables */
  float32_t out1, out2, out3, out4;              /* Temporary variables to hold output values */
  /*loop Unrolling */
  blkCnt = numSamples >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
    /* read real input sample from source buffer */
    real1 = pSrc[0];
    /* read imaginary input sample from source buffer */
    imag1 = pSrc[1];
    /* calculate power of real value */
    mul1 = real1 * real1;
    /* read real input sample from source buffer */
    real2 = pSrc[2];
    /* calculate power of imaginary value */
    mul2 = imag1 * imag1;
    /* read imaginary input sample from source buffer */
    imag2 = pSrc[3];
    /* calculate power of real value */
    mul3 = real2 * real2;
    /* read real input sample from source buffer */
    real3 = pSrc[4];
    /* calculate power of imaginary value */
    mul4 = imag2 * imag2;
    /* read imaginary input sample from source buffer */
    imag3 = pSrc[5];
    /* calculate power of real value */
    mul5 = real3 * real3;
    /* calculate power of imaginary value */
    mul6 = imag3 * imag3;
    /* read real input sample from source buffer */
    real4 = pSrc[6];
    /* accumulate real and imaginary powers */
    out1 = mul1 + mul2;
    /* read imaginary input sample from source buffer */
    imag4 = pSrc[7];
    /* accumulate real and imaginary powers */
    out2 = mul3 + mul4;
    /* calculate power of real value */
    mul7 = real4 * real4;
    /* calculate power of imaginary value */
    mul8 = imag4 * imag4;
    /* store output to destination */
    pDst[0] = out1;
    /* accumulate real and imaginary powers */
    out3 = mul5 + mul6;
    /* store output to destination */
    pDst[1] = out2;
    /* accumulate real and imaginary powers */
    out4 = mul7 + mul8;
    /* store output to destination */
    pDst[2] = out3;
    /* increment destination pointer by 8 to process next samples */
    pSrc += 8u;
    /* store output to destination */
    pDst[3] = out4;
    /* increment destination pointer by 4 to process next samples */
    pDst += 4u;
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.        
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  blkCnt = numSamples;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
    real = *pSrc++;
    imag = *pSrc++;
    /* out = (real * real) + (imag * imag) */
    /* store the result in the destination buffer. */
    *pDst++ = (real * real) + (imag * imag);
    /* Decrement the loop counter */
    blkCnt--;
  }
 }
 /**        
 * @} end of cmplx_mag_squared group        
 */
--- a/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
+++ b/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
@@ -0,0 +1,148 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_cmplx_mag_squared_q15.c    
 *    
 * Description:	Q15 complex magnitude squared.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * ---------------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupCmplxMath    
 */
 /**    
 * @addtogroup cmplx_mag_squared    
 * @{    
 */
 /**    
 * @brief  Q15 complex magnitude squared    
 * @param  *pSrc points to the complex input vector    
 * @param  *pDst points to the real output vector    
 * @param  numSamples number of complex samples in the input vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.    
 */
 void arm_cmplx_mag_squared_q15(
  q15_t * pSrc,
  q15_t * pDst,
  uint32_t numSamples)
 {
  q31_t acc0, acc1;                              /* Accumulators */
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  uint32_t blkCnt;                               /* loop counter */
  q31_t in1, in2, in3, in4;
  q31_t acc2, acc3;
  /*loop Unrolling */
  blkCnt = numSamples >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
    in1 = *__SIMD32(pSrc)++;
    in2 = *__SIMD32(pSrc)++;
    in3 = *__SIMD32(pSrc)++;
    in4 = *__SIMD32(pSrc)++;
    acc0 = __SMUAD(in1, in1);
    acc1 = __SMUAD(in2, in2);
    acc2 = __SMUAD(in3, in3);
    acc3 = __SMUAD(in4, in4);
    /* store the result in 3.13 format in the destination buffer. */
    *pDst++ = (q15_t) (acc0 >> 17);
    *pDst++ = (q15_t) (acc1 >> 17);
    *pDst++ = (q15_t) (acc2 >> 17);
    *pDst++ = (q15_t) (acc3 >> 17);
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;
  while(blkCnt > 0u)
  {
    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
    in1 = *__SIMD32(pSrc)++;
    acc0 = __SMUAD(in1, in1);
    /* store the result in 3.13 format in the destination buffer. */
    *pDst++ = (q15_t) (acc0 >> 17);
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  q15_t real, imag;                              /* Temporary variables to store real and imaginary values */
  while(numSamples > 0u)
  {
    /* out = ((real * real) + (imag * imag)) */
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (real * real);
    acc1 = (imag * imag);
    /* store the result in 3.13 format in the destination buffer. */
    *pDst++ = (q15_t) (((q63_t) acc0 + acc1) >> 17);
    /* Decrement the loop counter */
    numSamples--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of cmplx_mag_squared group    
 */
--- a/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
+++ b/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
@@ -0,0 +1,161 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_cmplx_mag_squared_q31.c    
 *    
 * Description:	Q31 complex magnitude squared.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * ---------------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupCmplxMath    
 */
 /**    
 * @addtogroup cmplx_mag_squared    
 * @{    
 */
 /**    
 * @brief  Q31 complex magnitude squared    
 * @param  *pSrc points to the complex input vector    
 * @param  *pDst points to the real output vector    
 * @param  numSamples number of complex samples in the input vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.    
 * Input down scaling is not required.    
 */
 void arm_cmplx_mag_squared_q31(
  q31_t * pSrc,
  q31_t * pDst,
  uint32_t numSamples)
 {
  q31_t real, imag;                              /* Temporary variables to store real and imaginary values */
  q31_t acc0, acc1;                              /* Accumulators */
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  uint32_t blkCnt;                               /* loop counter */
  /* loop Unrolling */
  blkCnt = numSamples >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
    /* store the result in 3.29 format in the destination buffer. */
    *pDst++ = acc0 + acc1;
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
    /* store the result in 3.29 format in the destination buffer. */
    *pDst++ = acc0 + acc1;
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
    /* store the result in 3.29 format in the destination buffer. */
    *pDst++ = acc0 + acc1;
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
    /* store the result in 3.29 format in the destination buffer. */
    *pDst++ = acc0 + acc1;
    /* Decrement the loop counter */
    blkCnt--;
  }
  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;
  while(blkCnt > 0u)
  {
    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
    /* store the result in 3.29 format in the destination buffer. */
    *pDst++ = acc0 + acc1;
    /* Decrement the loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  while(numSamples > 0u)
  {
    /* out = ((real * real) + (imag * imag)) */
    real = *pSrc++;
    imag = *pSrc++;
    acc0 = (q31_t) (((q63_t) real * real) >> 33);
    acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
    /* store the result in 3.29 format in the destination buffer. */
    *pDst++ = acc0 + acc1;
    /* Decrement the loop counter */
    numSamples--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of cmplx_mag_squared group    
 */
--- a/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
+++ b/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
@@ -0,0 +1,207 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_cmplx_mult_cmplx_f32.c    
 *    
 * Description:	Floating-point complex-by-complex multiplication    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**        
 * @ingroup groupCmplxMath        
 */
 /**        
 * @defgroup CmplxByCmplxMult Complex-by-Complex Multiplication        
 *        
 * Multiplies a complex vector by another complex vector and generates a complex result.        
 * The data in the complex arrays is stored in an interleaved fashion        
 * (real, imag, real, imag, ...).        
 * The parameter <code>numSamples</code> represents the number of complex        
 * samples processed.  The complex arrays have a total of <code>2*numSamples</code>        
 * real values.        
 *        
 * The underlying algorithm is used:        
 *        
 * <pre>        
 * for(n=0; n<numSamples; n++) {        
 *     pDst[(2*n)+0] = pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];        
 *     pDst[(2*n)+1] = pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];        
 * }        
 * </pre>        
 *        
 * There are separate functions for floating-point, Q15, and Q31 data types.        
 */
 /**        
 * @addtogroup CmplxByCmplxMult        
 * @{        
 */
 /**        
 * @brief  Floating-point complex-by-complex multiplication        
 * @param[in]  *pSrcA points to the first input vector        
 * @param[in]  *pSrcB points to the second input vector        
 * @param[out]  *pDst  points to the output vector        
 * @param[in]  numSamples number of complex samples in each vector        
 * @return none.        
 */
 void arm_cmplx_mult_cmplx_f32(
  float32_t * pSrcA,
  float32_t * pSrcB,
  float32_t * pDst,
  uint32_t numSamples)
 {
  float32_t a1, b1, c1, d1;                      /* Temporary variables to store real and imaginary values */
  uint32_t blkCnt;                               /* loop counters */
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  float32_t a2, b2, c2, d2;                      /* Temporary variables to store real and imaginary values */
  float32_t acc1, acc2, acc3, acc4;
  /* loop Unrolling */
  blkCnt = numSamples >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
    a1 = *pSrcA;                /* A[2 * i] */
    c1 = *pSrcB;                /* B[2 * i] */
    b1 = *(pSrcA + 1);          /* A[2 * i + 1] */
    acc1 = a1 * c1;             /* acc1 = A[2 * i] * B[2 * i] */
    a2 = *(pSrcA + 2);          /* A[2 * i + 2] */
    acc2 = (b1 * c1);           /* acc2 = A[2 * i + 1] * B[2 * i] */
    d1 = *(pSrcB + 1);          /* B[2 * i + 1] */
    c2 = *(pSrcB + 2);          /* B[2 * i + 2] */
    acc1 -= b1 * d1;            /* acc1 =      A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1] */
    d2 = *(pSrcB + 3);          /* B[2 * i + 3] */
    acc3 = a2 * c2;             /* acc3 =       A[2 * i + 2] * B[2 * i + 2] */
    b2 = *(pSrcA + 3);          /* A[2 * i + 3] */
    acc2 += (a1 * d1);          /* acc2 =      A[2 * i + 1] * B[2 * i] + A[2 * i] * B[2 * i + 1] */
    a1 = *(pSrcA + 4);          /* A[2 * i + 4] */
    acc4 = (a2 * d2);           /* acc4 =   A[2 * i + 2] * B[2 * i + 3] */
    c1 = *(pSrcB + 4);          /* B[2 * i + 4] */
    acc3 -= (b2 * d2);          /* acc3 =       A[2 * i + 2] * B[2 * i + 2] - A[2 * i + 3] * B[2 * i + 3] */
    *pDst = acc1;               /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1] */
    b1 = *(pSrcA + 5);          /* A[2 * i + 5] */
    acc4 += b2 * c2;            /* acc4 =   A[2 * i + 2] * B[2 * i + 3] + A[2 * i + 3] * B[2 * i + 2] */
    *(pDst + 1) = acc2;         /* C[2 * i + 1] = A[2 * i + 1] * B[2 * i] + A[2 * i] * B[2 * i + 1]  */
    acc1 = (a1 * c1);
    d1 = *(pSrcB + 5);
    acc2 = (b1 * c1);
    *(pDst + 2) = acc3;
    *(pDst + 3) = acc4;
    a2 = *(pSrcA + 6);
    acc1 -= (b1 * d1);
    c2 = *(pSrcB + 6);
    acc2 += (a1 * d1);
    b2 = *(pSrcA + 7);
    acc3 = (a2 * c2);
    d2 = *(pSrcB + 7);
    acc4 = (b2 * c2);
    *(pDst + 4) = acc1;
    pSrcA += 8u;
    acc3 -= (b2 * d2);
    acc4 += (a2 * d2);
    *(pDst + 5) = acc2;
    pSrcB += 8u;
    *(pDst + 6) = acc3;
    *(pDst + 7) = acc4;
    pDst += 8u;
    /* Decrement the numSamples loop counter */
    blkCnt--;
  }
  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.        
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  blkCnt = numSamples;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
    a1 = *pSrcA++;
    b1 = *pSrcA++;
    c1 = *pSrcB++;
    d1 = *pSrcB++;
    /* store the result in the destination buffer. */
    *pDst++ = (a1 * c1) - (b1 * d1);
    *pDst++ = (a1 * d1) + (b1 * c1);
    /* Decrement the numSamples loop counter */
    blkCnt--;
  }
 }
 /**        
 * @} end of CmplxByCmplxMult group        
 */
--- a/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
+++ b/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
@@ -0,0 +1,193 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_cmplx_mult_cmplx_q15.c    
 *    
 * Description:	Q15 complex-by-complex multiplication    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupCmplxMath    
 */
 /**    
 * @addtogroup CmplxByCmplxMult    
 * @{    
 */
 /**    
 * @brief  Q15 complex-by-complex multiplication    
 * @param[in]  *pSrcA points to the first input vector    
 * @param[in]  *pSrcB points to the second input vector    
 * @param[out]  *pDst  points to the output vector    
 * @param[in]  numSamples number of complex samples in each vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.    
 */
 void arm_cmplx_mult_cmplx_q15(
  q15_t * pSrcA,
  q15_t * pSrcB,
  q15_t * pDst,
  uint32_t numSamples)
 {
  q15_t a, b, c, d;                              /* Temporary variables to store real and imaginary values */
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  uint32_t blkCnt;                               /* loop counters */
  /* loop Unrolling */
  blkCnt = numSamples >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    /* store the result in 3.13 format in the destination buffer. */
    *pDst++ =
      (q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
    /* store the result in 3.13 format in the destination buffer. */
    *pDst++ =
      (q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    /* store the result in 3.13 format in the destination buffer. */
    *pDst++ =
      (q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
    /* store the result in 3.13 format in the destination buffer. */
    *pDst++ =
      (q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    /* store the result in 3.13 format in the destination buffer. */
    *pDst++ =
      (q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
    /* store the result in 3.13 format in the destination buffer. */
    *pDst++ =
      (q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    /* store the result in 3.13 format in the destination buffer. */
    *pDst++ =
      (q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
    /* store the result in 3.13 format in the destination buffer. */
    *pDst++ =
      (q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
    /* Decrement the blockSize loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;
  while(blkCnt > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    /* store the result in 3.13 format in the destination buffer. */
    *pDst++ =
      (q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
    /* store the result in 3.13 format in the destination buffer. */
    *pDst++ =
      (q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
    /* Decrement the blockSize loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  while(numSamples > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    /* store the result in 3.13 format in the destination buffer. */
    *pDst++ =
      (q15_t) (q31_t) (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17);
    /* store the result in 3.13 format in the destination buffer. */
    *pDst++ =
      (q15_t) (q31_t) (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17);
    /* Decrement the blockSize loop counter */
    numSamples--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of CmplxByCmplxMult group    
 */
--- a/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
+++ b/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
@@ -0,0 +1,326 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_cmplx_mult_cmplx_q31.c    
 *    
 * Description:	Q31 complex-by-complex multiplication    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupCmplxMath    
 */
 /**    
 * @addtogroup CmplxByCmplxMult    
 * @{    
 */
 /**    
 * @brief  Q31 complex-by-complex multiplication    
 * @param[in]  *pSrcA points to the first input vector    
 * @param[in]  *pSrcB points to the second input vector    
 * @param[out]  *pDst  points to the output vector    
 * @param[in]  numSamples number of complex samples in each vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.    
 * Input down scaling is not required.    
 */
 void arm_cmplx_mult_cmplx_q31(
  q31_t * pSrcA,
  q31_t * pSrcB,
  q31_t * pDst,
  uint32_t numSamples)
 {
  q31_t a, b, c, d;                              /* Temporary variables to store real and imaginary values */
  uint32_t blkCnt;                               /* loop counters */
  q31_t mul1, mul2, mul3, mul4;
  q31_t out1, out2;
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  /* loop Unrolling */
  blkCnt = numSamples >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    mul1 = (q31_t) (((q63_t) a * c) >> 32);
    mul2 = (q31_t) (((q63_t) b * d) >> 32);
    mul3 = (q31_t) (((q63_t) a * d) >> 32);
    mul4 = (q31_t) (((q63_t) b * c) >> 32);
    mul1 = (mul1 >> 1);
    mul2 = (mul2 >> 1);
    mul3 = (mul3 >> 1);
    mul4 = (mul4 >> 1);
    out1 = mul1 - mul2;
    out2 = mul3 + mul4;
    /* store the real result in 3.29 format in the destination buffer. */
    *pDst++ = out1;
    /* store the imag result in 3.29 format in the destination buffer. */
    *pDst++ = out2;
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    mul1 = (q31_t) (((q63_t) a * c) >> 32);
    mul2 = (q31_t) (((q63_t) b * d) >> 32);
    mul3 = (q31_t) (((q63_t) a * d) >> 32);
    mul4 = (q31_t) (((q63_t) b * c) >> 32);
    mul1 = (mul1 >> 1);
    mul2 = (mul2 >> 1);
    mul3 = (mul3 >> 1);
    mul4 = (mul4 >> 1);
    out1 = mul1 - mul2;
    out2 = mul3 + mul4;
    /* store the real result in 3.29 format in the destination buffer. */
    *pDst++ = out1;
    /* store the imag result in 3.29 format in the destination buffer. */
    *pDst++ = out2;
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    mul1 = (q31_t) (((q63_t) a * c) >> 32);
    mul2 = (q31_t) (((q63_t) b * d) >> 32);
    mul3 = (q31_t) (((q63_t) a * d) >> 32);
    mul4 = (q31_t) (((q63_t) b * c) >> 32);
    mul1 = (mul1 >> 1);
    mul2 = (mul2 >> 1);
    mul3 = (mul3 >> 1);
    mul4 = (mul4 >> 1);
    out1 = mul1 - mul2;
    out2 = mul3 + mul4;
    /* store the real result in 3.29 format in the destination buffer. */
    *pDst++ = out1;
    /* store the imag result in 3.29 format in the destination buffer. */
    *pDst++ = out2;
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    mul1 = (q31_t) (((q63_t) a * c) >> 32);
    mul2 = (q31_t) (((q63_t) b * d) >> 32);
    mul3 = (q31_t) (((q63_t) a * d) >> 32);
    mul4 = (q31_t) (((q63_t) b * c) >> 32);
    mul1 = (mul1 >> 1);
    mul2 = (mul2 >> 1);
    mul3 = (mul3 >> 1);
    mul4 = (mul4 >> 1);
    out1 = mul1 - mul2;
    out2 = mul3 + mul4;
    /* store the real result in 3.29 format in the destination buffer. */
    *pDst++ = out1;
    /* store the imag result in 3.29 format in the destination buffer. */
    *pDst++ = out2;
    /* Decrement the blockSize loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;
  while(blkCnt > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    mul1 = (q31_t) (((q63_t) a * c) >> 32);
    mul2 = (q31_t) (((q63_t) b * d) >> 32);
    mul3 = (q31_t) (((q63_t) a * d) >> 32);
    mul4 = (q31_t) (((q63_t) b * c) >> 32);
    mul1 = (mul1 >> 1);
    mul2 = (mul2 >> 1);
    mul3 = (mul3 >> 1);
    mul4 = (mul4 >> 1);
    out1 = mul1 - mul2;
    out2 = mul3 + mul4;
    /* store the real result in 3.29 format in the destination buffer. */
    *pDst++ = out1;
    /* store the imag result in 3.29 format in the destination buffer. */
    *pDst++ = out2;
    /* Decrement the blockSize loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  /* loop Unrolling */
  blkCnt = numSamples >> 1u;
  /* First part of the processing with loop unrolling.  Compute 2 outputs at a time.     
   ** a second loop below computes the remaining 1 sample. */
  while(blkCnt > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    mul1 = (q31_t) (((q63_t) a * c) >> 32);
    mul2 = (q31_t) (((q63_t) b * d) >> 32);
    mul3 = (q31_t) (((q63_t) a * d) >> 32);
    mul4 = (q31_t) (((q63_t) b * c) >> 32);
    mul1 = (mul1 >> 1);
    mul2 = (mul2 >> 1);
    mul3 = (mul3 >> 1);
    mul4 = (mul4 >> 1);
    out1 = mul1 - mul2;
    out2 = mul3 + mul4;
    /* store the real result in 3.29 format in the destination buffer. */
    *pDst++ = out1;
    /* store the imag result in 3.29 format in the destination buffer. */
    *pDst++ = out2;
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    mul1 = (q31_t) (((q63_t) a * c) >> 32);
    mul2 = (q31_t) (((q63_t) b * d) >> 32);
    mul3 = (q31_t) (((q63_t) a * d) >> 32);
    mul4 = (q31_t) (((q63_t) b * c) >> 32);
    mul1 = (mul1 >> 1);
    mul2 = (mul2 >> 1);
    mul3 = (mul3 >> 1);
    mul4 = (mul4 >> 1);
    out1 = mul1 - mul2;
    out2 = mul3 + mul4;
    /* store the real result in 3.29 format in the destination buffer. */
    *pDst++ = out1;
    /* store the imag result in 3.29 format in the destination buffer. */
    *pDst++ = out2;
    /* Decrement the blockSize loop counter */
    blkCnt--;
  }
  /* If the blockSize is not a multiple of 2, compute any remaining output samples here.     
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x2u;
  while(blkCnt > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;
    mul1 = (q31_t) (((q63_t) a * c) >> 32);
    mul2 = (q31_t) (((q63_t) b * d) >> 32);
    mul3 = (q31_t) (((q63_t) a * d) >> 32);
    mul4 = (q31_t) (((q63_t) b * c) >> 32);
    mul1 = (mul1 >> 1);
    mul2 = (mul2 >> 1);
    mul3 = (mul3 >> 1);
    mul4 = (mul4 >> 1);
    out1 = mul1 - mul2;
    out2 = mul3 + mul4;
    /* store the real result in 3.29 format in the destination buffer. */
    *pDst++ = out1;
    /* store the imag result in 3.29 format in the destination buffer. */
    *pDst++ = out2;
    /* Decrement the blockSize loop counter */
    blkCnt--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of CmplxByCmplxMult group    
 */
--- a/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
+++ b/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
@@ -0,0 +1,225 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_cmplx_mult_real_f32.c    
 *    
 * Description:	Floating-point complex by real multiplication    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**        
 * @ingroup groupCmplxMath        
 */
 /**        
 * @defgroup CmplxByRealMult Complex-by-Real Multiplication        
 *        
 * Multiplies a complex vector by a real vector and generates a complex result.        
 * The data in the complex arrays is stored in an interleaved fashion        
 * (real, imag, real, imag, ...).        
 * The parameter <code>numSamples</code> represents the number of complex        
 * samples processed.  The complex arrays have a total of <code>2*numSamples</code>        
 * real values while the real array has a total of <code>numSamples</code>        
 * real values.        
 *        
 * The underlying algorithm is used:        
 *        
 * <pre>        
 * for(n=0; n<numSamples; n++) {        
 *     pCmplxDst[(2*n)+0] = pSrcCmplx[(2*n)+0] * pSrcReal[n];        
 *     pCmplxDst[(2*n)+1] = pSrcCmplx[(2*n)+1] * pSrcReal[n];        
 * }        
 * </pre>        
 *        
 * There are separate functions for floating-point, Q15, and Q31 data types.        
 */
 /**        
 * @addtogroup CmplxByRealMult        
 * @{        
 */
 /**        
 * @brief  Floating-point complex-by-real multiplication        
 * @param[in]  *pSrcCmplx points to the complex input vector        
 * @param[in]  *pSrcReal points to the real input vector        
 * @param[out]  *pCmplxDst points to the complex output vector        
 * @param[in]  numSamples number of samples in each vector        
 * @return none.        
 */
 void arm_cmplx_mult_real_f32(
  float32_t * pSrcCmplx,
  float32_t * pSrcReal,
  float32_t * pCmplxDst,
  uint32_t numSamples)
 {
  float32_t in;                                  /* Temporary variable to store input value */
  uint32_t blkCnt;                               /* loop counters */
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  float32_t inA1, inA2, inA3, inA4;              /* Temporary variables to hold input data */
  float32_t inA5, inA6, inA7, inA8;              /* Temporary variables to hold input data */
  float32_t inB1, inB2, inB3, inB4;              /* Temporary variables to hold input data */
  float32_t out1, out2, out3, out4;              /* Temporary variables to hold output data */
  float32_t out5, out6, out7, out8;              /* Temporary variables to hold output data */
  /* loop Unrolling */
  blkCnt = numSamples >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.        
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[i].            */
    /* C[2 * i + 1] = A[2 * i + 1] * B[i].        */
    /* read input from complex input buffer */
    inA1 = pSrcCmplx[0];
    inA2 = pSrcCmplx[1];
    /* read input from real input buffer */
    inB1 = pSrcReal[0];
    /* read input from complex input buffer */
    inA3 = pSrcCmplx[2];
    /* multiply complex buffer real input with real buffer input */
    out1 = inA1 * inB1;
    /* read input from complex input buffer */
    inA4 = pSrcCmplx[3];
    /* multiply complex buffer imaginary input with real buffer input */
    out2 = inA2 * inB1;
    /* read input from real input buffer */
    inB2 = pSrcReal[1];
    /* read input from complex input buffer */
    inA5 = pSrcCmplx[4];
    /* multiply complex buffer real input with real buffer input */
    out3 = inA3 * inB2;
    /* read input from complex input buffer */
    inA6 = pSrcCmplx[5];
    /* read input from real input buffer */
    inB3 = pSrcReal[2];
    /* multiply complex buffer imaginary input with real buffer input */
    out4 = inA4 * inB2;
    /* read input from complex input buffer */
    inA7 = pSrcCmplx[6];
    /* multiply complex buffer real input with real buffer input */
    out5 = inA5 * inB3;
    /* read input from complex input buffer */
    inA8 = pSrcCmplx[7];
    /* multiply complex buffer imaginary input with real buffer input */
    out6 = inA6 * inB3;
    /* read input from real input buffer */
    inB4 = pSrcReal[3];
    /* store result to destination bufer */
    pCmplxDst[0] = out1;
    /* multiply complex buffer real input with real buffer input */
    out7 = inA7 * inB4;
    /* store result to destination bufer */
    pCmplxDst[1] = out2;
    /* multiply complex buffer imaginary input with real buffer input */
    out8 = inA8 * inB4;
    /* store result to destination bufer */
    pCmplxDst[2] = out3;
    pCmplxDst[3] = out4;
    pCmplxDst[4] = out5;
    /* incremnet complex input buffer by 8 to process next samples */
    pSrcCmplx += 8u;
    /* store result to destination bufer */
    pCmplxDst[5] = out6;
    /* increment real input buffer by 4 to process next samples */
    pSrcReal += 4u;
    /* store result to destination bufer */
    pCmplxDst[6] = out7;
    pCmplxDst[7] = out8;
    /* increment destination buffer by 8 to process next sampels */
    pCmplxDst += 8u;
    /* Decrement the numSamples loop counter */
    blkCnt--;
  }
  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.        
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;
 #else
  /* Run the below code for Cortex-M0 */
  blkCnt = numSamples;
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  while(blkCnt > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[i].            */
    /* C[2 * i + 1] = A[2 * i + 1] * B[i].        */
    in = *pSrcReal++;
    /* store the result in the destination buffer. */
    *pCmplxDst++ = (*pSrcCmplx++) * (in);
    *pCmplxDst++ = (*pSrcCmplx++) * (in);
    /* Decrement the numSamples loop counter */
    blkCnt--;
  }
 }
 /**        
 * @} end of CmplxByRealMult group        
 */
--- a/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
+++ b/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
@@ -0,0 +1,203 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_cmplx_mult_real_q15.c    
 *    
 * Description:	Q15 complex by real multiplication    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupCmplxMath    
 */
 /**    
 * @addtogroup CmplxByRealMult    
 * @{    
 */
 /**    
 * @brief  Q15 complex-by-real multiplication    
 * @param[in]  *pSrcCmplx points to the complex input vector    
 * @param[in]  *pSrcReal points to the real input vector    
 * @param[out]  *pCmplxDst points to the complex output vector    
 * @param[in]  numSamples number of samples in each vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.    
 */
 void arm_cmplx_mult_real_q15(
  q15_t * pSrcCmplx,
  q15_t * pSrcReal,
  q15_t * pCmplxDst,
  uint32_t numSamples)
 {
  q15_t in;                                      /* Temporary variable to store input value */
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  uint32_t blkCnt;                               /* loop counters */
  q31_t inA1, inA2;                              /* Temporary variables to hold input data */
  q31_t inB1;                                    /* Temporary variables to hold input data */
  q15_t out1, out2, out3, out4;                  /* Temporary variables to hold output data */
  q31_t mul1, mul2, mul3, mul4;                  /* Temporary variables to hold intermediate data */
  /* loop Unrolling */
  blkCnt = numSamples >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[i].            */
    /* C[2 * i + 1] = A[2 * i + 1] * B[i].        */
    /* read complex number both real and imaginary from complex input buffer */
    inA1 = *__SIMD32(pSrcCmplx)++;
    /* read two real values at a time from real input buffer */
    inB1 = *__SIMD32(pSrcReal)++;
    /* read complex number both real and imaginary from complex input buffer */
    inA2 = *__SIMD32(pSrcCmplx)++;
    /* multiply complex number with real numbers */
 #ifndef ARM_MATH_BIG_ENDIAN
    mul1 = (q31_t) ((q15_t) (inA1) * (q15_t) (inB1));
    mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1));
    mul3 = (q31_t) ((q15_t) (inA2) * (q15_t) (inB1 >> 16));
    mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB1 >> 16));
 #else
    mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
    mul1 = (q31_t) ((q15_t) inA1 * (q15_t) (inB1 >> 16));
    mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) inB1);
    mul3 = (q31_t) ((q15_t) inA2 * (q15_t) inB1);
 #endif //      #ifndef ARM_MATH_BIG_ENDIAN
    /* saturate the result */
    out1 = (q15_t) __SSAT(mul1 >> 15u, 16);
    out2 = (q15_t) __SSAT(mul2 >> 15u, 16);
    out3 = (q15_t) __SSAT(mul3 >> 15u, 16);
    out4 = (q15_t) __SSAT(mul4 >> 15u, 16);
    /* pack real and imaginary outputs and store them to destination */
    *__SIMD32(pCmplxDst)++ = __PKHBT(out1, out2, 16);
    *__SIMD32(pCmplxDst)++ = __PKHBT(out3, out4, 16);
    inA1 = *__SIMD32(pSrcCmplx)++;
    inB1 = *__SIMD32(pSrcReal)++;
    inA2 = *__SIMD32(pSrcCmplx)++;
 #ifndef ARM_MATH_BIG_ENDIAN
    mul1 = (q31_t) ((q15_t) (inA1) * (q15_t) (inB1));
    mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1));
    mul3 = (q31_t) ((q15_t) (inA2) * (q15_t) (inB1 >> 16));
    mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB1 >> 16));
 #else
    mul2 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
    mul1 = (q31_t) ((q15_t) inA1 * (q15_t) (inB1 >> 16));
    mul4 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) inB1);
    mul3 = (q31_t) ((q15_t) inA2 * (q15_t) inB1);
 #endif //      #ifndef ARM_MATH_BIG_ENDIAN
    out1 = (q15_t) __SSAT(mul1 >> 15u, 16);
    out2 = (q15_t) __SSAT(mul2 >> 15u, 16);
    out3 = (q15_t) __SSAT(mul3 >> 15u, 16);
    out4 = (q15_t) __SSAT(mul4 >> 15u, 16);
    *__SIMD32(pCmplxDst)++ = __PKHBT(out1, out2, 16);
    *__SIMD32(pCmplxDst)++ = __PKHBT(out3, out4, 16);
    /* Decrement the numSamples loop counter */
    blkCnt--;
  }
  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;
  while(blkCnt > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[i].            */
    /* C[2 * i + 1] = A[2 * i + 1] * B[i].        */
    in = *pSrcReal++;
    /* store the result in the destination buffer. */
    *pCmplxDst++ =
      (q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
    *pCmplxDst++ =
      (q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
    /* Decrement the numSamples loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  while(numSamples > 0u)
  {
    /* realOut = realA * realB.            */
    /* imagOut = imagA * realB.                */
    in = *pSrcReal++;
    /* store the result in the destination buffer. */
    *pCmplxDst++ =
      (q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
    *pCmplxDst++ =
      (q15_t) __SSAT((((q31_t) (*pSrcCmplx++) * (in)) >> 15), 16);
    /* Decrement the numSamples loop counter */
    numSamples--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of CmplxByRealMult group    
 */
--- a/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
+++ b/CMSIS/DSP_Lib/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
@@ -0,0 +1,223 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_cmplx_mult_real_q31.c    
 *    
 * Description:	Q31 complex by real multiplication    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupCmplxMath    
 */
 /**    
 * @addtogroup CmplxByRealMult    
 * @{    
 */
 /**    
 * @brief  Q31 complex-by-real multiplication    
 * @param[in]  *pSrcCmplx points to the complex input vector    
 * @param[in]  *pSrcReal points to the real input vector    
 * @param[out]  *pCmplxDst points to the complex output vector    
 * @param[in]  numSamples number of samples in each vector    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function uses saturating arithmetic.    
 * Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.    
 */
 void arm_cmplx_mult_real_q31(
  q31_t * pSrcCmplx,
  q31_t * pSrcReal,
  q31_t * pCmplxDst,
  uint32_t numSamples)
 {
  q31_t inA1;                                    /* Temporary variable to store input value */
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  uint32_t blkCnt;                               /* loop counters */
  q31_t inA2, inA3, inA4;                        /* Temporary variables to hold input data */
  q31_t inB1, inB2;                              /* Temporary variabels to hold input data */
  q31_t out1, out2, out3, out4;                  /* Temporary variables to hold output data */
  /* loop Unrolling */
  blkCnt = numSamples >> 2u;
  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
   ** a second loop below computes the remaining 1 to 3 samples. */
  while(blkCnt > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[i].            */
    /* C[2 * i + 1] = A[2 * i + 1] * B[i].        */
    /* read real input from complex input buffer */
    inA1 = *pSrcCmplx++;
    inA2 = *pSrcCmplx++;
    /* read input from real input bufer */
    inB1 = *pSrcReal++;
    inB2 = *pSrcReal++;
    /* read imaginary input from complex input buffer */
    inA3 = *pSrcCmplx++;
    inA4 = *pSrcCmplx++;
    /* multiply complex input with real input */
    out1 = ((q63_t) inA1 * inB1) >> 32;
    out2 = ((q63_t) inA2 * inB1) >> 32;
    out3 = ((q63_t) inA3 * inB2) >> 32;
    out4 = ((q63_t) inA4 * inB2) >> 32;
    /* sature the result */
    out1 = __SSAT(out1, 31);
    out2 = __SSAT(out2, 31);
    out3 = __SSAT(out3, 31);
    out4 = __SSAT(out4, 31);
    /* get result in 1.31 format */
    out1 = out1 << 1;
    out2 = out2 << 1;
    out3 = out3 << 1;
    out4 = out4 << 1;
    /* store the result to destination buffer */
    *pCmplxDst++ = out1;
    *pCmplxDst++ = out2;
    *pCmplxDst++ = out3;
    *pCmplxDst++ = out4;
    /* read real input from complex input buffer */
    inA1 = *pSrcCmplx++;
    inA2 = *pSrcCmplx++;
    /* read input from real input bufer */
    inB1 = *pSrcReal++;
    inB2 = *pSrcReal++;
    /* read imaginary input from complex input buffer */
    inA3 = *pSrcCmplx++;
    inA4 = *pSrcCmplx++;
    /* multiply complex input with real input */
    out1 = ((q63_t) inA1 * inB1) >> 32;
    out2 = ((q63_t) inA2 * inB1) >> 32;
    out3 = ((q63_t) inA3 * inB2) >> 32;
    out4 = ((q63_t) inA4 * inB2) >> 32;
    /* sature the result */
    out1 = __SSAT(out1, 31);
    out2 = __SSAT(out2, 31);
    out3 = __SSAT(out3, 31);
    out4 = __SSAT(out4, 31);
    /* get result in 1.31 format */
    out1 = out1 << 1;
    out2 = out2 << 1;
    out3 = out3 << 1;
    out4 = out4 << 1;
    /* store the result to destination buffer */
    *pCmplxDst++ = out1;
    *pCmplxDst++ = out2;
    *pCmplxDst++ = out3;
    *pCmplxDst++ = out4;
    /* Decrement the numSamples loop counter */
    blkCnt--;
  }
  /* If the numSamples is not a multiple of 4, compute any remaining output samples here.    
   ** No loop unrolling is used. */
  blkCnt = numSamples % 0x4u;
  while(blkCnt > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[i].            */
    /* C[2 * i + 1] = A[2 * i + 1] * B[i].        */
    /* read real input from complex input buffer */
    inA1 = *pSrcCmplx++;
    inA2 = *pSrcCmplx++;
    /* read input from real input bufer */
    inB1 = *pSrcReal++;
    /* multiply complex input with real input */
    out1 = ((q63_t) inA1 * inB1) >> 32;
    out2 = ((q63_t) inA2 * inB1) >> 32;
    /* sature the result */
    out1 = __SSAT(out1, 31);
    out2 = __SSAT(out2, 31);
    /* get result in 1.31 format */
    out1 = out1 << 1;
    out2 = out2 << 1;
    /* store the result to destination buffer */
    *pCmplxDst++ = out1;
    *pCmplxDst++ = out2;
    /* Decrement the numSamples loop counter */
    blkCnt--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  while(numSamples > 0u)
  {
    /* realOut = realA * realB.            */
    /* imagReal = imagA * realB.               */
    inA1 = *pSrcReal++;
    /* store the result in the destination buffer. */
    *pCmplxDst++ =
      (q31_t) clip_q63_to_q31(((q63_t) * pSrcCmplx++ * inA1) >> 31);
    *pCmplxDst++ =
      (q31_t) clip_q63_to_q31(((q63_t) * pSrcCmplx++ * inA1) >> 31);
    /* Decrement the numSamples loop counter */
    numSamples--;
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of CmplxByRealMult group    
 */
--- a/CMSIS/DSP_Lib/Source/ControllerFunctions/arm_pid_init_f32.c
+++ b/CMSIS/DSP_Lib/Source/ControllerFunctions/arm_pid_init_f32.c
@@ -0,0 +1,87 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_pid_init_f32.c    
 *    
 * Description:	Floating-point PID Control initialization function    
 *				   
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * ------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @addtogroup PID    
 * @{    
 */
 /**    
 * @brief  Initialization function for the floating-point PID Control.   
 * @param[in,out] *S points to an instance of the PID structure.   
 * @param[in]     resetStateFlag  flag to reset the state. 0 = no change in state & 1 = reset the state.   
 * @return none.   
 * \par Description:   
 * \par    
 * The <code>resetStateFlag</code> specifies whether to set state to zero or not. \n   
 * The function computes the structure fields: <code>A0</code>, <code>A1</code> <code>A2</code>    
 * using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)    
 * also sets the state variables to all zeros.    
 */
 void arm_pid_init_f32(
  arm_pid_instance_f32 * S,
  int32_t resetStateFlag)
 {
  /* Derived coefficient A0 */
  S->A0 = S->Kp + S->Ki + S->Kd;
  /* Derived coefficient A1 */
  S->A1 = (-S->Kp) - ((float32_t) 2.0 * S->Kd);
  /* Derived coefficient A2 */
  S->A2 = S->Kd;
  /* Check whether state needs reset or not */
  if(resetStateFlag)
  {
    /* Clear the state buffer.  The size will be always 3 samples */
    memset(S->state, 0, 3u * sizeof(float32_t));
  }
 }
 /**    
 * @} end of PID group    
 */
--- a/CMSIS/DSP_Lib/Source/ControllerFunctions/arm_pid_init_q15.c
+++ b/CMSIS/DSP_Lib/Source/ControllerFunctions/arm_pid_init_q15.c
@@ -0,0 +1,122 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_pid_init_q15.c    
 *    
 * Description:	Q15 PID Control initialization function    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @addtogroup PID    
 * @{    
 */
 /**    
 * @details    
 * @param[in,out] *S points to an instance of the Q15 PID structure.    
 * @param[in]     resetStateFlag  flag to reset the state. 0 = no change in state 1 = reset the state.    
 * @return none.    
 * \par Description:   
 * \par    
 * The <code>resetStateFlag</code> specifies whether to set state to zero or not. \n   
 * The function computes the structure fields: <code>A0</code>, <code>A1</code> <code>A2</code>    
 * using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)    
 * also sets the state variables to all zeros.    
 */
 void arm_pid_init_q15(
  arm_pid_instance_q15 * S,
  int32_t resetStateFlag)
 {
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  /* Derived coefficient A0 */
  S->A0 = __QADD16(__QADD16(S->Kp, S->Ki), S->Kd);
  /* Derived coefficients and pack into A1 */
 #ifndef  ARM_MATH_BIG_ENDIAN
  S->A1 = __PKHBT(-__QADD16(__QADD16(S->Kd, S->Kd), S->Kp), S->Kd, 16);
 #else
  S->A1 = __PKHBT(S->Kd, -__QADD16(__QADD16(S->Kd, S->Kd), S->Kp), 16);
 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
  /* Check whether state needs reset or not */
  if(resetStateFlag)
  {
    /* Clear the state buffer.  The size will be always 3 samples */
    memset(S->state, 0, 3u * sizeof(q15_t));
  }
 #else
  /* Run the below code for Cortex-M0 */
  q31_t temp;                                    /*to store the sum */
  /* Derived coefficient A0 */
  temp = S->Kp + S->Ki + S->Kd;
  S->A0 = (q15_t) __SSAT(temp, 16);
  /* Derived coefficients and pack into A1 */
  temp = -(S->Kd + S->Kd + S->Kp);
  S->A1 = (q15_t) __SSAT(temp, 16);
  S->A2 = S->Kd;
  /* Check whether state needs reset or not */
  if(resetStateFlag)
  {
    /* Clear the state buffer.  The size will be always 3 samples */
    memset(S->state, 0, 3u * sizeof(q15_t));
  }
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of PID group    
 */
--- a/CMSIS/DSP_Lib/Source/ControllerFunctions/arm_pid_init_q31.c
+++ b/CMSIS/DSP_Lib/Source/ControllerFunctions/arm_pid_init_q31.c
@@ -0,0 +1,107 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_pid_init_q31.c    
 *    
 * Description:	Q31 PID Control initialization function     
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * ------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @addtogroup PID    
 * @{    
 */
 /**    
 * @brief  Initialization function for the Q31 PID Control.   
 * @param[in,out] *S points to an instance of the Q31 PID structure.   
 * @param[in]     resetStateFlag  flag to reset the state. 0 = no change in state 1 = reset the state.   
 * @return none.    
 * \par Description:   
 * \par    
 * The <code>resetStateFlag</code> specifies whether to set state to zero or not. \n   
 * The function computes the structure fields: <code>A0</code>, <code>A1</code> <code>A2</code>    
 * using the proportional gain( \c Kp), integral gain( \c Ki) and derivative gain( \c Kd)    
 * also sets the state variables to all zeros.    
 */
 void arm_pid_init_q31(
  arm_pid_instance_q31 * S,
  int32_t resetStateFlag)
 {
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  /* Derived coefficient A0 */
  S->A0 = __QADD(__QADD(S->Kp, S->Ki), S->Kd);
  /* Derived coefficient A1 */
  S->A1 = -__QADD(__QADD(S->Kd, S->Kd), S->Kp);
 #else
  /* Run the below code for Cortex-M0 */
  q31_t temp;
  /* Derived coefficient A0 */
  temp = clip_q63_to_q31((q63_t) S->Kp + S->Ki);
  S->A0 = clip_q63_to_q31((q63_t) temp + S->Kd);
  /* Derived coefficient A1 */
  temp = clip_q63_to_q31((q63_t) S->Kd + S->Kd);
  S->A1 = -clip_q63_to_q31((q63_t) temp + S->Kp);
 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  /* Derived coefficient A2 */
  S->A2 = S->Kd;
  /* Check whether state needs reset or not */
  if(resetStateFlag)
  {
    /* Clear the state buffer.  The size will be always 3 samples */
    memset(S->state, 0, 3u * sizeof(q31_t));
  }
 }
 /**    
 * @} end of PID group    
 */
--- a/CMSIS/DSP_Lib/Source/ControllerFunctions/arm_pid_reset_f32.c
+++ b/CMSIS/DSP_Lib/Source/ControllerFunctions/arm_pid_reset_f32.c
@@ -0,0 +1,65 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_pid_reset_f32.c    
 *    
 * Description:	Floating-point PID Control reset function   
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * ------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @addtogroup PID    
 * @{    
 */
 /**    
 * @brief  Reset function for the floating-point PID Control.   
 * @param[in] *S	Instance pointer of PID control data structure.   
 * @return none.    
 * \par Description:   
 * The function resets the state buffer to zeros.    
 */
 void arm_pid_reset_f32(
  arm_pid_instance_f32 * S)
 {
  /* Clear the state buffer.  The size will be always 3 samples */
  memset(S->state, 0, 3u * sizeof(float32_t));
 }
 /**    
 * @} end of PID group    
 */
--- a/CMSIS/DSP_Lib/Source/ControllerFunctions/arm_pid_reset_q15.c
+++ b/CMSIS/DSP_Lib/Source/ControllerFunctions/arm_pid_reset_q15.c
@@ -0,0 +1,64 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_pid_reset_q15.c    
 *    
 * Description:	Q15 PID Control reset function   
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @addtogroup PID    
 * @{    
 */
 /**    
 * @brief  Reset function for the Q15 PID Control.   
 * @param[in] *S		Instance pointer of PID control data structure.   
 * @return none.    
 * \par Description:   
 * The function resets the state buffer to zeros.    
 */
 void arm_pid_reset_q15(
  arm_pid_instance_q15 * S)
 {
  /* Reset state to zero, The size will be always 3 samples */
  memset(S->state, 0, 3u * sizeof(q15_t));
 }
 /**    
 * @} end of PID group    
 */
--- a/CMSIS/DSP_Lib/Source/ControllerFunctions/arm_pid_reset_q31.c
+++ b/CMSIS/DSP_Lib/Source/ControllerFunctions/arm_pid_reset_q31.c
@@ -0,0 +1,65 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_pid_reset_q31.c    
 *    
 * Description:	Q31 PID Control reset function   
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.    
 * ------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @addtogroup PID    
 * @{    
 */
 /**    
 * @brief  Reset function for the Q31 PID Control.   
 * @param[in] *S	Instance pointer of PID control data structure.   
 * @return none.    
 * \par Description:   
 * The function resets the state buffer to zeros.    
 */
 void arm_pid_reset_q31(
  arm_pid_instance_q31 * S)
 {
  /* Clear the state buffer.  The size will be always 3 samples */
  memset(S->state, 0, 3u * sizeof(q31_t));
 }
 /**    
 * @} end of PID group    
 */
--- a/CMSIS/DSP_Lib/Source/ControllerFunctions/arm_sin_cos_f32.c
+++ b/CMSIS/DSP_Lib/Source/ControllerFunctions/arm_sin_cos_f32.c
@@ -0,0 +1,149 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_sin_cos_f32.c    
 *    
 * Description:	Sine and Cosine calculation for floating-point values.   
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 #include "arm_common_tables.h"
 /**    
 * @ingroup groupController    
 */
 /**    
 * @defgroup SinCos Sine Cosine   
 *    
 * Computes the trigonometric sine and cosine values using a combination of table lookup   
 * and linear interpolation.     
 * There are separate functions for Q31 and floating-point data types.   
 * The input to the floating-point version is in degrees while the   
 * fixed-point Q31 have a scaled input with the range   
 * [-1 0.9999] mapping to [-180 +180] degrees.   
 *
 * The floating point function also allows values that are out of the usual range. When this happens, the function will
 * take extra time to adjust the input value to the range of [-180 180].
 *   
 * The implementation is based on table lookup using 360 values together with linear interpolation.   
 * The steps used are:   
 *  -# Calculation of the nearest integer table index.   
 *  -# Compute the fractional portion (fract) of the input.   
 *  -# Fetch the value corresponding to \c index from sine table to \c y0 and also value from \c index+1 to \c y1.      
 *  -# Sine value is computed as <code> *psinVal = y0 + (fract * (y1 - y0))</code>.    
 *  -# Fetch the value corresponding to \c index from cosine table to \c y0 and also value from \c index+1 to \c y1.      
 *  -# Cosine value is computed as <code> *pcosVal = y0 + (fract * (y1 - y0))</code>.    
 */
 /**    
 * @addtogroup SinCos    
 * @{    
 */
 /**    
 * @brief  Floating-point sin_cos function.   
 * @param[in]  theta    input value in degrees    
 * @param[out] *pSinVal points to the processed sine output.    
 * @param[out] *pCosVal points to the processed cos output.    
 * @return none.   
 */
 void arm_sin_cos_f32(
  float32_t theta,
  float32_t * pSinVal,
  float32_t * pCosVal)
 {
  float32_t fract, in;                             /* Temporary variables for input, output */
  uint16_t indexS, indexC;                         /* Index variable */
  float32_t f1, f2, d1, d2;                        /* Two nearest output values */
  int32_t n;
  float32_t findex, Dn, Df, temp;
  /* input x is in degrees */
  /* Scale the input, divide input by 360, for cosine add 0.25 (pi/2) to read sine table */
  in = theta * 0.00277777777778f;
  /* Calculation of floor value of input */
  n = (int32_t) in;
  /* Make negative values towards -infinity */
  if(in < 0.0f)
  {
    n--;
  }
  /* Map input value to [0 1] */
  in = in - (float32_t) n;
  /* Calculation of index of the table */
  findex = (float32_t) FAST_MATH_TABLE_SIZE * in;
  indexS = ((uint16_t)findex) & 0x1ff;
  indexC = (indexS + (FAST_MATH_TABLE_SIZE / 4)) & 0x1ff;
  /* fractional value calculation */
  fract = findex - (float32_t) indexS;
  /* Read two nearest values of input value from the cos & sin tables */
  f1 = sinTable_f32[indexC+0];
  f2 = sinTable_f32[indexC+1];
  d1 = -sinTable_f32[indexS+0];
  d2 = -sinTable_f32[indexS+1];
  Dn = 0.0122718463030f; // delta between the two points (fixed), in this case 2*pi/FAST_MATH_TABLE_SIZE
  Df = f2 - f1; // delta between the values of the functions
  temp = Dn*(d1 + d2) - 2*Df;
  temp = fract*temp + (3*Df - (d2 + 2*d1)*Dn);
  temp = fract*temp + d1*Dn;
  /* Calculation of cosine value */
  *pCosVal = fract*temp + f1;
  /* Read two nearest values of input value from the cos & sin tables */
  f1 = sinTable_f32[indexS+0];
  f2 = sinTable_f32[indexS+1];
  d1 = sinTable_f32[indexC+0];
  d2 = sinTable_f32[indexC+1];
  Df = f2 - f1; // delta between the values of the functions
  temp = Dn*(d1 + d2) - 2*Df;
  temp = fract*temp + (3*Df - (d2 + 2*d1)*Dn);
  temp = fract*temp + d1*Dn;
  /* Calculation of sine value */
  *pSinVal = fract*temp + f1;
 }
 /**    
 * @} end of SinCos group    
 */
--- a/CMSIS/DSP_Lib/Source/ControllerFunctions/arm_sin_cos_q31.c
+++ b/CMSIS/DSP_Lib/Source/ControllerFunctions/arm_sin_cos_q31.c
@@ -0,0 +1,122 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_sin_cos_q31.c    
 *    
 * Description:	Cosine & Sine calculation for Q31 values.   
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.    
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 #include "arm_common_tables.h"
 /**    
 * @ingroup groupController    
 */
 /**    
 * @addtogroup SinCos    
 * @{    
 */
 /**    
 * @brief  Q31 sin_cos function.   
 * @param[in]  theta    scaled input value in degrees    
 * @param[out] *pSinVal points to the processed sine output.    
 * @param[out] *pCosVal points to the processed cosine output.    
 * @return none.   
 *    
 * The Q31 input value is in the range [-1 0.999999] and is mapped to a degree value in the range [-180 179].   
 *    
 */
 void arm_sin_cos_q31(
  q31_t theta,
  q31_t * pSinVal,
  q31_t * pCosVal)
 {
  q31_t fract;                                 /* Temporary variables for input, output */
  uint16_t indexS, indexC;                     /* Index variable */
  q31_t f1, f2, d1, d2;                        /* Two nearest output values */
  q31_t Dn, Df;
  q63_t temp;
  /* Calculate the nearest index */
  indexS = (uint32_t)theta >> CONTROLLER_Q31_SHIFT;
  indexC = (indexS + 128) & 0x1ff;
  /* Calculation of fractional value */
  fract = (theta - (indexS << CONTROLLER_Q31_SHIFT)) << 8;
  /* Read two nearest values of input value from the cos & sin tables */
  f1 = sinTable_q31[indexC+0];
  f2 = sinTable_q31[indexC+1];
  d1 = -sinTable_q31[indexS+0];
  d2 = -sinTable_q31[indexS+1];
  Dn = 0x1921FB5; // delta between the two points (fixed), in this case 2*pi/FAST_MATH_TABLE_SIZE
  Df = f2 - f1; // delta between the values of the functions
  temp = Dn*((q63_t)d1 + d2);
  temp = temp - ((q63_t)Df << 32);
  temp = (q63_t)fract*(temp >> 31);
  temp = temp + ((3*(q63_t)Df << 31) - (d2 + ((q63_t)d1 << 1))*Dn);
  temp = (q63_t)fract*(temp >> 31);
  temp = temp + (q63_t)d1*Dn;
  temp = (q63_t)fract*(temp >> 31);
  /* Calculation of cosine value */
  *pCosVal = clip_q63_to_q31((temp >> 31) + (q63_t)f1);
  /* Read two nearest values of input value from the cos & sin tables */
  f1 = sinTable_q31[indexS+0];
  f2 = sinTable_q31[indexS+1];
  d1 = sinTable_q31[indexC+0];
  d2 = sinTable_q31[indexC+1];
  Df = f2 - f1; // delta between the values of the functions
  temp = Dn*((q63_t)d1 + d2);
  temp = temp - ((q63_t)Df << 32);
  temp = (q63_t)fract*(temp >> 31);
  temp = temp + ((3*(q63_t)Df << 31) - (d2 + ((q63_t)d1 << 1))*Dn);
  temp = (q63_t)fract*(temp >> 31);
  temp = temp + (q63_t)d1*Dn;
  temp = (q63_t)fract*(temp >> 31);
  /* Calculation of sine value */
  *pSinVal = clip_q63_to_q31((temp >> 31) + (q63_t)f1);
 }
 /**    
 * @} end of SinCos group    
 */
--- a/CMSIS/DSP_Lib/Source/FastMathFunctions/arm_cos_f32.c
+++ b/CMSIS/DSP_Lib/Source/FastMathFunctions/arm_cos_f32.c
@@ -0,0 +1,138 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_cos_f32.c    
 *    
 * Description:	Fast cosine calculation for floating-point values.   
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.    
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 #include "arm_common_tables.h"
 /**    
 * @ingroup groupFastMath    
 */
 /**    
 * @defgroup cos Cosine    
 *    
 * Computes the trigonometric cosine function using a combination of table lookup   
 * and cubic interpolation.  There are separate functions for   
 * Q15, Q31, and floating-point data types.   
 * The input to the floating-point version is in radians while the   
 * fixed-point Q15 and Q31 have a scaled input with the range   
 * [0 +0.9999] mapping to [0 2*pi).  The fixed-point range is chosen so that a
 * value of 2*pi wraps around to 0.
 *   
 * The implementation is based on table lookup using 256 values together with cubic interpolation.   
 * The steps used are:   
 *  -# Calculation of the nearest integer table index   
 *  -# Fetch the four table values a, b, c, and d     
 *  -# Compute the fractional portion (fract) of the table index.   
 *  -# Calculation of wa, wb, wc, wd    
 *  -# The final result equals <code>a*wa + b*wb + c*wc + d*wd</code>   
 *   
 * where   
 * <pre>    
 *    a=Table[index-1];    
 *    b=Table[index+0];    
 *    c=Table[index+1];    
 *    d=Table[index+2];    
 * </pre>   
 * and   
 * <pre>    
 *    wa=-(1/6)*fract.^3 + (1/2)*fract.^2 - (1/3)*fract;    
 *    wb=(1/2)*fract.^3 - fract.^2 - (1/2)*fract + 1;    
 *    wc=-(1/2)*fract.^3+(1/2)*fract.^2+fract;    
 *    wd=(1/6)*fract.^3 - (1/6)*fract;    
 * </pre>    
 */
 /**    
 * @addtogroup cos    
 * @{    
 */
 /**   
 * @brief  Fast approximation to the trigonometric cosine function for floating-point data.   
 * @param[in] x input value in radians.   
 * @return cos(x).   
 */
 float32_t arm_cos_f32(
  float32_t x)
 {
  float32_t cosVal, fract, in;                   /* Temporary variables for input, output */
  uint16_t index;                                /* Index variable */
  float32_t a, b;                                /* Two nearest output values */
  int32_t n;
  float32_t findex;
  /* input x is in radians */
  /* Scale the input to [0 1] range from [0 2*PI] , divide input by 2*pi, add 0.25 (pi/2) to read sine table */
  in = x * 0.159154943092f + 0.25f;
  /* Calculation of floor value of input */
  n = (int32_t) in;
  /* Make negative values towards -infinity */
  if(in < 0.0f)
  {
    n--;
  }
  /* Map input value to [0 1] */
  in = in - (float32_t) n;
  /* Calculation of index of the table */
  findex = (float32_t) FAST_MATH_TABLE_SIZE * in;
  index = ((uint16_t)findex) & 0x1ff;
  /* fractional value calculation */
  fract = findex - (float32_t) index;
  /* Read two nearest values of input value from the cos table */
  a = sinTable_f32[index];
  b = sinTable_f32[index+1];
  /* Linear interpolation process */
  cosVal = (1.0f-fract)*a + fract*b;
  /* Return the output value */
  return (cosVal);
 }
 /**    
 * @} end of cos group    
 */
--- a/CMSIS/DSP_Lib/Source/FastMathFunctions/arm_cos_q15.c
+++ b/CMSIS/DSP_Lib/Source/FastMathFunctions/arm_cos_q15.c
@@ -0,0 +1,96 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_cos_q15.c    
 *    
 * Description:	Fast cosine calculation for Q15 values.   
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 #include "arm_common_tables.h"
 /**    
 * @ingroup groupFastMath    
 */
 /**    
 * @addtogroup cos    
 * @{    
 */
 /**   
 * @brief Fast approximation to the trigonometric cosine function for Q15 data.   
 * @param[in] x Scaled input value in radians.   
 * @return  cos(x).   
 *   
 * The Q15 input value is in the range [0 +0.9999] and is mapped to a radian
 * value in the range [0 2*pi).
 */
 q15_t arm_cos_q15(
  q15_t x)
 {
  q15_t sinVal;                                  /* Temporary variables for input, output */
  int32_t index;                                 /* Index variables */
  q15_t a, b;                                    /* Four nearest output values */
  q15_t fract;                                   /* Temporary values for fractional values */
  /* add 0.25 (pi/2) to read sine table */
  x += 0x2000;
  if(x < 0)
  {   /* convert negative numbers to corresponding positive ones */
      x = x + 0x8000;
  }
  /* Calculate the nearest index */
  index = (uint32_t)x >> FAST_MATH_Q15_SHIFT;
  /* Calculation of fractional value */
  fract = (x - (index << FAST_MATH_Q15_SHIFT)) << 9;
  /* Read two nearest values of input value from the sin table */
  a = sinTable_q15[index];
  b = sinTable_q15[index+1];
  /* Linear interpolation process */
  sinVal = (q31_t)(0x8000-fract)*a >> 16;
  sinVal = (q15_t)((((q31_t)sinVal << 16) + ((q31_t)fract*b)) >> 16);
  return sinVal << 1;
 }
 /**    
 * @} end of cos group    
 */
--- a/CMSIS/DSP_Lib/Source/FastMathFunctions/arm_cos_q31.c
+++ b/CMSIS/DSP_Lib/Source/FastMathFunctions/arm_cos_q31.c
@@ -0,0 +1,96 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_cos_q31.c    
 *    
 * Description:	Fast cosine calculation for Q31 values.   
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 #include "arm_common_tables.h"
 /**    
 * @ingroup groupFastMath    
 */
 /**    
 * @addtogroup cos    
 * @{    
 */
 /**   
 * @brief Fast approximation to the trigonometric cosine function for Q31 data.   
 * @param[in] x Scaled input value in radians.   
 * @return  cos(x).   
 *   
 * The Q31 input value is in the range [0 +0.9999] and is mapped to a radian
 * value in the range [0 2*pi).
 */
 q31_t arm_cos_q31(
  q31_t x)
 {
  q31_t cosVal;                                  /* Temporary variables for input, output */
  int32_t index;                                 /* Index variables */
  q31_t a, b;                                    /* Four nearest output values */
  q31_t fract;                                   /* Temporary values for fractional values */
  /* add 0.25 (pi/2) to read sine table */
  x += 0x20000000;
  if(x < 0)
  {   /* convert negative numbers to corresponding positive ones */
      x = x + 0x80000000;
  }
  /* Calculate the nearest index */
  index = (uint32_t)x >> FAST_MATH_Q31_SHIFT;
  /* Calculation of fractional value */
  fract = (x - (index << FAST_MATH_Q31_SHIFT)) << 9;
  /* Read two nearest values of input value from the sin table */
  a = sinTable_q31[index];
  b = sinTable_q31[index+1];
  /* Linear interpolation process */
  cosVal = (q63_t)(0x80000000-fract)*a >> 32;
  cosVal = (q31_t)((((q63_t)cosVal << 32) + ((q63_t)fract*b)) >> 32);
  return cosVal << 1;
 }
 /**    
 * @} end of cos group    
 */
--- a/CMSIS/DSP_Lib/Source/FastMathFunctions/arm_sin_f32.c
+++ b/CMSIS/DSP_Lib/Source/FastMathFunctions/arm_sin_f32.c
@@ -0,0 +1,139 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_sin_f32.c    
 *    
 * Description:	Fast sine calculation for floating-point values.   
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 #include "arm_common_tables.h"
 /**    
 * @ingroup groupFastMath    
 */
 /**    
 * @defgroup sin Sine    
 *    
 * Computes the trigonometric sine function using a combination of table lookup   
 * and cubic interpolation.  There are separate functions for   
 * Q15, Q31, and floating-point data types.   
 * The input to the floating-point version is in radians while the   
 * fixed-point Q15 and Q31 have a scaled input with the range   
 * [0 +0.9999] mapping to [0 2*pi).  The fixed-point range is chosen so that a
 * value of 2*pi wraps around to 0.
 *   
 * The implementation is based on table lookup using 256 values together with cubic interpolation.   
 * The steps used are:   
 *  -# Calculation of the nearest integer table index   
 *  -# Fetch the four table values a, b, c, and d     
 *  -# Compute the fractional portion (fract) of the table index.   
 *  -# Calculation of wa, wb, wc, wd    
 *  -# The final result equals <code>a*wa + b*wb + c*wc + d*wd</code>   
 *   
 * where   
 * <pre>    
 *    a=Table[index-1];    
 *    b=Table[index+0];    
 *    c=Table[index+1];    
 *    d=Table[index+2];    
 * </pre>   
 * and   
 * <pre>    
 *    wa=-(1/6)*fract.^3 + (1/2)*fract.^2 - (1/3)*fract;    
 *    wb=(1/2)*fract.^3 - fract.^2 - (1/2)*fract + 1;    
 *    wc=-(1/2)*fract.^3+(1/2)*fract.^2+fract;    
 *    wd=(1/6)*fract.^3 - (1/6)*fract;    
 * </pre>    
 */
 /**    
 * @addtogroup sin    
 * @{    
 */
 /**   
 * @brief  Fast approximation to the trigonometric sine function for floating-point data.   
 * @param[in] x input value in radians.   
 * @return  sin(x).   
 */
 float32_t arm_sin_f32(
  float32_t x)
 {
  float32_t sinVal, fract, in;                           /* Temporary variables for input, output */
  uint16_t index;                                        /* Index variable */
  float32_t a, b;                                        /* Two nearest output values */
  int32_t n;
  float32_t findex;
  /* input x is in radians */
  /* Scale the input to [0 1] range from [0 2*PI] , divide input by 2*pi */
  in = x * 0.159154943092f;
  /* Calculation of floor value of input */
  n = (int32_t) in;
  /* Make negative values towards -infinity */
  if(x < 0.0f)
  {
    n--;
  }
  /* Map input value to [0 1] */
  in = in - (float32_t) n;
  /* Calculation of index of the table */
  findex = (float32_t) FAST_MATH_TABLE_SIZE * in;
  index = ((uint16_t)findex) & 0x1ff;
  /* fractional value calculation */
  fract = findex - (float32_t) index;
  /* Read two nearest values of input value from the sin table */
  a = sinTable_f32[index];
  b = sinTable_f32[index+1];
  /* Linear interpolation process */
  sinVal = (1.0f-fract)*a + fract*b;
  /* Return the output value */
  return (sinVal);
 }
 /**    
 * @} end of sin group    
 */
--- a/CMSIS/DSP_Lib/Source/FastMathFunctions/arm_sin_q15.c
+++ b/CMSIS/DSP_Lib/Source/FastMathFunctions/arm_sin_q15.c
@@ -0,0 +1,88 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_sin_q15.c    
 *    
 * Description:	Fast sine calculation for Q15 values.   
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 #include "arm_common_tables.h"
 /**    
 * @ingroup groupFastMath    
 */
 /**    
 * @addtogroup sin    
 * @{    
 */
 /**   
 * @brief Fast approximation to the trigonometric sine function for Q15 data.   
 * @param[in] x Scaled input value in radians.   
 * @return  sin(x).   
 *   
 * The Q15 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*pi).
 */
 q15_t arm_sin_q15(
  q15_t x)
 {
  q15_t sinVal;                                  /* Temporary variables for input, output */
  int32_t index;                                 /* Index variables */
  q15_t a, b;                                    /* Four nearest output values */
  q15_t fract;                                   /* Temporary values for fractional values */
  /* Calculate the nearest index */
  index = (uint32_t)x >> FAST_MATH_Q15_SHIFT;
  /* Calculation of fractional value */
  fract = (x - (index << FAST_MATH_Q15_SHIFT)) << 9;
  /* Read two nearest values of input value from the sin table */
  a = sinTable_q15[index];
  b = sinTable_q15[index+1];
  /* Linear interpolation process */
  sinVal = (q31_t)(0x8000-fract)*a >> 16;
  sinVal = (q15_t)((((q31_t)sinVal << 16) + ((q31_t)fract*b)) >> 16);
  return sinVal << 1;
 }
 /**    
 * @} end of sin group    
 */
--- a/CMSIS/DSP_Lib/Source/FastMathFunctions/arm_sin_q31.c
+++ b/CMSIS/DSP_Lib/Source/FastMathFunctions/arm_sin_q31.c
@@ -0,0 +1,87 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_sin_q31.c    
 *    
 * Description:	Fast sine calculation for Q31 values.   
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.    
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 #include "arm_common_tables.h"
 /**    
 * @ingroup groupFastMath    
 */
 /**    
 * @addtogroup sin    
 * @{    
 */
 /**   
 * @brief Fast approximation to the trigonometric sine function for Q31 data.
 * @param[in] x Scaled input value in radians.
 * @return  sin(x).
 *
 * The Q31 input value is in the range [0 +0.9999] and is mapped to a radian value in the range [0 2*pi). */
 q31_t arm_sin_q31(
  q31_t x)
 {
  q31_t sinVal;                                  /* Temporary variables for input, output */
  int32_t index;                                 /* Index variables */
  q31_t a, b;                                    /* Four nearest output values */
  q31_t fract;                                   /* Temporary values for fractional values */
  /* Calculate the nearest index */
  index = (uint32_t)x >> FAST_MATH_Q31_SHIFT;
  /* Calculation of fractional value */
  fract = (x - (index << FAST_MATH_Q31_SHIFT)) << 9;
  /* Read two nearest values of input value from the sin table */
  a = sinTable_q31[index];
  b = sinTable_q31[index+1];
  /* Linear interpolation process */
  sinVal = (q63_t)(0x80000000-fract)*a >> 32;
  sinVal = (q31_t)((((q63_t)sinVal << 32) + ((q63_t)fract*b)) >> 32);
  return sinVal << 1;
 }
 /**    
 * @} end of sin group    
 */
--- a/CMSIS/DSP_Lib/Source/FastMathFunctions/arm_sqrt_q15.c
+++ b/CMSIS/DSP_Lib/Source/FastMathFunctions/arm_sqrt_q15.c
@@ -0,0 +1,155 @@
 /* ----------------------------------------------------------------------     
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.  
 *     
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *     
 * Project:      CMSIS DSP Library  
 * Title:		arm_sqrt_q15.c     
 *     
 * Description:	Q15 square root function.    
 *     
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 #include "arm_common_tables.h"
 /**     
 * @ingroup groupFastMath     
 */
 /**     
 * @addtogroup SQRT     
 * @{     
 */
  /**    
   * @brief  Q15 square root function.    
   * @param[in]   in     input value.  The range of the input value is [0 +1) or 0x0000 to 0x7FFF.    
   * @param[out]  *pOut  square root of input value.    
   * @return The function returns ARM_MATH_SUCCESS if the input value is positive
   * and ARM_MATH_ARGUMENT_ERROR if the input is negative.  For
   * negative inputs, the function returns *pOut = 0.
   */
 arm_status arm_sqrt_q15(
  q15_t in,
  q15_t * pOut)
 {
  q15_t number, temp1, var1, signBits1, half;
  q31_t bits_val1;
  float32_t temp_float1;
  union
  {
    q31_t fracval;
    float32_t floatval;
  } tempconv;
  number = in;
  /* If the input is a positive number then compute the signBits. */
  if(number > 0)
  {
    signBits1 = __CLZ(number) - 17;
    /* Shift by the number of signBits1 */
    if((signBits1 % 2) == 0)
    {
      number = number << signBits1;
    }
    else
    {
      number = number << (signBits1 - 1);
    }
    /* Calculate half value of the number */
    half = number >> 1;
    /* Store the number for later use */
    temp1 = number;
    /*Convert to float */
    temp_float1 = number * 3.051757812500000e-005f;
    /*Store as integer */
    tempconv.floatval = temp_float1;
    bits_val1 = tempconv.fracval;
    /* Subtract the shifted value from the magic number to give intial guess */
    bits_val1 = 0x5f3759df - (bits_val1 >> 1);  // gives initial guess  
    /* Store as float */
    tempconv.fracval = bits_val1;
    temp_float1 = tempconv.floatval;
    /* Convert to integer format */
    var1 = (q31_t) (temp_float1 * 16384);
    /* 1st iteration */
    var1 = ((q15_t) ((q31_t) var1 * (0x3000 -
                                     ((q15_t)
                                      ((((q15_t)
                                         (((q31_t) var1 * var1) >> 15)) *
                                        (q31_t) half) >> 15))) >> 15)) << 2;
    /* 2nd iteration */
    var1 = ((q15_t) ((q31_t) var1 * (0x3000 -
                                     ((q15_t)
                                      ((((q15_t)
                                         (((q31_t) var1 * var1) >> 15)) *
                                        (q31_t) half) >> 15))) >> 15)) << 2;
    /* 3rd iteration */
    var1 = ((q15_t) ((q31_t) var1 * (0x3000 -
                                     ((q15_t)
                                      ((((q15_t)
                                         (((q31_t) var1 * var1) >> 15)) *
                                        (q31_t) half) >> 15))) >> 15)) << 2;
    /* Multiply the inverse square root with the original value */
    var1 = ((q15_t) (((q31_t) temp1 * var1) >> 15)) << 1;
    /* Shift the output down accordingly */
    if((signBits1 % 2) == 0)
    {
      var1 = var1 >> (signBits1 / 2);
    }
    else
    {
      var1 = var1 >> ((signBits1 - 1) / 2);
    }
    *pOut = var1;
    return (ARM_MATH_SUCCESS);
  }
  /* If the number is a negative number then store zero as its square root value */
  else
  {
    *pOut = 0;
    return (ARM_MATH_ARGUMENT_ERROR);
  }
 }
 /**     
 * @} end of SQRT group     
 */
--- a/CMSIS/DSP_Lib/Source/FastMathFunctions/arm_sqrt_q31.c
+++ b/CMSIS/DSP_Lib/Source/FastMathFunctions/arm_sqrt_q31.c
@@ -0,0 +1,153 @@
 /* ----------------------------------------------------------------------     
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.  
 *     
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *     
 * Project:      CMSIS DSP Library  
 * Title:		arm_sqrt_q31.c     
 *     
 * Description:	Q31 square root function.    
 *     
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 #include "arm_common_tables.h"
 /**     
 * @ingroup groupFastMath     
 */
 /**     
 * @addtogroup SQRT     
 * @{     
 */
 /**    
 * @brief Q31 square root function.    
 * @param[in]   in    input value.  The range of the input value is [0 +1) or 0x00000000 to 0x7FFFFFFF.    
 * @param[out]  *pOut square root of input value.    
 * @return The function returns ARM_MATH_SUCCESS if the input value is positive
 * and ARM_MATH_ARGUMENT_ERROR if the input is negative.  For
 * negative inputs, the function returns *pOut = 0.
 */
 arm_status arm_sqrt_q31(
  q31_t in,
  q31_t * pOut)
 {
  q31_t number, temp1, bits_val1, var1, signBits1, half;
  float32_t temp_float1;
  union
  {
      q31_t fracval;
      float32_t floatval;
  } tempconv;
  number = in;
  /* If the input is a positive number then compute the signBits. */
  if(number > 0)
  {
    signBits1 = __CLZ(number) - 1;
    /* Shift by the number of signBits1 */
    if((signBits1 % 2) == 0)
    {
      number = number << signBits1;
    }
    else
    {
      number = number << (signBits1 - 1);
    }
    /* Calculate half value of the number */
    half = number >> 1;
    /* Store the number for later use */
    temp1 = number;
    /*Convert to float */
    temp_float1 = number * 4.6566128731e-010f;
    /*Store as integer */
    tempconv.floatval = temp_float1;
    bits_val1 = tempconv.fracval;
    /* Subtract the shifted value from the magic number to give intial guess */
    bits_val1 = 0x5f3759df - (bits_val1 >> 1);  // gives initial guess  
    /* Store as float */
    tempconv.fracval = bits_val1;
    temp_float1 = tempconv.floatval;
    /* Convert to integer format */
    var1 = (q31_t) (temp_float1 * 1073741824);
    /* 1st iteration */
    var1 = ((q31_t) ((q63_t) var1 * (0x30000000 -
                                     ((q31_t)
                                      ((((q31_t)
                                         (((q63_t) var1 * var1) >> 31)) *
                                        (q63_t) half) >> 31))) >> 31)) << 2;
    /* 2nd iteration */
    var1 = ((q31_t) ((q63_t) var1 * (0x30000000 -
                                     ((q31_t)
                                      ((((q31_t)
                                         (((q63_t) var1 * var1) >> 31)) *
                                        (q63_t) half) >> 31))) >> 31)) << 2;
    /* 3rd iteration */
    var1 = ((q31_t) ((q63_t) var1 * (0x30000000 -
                                     ((q31_t)
                                      ((((q31_t)
                                         (((q63_t) var1 * var1) >> 31)) *
                                        (q63_t) half) >> 31))) >> 31)) << 2;
    /* Multiply the inverse square root with the original value */
    var1 = ((q31_t) (((q63_t) temp1 * var1) >> 31)) << 1;
    /* Shift the output down accordingly */
    if((signBits1 % 2) == 0)
    {
      var1 = var1 >> (signBits1 / 2);
    }
    else
    {
      var1 = var1 >> ((signBits1 - 1) / 2);
    }
    *pOut = var1;
    return (ARM_MATH_SUCCESS);
  }
  /* If the number is a negative number then store zero as its square root value */
  else
  {
    *pOut = 0;
    return (ARM_MATH_ARGUMENT_ERROR);
  }
 }
 /**     
 * @} end of SQRT group     
 */
--- a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_init_q31.c
@@ -0,0 +1,110 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_biquad_cascade_df1_32x64_init_q31.c    
 *    
 * Description:	High precision Q31 Biquad cascade filter initialization function.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.    
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupFilters    
 */
 /**    
 * @addtogroup BiquadCascadeDF1_32x64    
 * @{    
 */
 /**    
 * @details    
 *    
 * @param[in,out] *S           	points to an instance of the high precision Q31 Biquad cascade filter structure.    
 * @param[in]     numStages     number of 2nd order stages in the filter.    
 * @param[in]     *pCoeffs      points to the filter coefficients.    
 * @param[in]     *pState       points to the state buffer.    
 * @param[in]     postShift     Shift to be applied after the accumulator.  Varies according to the coefficients format.    
 * @return        none    
 *    
 * <b>Coefficient and State Ordering:</b>    
 *    
 * \par    
 * The coefficients are stored in the array <code>pCoeffs</code> in the following order:    
 * <pre>    
 *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}    
 * </pre>    
 * where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,    
 * <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,    
 * and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.    
 *    
 * \par    
 * The <code>pState</code> points to state variables array and size of each state variable is 1.63 format.    
 * Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.    
 * The state variables are arranged in the state array as:    
 * <pre>    
 *     {x[n-1], x[n-2], y[n-1], y[n-2]}    
 * </pre>    
 * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.    
 * The state array has a total length of <code>4*numStages</code> values.    
 * The state variables are updated after each block of data is processed; the coefficients are untouched.    
 */
 void arm_biquad_cas_df1_32x64_init_q31(
  arm_biquad_cas_df1_32x64_ins_q31 * S,
  uint8_t numStages,
  q31_t * pCoeffs,
  q63_t * pState,
  uint8_t postShift)
 {
  /* Assign filter stages */
  S->numStages = numStages;
  /* Assign postShift to be applied to the output */
  S->postShift = postShift;
  /* Assign coefficient pointer */
  S->pCoeffs = pCoeffs;
  /* Clear state buffer and size is always 4 * numStages */
  memset(pState, 0, (4u * (uint32_t) numStages) * sizeof(q63_t));
  /* Assign state pointer */
  S->pState = pState;
 }
 /**    
 * @} end of BiquadCascadeDF1_32x64 group    
 */
--- a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c
@@ -0,0 +1,561 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_biquad_cascade_df1_32x64_q31.c    
 *    
 * Description:	High precision Q31 Biquad cascade filter processing function    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupFilters    
 */
 /**    
 * @defgroup BiquadCascadeDF1_32x64 High Precision Q31 Biquad Cascade Filter    
 *    
 * This function implements a high precision Biquad cascade filter which operates on    
 * Q31 data values.  The filter coefficients are in 1.31 format and the state variables    
 * are in 1.63 format.  The double precision state variables reduce quantization noise    
 * in the filter and provide a cleaner output.    
 * These filters are particularly useful when implementing filters in which the    
 * singularities are close to the unit circle.  This is common for low pass or high    
 * pass filters with very low cutoff frequencies.    
 *    
 * The function operates on blocks of input and output data    
 * and each call to the function processes <code>blockSize</code> samples through    
 * the filter. <code>pSrc</code> and <code>pDst</code> points to input and output arrays    
 * containing <code>blockSize</code> Q31 values.    
 *    
 * \par Algorithm    
 * Each Biquad stage implements a second order filter using the difference equation:    
 * <pre>    
 *     y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]    
 * </pre>    
 * A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.    
 * \image html Biquad.gif "Single Biquad filter stage"    
 * Coefficients <code>b0, b1, and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.    
 * Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.    
 * Pay careful attention to the sign of the feedback coefficients.    
 * Some design tools use the difference equation    
 * <pre>    
 *     y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]    
 * </pre>    
 * In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.    
 *    
 * \par    
 * Higher order filters are realized as a cascade of second order sections.    
 * <code>numStages</code> refers to the number of second order stages used.    
 * For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.    
 * \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"    
 * A 9th order filter would be realized with <code>numStages=5</code> second order stages with the coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).    
 *    
 * \par    
 * The <code>pState</code> points to state variables array .    
 * Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code> and each state variable in 1.63 format to improve precision.    
 * The state variables are arranged in the array as:    
 * <pre>    
 *     {x[n-1], x[n-2], y[n-1], y[n-2]}    
 * </pre>    
 *    
 * \par    
 * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.    
 * The state array has a total length of <code>4*numStages</code> values of data in 1.63 format.    
 * The state variables are updated after each block of data is processed; the coefficients are untouched.    
 *    
 * \par Instance Structure    
 * The coefficients and state variables for a filter are stored together in an instance data structure.    
 * A separate instance structure must be defined for each filter.    
 * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.    
 *    
 * \par Init Function    
 * There is also an associated initialization function which performs the following operations:    
 * - Sets the values of the internal structure fields.    
 * - Zeros out the values in the state buffer.    
 * To do this manually without calling the init function, assign the follow subfields of the instance structure:
 * numStages, pCoeffs, postShift, pState. Also set all of the values in pState to zero. 
 *
 * \par    
 * Use of the initialization function is optional.    
 * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.    
 * To place an instance structure into a const data section, the instance structure must be manually initialized.    
 * Set the values in the state buffer to zeros before static initialization.    
 * For example, to statically initialize the filter instance structure use    
 * <pre>    
 *     arm_biquad_cas_df1_32x64_ins_q31 S1 = {numStages, pState, pCoeffs, postShift};    
 * </pre>    
 * where <code>numStages</code> is the number of Biquad stages in the filter; <code>pState</code> is the address of the state buffer;    
 * <code>pCoeffs</code> is the address of the coefficient buffer; <code>postShift</code> shift to be applied which is described in detail below.    
 * \par Fixed-Point Behavior    
 * Care must be taken while using Biquad Cascade 32x64 filter function.    
 * Following issues must be considered:    
 * - Scaling of coefficients    
 * - Filter gain    
 * - Overflow and saturation    
 *    
 * \par    
 * Filter coefficients are represented as fractional values and    
 * restricted to lie in the range <code>[-1 +1)</code>.    
 * The processing function has an additional scaling parameter <code>postShift</code>    
 * which allows the filter coefficients to exceed the range <code>[+1 -1)</code>.    
 * At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.    
 * \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"    
 * This essentially scales the filter coefficients by <code>2^postShift</code>.    
 * For example, to realize the coefficients    
 * <pre>    
 *    {1.5, -0.8, 1.2, 1.6, -0.9}    
 * </pre>    
 * set the Coefficient array to:    
 * <pre>    
 *    {0.75, -0.4, 0.6, 0.8, -0.45}    
 * </pre>    
 * and set <code>postShift=1</code>    
 *    
 * \par    
 * The second thing to keep in mind is the gain through the filter.    
 * The frequency response of a Biquad filter is a function of its coefficients.    
 * It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies.    
 * This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter.    
 * To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed.    
 *    
 * \par    
 * The third item to consider is the overflow and saturation behavior of the fixed-point Q31 version.    
 * This is described in the function specific documentation below.    
 */
 /**    
 * @addtogroup BiquadCascadeDF1_32x64    
 * @{    
 */
 /**    
 * @details    
 * @param[in]  *S points to an instance of the high precision Q31 Biquad cascade filter.    
 * @param[in]  *pSrc points to the block of input data.    
 * @param[out] *pDst points to the block of output data.    
 * @param[in]  blockSize number of samples to process.    
 * @return none.    
 *    
 * \par    
 * The function is implemented using an internal 64-bit accumulator.    
 * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.    
 * Thus, if the accumulator result overflows it wraps around rather than clip.    
 * In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).    
 * After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by <code>postShift</code> bits and the result truncated to    
 * 1.31 format by discarding the low 32 bits.    
 *    
 * \par    
 * Two related functions are provided in the CMSIS DSP library.    
 * <code>arm_biquad_cascade_df1_q31()</code> implements a Biquad cascade with 32-bit coefficients and state variables with a Q63 accumulator.    
 * <code>arm_biquad_cascade_df1_fast_q31()</code> implements a Biquad cascade with 32-bit coefficients and state variables with a Q31 accumulator.    
 */
 void arm_biquad_cas_df1_32x64_q31(
  const arm_biquad_cas_df1_32x64_ins_q31 * S,
  q31_t * pSrc,
  q31_t * pDst,
  uint32_t blockSize)
 {
  q31_t *pIn = pSrc;                             /*  input pointer initialization  */
  q31_t *pOut = pDst;                            /*  output pointer initialization */
  q63_t *pState = S->pState;                     /*  state pointer initialization  */
  q31_t *pCoeffs = S->pCoeffs;                   /*  coeff pointer initialization  */
  q63_t acc;                                     /*  accumulator                   */
  q31_t Xn1, Xn2;                                /*  Input Filter state variables        */
  q63_t Yn1, Yn2;                                /*  Output Filter state variables        */
  q31_t b0, b1, b2, a1, a2;                      /*  Filter coefficients           */
  q31_t Xn;                                      /*  temporary input               */
  int32_t shift = (int32_t) S->postShift + 1;    /*  Shift to be applied to the output */
  uint32_t sample, stage = S->numStages;         /*  loop counters                     */
  q31_t acc_l, acc_h;                            /*  temporary output               */
  uint32_t uShift = ((uint32_t) S->postShift + 1u);
  uint32_t lShift = 32u - uShift;                /*  Shift to be applied to the output */
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  do
  {
    /* Reading the coefficients */
    b0 = *pCoeffs++;
    b1 = *pCoeffs++;
    b2 = *pCoeffs++;
    a1 = *pCoeffs++;
    a2 = *pCoeffs++;
    /* Reading the state values */
    Xn1 = (q31_t) (pState[0]);
    Xn2 = (q31_t) (pState[1]);
    Yn1 = pState[2];
    Yn2 = pState[3];
    /* Apply loop unrolling and compute 4 output values simultaneously. */
    /* The variable acc hold output value that is being computed and    
     * stored in the destination buffer    
     * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]    
     */
    sample = blockSize >> 2u;
    /* First part of the processing with loop unrolling. Compute 4 outputs at a time.    
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(sample > 0u)
    {
      /* Read the input */
      Xn = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      acc = (q63_t) Xn *b0;
      /* acc +=  b1 * x[n-1] */
      acc += (q63_t) Xn1 *b1;
      /* acc +=  b[2] * x[n-2] */
      acc += (q63_t) Xn2 *b2;
      /* acc +=  a1 * y[n-1] */
      acc += mult32x64(Yn1, a1);
      /* acc +=  a2 * y[n-2] */
      acc += mult32x64(Yn2, a2);
      /* The result is converted to 1.63 , Yn2 variable is reused */
      Yn2 = acc << shift;
      /* Calc lower part of acc */
      acc_l = acc & 0xffffffff;
      /* Calc upper part of acc */
      acc_h = (acc >> 32) & 0xffffffff;
      /* Apply shift for lower part of acc and upper part of acc */
      acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
      /* Store the output in the destination buffer in 1.31 format. */
      *pOut = acc_h;
      /* Read the second input into Xn2, to reuse the value */
      Xn2 = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc +=  b1 * x[n-1] */
      acc = (q63_t) Xn *b1;
      /* acc =  b0 * x[n] */
      acc += (q63_t) Xn2 *b0;
      /* acc +=  b[2] * x[n-2] */
      acc += (q63_t) Xn1 *b2;
      /* acc +=  a1 * y[n-1] */
      acc += mult32x64(Yn2, a1);
      /* acc +=  a2 * y[n-2] */
      acc += mult32x64(Yn1, a2);
      /* The result is converted to 1.63, Yn1 variable is reused */
      Yn1 = acc << shift;
      /* Calc lower part of acc */
      acc_l = acc & 0xffffffff;
      /* Calc upper part of acc */
      acc_h = (acc >> 32) & 0xffffffff;
      /* Apply shift for lower part of acc and upper part of acc */
      acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
      /* Read the third input into Xn1, to reuse the value */
      Xn1 = *pIn++;
      /* The result is converted to 1.31 */
      /* Store the output in the destination buffer. */
      *(pOut + 1u) = acc_h;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      acc = (q63_t) Xn1 *b0;
      /* acc +=  b1 * x[n-1] */
      acc += (q63_t) Xn2 *b1;
      /* acc +=  b[2] * x[n-2] */
      acc += (q63_t) Xn *b2;
      /* acc +=  a1 * y[n-1] */
      acc += mult32x64(Yn1, a1);
      /* acc +=  a2 * y[n-2] */
      acc += mult32x64(Yn2, a2);
      /* The result is converted to 1.63, Yn2 variable is reused  */
      Yn2 = acc << shift;
      /* Calc lower part of acc */
      acc_l = acc & 0xffffffff;
      /* Calc upper part of acc */
      acc_h = (acc >> 32) & 0xffffffff;
      /* Apply shift for lower part of acc and upper part of acc */
      acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
      /* Store the output in the destination buffer in 1.31 format. */
      *(pOut + 2u) = acc_h;
      /* Read the fourth input into Xn, to reuse the value */
      Xn = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      acc = (q63_t) Xn *b0;
      /* acc +=  b1 * x[n-1] */
      acc += (q63_t) Xn1 *b1;
      /* acc +=  b[2] * x[n-2] */
      acc += (q63_t) Xn2 *b2;
      /* acc +=  a1 * y[n-1] */
      acc += mult32x64(Yn2, a1);
      /* acc +=  a2 * y[n-2] */
      acc += mult32x64(Yn1, a2);
      /* The result is converted to 1.63, Yn1 variable is reused  */
      Yn1 = acc << shift;
      /* Calc lower part of acc */
      acc_l = acc & 0xffffffff;
      /* Calc upper part of acc */
      acc_h = (acc >> 32) & 0xffffffff;
      /* Apply shift for lower part of acc and upper part of acc */
      acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
      /* Store the output in the destination buffer in 1.31 format. */
      *(pOut + 3u) = acc_h;
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc    */
      Xn2 = Xn1;
      Xn1 = Xn;
      /* update output pointer */
      pOut += 4u;
      /* decrement the loop counter */
      sample--;
    }
    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
     ** No loop unrolling is used. */
    sample = (blockSize & 0x3u);
    while(sample > 0u)
    {
      /* Read the input */
      Xn = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      acc = (q63_t) Xn *b0;
      /* acc +=  b1 * x[n-1] */
      acc += (q63_t) Xn1 *b1;
      /* acc +=  b[2] * x[n-2] */
      acc += (q63_t) Xn2 *b2;
      /* acc +=  a1 * y[n-1] */
      acc += mult32x64(Yn1, a1);
      /* acc +=  a2 * y[n-2] */
      acc += mult32x64(Yn2, a2);
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc    */
      Xn2 = Xn1;
      Xn1 = Xn;
      Yn2 = Yn1;
      /* The result is converted to 1.63, Yn1 variable is reused  */
      Yn1 = acc << shift;
      /* Calc lower part of acc */
      acc_l = acc & 0xffffffff;
      /* Calc upper part of acc */
      acc_h = (acc >> 32) & 0xffffffff;
      /* Apply shift for lower part of acc and upper part of acc */
      acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
      /* Store the output in the destination buffer in 1.31 format. */
      *pOut++ = acc_h;
      //Yn1 = acc << shift; 
      /* Store the output in the destination buffer in 1.31 format. */
 //      *pOut++ = (q31_t) (acc >> (32 - shift)); 
      /* decrement the loop counter */
      sample--;
    }
    /*  The first stage output is given as input to the second stage. */
    pIn = pDst;
    /* Reset to destination buffer working pointer */
    pOut = pDst;
    /*  Store the updated state variables back into the pState array */
    /*  Store the updated state variables back into the pState array */
    *pState++ = (q63_t) Xn1;
    *pState++ = (q63_t) Xn2;
    *pState++ = Yn1;
    *pState++ = Yn2;
  } while(--stage);
 #else
  /* Run the below code for Cortex-M0 */
  do
  {
    /* Reading the coefficients */
    b0 = *pCoeffs++;
    b1 = *pCoeffs++;
    b2 = *pCoeffs++;
    a1 = *pCoeffs++;
    a2 = *pCoeffs++;
    /* Reading the state values */
    Xn1 = pState[0];
    Xn2 = pState[1];
    Yn1 = pState[2];
    Yn2 = pState[3];
    /* The variable acc hold output value that is being computed and        
     * stored in the destination buffer            
     * acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]            
     */
    sample = blockSize;
    while(sample > 0u)
    {
      /* Read the input */
      Xn = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      acc = (q63_t) Xn *b0;
      /* acc +=  b1 * x[n-1] */
      acc += (q63_t) Xn1 *b1;
      /* acc +=  b[2] * x[n-2] */
      acc += (q63_t) Xn2 *b2;
      /* acc +=  a1 * y[n-1] */
      acc += mult32x64(Yn1, a1);
      /* acc +=  a2 * y[n-2] */
      acc += mult32x64(Yn2, a2);
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc    */
      Xn2 = Xn1;
      Xn1 = Xn;
      Yn2 = Yn1;
      /* The result is converted to 1.63, Yn1 variable is reused  */
      Yn1 = acc << shift;
      /* Calc lower part of acc */
      acc_l = acc & 0xffffffff;
      /* Calc upper part of acc */
      acc_h = (acc >> 32) & 0xffffffff;
      /* Apply shift for lower part of acc and upper part of acc */
      acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
      /* Store the output in the destination buffer in 1.31 format. */
      *pOut++ = acc_h;
      //Yn1 = acc << shift; 
      /* Store the output in the destination buffer in 1.31 format. */
      //*pOut++ = (q31_t) (acc >> (32 - shift)); 
      /* decrement the loop counter */
      sample--;
    }
    /*  The first stage output is given as input to the second stage. */
    pIn = pDst;
    /* Reset to destination buffer working pointer */
    pOut = pDst;
    /*  Store the updated state variables back into the pState array */
    *pState++ = (q63_t) Xn1;
    *pState++ = (q63_t) Xn2;
    *pState++ = Yn1;
    *pState++ = Yn2;
  } while(--stage);
 #endif /*    #ifndef ARM_MATH_CM0_FAMILY     */
 }
  /**    
   * @} end of BiquadCascadeDF1_32x64 group    
   */
--- a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c
@@ -0,0 +1,425 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_biquad_cascade_df1_f32.c    
 *    
 * Description:	Processing function for the    
 *               floating-point Biquad cascade DirectFormI(DF1) filter.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupFilters    
 */
 /**    
 * @defgroup BiquadCascadeDF1 Biquad Cascade IIR Filters Using Direct Form I Structure    
 *    
 * This set of functions implements arbitrary order recursive (IIR) filters.    
 * The filters are implemented as a cascade of second order Biquad sections.    
 * The functions support Q15, Q31 and floating-point data types.  
 * Fast version of Q15 and Q31 also supported on CortexM4 and Cortex-M3.    
 *    
 * The functions operate on blocks of input and output data and each call to the function    
 * processes <code>blockSize</code> samples through the filter.    
 * <code>pSrc</code> points to the array of input data and    
 * <code>pDst</code> points to the array of output data.    
 * Both arrays contain <code>blockSize</code> values.    
 *    
 * \par Algorithm    
 * Each Biquad stage implements a second order filter using the difference equation:    
 * <pre>    
 *     y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]    
 * </pre>    
 * A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.    
 * \image html Biquad.gif "Single Biquad filter stage"    
 * Coefficients <code>b0, b1 and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.    
 * Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.    
 * Pay careful attention to the sign of the feedback coefficients.    
 * Some design tools use the difference equation    
 * <pre>    
 *     y[n] = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] - a1 * y[n-1] - a2 * y[n-2]    
 * </pre>    
 * In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.    
 *    
 * \par    
 * Higher order filters are realized as a cascade of second order sections.    
 * <code>numStages</code> refers to the number of second order stages used.    
 * For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.    
 * \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"    
 * A 9th order filter would be realized with <code>numStages=5</code> second order stages with the coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).    
 *    
 * \par    
 * The <code>pState</code> points to state variables array.    
 * Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.    
 * The state variables are arranged in the <code>pState</code> array as:    
 * <pre>    
 *     {x[n-1], x[n-2], y[n-1], y[n-2]}    
 * </pre>    
 *    
 * \par    
 * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.    
 * The state array has a total length of <code>4*numStages</code> values.    
 * The state variables are updated after each block of data is processed, the coefficients are untouched.    
 *    
 * \par Instance Structure    
 * The coefficients and state variables for a filter are stored together in an instance data structure.    
 * A separate instance structure must be defined for each filter.    
 * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.    
 * There are separate instance structure declarations for each of the 3 supported data types.    
 *    
 * \par Init Functions    
 * There is also an associated initialization function for each data type.    
 * The initialization function performs following operations:    
 * - Sets the values of the internal structure fields.    
 * - Zeros out the values in the state buffer.    
 * To do this manually without calling the init function, assign the follow subfields of the instance structure:
 * numStages, pCoeffs, pState. Also set all of the values in pState to zero. 
 *    
 * \par    
 * Use of the initialization function is optional.    
 * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.    
 * To place an instance structure into a const data section, the instance structure must be manually initialized.    
 * Set the values in the state buffer to zeros before static initialization.    
 * The code below statically initializes each of the 3 different data type filter instance structures    
 * <pre>    
 *     arm_biquad_casd_df1_inst_f32 S1 = {numStages, pState, pCoeffs};    
 *     arm_biquad_casd_df1_inst_q15 S2 = {numStages, pState, pCoeffs, postShift};    
 *     arm_biquad_casd_df1_inst_q31 S3 = {numStages, pState, pCoeffs, postShift};    
 * </pre>    
 * where <code>numStages</code> is the number of Biquad stages in the filter; <code>pState</code> is the address of the state buffer;    
 * <code>pCoeffs</code> is the address of the coefficient buffer; <code>postShift</code> shift to be applied.    
 *    
 * \par Fixed-Point Behavior    
 * Care must be taken when using the Q15 and Q31 versions of the Biquad Cascade filter functions.    
 * Following issues must be considered:    
 * - Scaling of coefficients    
 * - Filter gain    
 * - Overflow and saturation    
 *    
 * \par    
 * <b>Scaling of coefficients: </b>    
 * Filter coefficients are represented as fractional values and    
 * coefficients are restricted to lie in the range <code>[-1 +1)</code>.    
 * The fixed-point functions have an additional scaling parameter <code>postShift</code>    
 * which allow the filter coefficients to exceed the range <code>[+1 -1)</code>.    
 * At the output of the filter's accumulator is a shift register which shifts the result by <code>postShift</code> bits.    
 * \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"    
 * This essentially scales the filter coefficients by <code>2^postShift</code>.    
 * For example, to realize the coefficients    
 * <pre>    
 *    {1.5, -0.8, 1.2, 1.6, -0.9}    
 * </pre>    
 * set the pCoeffs array to:    
 * <pre>    
 *    {0.75, -0.4, 0.6, 0.8, -0.45}    
 * </pre>    
 * and set <code>postShift=1</code>    
 *    
 * \par    
 * <b>Filter gain: </b>    
 * The frequency response of a Biquad filter is a function of its coefficients.    
 * It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies.    
 * This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter.    
 * To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed.    
 *    
 * \par    
 * <b>Overflow and saturation: </b>    
 * For Q15 and Q31 versions, it is described separately as part of the function specific documentation below.    
 */
 /**    
 * @addtogroup BiquadCascadeDF1    
 * @{    
 */
 /**    
 * @param[in]  *S         points to an instance of the floating-point Biquad cascade structure.    
 * @param[in]  *pSrc      points to the block of input data.    
 * @param[out] *pDst      points to the block of output data.    
 * @param[in]  blockSize  number of samples to process per call.    
 * @return     none.    
 *    
 */
 void arm_biquad_cascade_df1_f32(
  const arm_biquad_casd_df1_inst_f32 * S,
  float32_t * pSrc,
  float32_t * pDst,
  uint32_t blockSize)
 {
  float32_t *pIn = pSrc;                         /*  source pointer            */
  float32_t *pOut = pDst;                        /*  destination pointer       */
  float32_t *pState = S->pState;                 /*  pState pointer            */
  float32_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
  float32_t acc;                                 /*  Simulates the accumulator */
  float32_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
  float32_t Xn1, Xn2, Yn1, Yn2;                  /*  Filter pState variables   */
  float32_t Xn;                                  /*  temporary input           */
  uint32_t sample, stage = S->numStages;         /*  loop counters             */
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  do
  {
    /* Reading the coefficients */
    b0 = *pCoeffs++;
    b1 = *pCoeffs++;
    b2 = *pCoeffs++;
    a1 = *pCoeffs++;
    a2 = *pCoeffs++;
    /* Reading the pState values */
    Xn1 = pState[0];
    Xn2 = pState[1];
    Yn1 = pState[2];
    Yn2 = pState[3];
    /* Apply loop unrolling and compute 4 output values simultaneously. */
    /*      The variable acc hold output values that are being computed:    
     *    
     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1]   + a2 * y[n-2]    
     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1]   + a2 * y[n-2]    
     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1]   + a2 * y[n-2]    
     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1]   + a2 * y[n-2]    
     */
    sample = blockSize >> 2u;
    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(sample > 0u)
    {
      /* Read the first input */
      Xn = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      Yn2 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
      /* Store the result in the accumulator in the destination buffer. */
      *pOut++ = Yn2;
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc   */
      /* Read the second input */
      Xn2 = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      Yn1 = (b0 * Xn2) + (b1 * Xn) + (b2 * Xn1) + (a1 * Yn2) + (a2 * Yn1);
      /* Store the result in the accumulator in the destination buffer. */
      *pOut++ = Yn1;
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc   */
      /* Read the third input */
      Xn1 = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      Yn2 = (b0 * Xn1) + (b1 * Xn2) + (b2 * Xn) + (a1 * Yn1) + (a2 * Yn2);
      /* Store the result in the accumulator in the destination buffer. */
      *pOut++ = Yn2;
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as: */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc   */
      /* Read the forth input */
      Xn = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      Yn1 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn2) + (a2 * Yn1);
      /* Store the result in the accumulator in the destination buffer. */
      *pOut++ = Yn1;
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc   */
      Xn2 = Xn1;
      Xn1 = Xn;
      /* decrement the loop counter */
      sample--;
    }
    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
     ** No loop unrolling is used. */
    sample = blockSize & 0x3u;
    while(sample > 0u)
    {
      /* Read the input */
      Xn = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
      /* Store the result in the accumulator in the destination buffer. */
      *pOut++ = acc;
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:    */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc   */
      Xn2 = Xn1;
      Xn1 = Xn;
      Yn2 = Yn1;
      Yn1 = acc;
      /* decrement the loop counter */
      sample--;
    }
    /*  Store the updated state variables back into the pState array */
    *pState++ = Xn1;
    *pState++ = Xn2;
    *pState++ = Yn1;
    *pState++ = Yn2;
    /*  The first stage goes from the input buffer to the output buffer. */
    /*  Subsequent numStages  occur in-place in the output buffer */
    pIn = pDst;
    /* Reset the output pointer */
    pOut = pDst;
    /* decrement the loop counter */
    stage--;
  } while(stage > 0u);
 #else
  /* Run the below code for Cortex-M0 */
  do
  {
    /* Reading the coefficients */
    b0 = *pCoeffs++;
    b1 = *pCoeffs++;
    b2 = *pCoeffs++;
    a1 = *pCoeffs++;
    a2 = *pCoeffs++;
    /* Reading the pState values */
    Xn1 = pState[0];
    Xn2 = pState[1];
    Yn1 = pState[2];
    Yn2 = pState[3];
    /*      The variables acc holds the output value that is computed:        
     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1]   + a2 * y[n-2]        
     */
    sample = blockSize;
    while(sample > 0u)
    {
      /* Read the input */
      Xn = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
      /* Store the result in the accumulator in the destination buffer. */
      *pOut++ = acc;
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:    */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc   */
      Xn2 = Xn1;
      Xn1 = Xn;
      Yn2 = Yn1;
      Yn1 = acc;
      /* decrement the loop counter */
      sample--;
    }
    /*  Store the updated state variables back into the pState array */
    *pState++ = Xn1;
    *pState++ = Xn2;
    *pState++ = Yn1;
    *pState++ = Yn2;
    /*  The first stage goes from the input buffer to the output buffer. */
    /*  Subsequent numStages  occur in-place in the output buffer */
    pIn = pDst;
    /* Reset the output pointer */
    pOut = pDst;
    /* decrement the loop counter */
    stage--;
  } while(stage > 0u);
 #endif /*   #ifndef ARM_MATH_CM0_FAMILY         */
 }
  /**    
   * @} end of BiquadCascadeDF1 group    
   */
--- a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c
@@ -0,0 +1,286 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_biquad_cascade_df1_fast_q15.c    
 *    
 * Description:	Fast processing function for the    
 *				Q15 Biquad cascade filter.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupFilters    
 */
 /**    
 * @addtogroup BiquadCascadeDF1    
 * @{    
 */
 /**    
 * @details    
 * @param[in]  *S points to an instance of the Q15 Biquad cascade structure.    
 * @param[in]  *pSrc points to the block of input data.    
 * @param[out] *pDst points to the block of output data.    
 * @param[in]  blockSize number of samples to process per call.    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * This fast version uses a 32-bit accumulator with 2.30 format.    
 * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.    
 * Thus, if the accumulator result overflows it wraps around and distorts the result.    
 * In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25).    
 * The 2.30 accumulator is then shifted by <code>postShift</code> bits and the result truncated to 1.15 format by discarding the low 16 bits.    
 *    
 * \par    
 * Refer to the function <code>arm_biquad_cascade_df1_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.  Both the slow and the fast versions use the same instance structure.    
 * Use the function <code>arm_biquad_cascade_df1_init_q15()</code> to initialize the filter structure.    
 *    
 */
 void arm_biquad_cascade_df1_fast_q15(
  const arm_biquad_casd_df1_inst_q15 * S,
  q15_t * pSrc,
  q15_t * pDst,
  uint32_t blockSize)
 {
  q15_t *pIn = pSrc;                             /*  Source pointer                               */
  q15_t *pOut = pDst;                            /*  Destination pointer                          */
  q31_t in;                                      /*  Temporary variable to hold input value       */
  q31_t out;                                     /*  Temporary variable to hold output value      */
  q31_t b0;                                      /*  Temporary variable to hold bo value          */
  q31_t b1, a1;                                  /*  Filter coefficients                          */
  q31_t state_in, state_out;                     /*  Filter state variables                       */
  q31_t acc;                                     /*  Accumulator                                  */
  int32_t shift = (int32_t) (15 - S->postShift); /*  Post shift                                   */
  q15_t *pState = S->pState;                     /*  State pointer                                */
  q15_t *pCoeffs = S->pCoeffs;                   /*  Coefficient pointer                          */
  uint32_t sample, stage = S->numStages;         /*  Stage loop counter                           */
  do
  {
    /* Read the b0 and 0 coefficients using SIMD  */
    b0 = *__SIMD32(pCoeffs)++;
    /* Read the b1 and b2 coefficients using SIMD */
    b1 = *__SIMD32(pCoeffs)++;
    /* Read the a1 and a2 coefficients using SIMD */
    a1 = *__SIMD32(pCoeffs)++;
    /* Read the input state values from the state buffer:  x[n-1], x[n-2] */
    state_in = *__SIMD32(pState)++;
    /* Read the output state values from the state buffer:  y[n-1], y[n-2] */
    state_out = *__SIMD32(pState)--;
    /* Apply loop unrolling and compute 2 output values simultaneously. */
    /*      The variable acc hold output values that are being computed:       
     *    
     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]       
     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]       
     */
    sample = blockSize >> 1u;
    /* First part of the processing with loop unrolling.  Compute 2 outputs at a time.    
     ** a second loop below computes the remaining 1 sample. */
    while(sample > 0u)
    {
      /* Read the input */
      in = *__SIMD32(pIn)++;
      /* out =  b0 * x[n] + 0 * 0 */
      out = __SMUAD(b0, in);
      /* acc =  b1 * x[n-1] + acc +=  b2 * x[n-2] + out */
      acc = __SMLAD(b1, state_in, out);
      /* acc +=  a1 * y[n-1] + acc +=  a2 * y[n-2] */
      acc = __SMLAD(a1, state_out, acc);
      /* The result is converted from 3.29 to 1.31 and then saturation is applied */
      out = __SSAT((acc >> shift), 16);
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc   */
      /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
      /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
 #ifndef  ARM_MATH_BIG_ENDIAN
      state_in = __PKHBT(in, state_in, 16);
      state_out = __PKHBT(out, state_out, 16);
 #else
      state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
      state_out = __PKHBT(state_out >> 16, (out), 16);
 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
      /* out =  b0 * x[n] + 0 * 0 */
      out = __SMUADX(b0, in);
      /* acc0 =  b1 * x[n-1] , acc0 +=  b2 * x[n-2] + out */
      acc = __SMLAD(b1, state_in, out);
      /* acc +=  a1 * y[n-1] + acc +=  a2 * y[n-2] */
      acc = __SMLAD(a1, state_out, acc);
      /* The result is converted from 3.29 to 1.31 and then saturation is applied */
      out = __SSAT((acc >> shift), 16);
      /* Store the output in the destination buffer. */
 #ifndef  ARM_MATH_BIG_ENDIAN
      *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16);
 #else
      *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16);
 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc   */
      /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
      /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
 #ifndef  ARM_MATH_BIG_ENDIAN
      state_in = __PKHBT(in >> 16, state_in, 16);
      state_out = __PKHBT(out, state_out, 16);
 #else
      state_in = __PKHBT(state_in >> 16, in, 16);
      state_out = __PKHBT(state_out >> 16, out, 16);
 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
      /* Decrement the loop counter */
      sample--;
    }
    /* If the blockSize is not a multiple of 2, compute any remaining output samples here.    
     ** No loop unrolling is used. */
    if((blockSize & 0x1u) != 0u)
    {
      /* Read the input */
      in = *pIn++;
      /* out =  b0 * x[n] + 0 * 0 */
 #ifndef  ARM_MATH_BIG_ENDIAN
      out = __SMUAD(b0, in);
 #else
      out = __SMUADX(b0, in);
 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
      /* acc =  b1 * x[n-1], acc +=  b2 * x[n-2] + out */
      acc = __SMLAD(b1, state_in, out);
      /* acc +=  a1 * y[n-1] + acc +=  a2 * y[n-2] */
      acc = __SMLAD(a1, state_out, acc);
      /* The result is converted from 3.29 to 1.31 and then saturation is applied */
      out = __SSAT((acc >> shift), 16);
      /* Store the output in the destination buffer. */
      *pOut++ = (q15_t) out;
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc   */
      /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
      /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
 #ifndef  ARM_MATH_BIG_ENDIAN
      state_in = __PKHBT(in, state_in, 16);
      state_out = __PKHBT(out, state_out, 16);
 #else
      state_in = __PKHBT(state_in >> 16, in, 16);
      state_out = __PKHBT(state_out >> 16, out, 16);
 #endif /*   #ifndef  ARM_MATH_BIG_ENDIAN    */
    }
    /*  The first stage goes from the input buffer to the output buffer.  */
    /*  Subsequent (numStages - 1) occur in-place in the output buffer  */
    pIn = pDst;
    /* Reset the output pointer */
    pOut = pDst;
    /*  Store the updated state variables back into the state array */
    *__SIMD32(pState)++ = state_in;
    *__SIMD32(pState)++ = state_out;
    /* Decrement the loop counter */
    stage--;
  } while(stage > 0u);
 }
 /**    
 * @} end of BiquadCascadeDF1 group    
 */
--- a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c
@@ -0,0 +1,305 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_biquad_cascade_df1_fast_q31.c    
 *    
 * Description:	Processing function for the    
 *				Q31 Fast Biquad cascade DirectFormI(DF1) filter.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE. 
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupFilters    
 */
 /**    
 * @addtogroup BiquadCascadeDF1    
 * @{    
 */
 /**    
 * @details    
 *    
 * @param[in]  *S        points to an instance of the Q31 Biquad cascade structure.    
 * @param[in]  *pSrc     points to the block of input data.    
 * @param[out] *pDst     points to the block of output data.    
 * @param[in]  blockSize number of samples to process per call.    
 * @return 	   none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * This function is optimized for speed at the expense of fixed-point precision and overflow protection.    
 * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.    
 * These intermediate results are added to a 2.30 accumulator.    
 * Finally, the accumulator is saturated and converted to a 1.31 result.    
 * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.    
 * In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25). Use the intialization function    
 * arm_biquad_cascade_df1_init_q31() to initialize filter structure.    
 *    
 * \par    
 * Refer to the function <code>arm_biquad_cascade_df1_q31()</code> for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.  Both the slow and the fast versions use the same instance structure.    
 * Use the function <code>arm_biquad_cascade_df1_init_q31()</code> to initialize the filter structure.    
 */
 void arm_biquad_cascade_df1_fast_q31(
  const arm_biquad_casd_df1_inst_q31 * S,
  q31_t * pSrc,
  q31_t * pDst,
  uint32_t blockSize)
 {
  q31_t acc = 0;                                 /*  accumulator                   */
  q31_t Xn1, Xn2, Yn1, Yn2;                      /*  Filter state variables        */
  q31_t b0, b1, b2, a1, a2;                      /*  Filter coefficients           */
  q31_t *pIn = pSrc;                             /*  input pointer initialization  */
  q31_t *pOut = pDst;                            /*  output pointer initialization */
  q31_t *pState = S->pState;                     /*  pState pointer initialization */
  q31_t *pCoeffs = S->pCoeffs;                   /*  coeff pointer initialization  */
  q31_t Xn;                                      /*  temporary input               */
  int32_t shift = (int32_t) S->postShift + 1;    /*  Shift to be applied to the output */
  uint32_t sample, stage = S->numStages;         /*  loop counters                     */
  do
  {
    /* Reading the coefficients */
    b0 = *pCoeffs++;
    b1 = *pCoeffs++;
    b2 = *pCoeffs++;
    a1 = *pCoeffs++;
    a2 = *pCoeffs++;
    /* Reading the state values */
    Xn1 = pState[0];
    Xn2 = pState[1];
    Yn1 = pState[2];
    Yn2 = pState[3];
    /* Apply loop unrolling and compute 4 output values simultaneously. */
    /*      The variables acc ... acc3 hold output values that are being computed:       
     *       
     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]       
     */
    sample = blockSize >> 2u;
    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.       
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(sample > 0u)
    {
      /* Read the input */
      Xn = *pIn;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      //acc = (q31_t) (((q63_t) b1 * Xn1) >> 32);
      mult_32x32_keep32_R(acc, b1, Xn1);
      /* acc +=  b1 * x[n-1] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b0 * (Xn))) >> 32);   
      multAcc_32x32_keep32_R(acc, b0, Xn);
      /* acc +=  b[2] * x[n-2] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);
      multAcc_32x32_keep32_R(acc, b2, Xn2);
      /* acc +=  a1 * y[n-1] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);
      multAcc_32x32_keep32_R(acc, a1, Yn1);
      /* acc +=  a2 * y[n-2] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);
      multAcc_32x32_keep32_R(acc, a2, Yn2);
      /* The result is converted to 1.31 , Yn2 variable is reused */
      Yn2 = acc << shift;
      /* Read the second input */
      Xn2 = *(pIn + 1u);
      /* Store the output in the destination buffer. */
      *pOut = Yn2;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      //acc = (q31_t) (((q63_t) b0 * (Xn2)) >> 32);
      mult_32x32_keep32_R(acc, b0, Xn2);
      /* acc +=  b1 * x[n-1] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn))) >> 32);
      multAcc_32x32_keep32_R(acc, b1, Xn);
      /* acc +=  b[2] * x[n-2] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn1))) >> 32);
      multAcc_32x32_keep32_R(acc, b2, Xn1);
      /* acc +=  a1 * y[n-1] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);
      multAcc_32x32_keep32_R(acc, a1, Yn2);
      /* acc +=  a2 * y[n-2] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);
      multAcc_32x32_keep32_R(acc, a2, Yn1);
      /* The result is converted to 1.31, Yn1 variable is reused  */
      Yn1 = acc << shift;
      /* Read the third input  */
      Xn1 = *(pIn + 2u);
      /* Store the output in the destination buffer. */
      *(pOut + 1u) = Yn1;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      //acc = (q31_t) (((q63_t) b0 * (Xn1)) >> 32);
      mult_32x32_keep32_R(acc, b0, Xn1);
      /* acc +=  b1 * x[n-1] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn2))) >> 32);
      multAcc_32x32_keep32_R(acc, b1, Xn2);
      /* acc +=  b[2] * x[n-2] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn))) >> 32);
      multAcc_32x32_keep32_R(acc, b2, Xn);
      /* acc +=  a1 * y[n-1] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);
      multAcc_32x32_keep32_R(acc, a1, Yn1);
      /* acc +=  a2 * y[n-2] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);
      multAcc_32x32_keep32_R(acc, a2, Yn2);
      /* The result is converted to 1.31, Yn2 variable is reused  */
      Yn2 = acc << shift;
      /* Read the forth input */
      Xn = *(pIn + 3u);
      /* Store the output in the destination buffer. */
      *(pOut + 2u) = Yn2;
      pIn += 4u;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      //acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);
      mult_32x32_keep32_R(acc, b0, Xn);
      /* acc +=  b1 * x[n-1] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);
      multAcc_32x32_keep32_R(acc, b1, Xn1);
      /* acc +=  b[2] * x[n-2] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);
      multAcc_32x32_keep32_R(acc, b2, Xn2);
      /* acc +=  a1 * y[n-1] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);
      multAcc_32x32_keep32_R(acc, a1, Yn2);
      /* acc +=  a2 * y[n-2] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);
      multAcc_32x32_keep32_R(acc, a2, Yn1);
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      Xn2 = Xn1;
      /* The result is converted to 1.31, Yn1 variable is reused  */
      Yn1 = acc << shift;
      /* Xn1 = Xn     */
      Xn1 = Xn;
      /* Store the output in the destination buffer. */
      *(pOut + 3u) = Yn1;
      pOut += 4u;
      /* decrement the loop counter */
      sample--;
    }
    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.       
     ** No loop unrolling is used. */
    sample = (blockSize & 0x3u);
   while(sample > 0u)
   {
      /* Read the input */
      Xn = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      //acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);
      mult_32x32_keep32_R(acc, b0, Xn);
      /* acc +=  b1 * x[n-1] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);
      multAcc_32x32_keep32_R(acc, b1, Xn1);
      /* acc +=  b[2] * x[n-2] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);
      multAcc_32x32_keep32_R(acc, b2, Xn2);
      /* acc +=  a1 * y[n-1] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);
      multAcc_32x32_keep32_R(acc, a1, Yn1);
      /* acc +=  a2 * y[n-2] */
      //acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);
      multAcc_32x32_keep32_R(acc, a2, Yn2);
      /* The result is converted to 1.31  */
      acc = acc << shift;
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc    */
      Xn2 = Xn1;
      Xn1 = Xn;
      Yn2 = Yn1;
      Yn1 = acc;
      /* Store the output in the destination buffer. */
      *pOut++ = acc;
      /* decrement the loop counter */
      sample--;
   }
    /*  The first stage goes from the input buffer to the output buffer. */
    /*  Subsequent stages occur in-place in the output buffer */
    pIn = pDst;
    /* Reset to destination pointer */
    pOut = pDst;
    /*  Store the updated state variables back into the pState array */
    *pState++ = Xn1;
    *pState++ = Xn2;
    *pState++ = Yn1;
    *pState++ = Yn2;
  } while(--stage);
 }
 /**    
  * @} end of BiquadCascadeDF1 group    
  */
--- a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c
@@ -0,0 +1,109 @@
 /*-----------------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:        arm_biquad_cascade_df1_init_f32.c    
 *    
 * Description:  floating-point Biquad cascade DirectFormI(DF1) filter initialization function.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.    
 * ---------------------------------------------------------------------------*/
 #include "arm_math.h"
 /**    
 * @ingroup groupFilters    
 */
 /**    
 * @addtogroup BiquadCascadeDF1    
 * @{    
 */
 /**    
 * @details    
 * @brief  Initialization function for the floating-point Biquad cascade filter.    
 * @param[in,out] *S           points to an instance of the floating-point Biquad cascade structure.    
 * @param[in]     numStages    number of 2nd order stages in the filter.    
 * @param[in]     *pCoeffs     points to the filter coefficients array.    
 * @param[in]     *pState      points to the state array.    
 * @return        none    
 *    
 *    
 * <b>Coefficient and State Ordering:</b>    
 *    
 * \par    
 * The coefficients are stored in the array <code>pCoeffs</code> in the following order:    
 * <pre>    
 *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}    
 * </pre>    
 *    
 * \par    
 * where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,    
 * <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,    
 * and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.    
 *    
 * \par    
 * The <code>pState</code> is a pointer to state array.    
 * Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.    
 * The state variables are arranged in the <code>pState</code> array as:    
 * <pre>    
 *     {x[n-1], x[n-2], y[n-1], y[n-2]}    
 * </pre>    
 * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.    
 * The state array has a total length of <code>4*numStages</code> values.    
 * The state variables are updated after each block of data is processed; the coefficients are untouched.    
 *    
 */
 void arm_biquad_cascade_df1_init_f32(
  arm_biquad_casd_df1_inst_f32 * S,
  uint8_t numStages,
  float32_t * pCoeffs,
  float32_t * pState)
 {
  /* Assign filter stages */
  S->numStages = numStages;
  /* Assign coefficient pointer */
  S->pCoeffs = pCoeffs;
  /* Clear state buffer and size is always 4 * numStages */
  memset(pState, 0, (4u * (uint32_t) numStages) * sizeof(float32_t));
  /* Assign state pointer */
  S->pState = pState;
 }
 /**    
 * @} end of BiquadCascadeDF1 group    
 */
--- a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c
@@ -0,0 +1,111 @@
 /*-----------------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:        arm_biquad_cascade_df1_init_q15.c    
 *    
 * Description:  Q15 Biquad cascade DirectFormI(DF1) filter initialization function.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * ---------------------------------------------------------------------------*/
 #include "arm_math.h"
 /**    
 * @ingroup groupFilters    
 */
 /**    
 * @addtogroup BiquadCascadeDF1    
 * @{    
 */
 /**    
 * @details    
 *    
 * @param[in,out] *S           points to an instance of the Q15 Biquad cascade structure.    
 * @param[in]     numStages    number of 2nd order stages in the filter.    
 * @param[in]     *pCoeffs     points to the filter coefficients.    
 * @param[in]     *pState      points to the state buffer.    
 * @param[in]     postShift    Shift to be applied to the accumulator result. Varies according to the coefficients format    
 * @return        none    
 *    
 * <b>Coefficient and State Ordering:</b>    
 *    
 * \par    
 * The coefficients are stored in the array <code>pCoeffs</code> in the following order:    
 * <pre>    
 *     {b10, 0, b11, b12, a11, a12, b20, 0, b21, b22, a21, a22, ...}    
 * </pre>    
 * where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,    
 * <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,    
 * and so on.  The <code>pCoeffs</code> array contains a total of <code>6*numStages</code> values.    
 * The zero coefficient between <code>b1</code> and <code>b2</code> facilities  use of 16-bit SIMD instructions on the Cortex-M4.    
 *    
 * \par    
 * The state variables are stored in the array <code>pState</code>.    
 * Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.    
 * The state variables are arranged in the <code>pState</code> array as:    
 * <pre>    
 *     {x[n-1], x[n-2], y[n-1], y[n-2]}    
 * </pre>    
 * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.    
 * The state array has a total length of <code>4*numStages</code> values.    
 * The state variables are updated after each block of data is processed; the coefficients are untouched.    
 */
 void arm_biquad_cascade_df1_init_q15(
  arm_biquad_casd_df1_inst_q15 * S,
  uint8_t numStages,
  q15_t * pCoeffs,
  q15_t * pState,
  int8_t postShift)
 {
  /* Assign filter stages */
  S->numStages = numStages;
  /* Assign postShift to be applied to the output */
  S->postShift = postShift;
  /* Assign coefficient pointer */
  S->pCoeffs = pCoeffs;
  /* Clear state buffer and size is always 4 * numStages */
  memset(pState, 0, (4u * (uint32_t) numStages) * sizeof(q15_t));
  /* Assign state pointer */
  S->pState = pState;
 }
 /**    
 * @} end of BiquadCascadeDF1 group    
 */
--- a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c
@@ -0,0 +1,111 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_biquad_cascade_df1_init_q31.c    
 *    
 * Description:	Q31 Biquad cascade DirectFormI(DF1) filter initialization function.    
 *    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.    
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupFilters    
 */
 /**    
 * @addtogroup BiquadCascadeDF1    
 * @{    
 */
 /**    
 * @details    
 *    
 * @param[in,out] *S           points to an instance of the Q31 Biquad cascade structure.    
 * @param[in]     numStages    number of 2nd order stages in the filter.    
 * @param[in]     *pCoeffs     points to the filter coefficients buffer.    
 * @param[in]     *pState      points to the state buffer.    
 * @param[in]     postShift    Shift to be applied after the accumulator.  Varies according to the coefficients format    
 * @return        none    
 *    
 * <b>Coefficient and State Ordering:</b>    
 *    
 * \par    
 * The coefficients are stored in the array <code>pCoeffs</code> in the following order:    
 * <pre>    
 *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}    
 * </pre>    
 * where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,    
 * <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,    
 * and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.    
 *    
 * \par    
 * The <code>pState</code> points to state variables array.    
 * Each Biquad stage has 4 state variables <code>x[n-1], x[n-2], y[n-1],</code> and <code>y[n-2]</code>.    
 * The state variables are arranged in the <code>pState</code> array as:    
 * <pre>    
 *     {x[n-1], x[n-2], y[n-1], y[n-2]}    
 * </pre>    
 * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.    
 * The state array has a total length of <code>4*numStages</code> values.    
 * The state variables are updated after each block of data is processed; the coefficients are untouched.    
 */
 void arm_biquad_cascade_df1_init_q31(
  arm_biquad_casd_df1_inst_q31 * S,
  uint8_t numStages,
  q31_t * pCoeffs,
  q31_t * pState,
  int8_t postShift)
 {
  /* Assign filter stages */
  S->numStages = numStages;
  /* Assign postShift to be applied to the output */
  S->postShift = postShift;
  /* Assign coefficient pointer */
  S->pCoeffs = pCoeffs;
  /* Clear state buffer and size is always 4 * numStages */
  memset(pState, 0, (4u * (uint32_t) numStages) * sizeof(q31_t));
  /* Assign state pointer */
  S->pState = pState;
 }
 /**    
 * @} end of BiquadCascadeDF1 group    
 */
--- a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c
@@ -0,0 +1,411 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_biquad_cascade_df1_q15.c    
 *    
 * Description:	Processing function for the    
 *				Q15 Biquad cascade DirectFormI(DF1) filter.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.    
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupFilters    
 */
 /**    
 * @addtogroup BiquadCascadeDF1    
 * @{    
 */
 /**    
 * @brief Processing function for the Q15 Biquad cascade filter.    
 * @param[in]  *S points to an instance of the Q15 Biquad cascade structure.    
 * @param[in]  *pSrc points to the block of input data.    
 * @param[out] *pDst points to the location where the output result is written.    
 * @param[in]  blockSize number of samples to process per call.    
 * @return none.    
 *    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function is implemented using a 64-bit internal accumulator.    
 * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.    
 * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.    
 * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.    
 * The accumulator is then shifted by <code>postShift</code> bits to truncate the result to 1.15 format by discarding the low 16 bits.    
 * Finally, the result is saturated to 1.15 format.    
 *    
 * \par    
 * Refer to the function <code>arm_biquad_cascade_df1_fast_q15()</code> for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4.    
 */
 void arm_biquad_cascade_df1_q15(
  const arm_biquad_casd_df1_inst_q15 * S,
  q15_t * pSrc,
  q15_t * pDst,
  uint32_t blockSize)
 {
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  q15_t *pIn = pSrc;                             /*  Source pointer                               */
  q15_t *pOut = pDst;                            /*  Destination pointer                          */
  q31_t in;                                      /*  Temporary variable to hold input value       */
  q31_t out;                                     /*  Temporary variable to hold output value      */
  q31_t b0;                                      /*  Temporary variable to hold bo value          */
  q31_t b1, a1;                                  /*  Filter coefficients                          */
  q31_t state_in, state_out;                     /*  Filter state variables                       */
  q31_t acc_l, acc_h;
  q63_t acc;                                     /*  Accumulator                                  */
  int32_t lShift = (15 - (int32_t) S->postShift);       /*  Post shift                                   */
  q15_t *pState = S->pState;                     /*  State pointer                                */
  q15_t *pCoeffs = S->pCoeffs;                   /*  Coefficient pointer                          */
  uint32_t sample, stage = (uint32_t) S->numStages;     /*  Stage loop counter                           */
  int32_t uShift = (32 - lShift);
  do
  {
    /* Read the b0 and 0 coefficients using SIMD  */
    b0 = *__SIMD32(pCoeffs)++;
    /* Read the b1 and b2 coefficients using SIMD */
    b1 = *__SIMD32(pCoeffs)++;
    /* Read the a1 and a2 coefficients using SIMD */
    a1 = *__SIMD32(pCoeffs)++;
    /* Read the input state values from the state buffer:  x[n-1], x[n-2] */
    state_in = *__SIMD32(pState)++;
    /* Read the output state values from the state buffer:  y[n-1], y[n-2] */
    state_out = *__SIMD32(pState)--;
    /* Apply loop unrolling and compute 2 output values simultaneously. */
    /*      The variable acc hold output values that are being computed:    
     *    
     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]    
     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]    
     */
    sample = blockSize >> 1u;
    /* First part of the processing with loop unrolling.  Compute 2 outputs at a time.    
     ** a second loop below computes the remaining 1 sample. */
    while(sample > 0u)
    {
      /* Read the input */
      in = *__SIMD32(pIn)++;
      /* out =  b0 * x[n] + 0 * 0 */
      out = __SMUAD(b0, in);
      /* acc +=  b1 * x[n-1] +  b2 * x[n-2] + out */
      acc = __SMLALD(b1, state_in, out);
      /* acc +=  a1 * y[n-1] +  a2 * y[n-2] */
      acc = __SMLALD(a1, state_out, acc);
      /* The result is converted from 3.29 to 1.31 if postShift = 1, and then saturation is applied */
      /* Calc lower part of acc */
      acc_l = acc & 0xffffffff;
      /* Calc upper part of acc */
      acc_h = (acc >> 32) & 0xffffffff;
      /* Apply shift for lower part of acc and upper part of acc */
      out = (uint32_t) acc_l >> lShift | acc_h << uShift;
      out = __SSAT(out, 16);
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc   */
      /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
      /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
 #ifndef  ARM_MATH_BIG_ENDIAN
      state_in = __PKHBT(in, state_in, 16);
      state_out = __PKHBT(out, state_out, 16);
 #else
      state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
      state_out = __PKHBT(state_out >> 16, (out), 16);
 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
      /* out =  b0 * x[n] + 0 * 0 */
      out = __SMUADX(b0, in);
      /* acc +=  b1 * x[n-1] +  b2 * x[n-2] + out */
      acc = __SMLALD(b1, state_in, out);
      /* acc +=  a1 * y[n-1] + a2 * y[n-2] */
      acc = __SMLALD(a1, state_out, acc);
      /* The result is converted from 3.29 to 1.31 if postShift = 1, and then saturation is applied */
      /* Calc lower part of acc */
      acc_l = acc & 0xffffffff;
      /* Calc upper part of acc */
      acc_h = (acc >> 32) & 0xffffffff;
      /* Apply shift for lower part of acc and upper part of acc */
      out = (uint32_t) acc_l >> lShift | acc_h << uShift;
      out = __SSAT(out, 16);
      /* Store the output in the destination buffer. */
 #ifndef  ARM_MATH_BIG_ENDIAN
      *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16);
 #else
      *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16);
 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc   */
      /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
      /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
 #ifndef  ARM_MATH_BIG_ENDIAN
      state_in = __PKHBT(in >> 16, state_in, 16);
      state_out = __PKHBT(out, state_out, 16);
 #else
      state_in = __PKHBT(state_in >> 16, in, 16);
      state_out = __PKHBT(state_out >> 16, out, 16);
 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
      /* Decrement the loop counter */
      sample--;
    }
    /* If the blockSize is not a multiple of 2, compute any remaining output samples here.    
     ** No loop unrolling is used. */
    if((blockSize & 0x1u) != 0u)
    {
      /* Read the input */
      in = *pIn++;
      /* out =  b0 * x[n] + 0 * 0 */
 #ifndef  ARM_MATH_BIG_ENDIAN
      out = __SMUAD(b0, in);
 #else
      out = __SMUADX(b0, in);
 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
      /* acc =  b1 * x[n-1] + b2 * x[n-2] + out */
      acc = __SMLALD(b1, state_in, out);
      /* acc +=  a1 * y[n-1] + a2 * y[n-2] */
      acc = __SMLALD(a1, state_out, acc);
      /* The result is converted from 3.29 to 1.31 if postShift = 1, and then saturation is applied */
      /* Calc lower part of acc */
      acc_l = acc & 0xffffffff;
      /* Calc upper part of acc */
      acc_h = (acc >> 32) & 0xffffffff;
      /* Apply shift for lower part of acc and upper part of acc */
      out = (uint32_t) acc_l >> lShift | acc_h << uShift;
      out = __SSAT(out, 16);
      /* Store the output in the destination buffer. */
      *pOut++ = (q15_t) out;
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc   */
      /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
      /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
 #ifndef  ARM_MATH_BIG_ENDIAN
      state_in = __PKHBT(in, state_in, 16);
      state_out = __PKHBT(out, state_out, 16);
 #else
      state_in = __PKHBT(state_in >> 16, in, 16);
      state_out = __PKHBT(state_out >> 16, out, 16);
 #endif /*   #ifndef  ARM_MATH_BIG_ENDIAN    */
    }
    /*  The first stage goes from the input wire to the output wire.  */
    /*  Subsequent numStages occur in-place in the output wire  */
    pIn = pDst;
    /* Reset the output pointer */
    pOut = pDst;
    /*  Store the updated state variables back into the state array */
    *__SIMD32(pState)++ = state_in;
    *__SIMD32(pState)++ = state_out;
    /* Decrement the loop counter */
    stage--;
  } while(stage > 0u);
 #else
  /* Run the below code for Cortex-M0 */
  q15_t *pIn = pSrc;                             /*  Source pointer                               */
  q15_t *pOut = pDst;                            /*  Destination pointer                          */
  q15_t b0, b1, b2, a1, a2;                      /*  Filter coefficients           */
  q15_t Xn1, Xn2, Yn1, Yn2;                      /*  Filter state variables        */
  q15_t Xn;                                      /*  temporary input               */
  q63_t acc;                                     /*  Accumulator                                  */
  int32_t shift = (15 - (int32_t) S->postShift); /*  Post shift                                   */
  q15_t *pState = S->pState;                     /*  State pointer                                */
  q15_t *pCoeffs = S->pCoeffs;                   /*  Coefficient pointer                          */
  uint32_t sample, stage = (uint32_t) S->numStages;     /*  Stage loop counter                           */
  do
  {
    /* Reading the coefficients */
    b0 = *pCoeffs++;
    pCoeffs++;  // skip the 0 coefficient
    b1 = *pCoeffs++;
    b2 = *pCoeffs++;
    a1 = *pCoeffs++;
    a2 = *pCoeffs++;
    /* Reading the state values */
    Xn1 = pState[0];
    Xn2 = pState[1];
    Yn1 = pState[2];
    Yn2 = pState[3];
    /*      The variables acc holds the output value that is computed:         
     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]         
     */
    sample = blockSize;
    while(sample > 0u)
    {
      /* Read the input */
      Xn = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      acc = (q31_t) b0 *Xn;
      /* acc +=  b1 * x[n-1] */
      acc += (q31_t) b1 *Xn1;
      /* acc +=  b[2] * x[n-2] */
      acc += (q31_t) b2 *Xn2;
      /* acc +=  a1 * y[n-1] */
      acc += (q31_t) a1 *Yn1;
      /* acc +=  a2 * y[n-2] */
      acc += (q31_t) a2 *Yn2;
      /* The result is converted to 1.31  */
      acc = __SSAT((acc >> shift), 16);
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc    */
      Xn2 = Xn1;
      Xn1 = Xn;
      Yn2 = Yn1;
      Yn1 = (q15_t) acc;
      /* Store the output in the destination buffer. */
      *pOut++ = (q15_t) acc;
      /* decrement the loop counter */
      sample--;
    }
    /*  The first stage goes from the input buffer to the output buffer. */
    /*  Subsequent stages occur in-place in the output buffer */
    pIn = pDst;
    /* Reset to destination pointer */
    pOut = pDst;
    /*  Store the updated state variables back into the pState array */
    *pState++ = Xn1;
    *pState++ = Xn2;
    *pState++ = Yn1;
    *pState++ = Yn2;
  } while(--stage);
 #endif /*     #ifndef ARM_MATH_CM0_FAMILY */
 }
 /**    
 * @} end of BiquadCascadeDF1 group    
 */
--- a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c
@@ -0,0 +1,405 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_biquad_cascade_df1_q31.c    
 *    
 * Description:	Processing function for the    
 *				Q31 Biquad cascade filter    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.     
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupFilters    
 */
 /**    
 * @addtogroup BiquadCascadeDF1    
 * @{    
 */
 /**    
 * @brief Processing function for the Q31 Biquad cascade filter.    
 * @param[in]  *S         points to an instance of the Q31 Biquad cascade structure.    
 * @param[in]  *pSrc      points to the block of input data.    
 * @param[out] *pDst      points to the block of output data.    
 * @param[in]  blockSize  number of samples to process per call.    
 * @return none.    
 *    
 * <b>Scaling and Overflow Behavior:</b>    
 * \par    
 * The function is implemented using an internal 64-bit accumulator.    
 * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.    
 * Thus, if the accumulator result overflows it wraps around rather than clip.    
 * In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).    
 * After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by <code>postShift</code> bits and the result truncated to    
 * 1.31 format by discarding the low 32 bits.    
 *    
 * \par    
 * Refer to the function <code>arm_biquad_cascade_df1_fast_q31()</code> for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4.    
 */
 void arm_biquad_cascade_df1_q31(
  const arm_biquad_casd_df1_inst_q31 * S,
  q31_t * pSrc,
  q31_t * pDst,
  uint32_t blockSize)
 {
  q63_t acc;                                     /*  accumulator                   */
  uint32_t uShift = ((uint32_t) S->postShift + 1u);
  uint32_t lShift = 32u - uShift;                /*  Shift to be applied to the output */
  q31_t *pIn = pSrc;                             /*  input pointer initialization  */
  q31_t *pOut = pDst;                            /*  output pointer initialization */
  q31_t *pState = S->pState;                     /*  pState pointer initialization */
  q31_t *pCoeffs = S->pCoeffs;                   /*  coeff pointer initialization  */
  q31_t Xn1, Xn2, Yn1, Yn2;                      /*  Filter state variables        */
  q31_t b0, b1, b2, a1, a2;                      /*  Filter coefficients           */
  q31_t Xn;                                      /*  temporary input               */
  uint32_t sample, stage = S->numStages;         /*  loop counters                     */
 #ifndef ARM_MATH_CM0_FAMILY_FAMILY
  q31_t acc_l, acc_h;                            /*  temporary output variables    */
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  do
  {
    /* Reading the coefficients */
    b0 = *pCoeffs++;
    b1 = *pCoeffs++;
    b2 = *pCoeffs++;
    a1 = *pCoeffs++;
    a2 = *pCoeffs++;
    /* Reading the state values */
    Xn1 = pState[0];
    Xn2 = pState[1];
    Yn1 = pState[2];
    Yn2 = pState[3];
    /* Apply loop unrolling and compute 4 output values simultaneously. */
    /*      The variable acc hold output values that are being computed:    
     *    
     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]    
     */
    sample = blockSize >> 2u;
    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.    
     ** a second loop below computes the remaining 1 to 3 samples. */
    while(sample > 0u)
    {
      /* Read the input */
      Xn = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      acc = (q63_t) b0 *Xn;
      /* acc +=  b1 * x[n-1] */
      acc += (q63_t) b1 *Xn1;
      /* acc +=  b[2] * x[n-2] */
      acc += (q63_t) b2 *Xn2;
      /* acc +=  a1 * y[n-1] */
      acc += (q63_t) a1 *Yn1;
      /* acc +=  a2 * y[n-2] */
      acc += (q63_t) a2 *Yn2;
      /* The result is converted to 1.31 , Yn2 variable is reused */
      /* Calc lower part of acc */
      acc_l = acc & 0xffffffff;
      /* Calc upper part of acc */
      acc_h = (acc >> 32) & 0xffffffff;
      /* Apply shift for lower part of acc and upper part of acc */
      Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift;
      /* Store the output in the destination buffer. */
      *pOut++ = Yn2;
      /* Read the second input */
      Xn2 = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      acc = (q63_t) b0 *Xn2;
      /* acc +=  b1 * x[n-1] */
      acc += (q63_t) b1 *Xn;
      /* acc +=  b[2] * x[n-2] */
      acc += (q63_t) b2 *Xn1;
      /* acc +=  a1 * y[n-1] */
      acc += (q63_t) a1 *Yn2;
      /* acc +=  a2 * y[n-2] */
      acc += (q63_t) a2 *Yn1;
      /* The result is converted to 1.31, Yn1 variable is reused  */
      /* Calc lower part of acc */
      acc_l = acc & 0xffffffff;
      /* Calc upper part of acc */
      acc_h = (acc >> 32) & 0xffffffff;
      /* Apply shift for lower part of acc and upper part of acc */
      Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift;
      /* Store the output in the destination buffer. */
      *pOut++ = Yn1;
      /* Read the third input  */
      Xn1 = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      acc = (q63_t) b0 *Xn1;
      /* acc +=  b1 * x[n-1] */
      acc += (q63_t) b1 *Xn2;
      /* acc +=  b[2] * x[n-2] */
      acc += (q63_t) b2 *Xn;
      /* acc +=  a1 * y[n-1] */
      acc += (q63_t) a1 *Yn1;
      /* acc +=  a2 * y[n-2] */
      acc += (q63_t) a2 *Yn2;
      /* The result is converted to 1.31, Yn2 variable is reused  */
      /* Calc lower part of acc */
      acc_l = acc & 0xffffffff;
      /* Calc upper part of acc */
      acc_h = (acc >> 32) & 0xffffffff;
      /* Apply shift for lower part of acc and upper part of acc */
      Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift;
      /* Store the output in the destination buffer. */
      *pOut++ = Yn2;
      /* Read the forth input */
      Xn = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      acc = (q63_t) b0 *Xn;
      /* acc +=  b1 * x[n-1] */
      acc += (q63_t) b1 *Xn1;
      /* acc +=  b[2] * x[n-2] */
      acc += (q63_t) b2 *Xn2;
      /* acc +=  a1 * y[n-1] */
      acc += (q63_t) a1 *Yn2;
      /* acc +=  a2 * y[n-2] */
      acc += (q63_t) a2 *Yn1;
      /* The result is converted to 1.31, Yn1 variable is reused  */
      /* Calc lower part of acc */
      acc_l = acc & 0xffffffff;
      /* Calc upper part of acc */
      acc_h = (acc >> 32) & 0xffffffff;
      /* Apply shift for lower part of acc and upper part of acc */
      Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift;
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc    */
      Xn2 = Xn1;
      Xn1 = Xn;
      /* Store the output in the destination buffer. */
      *pOut++ = Yn1;
      /* decrement the loop counter */
      sample--;
    }
    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.    
     ** No loop unrolling is used. */
    sample = (blockSize & 0x3u);
    while(sample > 0u)
    {
      /* Read the input */
      Xn = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      acc = (q63_t) b0 *Xn;
      /* acc +=  b1 * x[n-1] */
      acc += (q63_t) b1 *Xn1;
      /* acc +=  b[2] * x[n-2] */
      acc += (q63_t) b2 *Xn2;
      /* acc +=  a1 * y[n-1] */
      acc += (q63_t) a1 *Yn1;
      /* acc +=  a2 * y[n-2] */
      acc += (q63_t) a2 *Yn2;
      /* The result is converted to 1.31  */
      acc = acc >> lShift;
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc    */
      Xn2 = Xn1;
      Xn1 = Xn;
      Yn2 = Yn1;
      Yn1 = (q31_t) acc;
      /* Store the output in the destination buffer. */
      *pOut++ = (q31_t) acc;
      /* decrement the loop counter */
      sample--;
    }
    /*  The first stage goes from the input buffer to the output buffer. */
    /*  Subsequent stages occur in-place in the output buffer */
    pIn = pDst;
    /* Reset to destination pointer */
    pOut = pDst;
    /*  Store the updated state variables back into the pState array */
    *pState++ = Xn1;
    *pState++ = Xn2;
    *pState++ = Yn1;
    *pState++ = Yn2;
  } while(--stage);
 #else
  /* Run the below code for Cortex-M0 */
  do
  {
    /* Reading the coefficients */
    b0 = *pCoeffs++;
    b1 = *pCoeffs++;
    b2 = *pCoeffs++;
    a1 = *pCoeffs++;
    a2 = *pCoeffs++;
    /* Reading the state values */
    Xn1 = pState[0];
    Xn2 = pState[1];
    Yn1 = pState[2];
    Yn2 = pState[3];
    /*      The variables acc holds the output value that is computed:         
     *    acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]         
     */
    sample = blockSize;
    while(sample > 0u)
    {
      /* Read the input */
      Xn = *pIn++;
      /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
      /* acc =  b0 * x[n] */
      acc = (q63_t) b0 *Xn;
      /* acc +=  b1 * x[n-1] */
      acc += (q63_t) b1 *Xn1;
      /* acc +=  b[2] * x[n-2] */
      acc += (q63_t) b2 *Xn2;
      /* acc +=  a1 * y[n-1] */
      acc += (q63_t) a1 *Yn1;
      /* acc +=  a2 * y[n-2] */
      acc += (q63_t) a2 *Yn2;
      /* The result is converted to 1.31  */
      acc = acc >> lShift;
      /* Every time after the output is computed state should be updated. */
      /* The states should be updated as:  */
      /* Xn2 = Xn1    */
      /* Xn1 = Xn     */
      /* Yn2 = Yn1    */
      /* Yn1 = acc    */
      Xn2 = Xn1;
      Xn1 = Xn;
      Yn2 = Yn1;
      Yn1 = (q31_t) acc;
      /* Store the output in the destination buffer. */
      *pOut++ = (q31_t) acc;
      /* decrement the loop counter */
      sample--;
    }
    /*  The first stage goes from the input buffer to the output buffer. */
    /*  Subsequent stages occur in-place in the output buffer */
    pIn = pDst;
    /* Reset to destination pointer */
    pOut = pDst;
    /*  Store the updated state variables back into the pState array */
    *pState++ = Xn1;
    *pState++ = Xn2;
    *pState++ = Yn1;
    *pState++ = Yn2;
  } while(--stage);
 #endif /*  #ifndef ARM_MATH_CM0_FAMILY_FAMILY */
 }
 /**    
  * @} end of BiquadCascadeDF1 group    
  */
--- a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c
@@ -0,0 +1,603 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        31. July 2014 
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_biquad_cascade_df2T_f32.c    
 *    
 * Description:  Processing function for the floating-point transposed    
 *               direct form II Biquad cascade filter.   
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**       
 * @ingroup groupFilters       
 */
 /**       
 * @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure       
 *       
 * This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.       
 * The filters are implemented as a cascade of second order Biquad sections.       
 * These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.      
 * Only floating-point data is supported.       
 *       
 * This function operate on blocks of input and output data and each call to the function       
 * processes <code>blockSize</code> samples through the filter.       
 * <code>pSrc</code> points to the array of input data and       
 * <code>pDst</code> points to the array of output data.       
 * Both arrays contain <code>blockSize</code> values.       
 *       
 * \par Algorithm       
 * Each Biquad stage implements a second order filter using the difference equation:       
 * <pre>       
 *    y[n] = b0 * x[n] + d1       
 *    d1 = b1 * x[n] + a1 * y[n] + d2       
 *    d2 = b2 * x[n] + a2 * y[n]       
 * </pre>       
 * where d1 and d2 represent the two state values.       
 *       
 * \par       
 * A Biquad filter using a transposed Direct Form II structure is shown below.       
 * \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"       
 * Coefficients <code>b0, b1, and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.       
 * Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.       
 * Pay careful attention to the sign of the feedback coefficients.       
 * Some design tools flip the sign of the feedback coefficients:       
 * <pre>       
 *    y[n] = b0 * x[n] + d1;       
 *    d1 = b1 * x[n] - a1 * y[n] + d2;       
 *    d2 = b2 * x[n] - a2 * y[n];       
 * </pre>       
 * In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.       
 *       
 * \par       
 * Higher order filters are realized as a cascade of second order sections.       
 * <code>numStages</code> refers to the number of second order stages used.       
 * For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.       
 * A 9th order filter would be realized with <code>numStages=5</code> second order stages with the       
 * coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).       
 *       
 * \par       
 * <code>pState</code> points to the state variable array.       
 * Each Biquad stage has 2 state variables <code>d1</code> and <code>d2</code>.       
 * The state variables are arranged in the <code>pState</code> array as:       
 * <pre>       
 *     {d11, d12, d21, d22, ...}       
 * </pre>       
 * where <code>d1x</code> refers to the state variables for the first Biquad and       
 * <code>d2x</code> refers to the state variables for the second Biquad.       
 * The state array has a total length of <code>2*numStages</code> values.       
 * The state variables are updated after each block of data is processed; the coefficients are untouched.       
 *       
 * \par       
 * The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.    
 * The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.    
 * That is why the Direct Form I structure supports Q15 and Q31 data types.    
 * The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables <code>d1</code> and <code>d2</code>.    
 * Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.    
 * The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.    
 *       
 * \par Instance Structure       
 * The coefficients and state variables for a filter are stored together in an instance data structure.       
 * A separate instance structure must be defined for each filter.       
 * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.       
 *       
 * \par Init Functions       
 * There is also an associated initialization function.      
 * The initialization function performs following operations:       
 * - Sets the values of the internal structure fields.       
 * - Zeros out the values in the state buffer.       
 * To do this manually without calling the init function, assign the follow subfields of the instance structure:
 * numStages, pCoeffs, pState. Also set all of the values in pState to zero. 
 *       
 * \par       
 * Use of the initialization function is optional.       
 * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.       
 * To place an instance structure into a const data section, the instance structure must be manually initialized.       
 * Set the values in the state buffer to zeros before static initialization.       
 * For example, to statically initialize the instance structure use       
 * <pre>       
 *     arm_biquad_cascade_df2T_instance_f32 S1 = {numStages, pState, pCoeffs};       
 * </pre>       
 * where <code>numStages</code> is the number of Biquad stages in the filter; <code>pState</code> is the address of the state buffer.       
 * <code>pCoeffs</code> is the address of the coefficient buffer;        
 *       
 */
 /**       
 * @addtogroup BiquadCascadeDF2T       
 * @{       
 */
 /**      
 * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.      
 * @param[in]  *S        points to an instance of the filter data structure.      
 * @param[in]  *pSrc     points to the block of input data.      
 * @param[out] *pDst     points to the block of output data      
 * @param[in]  blockSize number of samples to process.      
 * @return none.      
 */
 LOW_OPTIMIZATION_ENTER
 void arm_biquad_cascade_df2T_f32(
 const arm_biquad_cascade_df2T_instance_f32 * S,
 float32_t * pSrc,
 float32_t * pDst,
 uint32_t blockSize)
 {
   float32_t *pIn = pSrc;                         /*  source pointer            */
   float32_t *pOut = pDst;                        /*  destination pointer       */
   float32_t *pState = S->pState;                 /*  State pointer             */
   float32_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
   float32_t acc1;                                /*  accumulator               */
   float32_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
   float32_t Xn1;                                 /*  temporary input           */
   float32_t d1, d2;                              /*  state variables           */
   uint32_t sample, stage = S->numStages;         /*  loop counters             */
 #if defined(ARM_MATH_CM7)
   float32_t Xn2, Xn3, Xn4, Xn5, Xn6, Xn7, Xn8;   /*  Input State variables     */
   float32_t Xn9, Xn10, Xn11, Xn12, Xn13, Xn14, Xn15, Xn16;
   float32_t acc2, acc3, acc4, acc5, acc6, acc7;  /*  Simulates the accumulator */
   float32_t acc8, acc9, acc10, acc11, acc12, acc13, acc14, acc15, acc16;
   do
   {
      /* Reading the coefficients */ 
      b0 = pCoeffs[0]; 
      b1 = pCoeffs[1]; 
      b2 = pCoeffs[2]; 
      a1 = pCoeffs[3]; 
      /* Apply loop unrolling and compute 16 output values simultaneously. */ 
      sample = blockSize >> 4u; 
      a2 = pCoeffs[4]; 
      /*Reading the state values */ 
      d1 = pState[0]; 
      d2 = pState[1]; 
      pCoeffs += 5u;
      /* First part of the processing with loop unrolling.  Compute 16 outputs at a time.       
       ** a second loop below computes the remaining 1 to 15 samples. */
      while(sample > 0u) {
         /* y[n] = b0 * x[n] + d1 */
         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
         /* d2 = b2 * x[n] + a2 * y[n] */
         /* Read the first 2 inputs. 2 cycles */
         Xn1  = pIn[0 ];
         Xn2  = pIn[1 ];
         /* Sample 1. 5 cycles */
         Xn3  = pIn[2 ];
         acc1 = b0 * Xn1 + d1;
         Xn4  = pIn[3 ];
         d1 = b1 * Xn1 + d2;
         Xn5  = pIn[4 ];
         d2 = b2 * Xn1;
         Xn6  = pIn[5 ];
         d1 += a1 * acc1;
         Xn7  = pIn[6 ];
         d2 += a2 * acc1;
         /* Sample 2. 5 cycles */
         Xn8  = pIn[7 ];
         acc2 = b0 * Xn2 + d1;
         Xn9  = pIn[8 ];
         d1 = b1 * Xn2 + d2;
         Xn10 = pIn[9 ];
         d2 = b2 * Xn2;
         Xn11 = pIn[10];
         d1 += a1 * acc2;
         Xn12 = pIn[11];
         d2 += a2 * acc2;
         /* Sample 3. 5 cycles */
         Xn13 = pIn[12];
         acc3 = b0 * Xn3 + d1;
         Xn14 = pIn[13];
         d1 = b1 * Xn3 + d2;
         Xn15 = pIn[14];
         d2 = b2 * Xn3;
         Xn16 = pIn[15];
         d1 += a1 * acc3;
         pIn += 16;
         d2 += a2 * acc3;
         /* Sample 4. 5 cycles */
         acc4 = b0 * Xn4 + d1;
         d1 = b1 * Xn4 + d2;
         d2 = b2 * Xn4;
         d1 += a1 * acc4;
         d2 += a2 * acc4;
         /* Sample 5. 5 cycles */
         acc5 = b0 * Xn5 + d1;
         d1 = b1 * Xn5 + d2;
         d2 = b2 * Xn5;
         d1 += a1 * acc5;
         d2 += a2 * acc5;
         /* Sample 6. 5 cycles */
         acc6 = b0 * Xn6 + d1;
         d1 = b1 * Xn6 + d2;
         d2 = b2 * Xn6;
         d1 += a1 * acc6;
         d2 += a2 * acc6;
         /* Sample 7. 5 cycles */
         acc7 = b0 * Xn7 + d1;
         d1 = b1 * Xn7 + d2;
         d2 = b2 * Xn7;
         d1 += a1 * acc7;
         d2 += a2 * acc7;
         /* Sample 8. 5 cycles */
         acc8 = b0 * Xn8 + d1;
         d1 = b1 * Xn8 + d2;
         d2 = b2 * Xn8;
         d1 += a1 * acc8;
         d2 += a2 * acc8;
         /* Sample 9. 5 cycles */
         acc9 = b0 * Xn9 + d1;
         d1 = b1 * Xn9 + d2;
         d2 = b2 * Xn9;
         d1 += a1 * acc9;
         d2 += a2 * acc9;
         /* Sample 10. 5 cycles */
         acc10 = b0 * Xn10 + d1;
         d1 = b1 * Xn10 + d2;
         d2 = b2 * Xn10;
         d1 += a1 * acc10;
         d2 += a2 * acc10;
         /* Sample 11. 5 cycles */
         acc11 = b0 * Xn11 + d1;
         d1 = b1 * Xn11 + d2;
         d2 = b2 * Xn11;
         d1 += a1 * acc11;
         d2 += a2 * acc11;
         /* Sample 12. 5 cycles */
         acc12 = b0 * Xn12 + d1;
         d1 = b1 * Xn12 + d2;
         d2 = b2 * Xn12;
         d1 += a1 * acc12;
         d2 += a2 * acc12;
         /* Sample 13. 5 cycles */
         acc13 = b0 * Xn13 + d1;         
         d1 = b1 * Xn13 + d2;         
         d2 = b2 * Xn13;
         pOut[0 ] = acc1 ;
         d1 += a1 * acc13;
         pOut[1 ] = acc2 ;	
         d2 += a2 * acc13;
         /* Sample 14. 5 cycles */
         pOut[2 ] = acc3 ;	
         acc14 = b0 * Xn14 + d1;
         pOut[3 ] = acc4 ;
         d1 = b1 * Xn14 + d2;
         pOut[4 ] = acc5 ; 
         d2 = b2 * Xn14;
         pOut[5 ] = acc6 ;	  
         d1 += a1 * acc14;
         pOut[6 ] = acc7 ;	
         d2 += a2 * acc14;
         /* Sample 15. 5 cycles */
         pOut[7 ] = acc8 ;
         pOut[8 ] = acc9 ;  
         acc15 = b0 * Xn15 + d1;
         pOut[9 ] = acc10;	
         d1 = b1 * Xn15 + d2;
         pOut[10] = acc11;	
         d2 = b2 * Xn15;
         pOut[11] = acc12;
         d1 += a1 * acc15;
         pOut[12] = acc13;
         d2 += a2 * acc15;
         /* Sample 16. 5 cycles */
         pOut[13] = acc14;	
         acc16 = b0 * Xn16 + d1;
         pOut[14] = acc15;	
         d1 = b1 * Xn16 + d2;
         pOut[15] = acc16;
         d2 = b2 * Xn16;
         sample--;	 
         d1 += a1 * acc16;
         pOut += 16;
         d2 += a2 * acc16;
      }
      sample = blockSize & 0xFu;
      while(sample > 0u) {
         Xn1 = *pIn;         
         acc1 = b0 * Xn1 + d1;
         pIn++;
         d1 = b1 * Xn1 + d2;
         *pOut = acc1; 
         d2 = b2 * Xn1;
         pOut++;
         d1 += a1 * acc1;
         sample--;	
         d2 += a2 * acc1; 
      }
      /* Store the updated state variables back into the state array */ 
      pState[0] = d1; 
      /* The current stage input is given as the output to the next stage */ 
      pIn = pDst; 
      pState[1] = d2; 
      /* decrement the loop counter */ 
      stage--; 
      pState += 2u;
      /*Reset the output working pointer */ 
      pOut = pDst; 
   } while(stage > 0u);
 #elif defined(ARM_MATH_CM0_FAMILY)
   /* Run the below code for Cortex-M0 */
   do
   {
      /* Reading the coefficients */
      b0 = *pCoeffs++;
      b1 = *pCoeffs++;
      b2 = *pCoeffs++;
      a1 = *pCoeffs++;
      a2 = *pCoeffs++;
      /*Reading the state values */
      d1 = pState[0];
      d2 = pState[1];
      sample = blockSize;
      while(sample > 0u)
      {
         /* Read the input */
         Xn1 = *pIn++;
         /* y[n] = b0 * x[n] + d1 */
         acc1 = (b0 * Xn1) + d1;
         /* Store the result in the accumulator in the destination buffer. */
         *pOut++ = acc1;
         /* Every time after the output is computed state should be updated. */
         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
         d1 = ((b1 * Xn1) + (a1 * acc1)) + d2;
         /* d2 = b2 * x[n] + a2 * y[n] */
         d2 = (b2 * Xn1) + (a2 * acc1);
         /* decrement the loop counter */
         sample--;
      }
      /* Store the updated state variables back into the state array */
      *pState++ = d1;
      *pState++ = d2;
      /* The current stage input is given as the output to the next stage */
      pIn = pDst;
      /*Reset the output working pointer */
      pOut = pDst;
      /* decrement the loop counter */
      stage--;
   } while(stage > 0u);
 #else
   float32_t Xn2, Xn3, Xn4;                  	  /*  Input State variables     */
   float32_t acc2, acc3, acc4;              		  /*  accumulator               */
   float32_t p0, p1, p2, p3, p4, A1;
   /* Run the below code for Cortex-M4 and Cortex-M3 */
   do
   {
      /* Reading the coefficients */     
      b0 = *pCoeffs++;
      b1 = *pCoeffs++;
      b2 = *pCoeffs++;
      a1 = *pCoeffs++;
      a2 = *pCoeffs++;
      /*Reading the state values */
      d1 = pState[0];
      d2 = pState[1];
      /* Apply loop unrolling and compute 4 output values simultaneously. */
      sample = blockSize >> 2u;
      /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.       
   ** a second loop below computes the remaining 1 to 3 samples. */
      while(sample > 0u) {
         /* y[n] = b0 * x[n] + d1 */
         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
         /* d2 = b2 * x[n] + a2 * y[n] */
         /* Read the four inputs */
         Xn1 = pIn[0];
         Xn2 = pIn[1];
         Xn3 = pIn[2];
         Xn4 = pIn[3];
         pIn += 4;     
         p0 = b0 * Xn1; 
         p1 = b1 * Xn1;
         acc1 = p0 + d1;
         p0 = b0 * Xn2; 
         p3 = a1 * acc1;
         p2 = b2 * Xn1;
         A1 = p1 + p3;
         p4 = a2 * acc1;
         d1 = A1 + d2;
         d2 = p2 + p4;
         p1 = b1 * Xn2;
         acc2 = p0 + d1;
         p0 = b0 * Xn3;	 
         p3 = a1 * acc2; 
         p2 = b2 * Xn2;                                 
         A1 = p1 + p3;
         p4 = a2 * acc2;
         d1 = A1 + d2;
         d2 = p2 + p4;
         p1 = b1 * Xn3;
         acc3 = p0 + d1;
         p0 = b0 * Xn4;	
         p3 = a1 * acc3;
         p2 = b2 * Xn3;
         A1 = p1 + p3;
         p4 = a2 * acc3;
         d1 = A1 + d2;
         d2 = p2 + p4;
         acc4 = p0 + d1;
         p1 = b1 * Xn4;
         p3 = a1 * acc4;
         p2 = b2 * Xn4;
         A1 = p1 + p3;
         p4 = a2 * acc4;
         d1 = A1 + d2;
         d2 = p2 + p4;
         pOut[0] = acc1;	
         pOut[1] = acc2;	
         pOut[2] = acc3;	
         pOut[3] = acc4;
 		 pOut += 4;
         sample--;	       
      }
      sample = blockSize & 0x3u;
      while(sample > 0u) {
         Xn1 = *pIn++;
         p0 = b0 * Xn1; 
         p1 = b1 * Xn1;
         acc1 = p0 + d1;
         p3 = a1 * acc1;
         p2 = b2 * Xn1;
         A1 = p1 + p3;
         p4 = a2 * acc1;
         d1 = A1 + d2;
         d2 = p2 + p4;
         *pOut++ = acc1;
         sample--;	       
      }
      /* Store the updated state variables back into the state array */
      *pState++ = d1;
      *pState++ = d2;
      /* The current stage input is given as the output to the next stage */
      pIn = pDst;
      /*Reset the output working pointer */
      pOut = pDst;
      /* decrement the loop counter */
      stage--;
   } while(stage > 0u);
 #endif 
 }
 LOW_OPTIMIZATION_EXIT
 /**       
   * @} end of BiquadCascadeDF2T group       
   */
--- a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_f64.c
@@ -0,0 +1,603 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        31. July 2014 
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_biquad_cascade_df2T_f64.c    
 *    
 * Description:  Processing function for the floating-point transposed    
 *               direct form II Biquad cascade filter.   
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**       
 * @ingroup groupFilters       
 */
 /**       
 * @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure       
 *       
 * This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.       
 * The filters are implemented as a cascade of second order Biquad sections.       
 * These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.      
 * Only floating-point data is supported.       
 *       
 * This function operate on blocks of input and output data and each call to the function       
 * processes <code>blockSize</code> samples through the filter.       
 * <code>pSrc</code> points to the array of input data and       
 * <code>pDst</code> points to the array of output data.       
 * Both arrays contain <code>blockSize</code> values.       
 *       
 * \par Algorithm       
 * Each Biquad stage implements a second order filter using the difference equation:       
 * <pre>       
 *    y[n] = b0 * x[n] + d1       
 *    d1 = b1 * x[n] + a1 * y[n] + d2       
 *    d2 = b2 * x[n] + a2 * y[n]       
 * </pre>       
 * where d1 and d2 represent the two state values.       
 *       
 * \par       
 * A Biquad filter using a transposed Direct Form II structure is shown below.       
 * \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"       
 * Coefficients <code>b0, b1, and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.       
 * Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.       
 * Pay careful attention to the sign of the feedback coefficients.       
 * Some design tools flip the sign of the feedback coefficients:       
 * <pre>       
 *    y[n] = b0 * x[n] + d1;       
 *    d1 = b1 * x[n] - a1 * y[n] + d2;       
 *    d2 = b2 * x[n] - a2 * y[n];       
 * </pre>       
 * In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.       
 *       
 * \par       
 * Higher order filters are realized as a cascade of second order sections.       
 * <code>numStages</code> refers to the number of second order stages used.       
 * For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.       
 * A 9th order filter would be realized with <code>numStages=5</code> second order stages with the       
 * coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).       
 *       
 * \par       
 * <code>pState</code> points to the state variable array.       
 * Each Biquad stage has 2 state variables <code>d1</code> and <code>d2</code>.       
 * The state variables are arranged in the <code>pState</code> array as:       
 * <pre>       
 *     {d11, d12, d21, d22, ...}       
 * </pre>       
 * where <code>d1x</code> refers to the state variables for the first Biquad and       
 * <code>d2x</code> refers to the state variables for the second Biquad.       
 * The state array has a total length of <code>2*numStages</code> values.       
 * The state variables are updated after each block of data is processed; the coefficients are untouched.       
 *       
 * \par       
 * The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.    
 * The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.    
 * That is why the Direct Form I structure supports Q15 and Q31 data types.    
 * The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables <code>d1</code> and <code>d2</code>.    
 * Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.    
 * The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.    
 *       
 * \par Instance Structure       
 * The coefficients and state variables for a filter are stored together in an instance data structure.       
 * A separate instance structure must be defined for each filter.       
 * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.       
 *       
 * \par Init Functions       
 * There is also an associated initialization function.      
 * The initialization function performs following operations:       
 * - Sets the values of the internal structure fields.       
 * - Zeros out the values in the state buffer.       
 * To do this manually without calling the init function, assign the follow subfields of the instance structure:
 * numStages, pCoeffs, pState. Also set all of the values in pState to zero. 
 *       
 * \par       
 * Use of the initialization function is optional.       
 * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.       
 * To place an instance structure into a const data section, the instance structure must be manually initialized.       
 * Set the values in the state buffer to zeros before static initialization.       
 * For example, to statically initialize the instance structure use       
 * <pre>       
 *     arm_biquad_cascade_df2T_instance_f64 S1 = {numStages, pState, pCoeffs};       
 * </pre>       
 * where <code>numStages</code> is the number of Biquad stages in the filter; <code>pState</code> is the address of the state buffer.       
 * <code>pCoeffs</code> is the address of the coefficient buffer;        
 *       
 */
 /**       
 * @addtogroup BiquadCascadeDF2T       
 * @{       
 */
 /**      
 * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.      
 * @param[in]  *S        points to an instance of the filter data structure.      
 * @param[in]  *pSrc     points to the block of input data.      
 * @param[out] *pDst     points to the block of output data      
 * @param[in]  blockSize number of samples to process.      
 * @return none.      
 */
 LOW_OPTIMIZATION_ENTER
 void arm_biquad_cascade_df2T_f64(
 const arm_biquad_cascade_df2T_instance_f64 * S,
 float64_t * pSrc,
 float64_t * pDst,
 uint32_t blockSize)
 {
   float64_t *pIn = pSrc;                         /*  source pointer            */
   float64_t *pOut = pDst;                        /*  destination pointer       */
   float64_t *pState = S->pState;                 /*  State pointer             */
   float64_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
   float64_t acc1;                                /*  accumulator               */
   float64_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
   float64_t Xn1;                                 /*  temporary input           */
   float64_t d1, d2;                              /*  state variables           */
   uint32_t sample, stage = S->numStages;         /*  loop counters             */
 #if defined(ARM_MATH_CM7)
   float64_t Xn2, Xn3, Xn4, Xn5, Xn6, Xn7, Xn8;   /*  Input State variables     */
   float64_t Xn9, Xn10, Xn11, Xn12, Xn13, Xn14, Xn15, Xn16;
   float64_t acc2, acc3, acc4, acc5, acc6, acc7;  /*  Simulates the accumulator */
   float64_t acc8, acc9, acc10, acc11, acc12, acc13, acc14, acc15, acc16;
   do
   {
      /* Reading the coefficients */ 
      b0 = pCoeffs[0]; 
      b1 = pCoeffs[1]; 
      b2 = pCoeffs[2]; 
      a1 = pCoeffs[3]; 
      /* Apply loop unrolling and compute 16 output values simultaneously. */ 
      sample = blockSize >> 4u; 
      a2 = pCoeffs[4]; 
      /*Reading the state values */ 
      d1 = pState[0]; 
      d2 = pState[1]; 
      pCoeffs += 5u;
      /* First part of the processing with loop unrolling.  Compute 16 outputs at a time.       
       ** a second loop below computes the remaining 1 to 15 samples. */
      while(sample > 0u) {
         /* y[n] = b0 * x[n] + d1 */
         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
         /* d2 = b2 * x[n] + a2 * y[n] */
         /* Read the first 2 inputs. 2 cycles */
         Xn1  = pIn[0 ];
         Xn2  = pIn[1 ];
         /* Sample 1. 5 cycles */
         Xn3  = pIn[2 ];
         acc1 = b0 * Xn1 + d1;
         Xn4  = pIn[3 ];
         d1 = b1 * Xn1 + d2;
         Xn5  = pIn[4 ];
         d2 = b2 * Xn1;
         Xn6  = pIn[5 ];
         d1 += a1 * acc1;
         Xn7  = pIn[6 ];
         d2 += a2 * acc1;
         /* Sample 2. 5 cycles */
         Xn8  = pIn[7 ];
         acc2 = b0 * Xn2 + d1;
         Xn9  = pIn[8 ];
         d1 = b1 * Xn2 + d2;
         Xn10 = pIn[9 ];
         d2 = b2 * Xn2;
         Xn11 = pIn[10];
         d1 += a1 * acc2;
         Xn12 = pIn[11];
         d2 += a2 * acc2;
         /* Sample 3. 5 cycles */
         Xn13 = pIn[12];
         acc3 = b0 * Xn3 + d1;
         Xn14 = pIn[13];
         d1 = b1 * Xn3 + d2;
         Xn15 = pIn[14];
         d2 = b2 * Xn3;
         Xn16 = pIn[15];
         d1 += a1 * acc3;
         pIn += 16;
         d2 += a2 * acc3;
         /* Sample 4. 5 cycles */
         acc4 = b0 * Xn4 + d1;
         d1 = b1 * Xn4 + d2;
         d2 = b2 * Xn4;
         d1 += a1 * acc4;
         d2 += a2 * acc4;
         /* Sample 5. 5 cycles */
         acc5 = b0 * Xn5 + d1;
         d1 = b1 * Xn5 + d2;
         d2 = b2 * Xn5;
         d1 += a1 * acc5;
         d2 += a2 * acc5;
         /* Sample 6. 5 cycles */
         acc6 = b0 * Xn6 + d1;
         d1 = b1 * Xn6 + d2;
         d2 = b2 * Xn6;
         d1 += a1 * acc6;
         d2 += a2 * acc6;
         /* Sample 7. 5 cycles */
         acc7 = b0 * Xn7 + d1;
         d1 = b1 * Xn7 + d2;
         d2 = b2 * Xn7;
         d1 += a1 * acc7;
         d2 += a2 * acc7;
         /* Sample 8. 5 cycles */
         acc8 = b0 * Xn8 + d1;
         d1 = b1 * Xn8 + d2;
         d2 = b2 * Xn8;
         d1 += a1 * acc8;
         d2 += a2 * acc8;
         /* Sample 9. 5 cycles */
         acc9 = b0 * Xn9 + d1;
         d1 = b1 * Xn9 + d2;
         d2 = b2 * Xn9;
         d1 += a1 * acc9;
         d2 += a2 * acc9;
         /* Sample 10. 5 cycles */
         acc10 = b0 * Xn10 + d1;
         d1 = b1 * Xn10 + d2;
         d2 = b2 * Xn10;
         d1 += a1 * acc10;
         d2 += a2 * acc10;
         /* Sample 11. 5 cycles */
         acc11 = b0 * Xn11 + d1;
         d1 = b1 * Xn11 + d2;
         d2 = b2 * Xn11;
         d1 += a1 * acc11;
         d2 += a2 * acc11;
         /* Sample 12. 5 cycles */
         acc12 = b0 * Xn12 + d1;
         d1 = b1 * Xn12 + d2;
         d2 = b2 * Xn12;
         d1 += a1 * acc12;
         d2 += a2 * acc12;
         /* Sample 13. 5 cycles */
         acc13 = b0 * Xn13 + d1;         
         d1 = b1 * Xn13 + d2;         
         d2 = b2 * Xn13;
         pOut[0 ] = acc1 ;
         d1 += a1 * acc13;
         pOut[1 ] = acc2 ;	
         d2 += a2 * acc13;
         /* Sample 14. 5 cycles */
         pOut[2 ] = acc3 ;	
         acc14 = b0 * Xn14 + d1;
         pOut[3 ] = acc4 ;
         d1 = b1 * Xn14 + d2;
         pOut[4 ] = acc5 ; 
         d2 = b2 * Xn14;
         pOut[5 ] = acc6 ;	  
         d1 += a1 * acc14;
         pOut[6 ] = acc7 ;	
         d2 += a2 * acc14;
         /* Sample 15. 5 cycles */
         pOut[7 ] = acc8 ;
         pOut[8 ] = acc9 ;  
         acc15 = b0 * Xn15 + d1;
         pOut[9 ] = acc10;	
         d1 = b1 * Xn15 + d2;
         pOut[10] = acc11;	
         d2 = b2 * Xn15;
         pOut[11] = acc12;
         d1 += a1 * acc15;
         pOut[12] = acc13;
         d2 += a2 * acc15;
         /* Sample 16. 5 cycles */
         pOut[13] = acc14;	
         acc16 = b0 * Xn16 + d1;
         pOut[14] = acc15;	
         d1 = b1 * Xn16 + d2;
         pOut[15] = acc16;
         d2 = b2 * Xn16;
         sample--;	 
         d1 += a1 * acc16;
         pOut += 16;
         d2 += a2 * acc16;
      }
      sample = blockSize & 0xFu;
      while(sample > 0u) {
         Xn1 = *pIn;         
         acc1 = b0 * Xn1 + d1;
         pIn++;
         d1 = b1 * Xn1 + d2;
         *pOut = acc1; 
         d2 = b2 * Xn1;
         pOut++;
         d1 += a1 * acc1;
         sample--;	
         d2 += a2 * acc1; 
      }
      /* Store the updated state variables back into the state array */ 
      pState[0] = d1; 
      /* The current stage input is given as the output to the next stage */ 
      pIn = pDst; 
      pState[1] = d2; 
      /* decrement the loop counter */ 
      stage--; 
      pState += 2u;
      /*Reset the output working pointer */ 
      pOut = pDst; 
   } while(stage > 0u);
 #elif defined(ARM_MATH_CM0_FAMILY)
   /* Run the below code for Cortex-M0 */
   do
   {
      /* Reading the coefficients */
      b0 = *pCoeffs++;
      b1 = *pCoeffs++;
      b2 = *pCoeffs++;
      a1 = *pCoeffs++;
      a2 = *pCoeffs++;
      /*Reading the state values */
      d1 = pState[0];
      d2 = pState[1];
      sample = blockSize;
      while(sample > 0u)
      {
         /* Read the input */
         Xn1 = *pIn++;
         /* y[n] = b0 * x[n] + d1 */
         acc1 = (b0 * Xn1) + d1;
         /* Store the result in the accumulator in the destination buffer. */
         *pOut++ = acc1;
         /* Every time after the output is computed state should be updated. */
         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
         d1 = ((b1 * Xn1) + (a1 * acc1)) + d2;
         /* d2 = b2 * x[n] + a2 * y[n] */
         d2 = (b2 * Xn1) + (a2 * acc1);
         /* decrement the loop counter */
         sample--;
      }
      /* Store the updated state variables back into the state array */
      *pState++ = d1;
      *pState++ = d2;
      /* The current stage input is given as the output to the next stage */
      pIn = pDst;
      /*Reset the output working pointer */
      pOut = pDst;
      /* decrement the loop counter */
      stage--;
   } while(stage > 0u);
 #else
   float64_t Xn2, Xn3, Xn4;                  	  /*  Input State variables     */
   float64_t acc2, acc3, acc4;              		  /*  accumulator               */
   float64_t p0, p1, p2, p3, p4, A1;
   /* Run the below code for Cortex-M4 and Cortex-M3 */
   do
   {
      /* Reading the coefficients */     
      b0 = *pCoeffs++;
      b1 = *pCoeffs++;
      b2 = *pCoeffs++;
      a1 = *pCoeffs++;
      a2 = *pCoeffs++;
      /*Reading the state values */
      d1 = pState[0];
      d2 = pState[1];
      /* Apply loop unrolling and compute 4 output values simultaneously. */
      sample = blockSize >> 2u;
      /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.       
   ** a second loop below computes the remaining 1 to 3 samples. */
      while(sample > 0u) {
         /* y[n] = b0 * x[n] + d1 */
         /* d1 = b1 * x[n] + a1 * y[n] + d2 */
         /* d2 = b2 * x[n] + a2 * y[n] */
         /* Read the four inputs */
         Xn1 = pIn[0];
         Xn2 = pIn[1];
         Xn3 = pIn[2];
         Xn4 = pIn[3];
         pIn += 4;     
         p0 = b0 * Xn1; 
         p1 = b1 * Xn1;
         acc1 = p0 + d1;
         p0 = b0 * Xn2; 
         p3 = a1 * acc1;
         p2 = b2 * Xn1;
         A1 = p1 + p3;
         p4 = a2 * acc1;
         d1 = A1 + d2;
         d2 = p2 + p4;
         p1 = b1 * Xn2;
         acc2 = p0 + d1;
         p0 = b0 * Xn3;	 
         p3 = a1 * acc2; 
         p2 = b2 * Xn2;                                 
         A1 = p1 + p3;
         p4 = a2 * acc2;
         d1 = A1 + d2;
         d2 = p2 + p4;
         p1 = b1 * Xn3;
         acc3 = p0 + d1;
         p0 = b0 * Xn4;	
         p3 = a1 * acc3;
         p2 = b2 * Xn3;
         A1 = p1 + p3;
         p4 = a2 * acc3;
         d1 = A1 + d2;
         d2 = p2 + p4;
         acc4 = p0 + d1;
         p1 = b1 * Xn4;
         p3 = a1 * acc4;
         p2 = b2 * Xn4;
         A1 = p1 + p3;
         p4 = a2 * acc4;
         d1 = A1 + d2;
         d2 = p2 + p4;
         pOut[0] = acc1;	
         pOut[1] = acc2;	
         pOut[2] = acc3;	
         pOut[3] = acc4;
 				 pOut += 4;
         sample--;	       
      }
      sample = blockSize & 0x3u;
      while(sample > 0u) {
         Xn1 = *pIn++;
         p0 = b0 * Xn1; 
         p1 = b1 * Xn1;
         acc1 = p0 + d1;
         p3 = a1 * acc1;
         p2 = b2 * Xn1;
         A1 = p1 + p3;
         p4 = a2 * acc1;
         d1 = A1 + d2;
         d2 = p2 + p4;
         *pOut++ = acc1;
         sample--;	       
      }
      /* Store the updated state variables back into the state array */
      *pState++ = d1;
      *pState++ = d2;
      /* The current stage input is given as the output to the next stage */
      pIn = pDst;
      /*Reset the output working pointer */
      pOut = pDst;
      /* decrement the loop counter */
      stage--;
   } while(stage > 0u);
 #endif 
 }
 LOW_OPTIMIZATION_EXIT
 /**       
   * @} end of BiquadCascadeDF2T group       
   */
--- a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f32.c
@@ -0,0 +1,102 @@
 /*-----------------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:        arm_biquad_cascade_df2T_init_f32.c    
 *    
 * Description:  Initialization function for the floating-point transposed   
 *               direct form II Biquad cascade filter.   
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * ---------------------------------------------------------------------------*/
 #include "arm_math.h"
 /**    
 * @ingroup groupFilters    
 */
 /**    
 * @addtogroup BiquadCascadeDF2T    
 * @{    
 */
 /**   
 * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.   
 * @param[in,out] *S           points to an instance of the filter data structure.   
 * @param[in]     numStages    number of 2nd order stages in the filter.   
 * @param[in]     *pCoeffs     points to the filter coefficients.   
 * @param[in]     *pState      points to the state buffer.   
 * @return        none   
 *    
 * <b>Coefficient and State Ordering:</b>    
 * \par    
 * The coefficients are stored in the array <code>pCoeffs</code> in the following order:    
 * <pre>    
 *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}    
 * </pre>    
 *    
 * \par    
 * where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,    
 * <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,    
 * and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.    
 *    
 * \par    
 * The <code>pState</code> is a pointer to state array.    
 * Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code>.    
 * The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.    
 * The state array has a total length of <code>2*numStages</code> values.    
 * The state variables are updated after each block of data is processed; the coefficients are untouched.    
 */
 void arm_biquad_cascade_df2T_init_f32(
  arm_biquad_cascade_df2T_instance_f32 * S,
  uint8_t numStages,
  float32_t * pCoeffs,
  float32_t * pState)
 {
  /* Assign filter stages */
  S->numStages = numStages;
  /* Assign coefficient pointer */
  S->pCoeffs = pCoeffs;
  /* Clear state buffer and size is always 2 * numStages */
  memset(pState, 0, (2u * (uint32_t) numStages) * sizeof(float32_t));
  /* Assign state pointer */
  S->pState = pState;
 }
 /**    
 * @} end of BiquadCascadeDF2T group    
 */
--- a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_init_f64.c
@@ -0,0 +1,102 @@
 /*-----------------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:        arm_biquad_cascade_df2T_init_f64.c    
 *    
 * Description:  Initialization function for the floating-point transposed   
 *               direct form II Biquad cascade filter.   
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * ---------------------------------------------------------------------------*/
 #include "arm_math.h"
 /**    
 * @ingroup groupFilters    
 */
 /**    
 * @addtogroup BiquadCascadeDF2T    
 * @{    
 */
 /**   
 * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.   
 * @param[in,out] *S           points to an instance of the filter data structure.   
 * @param[in]     numStages    number of 2nd order stages in the filter.   
 * @param[in]     *pCoeffs     points to the filter coefficients.   
 * @param[in]     *pState      points to the state buffer.   
 * @return        none   
 *    
 * <b>Coefficient and State Ordering:</b>    
 * \par    
 * The coefficients are stored in the array <code>pCoeffs</code> in the following order:    
 * <pre>    
 *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}    
 * </pre>    
 *    
 * \par    
 * where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,    
 * <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,    
 * and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.    
 *    
 * \par    
 * The <code>pState</code> is a pointer to state array.    
 * Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code>.    
 * The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.    
 * The state array has a total length of <code>2*numStages</code> values.    
 * The state variables are updated after each block of data is processed; the coefficients are untouched.    
 */
 void arm_biquad_cascade_df2T_init_f64(
  arm_biquad_cascade_df2T_instance_f64 * S,
  uint8_t numStages,
  float64_t * pCoeffs,
  float64_t * pState)
 {
  /* Assign filter stages */
  S->numStages = numStages;
  /* Assign coefficient pointer */
  S->pCoeffs = pCoeffs;
  /* Clear state buffer and size is always 2 * numStages */
  memset(pState, 0, (2u * (uint32_t) numStages) * sizeof(float64_t));
  /* Assign state pointer */
  S->pState = pState;
 }
 /**    
 * @} end of BiquadCascadeDF2T group    
 */
--- a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c
@@ -0,0 +1,683 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        31. July 2014 
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:	    arm_biquad_cascade_stereo_df2T_f32.c    
 *    
 * Description:  Processing function for the floating-point transposed    
 *               direct form II Biquad cascade filter. 2 channels  
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**       
 * @ingroup groupFilters       
 */
 /**       
 * @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure       
 *       
 * This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.       
 * The filters are implemented as a cascade of second order Biquad sections.       
 * These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.      
 * Only floating-point data is supported.       
 *       
 * This function operate on blocks of input and output data and each call to the function       
 * processes <code>blockSize</code> samples through the filter.       
 * <code>pSrc</code> points to the array of input data and       
 * <code>pDst</code> points to the array of output data.       
 * Both arrays contain <code>blockSize</code> values.       
 *       
 * \par Algorithm       
 * Each Biquad stage implements a second order filter using the difference equation:       
 * <pre>       
 *    y[n] = b0 * x[n] + d1       
 *    d1 = b1 * x[n] + a1 * y[n] + d2       
 *    d2 = b2 * x[n] + a2 * y[n]       
 * </pre>       
 * where d1 and d2 represent the two state values.       
 *       
 * \par       
 * A Biquad filter using a transposed Direct Form II structure is shown below.       
 * \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"       
 * Coefficients <code>b0, b1, and b2 </code> multiply the input signal <code>x[n]</code> and are referred to as the feedforward coefficients.       
 * Coefficients <code>a1</code> and <code>a2</code> multiply the output signal <code>y[n]</code> and are referred to as the feedback coefficients.       
 * Pay careful attention to the sign of the feedback coefficients.       
 * Some design tools flip the sign of the feedback coefficients:       
 * <pre>       
 *    y[n] = b0 * x[n] + d1;       
 *    d1 = b1 * x[n] - a1 * y[n] + d2;       
 *    d2 = b2 * x[n] - a2 * y[n];       
 * </pre>       
 * In this case the feedback coefficients <code>a1</code> and <code>a2</code> must be negated when used with the CMSIS DSP Library.       
 *       
 * \par       
 * Higher order filters are realized as a cascade of second order sections.       
 * <code>numStages</code> refers to the number of second order stages used.       
 * For example, an 8th order filter would be realized with <code>numStages=4</code> second order stages.       
 * A 9th order filter would be realized with <code>numStages=5</code> second order stages with the       
 * coefficients for one of the stages configured as a first order filter (<code>b2=0</code> and <code>a2=0</code>).       
 *       
 * \par       
 * <code>pState</code> points to the state variable array.       
 * Each Biquad stage has 2 state variables <code>d1</code> and <code>d2</code>.       
 * The state variables are arranged in the <code>pState</code> array as:       
 * <pre>       
 *     {d11, d12, d21, d22, ...}       
 * </pre>       
 * where <code>d1x</code> refers to the state variables for the first Biquad and       
 * <code>d2x</code> refers to the state variables for the second Biquad.       
 * The state array has a total length of <code>2*numStages</code> values.       
 * The state variables are updated after each block of data is processed; the coefficients are untouched.       
 *       
 * \par       
 * The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.    
 * The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.    
 * That is why the Direct Form I structure supports Q15 and Q31 data types.    
 * The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables <code>d1</code> and <code>d2</code>.    
 * Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.    
 * The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.    
 *       
 * \par Instance Structure       
 * The coefficients and state variables for a filter are stored together in an instance data structure.       
 * A separate instance structure must be defined for each filter.       
 * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.       
 *       
 * \par Init Functions       
 * There is also an associated initialization function.      
 * The initialization function performs following operations:       
 * - Sets the values of the internal structure fields.       
 * - Zeros out the values in the state buffer.       
 * To do this manually without calling the init function, assign the follow subfields of the instance structure:
 * numStages, pCoeffs, pState. Also set all of the values in pState to zero. 
 *       
 * \par       
 * Use of the initialization function is optional.       
 * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.       
 * To place an instance structure into a const data section, the instance structure must be manually initialized.       
 * Set the values in the state buffer to zeros before static initialization.       
 * For example, to statically initialize the instance structure use       
 * <pre>       
 *     arm_biquad_cascade_df2T_instance_f32 S1 = {numStages, pState, pCoeffs};       
 * </pre>       
 * where <code>numStages</code> is the number of Biquad stages in the filter; <code>pState</code> is the address of the state buffer.       
 * <code>pCoeffs</code> is the address of the coefficient buffer;        
 *       
 */
 /**       
 * @addtogroup BiquadCascadeDF2T       
 * @{       
 */
 /**      
 * @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.      
 * @param[in]  *S        points to an instance of the filter data structure.      
 * @param[in]  *pSrc     points to the block of input data.      
 * @param[out] *pDst     points to the block of output data      
 * @param[in]  blockSize number of samples to process.      
 * @return none.      
 */
 LOW_OPTIMIZATION_ENTER
 void arm_biquad_cascade_stereo_df2T_f32(
 const arm_biquad_cascade_stereo_df2T_instance_f32 * S,
 float32_t * pSrc,
 float32_t * pDst,
 uint32_t blockSize)
 {
    float32_t *pIn = pSrc;                         /*  source pointer            */
    float32_t *pOut = pDst;                        /*  destination pointer       */
    float32_t *pState = S->pState;                 /*  State pointer             */
    float32_t *pCoeffs = S->pCoeffs;               /*  coefficient pointer       */
    float32_t acc1a, acc1b;                        /*  accumulator               */
    float32_t b0, b1, b2, a1, a2;                  /*  Filter coefficients       */
    float32_t Xn1a, Xn1b;                          /*  temporary input           */
    float32_t d1a, d2a, d1b, d2b;                  /*  state variables           */
    uint32_t sample, stage = S->numStages;         /*  loop counters             */
 #if defined(ARM_MATH_CM7)
    float32_t Xn2a, Xn3a, Xn4a, Xn5a, Xn6a, Xn7a, Xn8a;         /*  Input State variables     */
    float32_t Xn2b, Xn3b, Xn4b, Xn5b, Xn6b, Xn7b, Xn8b;         /*  Input State variables     */
    float32_t acc2a, acc3a, acc4a, acc5a, acc6a, acc7a, acc8a;  /*  Simulates the accumulator */
    float32_t acc2b, acc3b, acc4b, acc5b, acc6b, acc7b, acc8b;  /*  Simulates the accumulator */
    do
    {
        /* Reading the coefficients */ 
        b0 = pCoeffs[0]; 
        b1 = pCoeffs[1]; 
        b2 = pCoeffs[2]; 
        a1 = pCoeffs[3]; 
        /* Apply loop unrolling and compute 8 output values simultaneously. */ 
        sample = blockSize >> 3u; 
        a2 = pCoeffs[4]; 
        /*Reading the state values */ 
        d1a = pState[0]; 
        d2a = pState[1]; 
        d1b = pState[2]; 
        d2b = pState[3]; 
        pCoeffs += 5u;
        /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.       
        ** a second loop below computes the remaining 1 to 7 samples. */
        while(sample > 0u) {
            /* y[n] = b0 * x[n] + d1 */
            /* d1 = b1 * x[n] + a1 * y[n] + d2 */
            /* d2 = b2 * x[n] + a2 * y[n] */
            /* Read the first 2 inputs. 2 cycles */
            Xn1a  = pIn[0 ];
            Xn1b  = pIn[1 ];
            /* Sample 1. 5 cycles */
            Xn2a  = pIn[2 ];
            acc1a = b0 * Xn1a + d1a;
            Xn2b  = pIn[3 ];
            d1a = b1 * Xn1a + d2a;
            Xn3a  = pIn[4 ];
            d2a = b2 * Xn1a;
            Xn3b  = pIn[5 ];
            d1a += a1 * acc1a;
            Xn4a  = pIn[6 ];
            d2a += a2 * acc1a;
            /* Sample 2. 5 cycles */
            Xn4b  = pIn[7 ];
            acc1b = b0 * Xn1b + d1b;
            Xn5a  = pIn[8 ];
            d1b = b1 * Xn1b + d2b;
            Xn5b = pIn[9 ];
            d2b = b2 * Xn1b;
            Xn6a = pIn[10];
            d1b += a1 * acc1b;
            Xn6b = pIn[11];
            d2b += a2 * acc1b;
            /* Sample 3. 5 cycles */
            Xn7a = pIn[12];
            acc2a = b0 * Xn2a + d1a;
            Xn7b = pIn[13];
            d1a = b1 * Xn2a + d2a;
            Xn8a = pIn[14];
            d2a = b2 * Xn2a;
            Xn8b = pIn[15];
            d1a += a1 * acc2a;
            pIn += 16;
            d2a += a2 * acc2a;
            /* Sample 4. 5 cycles */
            acc2b = b0 * Xn2b + d1b;
            d1b = b1 * Xn2b + d2b;
            d2b = b2 * Xn2b;
            d1b += a1 * acc2b;
            d2b += a2 * acc2b;
            /* Sample 5. 5 cycles */
            acc3a = b0 * Xn3a + d1a;
            d1a = b1 * Xn3a + d2a;
            d2a = b2 * Xn3a;
            d1a += a1 * acc3a;
            d2a += a2 * acc3a;
            /* Sample 6. 5 cycles */
            acc3b = b0 * Xn3b + d1b;
            d1b = b1 * Xn3b + d2b;
            d2b = b2 * Xn3b;
            d1b += a1 * acc3b;
            d2b += a2 * acc3b;
            /* Sample 7. 5 cycles */
            acc4a = b0 * Xn4a + d1a;
            d1a = b1 * Xn4a + d2a;
            d2a = b2 * Xn4a;
            d1a += a1 * acc4a;
            d2a += a2 * acc4a;
            /* Sample 8. 5 cycles */
            acc4b = b0 * Xn4b + d1b;
            d1b = b1 * Xn4b + d2b;
            d2b = b2 * Xn4b;
            d1b += a1 * acc4b;
            d2b += a2 * acc4b;
            /* Sample 9. 5 cycles */
            acc5a = b0 * Xn5a + d1a;
            d1a = b1 * Xn5a + d2a;
            d2a = b2 * Xn5a;
            d1a += a1 * acc5a;
            d2a += a2 * acc5a;
            /* Sample 10. 5 cycles */
            acc5b = b0 * Xn5b + d1b;
            d1b = b1 * Xn5b + d2b;
            d2b = b2 * Xn5b;
            d1b += a1 * acc5b;
            d2b += a2 * acc5b;
            /* Sample 11. 5 cycles */
            acc6a = b0 * Xn6a + d1a;
            d1a = b1 * Xn6a + d2a;
            d2a = b2 * Xn6a;
            d1a += a1 * acc6a;
            d2a += a2 * acc6a;
            /* Sample 12. 5 cycles */
            acc6b = b0 * Xn6b + d1b;
            d1b = b1 * Xn6b + d2b;
            d2b = b2 * Xn6b;
            d1b += a1 * acc6b;
            d2b += a2 * acc6b;
            /* Sample 13. 5 cycles */
            acc7a = b0 * Xn7a + d1a;         
            d1a = b1 * Xn7a + d2a;   
            pOut[0 ] = acc1a ;      
            d2a = b2 * Xn7a;
            pOut[1 ] = acc1b ;	
            d1a += a1 * acc7a;
            pOut[2 ] = acc2a ;	
            d2a += a2 * acc7a;
            /* Sample 14. 5 cycles */
            pOut[3 ] = acc2b ;
            acc7b = b0 * Xn7b + d1b;
            pOut[4 ] = acc3a ; 
            d1b = b1 * Xn7b + d2b;
            pOut[5 ] = acc3b ;	
            d2b = b2 * Xn7b;
            pOut[6 ] = acc4a ;	  
            d1b += a1 * acc7b;
            pOut[7 ] = acc4b ;
            d2b += a2 * acc7b;
            /* Sample 15. 5 cycles */
            pOut[8 ] = acc5a ;  
            acc8a = b0 * Xn8a + d1a;
            pOut[9 ] = acc5b;	
            d1a = b1 * Xn8a + d2a;
            pOut[10] = acc6a;	
            d2a = b2 * Xn8a;
            pOut[11] = acc6b;
            d1a += a1 * acc8a;
            pOut[12] = acc7a;
            d2a += a2 * acc8a;
            /* Sample 16. 5 cycles */
            pOut[13] = acc7b;	
            acc8b = b0 * Xn8b + d1b;
            pOut[14] = acc8a;	
            d1b = b1 * Xn8b + d2b;
            pOut[15] = acc8b;
            d2b = b2 * Xn8b;
            sample--;	 
            d1b += a1 * acc8b;
            pOut += 16;
            d2b += a2 * acc8b;
        }
        sample = blockSize & 0x7u;
        while(sample > 0u) {
            /* Read the input */
            Xn1a = *pIn++; //Channel a
            Xn1b = *pIn++; //Channel b
            /* y[n] = b0 * x[n] + d1 */
            acc1a = (b0 * Xn1a) + d1a;
            acc1b = (b0 * Xn1b) + d1b;
            /* Store the result in the accumulator in the destination buffer. */
            *pOut++ = acc1a;
            *pOut++ = acc1b;
            /* Every time after the output is computed state should be updated. */
            /* d1 = b1 * x[n] + a1 * y[n] + d2 */
            d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
            d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
            /* d2 = b2 * x[n] + a2 * y[n] */
            d2a = (b2 * Xn1a) + (a2 * acc1a);
            d2b = (b2 * Xn1b) + (a2 * acc1b);
            sample--;	
        }
        /* Store the updated state variables back into the state array */ 
        pState[0] = d1a; 
        pState[1] = d2a;         
        pState[2] = d1b; 
        pState[3] = d2b; 
        /* The current stage input is given as the output to the next stage */ 
        pIn = pDst; 
        /* decrement the loop counter */ 
        stage--; 
        pState += 4u;
        /*Reset the output working pointer */ 
        pOut = pDst; 
    } while(stage > 0u);
 #elif defined(ARM_MATH_CM0_FAMILY)
    /* Run the below code for Cortex-M0 */
    do
    {
        /* Reading the coefficients */
        b0 = *pCoeffs++;
        b1 = *pCoeffs++;
        b2 = *pCoeffs++;
        a1 = *pCoeffs++;
        a2 = *pCoeffs++;
        /*Reading the state values */
        d1a = pState[0];
        d2a = pState[1];
        d1b = pState[2];
        d2b = pState[3];
        sample = blockSize;
        while(sample > 0u)
        {
            /* Read the input */
            Xn1a = *pIn++; //Channel a
            Xn1b = *pIn++; //Channel b
            /* y[n] = b0 * x[n] + d1 */
            acc1a = (b0 * Xn1a) + d1a;
            acc1b = (b0 * Xn1b) + d1b;
            /* Store the result in the accumulator in the destination buffer. */
            *pOut++ = acc1a;
            *pOut++ = acc1b;
            /* Every time after the output is computed state should be updated. */
            /* d1 = b1 * x[n] + a1 * y[n] + d2 */
            d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
            d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
            /* d2 = b2 * x[n] + a2 * y[n] */
            d2a = (b2 * Xn1a) + (a2 * acc1a);
            d2b = (b2 * Xn1b) + (a2 * acc1b);
            /* decrement the loop counter */
            sample--;
        }
        /* Store the updated state variables back into the state array */
        *pState++ = d1a;
        *pState++ = d2a;
        *pState++ = d1b;
        *pState++ = d2b;
        /* The current stage input is given as the output to the next stage */
        pIn = pDst;
        /*Reset the output working pointer */
        pOut = pDst;
        /* decrement the loop counter */
        stage--;
    } while(stage > 0u);
 #else
    float32_t Xn2a, Xn3a, Xn4a;                          /*  Input State variables     */
    float32_t Xn2b, Xn3b, Xn4b;                          /*  Input State variables     */
    float32_t acc2a, acc3a, acc4a;                       /*  accumulator               */
    float32_t acc2b, acc3b, acc4b;                       /*  accumulator               */
    float32_t p0a, p1a, p2a, p3a, p4a, A1a;
    float32_t p0b, p1b, p2b, p3b, p4b, A1b;
    /* Run the below code for Cortex-M4 and Cortex-M3 */
    do
    {
        /* Reading the coefficients */     
        b0 = *pCoeffs++;
        b1 = *pCoeffs++;
        b2 = *pCoeffs++;
        a1 = *pCoeffs++;
        a2 = *pCoeffs++;      
        /*Reading the state values */
        d1a = pState[0];
        d2a = pState[1];
        d1b = pState[2];
        d2b = pState[3];
        /* Apply loop unrolling and compute 4 output values simultaneously. */
        sample = blockSize >> 2u;
        /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.       
        ** a second loop below computes the remaining 1 to 3 samples. */
        while(sample > 0u) {
            /* y[n] = b0 * x[n] + d1 */
            /* d1 = b1 * x[n] + a1 * y[n] + d2 */
            /* d2 = b2 * x[n] + a2 * y[n] */
            /* Read the four inputs */
            Xn1a = pIn[0];
            Xn1b = pIn[1];
            Xn2a = pIn[2];
            Xn2b = pIn[3];
            Xn3a = pIn[4];
            Xn3b = pIn[5];
            Xn4a = pIn[6];
            Xn4b = pIn[7];
            pIn += 8;     
            p0a = b0 * Xn1a; 
            p0b = b0 * Xn1b; 
            p1a = b1 * Xn1a;
            p1b = b1 * Xn1b;
            acc1a = p0a + d1a;
            acc1b = p0b + d1b;
            p0a = b0 * Xn2a; 
            p0b = b0 * Xn2b; 
            p3a = a1 * acc1a;
            p3b = a1 * acc1b;
            p2a = b2 * Xn1a;
            p2b = b2 * Xn1b;
            A1a = p1a + p3a;
            A1b = p1b + p3b;
            p4a = a2 * acc1a;
            p4b = a2 * acc1b;
            d1a = A1a + d2a;
            d1b = A1b + d2b;
            d2a = p2a + p4a;
            d2b = p2b + p4b;
            p1a = b1 * Xn2a;
            p1b = b1 * Xn2b;
            acc2a = p0a + d1a;
            acc2b = p0b + d1b;
            p0a = b0 * Xn3a; 
            p0b = b0 * Xn3b; 
            p3a = a1 * acc2a;
            p3b = a1 * acc2b;
            p2a = b2 * Xn2a;
            p2b = b2 * Xn2b;
            A1a = p1a + p3a;
            A1b = p1b + p3b;
            p4a = a2 * acc2a;
            p4b = a2 * acc2b;
            d1a = A1a + d2a;
            d1b = A1b + d2b;
            d2a = p2a + p4a;
            d2b = p2b + p4b;
            p1a = b1 * Xn3a;
            p1b = b1 * Xn3b;
            acc3a = p0a + d1a;
            acc3b = p0b + d1b;
            p0a = b0 * Xn4a; 
            p0b = b0 * Xn4b; 
            p3a = a1 * acc3a;
            p3b = a1 * acc3b;
            p2a = b2 * Xn3a;
            p2b = b2 * Xn3b;
            A1a = p1a + p3a;
            A1b = p1b + p3b;
            p4a = a2 * acc3a;
            p4b = a2 * acc3b;
            d1a = A1a + d2a;
            d1b = A1b + d2b;
            d2a = p2a + p4a;
            d2b = p2b + p4b;
            acc4a = p0a + d1a;
            acc4b = p0b + d1b;
            p1a = b1 * Xn4a;
            p1b = b1 * Xn4b;
            p3a = a1 * acc4a;
            p3b = a1 * acc4b;
            p2a = b2 * Xn4a;
            p2b = b2 * Xn4b;
            A1a = p1a + p3a;
            A1b = p1b + p3b;
            p4a = a2 * acc4a;
            p4b = a2 * acc4b;
            d1a = A1a + d2a;
            d1b = A1b + d2b;
            d2a = p2a + p4a;
            d2b = p2b + p4b;
            pOut[0] = acc1a;	
            pOut[1] = acc1b;	
            pOut[2] = acc2a;	
            pOut[3] = acc2b;
            pOut[4] = acc3a;	
            pOut[5] = acc3b;	
            pOut[6] = acc4a;	
            pOut[7] = acc4b;
            pOut += 8;
            sample--;	       
        }
        sample = blockSize & 0x3u;
        while(sample > 0u) {
            Xn1a = *pIn++;
            Xn1b = *pIn++;
            p0a = b0 * Xn1a; 
            p0b = b0 * Xn1b; 
            p1a = b1 * Xn1a;
            p1b = b1 * Xn1b;
            acc1a = p0a + d1a;
            acc1b = p0b + d1b;
            p3a = a1 * acc1a;
            p3b = a1 * acc1b;
            p2a = b2 * Xn1a;
            p2b = b2 * Xn1b;
            A1a = p1a + p3a;
            A1b = p1b + p3b;
            p4a = a2 * acc1a;
            p4b = a2 * acc1b;
            d1a = A1a + d2a;
            d1b = A1b + d2b;
            d2a = p2a + p4a;
            d2b = p2b + p4b;
            *pOut++ = acc1a;
            *pOut++ = acc1b;
            sample--;	       
        }
        /* Store the updated state variables back into the state array */
        *pState++ = d1a;
        *pState++ = d2a;
        *pState++ = d1b;
        *pState++ = d2b;
        /* The current stage input is given as the output to the next stage */
        pIn = pDst;
        /*Reset the output working pointer */
        pOut = pDst;
        /* decrement the loop counter */
        stage--;
    } while(stage > 0u);
 #endif 
 }
 LOW_OPTIMIZATION_EXIT
 /**       
   * @} end of BiquadCascadeDF2T group       
   */
--- a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f32.c
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f32.c
@@ -0,0 +1,102 @@
 /*-----------------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:        arm_biquad_cascade_stereo_df2T_init_f32.c    
 *    
 * Description:  Initialization function for the floating-point transposed   
 *               direct form II Biquad cascade filter.   
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.   
 * ---------------------------------------------------------------------------*/
 #include "arm_math.h"
 /**    
 * @ingroup groupFilters    
 */
 /**    
 * @addtogroup BiquadCascadeDF2T    
 * @{    
 */
 /**   
 * @brief  Initialization function for the floating-point transposed direct form II Biquad cascade filter.   
 * @param[in,out] *S           points to an instance of the filter data structure.   
 * @param[in]     numStages    number of 2nd order stages in the filter.   
 * @param[in]     *pCoeffs     points to the filter coefficients.   
 * @param[in]     *pState      points to the state buffer.   
 * @return        none   
 *    
 * <b>Coefficient and State Ordering:</b>    
 * \par    
 * The coefficients are stored in the array <code>pCoeffs</code> in the following order:    
 * <pre>    
 *     {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}    
 * </pre>    
 *    
 * \par    
 * where <code>b1x</code> and <code>a1x</code> are the coefficients for the first stage,    
 * <code>b2x</code> and <code>a2x</code> are the coefficients for the second stage,    
 * and so on.  The <code>pCoeffs</code> array contains a total of <code>5*numStages</code> values.    
 *    
 * \par    
 * The <code>pState</code> is a pointer to state array.    
 * Each Biquad stage has 2 state variables <code>d1,</code> and <code>d2</code> for each channel.    
 * The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.    
 * The state array has a total length of <code>2*numStages</code> values.    
 * The state variables are updated after each block of data is processed; the coefficients are untouched.    
 */
 void arm_biquad_cascade_stereo_df2T_init_f32(
  arm_biquad_cascade_stereo_df2T_instance_f32 * S,
  uint8_t numStages,
  float32_t * pCoeffs,
  float32_t * pState)
 {
  /* Assign filter stages */
  S->numStages = numStages;
  /* Assign coefficient pointer */
  S->pCoeffs = pCoeffs;
  /* Clear state buffer and size is always 4 * numStages */
  memset(pState, 0, (4u * (uint32_t) numStages) * sizeof(float32_t));
  /* Assign state pointer */
  S->pState = pState;
 }
 /**    
 * @} end of BiquadCascadeDF2T group    
 */
--- a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_f32.c
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_f32.c
@@ -0,0 +1,647 @@
 /* ----------------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_conv_f32.c    
 *    
 * Description:	Convolution of floating-point sequences.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.  
 * -------------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupFilters    
 */
 /**    
 * @defgroup Conv Convolution    
 *    
 * Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector.    
 * Convolution is similar to correlation and is frequently used in filtering and data analysis.    
 * The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types.    
 * The library also provides fast versions of the Q15 and Q31 functions on Cortex-M4 and Cortex-M3.    
 *    
 * \par Algorithm    
 * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.    
 * Then the convolution    
 *    
 * <pre>    
 *                   c[n] = a[n] * b[n]    
 * </pre>    
 *    
 * \par    
 * is defined as    
 * \image html ConvolutionEquation.gif    
 * \par    
 * Note that <code>c[n]</code> is of length <code>srcALen + srcBLen - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., srcALen + srcBLen - 2</code>.    
 * <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and    
 * <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.    
 * The output result is written to <code>pDst</code> and the calling function must allocate <code>srcALen+srcBLen-1</code> words for the result.    
 *    
 * \par    
 * Conceptually, when two signals <code>a[n]</code> and <code>b[n]</code> are convolved,    
 * the signal <code>b[n]</code> slides over <code>a[n]</code>.    
 * For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together.    
 *    
 * \par    
 * Note that convolution is a commutative operation:    
 *    
 * <pre>    
 *                   a[n] * b[n] = b[n] * a[n].    
 * </pre>    
 *    
 * \par    
 * This means that switching the A and B arguments to the convolution functions has no effect.    
 *    
 * <b>Fixed-Point Behavior</b>    
 *    
 * \par    
 * Convolution requires summing up a large number of intermediate products.    
 * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.    
 * Refer to the function specific documentation below for further details of the particular algorithm used.    
 *
 *
 * <b>Fast Versions</b>
 *
 * \par 
 * Fast versions are supported for Q31 and Q15.  Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires
 * the input signals should be scaled down to avoid intermediate overflows.   
 *
 *
 * <b>Opt Versions</b>
 *
 * \par 
 * Opt versions are supported for Q15 and Q7.  Design uses internal scratch buffer for getting good optimisation.
 * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions 
 */
 /**    
 * @addtogroup Conv    
 * @{    
 */
 /**    
 * @brief Convolution of floating-point sequences.    
 * @param[in] *pSrcA points to the first input sequence.    
 * @param[in] srcALen length of the first input sequence.    
 * @param[in] *pSrcB points to the second input sequence.    
 * @param[in] srcBLen length of the second input sequence.    
 * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.    
 * @return none.    
 */
 void arm_conv_f32(
  float32_t * pSrcA,
  uint32_t srcALen,
  float32_t * pSrcB,
  uint32_t srcBLen,
  float32_t * pDst)
 {
 #ifndef ARM_MATH_CM0_FAMILY
  /* Run the below code for Cortex-M4 and Cortex-M3 */
  float32_t *pIn1;                               /* inputA pointer */
  float32_t *pIn2;                               /* inputB pointer */
  float32_t *pOut = pDst;                        /* output pointer */
  float32_t *px;                                 /* Intermediate inputA pointer */
  float32_t *py;                                 /* Intermediate inputB pointer */
  float32_t *pSrc1, *pSrc2;                      /* Intermediate pointers */
  float32_t sum, acc0, acc1, acc2, acc3;         /* Accumulator */
  float32_t x0, x1, x2, x3, c0;                  /* Temporary variables to hold state and coefficient values */
  uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counters */
  /* The algorithm implementation is based on the lengths of the inputs. */
  /* srcB is always made to slide across srcA. */
  /* So srcBLen is always considered as shorter or equal to srcALen */
  if(srcALen >= srcBLen)
  {
    /* Initialization of inputA pointer */
    pIn1 = pSrcA;
    /* Initialization of inputB pointer */
    pIn2 = pSrcB;
  }
  else
  {
    /* Initialization of inputA pointer */
    pIn1 = pSrcB;
    /* Initialization of inputB pointer */
    pIn2 = pSrcA;
    /* srcBLen is always considered as shorter or equal to srcALen */
    j = srcBLen;
    srcBLen = srcALen;
    srcALen = j;
  }
  /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
  /* The function is internally    
   * divided into three stages according to the number of multiplications that has to be    
   * taken place between inputA samples and inputB samples. In the first stage of the    
   * algorithm, the multiplications increase by one for every iteration.    
   * In the second stage of the algorithm, srcBLen number of multiplications are done.    
   * In the third stage of the algorithm, the multiplications decrease by one    
   * for every iteration. */
  /* The algorithm is implemented in three stages.    
     The loop counters of each stage is initiated here. */
  blockSize1 = srcBLen - 1u;
  blockSize2 = srcALen - (srcBLen - 1u);
  blockSize3 = blockSize1;
  /* --------------------------    
   * initializations of stage1    
   * -------------------------*/
  /* sum = x[0] * y[0]    
   * sum = x[0] * y[1] + x[1] * y[0]    
   * ....    
   * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]    
   */
  /* In this stage the MAC operations are increased by 1 for every iteration.    
     The count variable holds the number of MAC operations performed */
  count = 1u;
  /* Working pointer of inputA */
  px = pIn1;
  /* Working pointer of inputB */
  py = pIn2;
  /* ------------------------    
   * Stage1 process    
   * ----------------------*/
  /* The first stage starts here */
  while(blockSize1 > 0u)
  {
    /* Accumulator is made zero for every iteration */
    sum = 0.0f;
    /* Apply loop unrolling and compute 4 MACs simultaneously. */
    k = count >> 2u;
    /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
     ** a second loop below computes MACs for the remaining 1 to 3 samples. */
    while(k > 0u)
    {
      /* x[0] * y[srcBLen - 1] */
      sum += *px++ * *py--;
      /* x[1] * y[srcBLen - 2] */
      sum += *px++ * *py--;
      /* x[2] * y[srcBLen - 3] */
      sum += *px++ * *py--;
      /* x[3] * y[srcBLen - 4] */
      sum += *px++ * *py--;
      /* Decrement the loop counter */
      k--;
    }
    /* If the count is not a multiple of 4, compute any remaining MACs here.    
     ** No loop unrolling is used. */
    k = count % 0x4u;
    while(k > 0u)
    {
      /* Perform the multiply-accumulate */
      sum += *px++ * *py--;
      /* Decrement the loop counter */
      k--;
    }
    /* Store the result in the accumulator in the destination buffer. */
    *pOut++ = sum;
    /* Update the inputA and inputB pointers for next MAC calculation */
    py = pIn2 + count;
    px = pIn1;
    /* Increment the MAC count */
    count++;
    /* Decrement the loop counter */
    blockSize1--;
  }
  /* --------------------------    
   * Initializations of stage2    
   * ------------------------*/
  /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]    
   * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]    
   * ....    
   * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]    
   */
  /* Working pointer of inputA */
  px = pIn1;
  /* Working pointer of inputB */
  pSrc2 = pIn2 + (srcBLen - 1u);
  py = pSrc2;
  /* count is index by which the pointer pIn1 to be incremented */
  count = 0u;
  /* -------------------    
   * Stage2 process    
   * ------------------*/
  /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.    
   * So, to loop unroll over blockSize2,    
   * srcBLen should be greater than or equal to 4 */
  if(srcBLen >= 4u)
  {
    /* Loop unroll over blockSize2, by 4 */
    blkCnt = blockSize2 >> 2u;
    while(blkCnt > 0u)
    {
      /* Set all accumulators to zero */
      acc0 = 0.0f;
      acc1 = 0.0f;
      acc2 = 0.0f;
      acc3 = 0.0f;
      /* read x[0], x[1], x[2] samples */
      x0 = *(px++);
      x1 = *(px++);
      x2 = *(px++);
      /* Apply loop unrolling and compute 4 MACs simultaneously. */
      k = srcBLen >> 2u;
      /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
       ** a second loop below computes MACs for the remaining 1 to 3 samples. */
      do
      {
        /* Read y[srcBLen - 1] sample */
        c0 = *(py--);
        /* Read x[3] sample */
        x3 = *(px);
        /* Perform the multiply-accumulate */
        /* acc0 +=  x[0] * y[srcBLen - 1] */
        acc0 += x0 * c0;
        /* acc1 +=  x[1] * y[srcBLen - 1] */
        acc1 += x1 * c0;
        /* acc2 +=  x[2] * y[srcBLen - 1] */
        acc2 += x2 * c0;
        /* acc3 +=  x[3] * y[srcBLen - 1] */
        acc3 += x3 * c0;
        /* Read y[srcBLen - 2] sample */
        c0 = *(py--);
        /* Read x[4] sample */
        x0 = *(px + 1u);
        /* Perform the multiply-accumulate */
        /* acc0 +=  x[1] * y[srcBLen - 2] */
        acc0 += x1 * c0;
        /* acc1 +=  x[2] * y[srcBLen - 2] */
        acc1 += x2 * c0;
        /* acc2 +=  x[3] * y[srcBLen - 2] */
        acc2 += x3 * c0;
        /* acc3 +=  x[4] * y[srcBLen - 2] */
        acc3 += x0 * c0;
        /* Read y[srcBLen - 3] sample */
        c0 = *(py--);
        /* Read x[5] sample */
        x1 = *(px + 2u);
        /* Perform the multiply-accumulates */
        /* acc0 +=  x[2] * y[srcBLen - 3] */
        acc0 += x2 * c0;
        /* acc1 +=  x[3] * y[srcBLen - 2] */
        acc1 += x3 * c0;
        /* acc2 +=  x[4] * y[srcBLen - 2] */
        acc2 += x0 * c0;
        /* acc3 +=  x[5] * y[srcBLen - 2] */
        acc3 += x1 * c0;
        /* Read y[srcBLen - 4] sample */
        c0 = *(py--);
        /* Read x[6] sample */
        x2 = *(px + 3u);
        px += 4u;
        /* Perform the multiply-accumulates */
        /* acc0 +=  x[3] * y[srcBLen - 4] */
        acc0 += x3 * c0;
        /* acc1 +=  x[4] * y[srcBLen - 4] */
        acc1 += x0 * c0;
        /* acc2 +=  x[5] * y[srcBLen - 4] */
        acc2 += x1 * c0;
        /* acc3 +=  x[6] * y[srcBLen - 4] */
        acc3 += x2 * c0;
      } while(--k);
      /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.    
       ** No loop unrolling is used. */
      k = srcBLen % 0x4u;
      while(k > 0u)
      {
        /* Read y[srcBLen - 5] sample */
        c0 = *(py--);
        /* Read x[7] sample */
        x3 = *(px++);
        /* Perform the multiply-accumulates */
        /* acc0 +=  x[4] * y[srcBLen - 5] */
        acc0 += x0 * c0;
        /* acc1 +=  x[5] * y[srcBLen - 5] */
        acc1 += x1 * c0;
        /* acc2 +=  x[6] * y[srcBLen - 5] */
        acc2 += x2 * c0;
        /* acc3 +=  x[7] * y[srcBLen - 5] */
        acc3 += x3 * c0;
        /* Reuse the present samples for the next MAC */
        x0 = x1;
        x1 = x2;
        x2 = x3;
        /* Decrement the loop counter */
        k--;
      }
      /* Store the result in the accumulator in the destination buffer. */
      *pOut++ = acc0;
      *pOut++ = acc1;
      *pOut++ = acc2;
      *pOut++ = acc3;
      /* Increment the pointer pIn1 index, count by 4 */
      count += 4u;
      /* Update the inputA and inputB pointers for next MAC calculation */
      px = pIn1 + count;
      py = pSrc2;
      /* Decrement the loop counter */
      blkCnt--;
    }
    /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.    
     ** No loop unrolling is used. */
    blkCnt = blockSize2 % 0x4u;
    while(blkCnt > 0u)
    {
      /* Accumulator is made zero for every iteration */
      sum = 0.0f;
      /* Apply loop unrolling and compute 4 MACs simultaneously. */
      k = srcBLen >> 2u;
      /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
       ** a second loop below computes MACs for the remaining 1 to 3 samples. */
      while(k > 0u)
      {
        /* Perform the multiply-accumulates */
        sum += *px++ * *py--;
        sum += *px++ * *py--;
        sum += *px++ * *py--;
        sum += *px++ * *py--;
        /* Decrement the loop counter */
        k--;
      }
      /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.    
       ** No loop unrolling is used. */
      k = srcBLen % 0x4u;
      while(k > 0u)
      {
        /* Perform the multiply-accumulate */
        sum += *px++ * *py--;
        /* Decrement the loop counter */
        k--;
      }
      /* Store the result in the accumulator in the destination buffer. */
      *pOut++ = sum;
      /* Increment the MAC count */
      count++;
      /* Update the inputA and inputB pointers for next MAC calculation */
      px = pIn1 + count;
      py = pSrc2;
      /* Decrement the loop counter */
      blkCnt--;
    }
  }
  else
  {
    /* If the srcBLen is not a multiple of 4,    
     * the blockSize2 loop cannot be unrolled by 4 */
    blkCnt = blockSize2;
    while(blkCnt > 0u)
    {
      /* Accumulator is made zero for every iteration */
      sum = 0.0f;
      /* srcBLen number of MACS should be performed */
      k = srcBLen;
      while(k > 0u)
      {
        /* Perform the multiply-accumulate */
        sum += *px++ * *py--;
        /* Decrement the loop counter */
        k--;
      }
      /* Store the result in the accumulator in the destination buffer. */
      *pOut++ = sum;
      /* Increment the MAC count */
      count++;
      /* Update the inputA and inputB pointers for next MAC calculation */
      px = pIn1 + count;
      py = pSrc2;
      /* Decrement the loop counter */
      blkCnt--;
    }
  }
  /* --------------------------    
   * Initializations of stage3    
   * -------------------------*/
  /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]    
   * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]    
   * ....    
   * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]    
   * sum +=  x[srcALen-1] * y[srcBLen-1]    
   */
  /* In this stage the MAC operations are decreased by 1 for every iteration.    
     The blockSize3 variable holds the number of MAC operations performed */
  /* Working pointer of inputA */
  pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
  px = pSrc1;
  /* Working pointer of inputB */
  pSrc2 = pIn2 + (srcBLen - 1u);
  py = pSrc2;
  /* -------------------    
   * Stage3 process    
   * ------------------*/
  while(blockSize3 > 0u)
  {
    /* Accumulator is made zero for every iteration */
    sum = 0.0f;
    /* Apply loop unrolling and compute 4 MACs simultaneously. */
    k = blockSize3 >> 2u;
    /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.    
     ** a second loop below computes MACs for the remaining 1 to 3 samples. */
    while(k > 0u)
    {
      /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
      sum += *px++ * *py--;
      /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
      sum += *px++ * *py--;
      /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
      sum += *px++ * *py--;
      /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
      sum += *px++ * *py--;
      /* Decrement the loop counter */
      k--;
    }
    /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.    
     ** No loop unrolling is used. */
    k = blockSize3 % 0x4u;
    while(k > 0u)
    {
      /* Perform the multiply-accumulates */
      /* sum +=  x[srcALen-1] * y[srcBLen-1] */
      sum += *px++ * *py--;
      /* Decrement the loop counter */
      k--;
    }
    /* Store the result in the accumulator in the destination buffer. */
    *pOut++ = sum;
    /* Update the inputA and inputB pointers for next MAC calculation */
    px = ++pSrc1;
    py = pSrc2;
    /* Decrement the loop counter */
    blockSize3--;
  }
 #else
  /* Run the below code for Cortex-M0 */
  float32_t *pIn1 = pSrcA;                       /* inputA pointer */
  float32_t *pIn2 = pSrcB;                       /* inputB pointer */
  float32_t sum;                                 /* Accumulator */
  uint32_t i, j;                                 /* loop counters */
  /* Loop to calculate convolution for output length number of times */
  for (i = 0u; i < ((srcALen + srcBLen) - 1u); i++)
  {
    /* Initialize sum with zero to carry out MAC operations */
    sum = 0.0f;
    /* Loop to perform MAC operations according to convolution equation */
    for (j = 0u; j <= i; j++)
    {
      /* Check the array limitations */
      if((((i - j) < srcBLen) && (j < srcALen)))
      {
        /* z[i] += x[i-j] * y[j] */
        sum += pIn1[j] * pIn2[i - j];
      }
    }
    /* Store the output in the destination buffer */
    pDst[i] = sum;
  }
 #endif /*   #ifndef ARM_MATH_CM0_FAMILY        */
 }
 /**    
 * @} end of Conv group    
 */
--- a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_opt_q15.c
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_opt_q15.c
@@ -0,0 +1,543 @@
 /* ----------------------------------------------------------------------    
 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.    
 *    
 * $Date:        12. March 2014
 * $Revision: 	V1.4.4
 *    
 * Project: 	    CMSIS DSP Library    
 * Title:		arm_conv_fast_opt_q15.c    
 *    
 * Description:	Fast Q15 Convolution.    
 *    
 * Target Processor: Cortex-M4/Cortex-M3
 *  
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions
 * are met:
 *   - Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   - Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the 
 *     distribution.
 *   - Neither the name of ARM LIMITED nor the names of its contributors
 *     may be used to endorse or promote products derived from this
 *     software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.    
 * -------------------------------------------------------------------- */
 #include "arm_math.h"
 /**    
 * @ingroup groupFilters    
 */
 /**    
 * @addtogroup Conv    
 * @{    
 */
 /**    
 * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.    
 * @param[in] *pSrcA points to the first input sequence.    
 * @param[in] srcALen length of the first input sequence.    
 * @param[in] *pSrcB points to the second input sequence.    
 * @param[in] srcBLen length of the second input sequence.    
 * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.    
 * @param[in]  *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.   
 * @param[in]  *pScratch2 points to scratch buffer of size min(srcALen, srcBLen).   
 * @return none.    
 *    
 * \par Restrictions    
 *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE    
 *	In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit    
 *     
 * <b>Scaling and Overflow Behavior:</b>    
 *    
 * \par    
 * This fast version uses a 32-bit accumulator with 2.30 format.    
 * The accumulator maintains full precision of the intermediate multiplication results    
 * but provides only a single guard bit. There is no saturation on intermediate additions.    
 * Thus, if the accumulator overflows it wraps around and distorts the result.    
 * The input signals should be scaled down to avoid intermediate overflows.    
 * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,    
 * as maximum of min(srcALen, srcBLen) number of additions are carried internally.    
 * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.    
 *    
 * \par    
 * See <code>arm_conv_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.    
 */
 void arm_conv_fast_opt_q15(
  q15_t * pSrcA,
  uint32_t srcALen,
  q15_t * pSrcB,
  uint32_t srcBLen,
  q15_t * pDst,
  q15_t * pScratch1,
  q15_t * pScratch2)
 {
  q31_t acc0, acc1, acc2, acc3;                  /* Accumulators */
  q31_t x1, x2, x3;                              /* Temporary variables to hold state and coefficient values */
  q31_t y1, y2;                                  /* State variables */
  q15_t *pOut = pDst;                            /* output pointer */
  q15_t *pScr1 = pScratch1;                      /* Temporary pointer for scratch1 */
  q15_t *pScr2 = pScratch2;                      /* Temporary pointer for scratch1 */
  q15_t *pIn1;                                   /* inputA pointer */
  q15_t *pIn2;                                   /* inputB pointer */
  q15_t *px;                                     /* Intermediate inputA pointer  */
  q15_t *py;                                     /* Intermediate inputB pointer  */
  uint32_t j, k, blkCnt;                         /* loop counter */
  uint32_t tapCnt;                               /* loop count */
 #ifdef UNALIGNED_SUPPORT_DISABLE
  q15_t a, b;
 #endif	/*	#ifdef UNALIGNED_SUPPORT_DISABLE	*/
  /* The algorithm implementation is based on the lengths of the inputs. */
  /* srcB is always made to slide across srcA. */
  /* So srcBLen is always considered as shorter or equal to srcALen */
  if(srcALen >= srcBLen)
  {
    /* Initialization of inputA pointer */
    pIn1 = pSrcA;
    /* Initialization of inputB pointer */
    pIn2 = pSrcB;
  }
  else
  {
    /* Initialization of inputA pointer */
    pIn1 = pSrcB;
    /* Initialization of inputB pointer */
    pIn2 = pSrcA;
    /* srcBLen is always considered as shorter or equal to srcALen */
    j = srcBLen;
    srcBLen = srcALen;
    srcALen = j;
  }
  /* Pointer to take end of scratch2 buffer */
  pScr2 = pScratch2 + srcBLen - 1;
  /* points to smaller length sequence */
  px = pIn2;
  /* Apply loop unrolling and do 4 Copies simultaneously. */
  k = srcBLen >> 2u;
  /* First part of the processing with loop unrolling copies 4 data points at a time.       
   ** a second loop below copies for the remaining 1 to 3 samples. */
  /* Copy smaller length input sequence in reverse order into second scratch buffer */
  while(k > 0u)
  {
    /* copy second buffer in reversal manner */
    *pScr2-- = *px++;
    *pScr2-- = *px++;
    *pScr2-- = *px++;
    *pScr2-- = *px++;
    /* Decrement the loop counter */
    k--;
  }
  /* If the count is not a multiple of 4, copy remaining samples here.       
   ** No loop unrolling is used. */
  k = srcBLen % 0x4u;
  while(k > 0u)
  {
    /* copy second buffer in reversal manner for remaining samples */
    *pScr2-- = *px++;
    /* Decrement the loop counter */
    k--;
  }
  /* Initialze temporary scratch pointer */
  pScr1 = pScratch1;
  /* Assuming scratch1 buffer is aligned by 32-bit */
  /* Fill (srcBLen - 1u) zeros in scratch1 buffer */
  arm_fill_q15(0, pScr1, (srcBLen - 1u));
  /* Update temporary scratch pointer */
  pScr1 += (srcBLen - 1u);
  /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
 #ifndef UNALIGNED_SUPPORT_DISABLE
  /* Copy (srcALen) samples in scratch buffer */
  arm_copy_q15(pIn1, pScr1, srcALen);
  /* Update pointers */
  pScr1 += srcALen;
 #else
  /* Apply loop unrolling and do 4 Copies simultaneously. */
  k = srcALen >> 2u;
  /* First part of the processing with loop unrolling copies 4 data points at a time.       
   ** a second loop below copies for the remaining 1 to 3 samples. */
  while(k > 0u)
  {
    /* copy second buffer in reversal manner */
    *pScr1++ = *pIn1++;
    *pScr1++ = *pIn1++;
    *pScr1++ = *pIn1++;
    *pScr1++ = *pIn1++;
    /* Decrement the loop counter */
    k--;
  }
  /* If the count is not a multiple of 4, copy remaining samples here.       
   ** No loop unrolling is used. */
  k = srcALen % 0x4u;
  while(k > 0u)
  {
    /* copy second buffer in reversal manner for remaining samples */
    *pScr1++ = *pIn1++;
    /* Decrement the loop counter */
    k--;
  }
 #endif	/*	#ifndef UNALIGNED_SUPPORT_DISABLE	*/
 #ifndef UNALIGNED_SUPPORT_DISABLE
  /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
  arm_fill_q15(0, pScr1, (srcBLen - 1u));
  /* Update pointer */
  pScr1 += (srcBLen - 1u);
 #else
  /* Apply loop unrolling and do 4 Copies simultaneously. */
  k = (srcBLen - 1u) >> 2u;
  /* First part of the processing with loop unrolling copies 4 data points at a time.       
   ** a second loop below copies for the remaining 1 to 3 samples. */
  while(k > 0u)
  {
    /* copy second buffer in reversal manner */
    *pScr1++ = 0;
    *pScr1++ = 0;
    *pScr1++ = 0;
    *pScr1++ = 0;
    /* Decrement the loop counter */
    k--;
  }
  /* If the count is not a multiple of 4, copy remaining samples here.       
   ** No loop unrolling is used. */
  k = (srcBLen - 1u) % 0x4u;
  while(k > 0u)
  {
    /* copy second buffer in reversal manner for remaining samples */
    *pScr1++ = 0;
    /* Decrement the loop counter */
    k--;
  }
 #endif	/*	#ifndef UNALIGNED_SUPPORT_DISABLE	*/
  /* Temporary pointer for scratch2 */
  py = pScratch2;
  /* Initialization of pIn2 pointer */
  pIn2 = py;
  /* First part of the processing with loop unrolling process 4 data points at a time.       
   ** a second loop below process for the remaining 1 to 3 samples. */
  /* Actual convolution process starts here */
  blkCnt = (srcALen + srcBLen - 1u) >> 2;
  while(blkCnt > 0)
  {
    /* Initialze temporary scratch pointer as scratch1 */
    pScr1 = pScratch1;
    /* Clear Accumlators */
    acc0 = 0;
    acc1 = 0;
    acc2 = 0;
    acc3 = 0;
    /* Read two samples from scratch1 buffer */
    x1 = *__SIMD32(pScr1)++;
    /* Read next two samples from scratch1 buffer */
    x2 = *__SIMD32(pScr1)++;
    tapCnt = (srcBLen) >> 2u;
    while(tapCnt > 0u)
    {
 #ifndef UNALIGNED_SUPPORT_DISABLE
      /* Read four samples from smaller buffer */
      y1 = _SIMD32_OFFSET(pIn2);
      y2 = _SIMD32_OFFSET(pIn2 + 2u);
      /* multiply and accumlate */
      acc0 = __SMLAD(x1, y1, acc0);
      acc2 = __SMLAD(x2, y1, acc2);
      /* pack input data */
 #ifndef ARM_MATH_BIG_ENDIAN
      x3 = __PKHBT(x2, x1, 0);
 #else
      x3 = __PKHBT(x1, x2, 0);
 #endif
      /* multiply and accumlate */
      acc1 = __SMLADX(x3, y1, acc1);
      /* Read next two samples from scratch1 buffer */
      x1 = _SIMD32_OFFSET(pScr1);
      /* multiply and accumlate */
      acc0 = __SMLAD(x2, y2, acc0);
      acc2 = __SMLAD(x1, y2, acc2);
      /* pack input data */
 #ifndef ARM_MATH_BIG_ENDIAN
      x3 = __PKHBT(x1, x2, 0);
 #else
      x3 = __PKHBT(x2, x1, 0);
 #endif
      acc3 = __SMLADX(x3, y1, acc3);
      acc1 = __SMLADX(x3, y2, acc1);
      x2 = _SIMD32_OFFSET(pScr1 + 2u);
 #ifndef ARM_MATH_BIG_ENDIAN
      x3 = __PKHBT(x2, x1, 0);
 #else
      x3 = __PKHBT(x1, x2, 0);
 #endif
      acc3 = __SMLADX(x3, y2, acc3);
 #else	 
      /* Read four samples from smaller buffer */
 	  a = *pIn2;
 	  b = *(pIn2 + 1);
 #ifndef ARM_MATH_BIG_ENDIAN
      y1 = __PKHBT(a, b, 16);
 #else
      y1 = __PKHBT(b, a, 16);
 #endif
 	  a = *(pIn2 + 2);
 	  b = *(pIn2 + 3);
 #ifndef ARM_MATH_BIG_ENDIAN
      y2 = __PKHBT(a, b, 16);
 #else
      y2 = __PKHBT(b, a, 16);
 #endif				
      acc0 = __SMLAD(x1, y1, acc0);
      acc2 = __SMLAD(x2, y1, acc2);
 #ifndef ARM_MATH_BIG_ENDIAN
      x3 = __PKHBT(x2, x1, 0);
 #else
      x3 = __PKHBT(x1, x2, 0);
 #endif
      acc1 = __SMLADX(x3, y1, acc1);
 	  a = *pScr1;
 	  b = *(pScr1 + 1);
 #ifndef ARM_MATH_BIG_ENDIAN
      x1 = __PKHBT(a, b, 16);
 #else
      x1 = __PKHBT(b, a, 16);
 #endif
      acc0 = __SMLAD(x2, y2, acc0);
      acc2 = __SMLAD(x1, y2, acc2);
 #ifndef ARM_MATH_BIG_ENDIAN
      x3 = __PKHBT(x1, x2, 0);
 #else
      x3 = __PKHBT(x2, x1, 0);
 #endif
      acc3 = __SMLADX(x3, y1, acc3);
      acc1 = __SMLADX(x3, y2, acc1);
 	  a = *(pScr1 + 2);
 	  b = *(pScr1 + 3);
 #ifndef ARM_MATH_BIG_ENDIAN
      x2 = __PKHBT(a, b, 16);
 #else
      x2 = __PKHBT(b, a, 16);
 #endif
 #ifndef ARM_MATH_BIG_ENDIAN
      x3 = __PKHBT(x2, x1, 0);
 #else
      x3 = __PKHBT(x1, x2, 0);
 #endif
      acc3 = __SMLADX(x3, y2, acc3);
 #endif	/*	#ifndef UNALIGNED_SUPPORT_DISABLE	*/
      /* update scratch pointers */
      pIn2 += 4u;
      pScr1 += 4u;
      /* Decrement the loop counter */
      tapCnt--;
    }
    /* Update scratch pointer for remaining samples of smaller length sequence */
    pScr1 -= 4u;
    /* apply same above for remaining samples of smaller length sequence */
    tapCnt = (srcBLen) & 3u;
    while(tapCnt > 0u)
    {
      /* accumlate the results */
      acc0 += (*pScr1++ * *pIn2);
      acc1 += (*pScr1++ * *pIn2);
      acc2 += (*pScr1++ * *pIn2);
      acc3 += (*pScr1++ * *pIn2++);
      pScr1 -= 3u;
      /* Decrement the loop counter */
      tapCnt--;
    }
    blkCnt--;
    /* Store the results in the accumulators in the destination buffer. */
 #ifndef ARM_MATH_BIG_ENDIAN
    *__SIMD32(pOut)++ =
      __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
    *__SIMD32(pOut)++ =
      __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
 #else
    *__SIMD32(pOut)++ =
      __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
    *__SIMD32(pOut)++ =
      __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN       */
    /* Initialization of inputB pointer */
    pIn2 = py;
    pScratch1 += 4u;
  }
  blkCnt = (srcALen + srcBLen - 1u) & 0x3;
  /* Calculate convolution for remaining samples of Bigger length sequence */
  while(blkCnt > 0)
  {
    /* Initialze temporary scratch pointer as scratch1 */
    pScr1 = pScratch1;
    /* Clear Accumlators */
    acc0 = 0;
    tapCnt = (srcBLen) >> 1u;
    while(tapCnt > 0u)
    {
      acc0 += (*pScr1++ * *pIn2++);
      acc0 += (*pScr1++ * *pIn2++);
      /* Decrement the loop counter */
      tapCnt--;
    }
    tapCnt = (srcBLen) & 1u;
    /* apply same above for remaining samples of smaller length sequence */
    while(tapCnt > 0u)
    {
      /* accumlate the results */
      acc0 += (*pScr1++ * *pIn2++);
      /* Decrement the loop counter */
      tapCnt--;
    }
    blkCnt--;
    /* The result is in 2.30 format.  Convert to 1.15 with saturation.       
     ** Then store the output in the destination buffer. */
    *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
    /* Initialization of inputB pointer */
    pIn2 = py;
    pScratch1 += 1u;
  }
 }
 /**    
 * @} end of Conv group    
 */