A bundled STM32F10x Std Periph and CMSIS library
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 
 
 

544 lignes
14 KiB

  1. /* ----------------------------------------------------------------------
  2. * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
  3. *
  4. * $Date: 12. March 2014
  5. * $Revision: V1.4.4
  6. *
  7. * Project: CMSIS DSP Library
  8. * Title: arm_conv_fast_opt_q15.c
  9. *
  10. * Description: Fast Q15 Convolution.
  11. *
  12. * Target Processor: Cortex-M4/Cortex-M3
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. * - Redistributions in binary form must reproduce the above copyright
  20. * notice, this list of conditions and the following disclaimer in
  21. * the documentation and/or other materials provided with the
  22. * distribution.
  23. * - Neither the name of ARM LIMITED nor the names of its contributors
  24. * may be used to endorse or promote products derived from this
  25. * software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38. * POSSIBILITY OF SUCH DAMAGE.
  39. * -------------------------------------------------------------------- */
  40. #include "arm_math.h"
  41. /**
  42. * @ingroup groupFilters
  43. */
  44. /**
  45. * @addtogroup Conv
  46. * @{
  47. */
  48. /**
  49. * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
  50. * @param[in] *pSrcA points to the first input sequence.
  51. * @param[in] srcALen length of the first input sequence.
  52. * @param[in] *pSrcB points to the second input sequence.
  53. * @param[in] srcBLen length of the second input sequence.
  54. * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
  55. * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
  56. * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
  57. * @return none.
  58. *
  59. * \par Restrictions
  60. * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
  61. * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
  62. *
  63. * <b>Scaling and Overflow Behavior:</b>
  64. *
  65. * \par
  66. * This fast version uses a 32-bit accumulator with 2.30 format.
  67. * The accumulator maintains full precision of the intermediate multiplication results
  68. * but provides only a single guard bit. There is no saturation on intermediate additions.
  69. * Thus, if the accumulator overflows it wraps around and distorts the result.
  70. * The input signals should be scaled down to avoid intermediate overflows.
  71. * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
  72. * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
  73. * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
  74. *
  75. * \par
  76. * See <code>arm_conv_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
  77. */
  78. void arm_conv_fast_opt_q15(
  79. q15_t * pSrcA,
  80. uint32_t srcALen,
  81. q15_t * pSrcB,
  82. uint32_t srcBLen,
  83. q15_t * pDst,
  84. q15_t * pScratch1,
  85. q15_t * pScratch2)
  86. {
  87. q31_t acc0, acc1, acc2, acc3; /* Accumulators */
  88. q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */
  89. q31_t y1, y2; /* State variables */
  90. q15_t *pOut = pDst; /* output pointer */
  91. q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
  92. q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
  93. q15_t *pIn1; /* inputA pointer */
  94. q15_t *pIn2; /* inputB pointer */
  95. q15_t *px; /* Intermediate inputA pointer */
  96. q15_t *py; /* Intermediate inputB pointer */
  97. uint32_t j, k, blkCnt; /* loop counter */
  98. uint32_t tapCnt; /* loop count */
  99. #ifdef UNALIGNED_SUPPORT_DISABLE
  100. q15_t a, b;
  101. #endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */
  102. /* The algorithm implementation is based on the lengths of the inputs. */
  103. /* srcB is always made to slide across srcA. */
  104. /* So srcBLen is always considered as shorter or equal to srcALen */
  105. if(srcALen >= srcBLen)
  106. {
  107. /* Initialization of inputA pointer */
  108. pIn1 = pSrcA;
  109. /* Initialization of inputB pointer */
  110. pIn2 = pSrcB;
  111. }
  112. else
  113. {
  114. /* Initialization of inputA pointer */
  115. pIn1 = pSrcB;
  116. /* Initialization of inputB pointer */
  117. pIn2 = pSrcA;
  118. /* srcBLen is always considered as shorter or equal to srcALen */
  119. j = srcBLen;
  120. srcBLen = srcALen;
  121. srcALen = j;
  122. }
  123. /* Pointer to take end of scratch2 buffer */
  124. pScr2 = pScratch2 + srcBLen - 1;
  125. /* points to smaller length sequence */
  126. px = pIn2;
  127. /* Apply loop unrolling and do 4 Copies simultaneously. */
  128. k = srcBLen >> 2u;
  129. /* First part of the processing with loop unrolling copies 4 data points at a time.
  130. ** a second loop below copies for the remaining 1 to 3 samples. */
  131. /* Copy smaller length input sequence in reverse order into second scratch buffer */
  132. while(k > 0u)
  133. {
  134. /* copy second buffer in reversal manner */
  135. *pScr2-- = *px++;
  136. *pScr2-- = *px++;
  137. *pScr2-- = *px++;
  138. *pScr2-- = *px++;
  139. /* Decrement the loop counter */
  140. k--;
  141. }
  142. /* If the count is not a multiple of 4, copy remaining samples here.
  143. ** No loop unrolling is used. */
  144. k = srcBLen % 0x4u;
  145. while(k > 0u)
  146. {
  147. /* copy second buffer in reversal manner for remaining samples */
  148. *pScr2-- = *px++;
  149. /* Decrement the loop counter */
  150. k--;
  151. }
  152. /* Initialze temporary scratch pointer */
  153. pScr1 = pScratch1;
  154. /* Assuming scratch1 buffer is aligned by 32-bit */
  155. /* Fill (srcBLen - 1u) zeros in scratch1 buffer */
  156. arm_fill_q15(0, pScr1, (srcBLen - 1u));
  157. /* Update temporary scratch pointer */
  158. pScr1 += (srcBLen - 1u);
  159. /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
  160. #ifndef UNALIGNED_SUPPORT_DISABLE
  161. /* Copy (srcALen) samples in scratch buffer */
  162. arm_copy_q15(pIn1, pScr1, srcALen);
  163. /* Update pointers */
  164. pScr1 += srcALen;
  165. #else
  166. /* Apply loop unrolling and do 4 Copies simultaneously. */
  167. k = srcALen >> 2u;
  168. /* First part of the processing with loop unrolling copies 4 data points at a time.
  169. ** a second loop below copies for the remaining 1 to 3 samples. */
  170. while(k > 0u)
  171. {
  172. /* copy second buffer in reversal manner */
  173. *pScr1++ = *pIn1++;
  174. *pScr1++ = *pIn1++;
  175. *pScr1++ = *pIn1++;
  176. *pScr1++ = *pIn1++;
  177. /* Decrement the loop counter */
  178. k--;
  179. }
  180. /* If the count is not a multiple of 4, copy remaining samples here.
  181. ** No loop unrolling is used. */
  182. k = srcALen % 0x4u;
  183. while(k > 0u)
  184. {
  185. /* copy second buffer in reversal manner for remaining samples */
  186. *pScr1++ = *pIn1++;
  187. /* Decrement the loop counter */
  188. k--;
  189. }
  190. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  191. #ifndef UNALIGNED_SUPPORT_DISABLE
  192. /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
  193. arm_fill_q15(0, pScr1, (srcBLen - 1u));
  194. /* Update pointer */
  195. pScr1 += (srcBLen - 1u);
  196. #else
  197. /* Apply loop unrolling and do 4 Copies simultaneously. */
  198. k = (srcBLen - 1u) >> 2u;
  199. /* First part of the processing with loop unrolling copies 4 data points at a time.
  200. ** a second loop below copies for the remaining 1 to 3 samples. */
  201. while(k > 0u)
  202. {
  203. /* copy second buffer in reversal manner */
  204. *pScr1++ = 0;
  205. *pScr1++ = 0;
  206. *pScr1++ = 0;
  207. *pScr1++ = 0;
  208. /* Decrement the loop counter */
  209. k--;
  210. }
  211. /* If the count is not a multiple of 4, copy remaining samples here.
  212. ** No loop unrolling is used. */
  213. k = (srcBLen - 1u) % 0x4u;
  214. while(k > 0u)
  215. {
  216. /* copy second buffer in reversal manner for remaining samples */
  217. *pScr1++ = 0;
  218. /* Decrement the loop counter */
  219. k--;
  220. }
  221. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  222. /* Temporary pointer for scratch2 */
  223. py = pScratch2;
  224. /* Initialization of pIn2 pointer */
  225. pIn2 = py;
  226. /* First part of the processing with loop unrolling process 4 data points at a time.
  227. ** a second loop below process for the remaining 1 to 3 samples. */
  228. /* Actual convolution process starts here */
  229. blkCnt = (srcALen + srcBLen - 1u) >> 2;
  230. while(blkCnt > 0)
  231. {
  232. /* Initialze temporary scratch pointer as scratch1 */
  233. pScr1 = pScratch1;
  234. /* Clear Accumlators */
  235. acc0 = 0;
  236. acc1 = 0;
  237. acc2 = 0;
  238. acc3 = 0;
  239. /* Read two samples from scratch1 buffer */
  240. x1 = *__SIMD32(pScr1)++;
  241. /* Read next two samples from scratch1 buffer */
  242. x2 = *__SIMD32(pScr1)++;
  243. tapCnt = (srcBLen) >> 2u;
  244. while(tapCnt > 0u)
  245. {
  246. #ifndef UNALIGNED_SUPPORT_DISABLE
  247. /* Read four samples from smaller buffer */
  248. y1 = _SIMD32_OFFSET(pIn2);
  249. y2 = _SIMD32_OFFSET(pIn2 + 2u);
  250. /* multiply and accumlate */
  251. acc0 = __SMLAD(x1, y1, acc0);
  252. acc2 = __SMLAD(x2, y1, acc2);
  253. /* pack input data */
  254. #ifndef ARM_MATH_BIG_ENDIAN
  255. x3 = __PKHBT(x2, x1, 0);
  256. #else
  257. x3 = __PKHBT(x1, x2, 0);
  258. #endif
  259. /* multiply and accumlate */
  260. acc1 = __SMLADX(x3, y1, acc1);
  261. /* Read next two samples from scratch1 buffer */
  262. x1 = _SIMD32_OFFSET(pScr1);
  263. /* multiply and accumlate */
  264. acc0 = __SMLAD(x2, y2, acc0);
  265. acc2 = __SMLAD(x1, y2, acc2);
  266. /* pack input data */
  267. #ifndef ARM_MATH_BIG_ENDIAN
  268. x3 = __PKHBT(x1, x2, 0);
  269. #else
  270. x3 = __PKHBT(x2, x1, 0);
  271. #endif
  272. acc3 = __SMLADX(x3, y1, acc3);
  273. acc1 = __SMLADX(x3, y2, acc1);
  274. x2 = _SIMD32_OFFSET(pScr1 + 2u);
  275. #ifndef ARM_MATH_BIG_ENDIAN
  276. x3 = __PKHBT(x2, x1, 0);
  277. #else
  278. x3 = __PKHBT(x1, x2, 0);
  279. #endif
  280. acc3 = __SMLADX(x3, y2, acc3);
  281. #else
  282. /* Read four samples from smaller buffer */
  283. a = *pIn2;
  284. b = *(pIn2 + 1);
  285. #ifndef ARM_MATH_BIG_ENDIAN
  286. y1 = __PKHBT(a, b, 16);
  287. #else
  288. y1 = __PKHBT(b, a, 16);
  289. #endif
  290. a = *(pIn2 + 2);
  291. b = *(pIn2 + 3);
  292. #ifndef ARM_MATH_BIG_ENDIAN
  293. y2 = __PKHBT(a, b, 16);
  294. #else
  295. y2 = __PKHBT(b, a, 16);
  296. #endif
  297. acc0 = __SMLAD(x1, y1, acc0);
  298. acc2 = __SMLAD(x2, y1, acc2);
  299. #ifndef ARM_MATH_BIG_ENDIAN
  300. x3 = __PKHBT(x2, x1, 0);
  301. #else
  302. x3 = __PKHBT(x1, x2, 0);
  303. #endif
  304. acc1 = __SMLADX(x3, y1, acc1);
  305. a = *pScr1;
  306. b = *(pScr1 + 1);
  307. #ifndef ARM_MATH_BIG_ENDIAN
  308. x1 = __PKHBT(a, b, 16);
  309. #else
  310. x1 = __PKHBT(b, a, 16);
  311. #endif
  312. acc0 = __SMLAD(x2, y2, acc0);
  313. acc2 = __SMLAD(x1, y2, acc2);
  314. #ifndef ARM_MATH_BIG_ENDIAN
  315. x3 = __PKHBT(x1, x2, 0);
  316. #else
  317. x3 = __PKHBT(x2, x1, 0);
  318. #endif
  319. acc3 = __SMLADX(x3, y1, acc3);
  320. acc1 = __SMLADX(x3, y2, acc1);
  321. a = *(pScr1 + 2);
  322. b = *(pScr1 + 3);
  323. #ifndef ARM_MATH_BIG_ENDIAN
  324. x2 = __PKHBT(a, b, 16);
  325. #else
  326. x2 = __PKHBT(b, a, 16);
  327. #endif
  328. #ifndef ARM_MATH_BIG_ENDIAN
  329. x3 = __PKHBT(x2, x1, 0);
  330. #else
  331. x3 = __PKHBT(x1, x2, 0);
  332. #endif
  333. acc3 = __SMLADX(x3, y2, acc3);
  334. #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
  335. /* update scratch pointers */
  336. pIn2 += 4u;
  337. pScr1 += 4u;
  338. /* Decrement the loop counter */
  339. tapCnt--;
  340. }
  341. /* Update scratch pointer for remaining samples of smaller length sequence */
  342. pScr1 -= 4u;
  343. /* apply same above for remaining samples of smaller length sequence */
  344. tapCnt = (srcBLen) & 3u;
  345. while(tapCnt > 0u)
  346. {
  347. /* accumlate the results */
  348. acc0 += (*pScr1++ * *pIn2);
  349. acc1 += (*pScr1++ * *pIn2);
  350. acc2 += (*pScr1++ * *pIn2);
  351. acc3 += (*pScr1++ * *pIn2++);
  352. pScr1 -= 3u;
  353. /* Decrement the loop counter */
  354. tapCnt--;
  355. }
  356. blkCnt--;
  357. /* Store the results in the accumulators in the destination buffer. */
  358. #ifndef ARM_MATH_BIG_ENDIAN
  359. *__SIMD32(pOut)++ =
  360. __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
  361. *__SIMD32(pOut)++ =
  362. __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
  363. #else
  364. *__SIMD32(pOut)++ =
  365. __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
  366. *__SIMD32(pOut)++ =
  367. __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
  368. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  369. /* Initialization of inputB pointer */
  370. pIn2 = py;
  371. pScratch1 += 4u;
  372. }
  373. blkCnt = (srcALen + srcBLen - 1u) & 0x3;
  374. /* Calculate convolution for remaining samples of Bigger length sequence */
  375. while(blkCnt > 0)
  376. {
  377. /* Initialze temporary scratch pointer as scratch1 */
  378. pScr1 = pScratch1;
  379. /* Clear Accumlators */
  380. acc0 = 0;
  381. tapCnt = (srcBLen) >> 1u;
  382. while(tapCnt > 0u)
  383. {
  384. acc0 += (*pScr1++ * *pIn2++);
  385. acc0 += (*pScr1++ * *pIn2++);
  386. /* Decrement the loop counter */
  387. tapCnt--;
  388. }
  389. tapCnt = (srcBLen) & 1u;
  390. /* apply same above for remaining samples of smaller length sequence */
  391. while(tapCnt > 0u)
  392. {
  393. /* accumlate the results */
  394. acc0 += (*pScr1++ * *pIn2++);
  395. /* Decrement the loop counter */
  396. tapCnt--;
  397. }
  398. blkCnt--;
  399. /* The result is in 2.30 format. Convert to 1.15 with saturation.
  400. ** Then store the output in the destination buffer. */
  401. *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
  402. /* Initialization of inputB pointer */
  403. pIn2 = py;
  404. pScratch1 += 1u;
  405. }
  406. }
  407. /**
  408. * @} end of Conv group
  409. */