summaryrefslogtreecommitdiffstats
path: root/lib/rbcodec/codecs/libopus/celt/arm/celt_pitch_xcorr_arm.s
blob: 6e873afc3712724fed83734b4b3ce8184e1919f7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
; Copyright (c) 2007-2008 CSIRO
; Copyright (c) 2007-2009 Xiph.Org Foundation
; Copyright (c) 2013      Parrot
; Written by Aurélien Zanelli
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions
; are met:
;
; - Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
;
; - Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in the
; documentation and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

  AREA  |.text|, CODE, READONLY

  GET    celt/arm/armopts.s

IF OPUS_ARM_MAY_HAVE_EDSP
  EXPORT celt_pitch_xcorr_edsp
ENDIF

IF OPUS_ARM_MAY_HAVE_NEON
  EXPORT celt_pitch_xcorr_neon
ENDIF

IF OPUS_ARM_MAY_HAVE_NEON

; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
xcorr_kernel_neon PROC
xcorr_kernel_neon_start
  ; input:
  ;   r3     = int         len
  ;   r4     = opus_val16 *x
  ;   r5     = opus_val16 *y
  ;   q0     = opus_val32  sum[4]
  ; output:
  ;   q0     = opus_val32  sum[4]
  ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
  ; internal usage:
  ;   r12 = int j
  ;   d3  = y_3|y_2|y_1|y_0
  ;   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
  ;   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
  ;   q8  = scratch
  ;
  ; Load y[0...3]
  ; This requires len>0 to always be valid (which we assert in the C code).
  VLD1.16      {d5}, [r5]!
  SUBS         r12, r3, #8
  BLE xcorr_kernel_neon_process4
; Process 8 samples at a time.
; This loop loads one y value more than we actually need. Therefore we have to
; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
; reading past the end of the array.
xcorr_kernel_neon_process8
  ; This loop has 19 total instructions (10 cycles to issue, minimum), with
  ; - 2 cycles of ARM insrtuctions,
  ; - 10 cycles of load/store/byte permute instructions, and
  ; - 9 cycles of data processing instructions.
  ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
  ; latter two categories, meaning the whole loop should run in 10 cycles per
  ; iteration, barring cache misses.
  ;
  ; Load x[0...7]
  VLD1.16      {d6, d7}, [r4]!
  ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
  ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
  VAND         d3, d5, d5
  SUBS         r12, r12, #8
  ; Load y[4...11]
  VLD1.16      {d4, d5}, [r5]!
  VMLAL.S16    q0, d3, d6[0]
  VEXT.16      d16, d3, d4, #1
  VMLAL.S16    q0, d4, d7[0]
  VEXT.16      d17, d4, d5, #1
  VMLAL.S16    q0, d16, d6[1]
  VEXT.16      d16, d3, d4, #2
  VMLAL.S16    q0, d17, d7[1]
  VEXT.16      d17, d4, d5, #2
  VMLAL.S16    q0, d16, d6[2]
  VEXT.16      d16, d3, d4, #3
  VMLAL.S16    q0, d17, d7[2]
  VEXT.16      d17, d4, d5, #3
  VMLAL.S16    q0, d16, d6[3]
  VMLAL.S16    q0, d17, d7[3]
  BGT xcorr_kernel_neon_process8
; Process 4 samples here if we have > 4 left (still reading one extra y value).
xcorr_kernel_neon_process4
  ADDS         r12, r12, #4
  BLE xcorr_kernel_neon_process2
  ; Load x[0...3]
  VLD1.16      d6, [r4]!
  ; Use VAND since it's a data processing instruction again.
  VAND         d4, d5, d5
  SUB          r12, r12, #4
  ; Load y[4...7]
  VLD1.16      d5, [r5]!
  VMLAL.S16    q0, d4, d6[0]
  VEXT.16      d16, d4, d5, #1
  VMLAL.S16    q0, d16, d6[1]
  VEXT.16      d16, d4, d5, #2
  VMLAL.S16    q0, d16, d6[2]
  VEXT.16      d16, d4, d5, #3
  VMLAL.S16    q0, d16, d6[3]
; Process 2 samples here if we have > 2 left (still reading one extra y value).
xcorr_kernel_neon_process2
  ADDS         r12, r12, #2
  BLE xcorr_kernel_neon_process1
  ; Load x[0...1]
  VLD2.16      {d6[],d7[]}, [r4]!
  ; Use VAND since it's a data processing instruction again.
  VAND         d4, d5, d5
  SUB          r12, r12, #2
  ; Load y[4...5]
  VLD1.32      {d5[]}, [r5]!
  VMLAL.S16    q0, d4, d6
  VEXT.16      d16, d4, d5, #1
  ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
  ; instead of VEXT, since it's a data-processing instruction.
  VSRI.64      d5, d4, #32
  VMLAL.S16    q0, d16, d7
; Process 1 sample using the extra y value we loaded above.
xcorr_kernel_neon_process1
  ; Load next *x
  VLD1.16      {d6[]}, [r4]!
  ADDS         r12, r12, #1
  ; y[0...3] are left in d5 from prior iteration(s) (if any)
  VMLAL.S16    q0, d5, d6
  MOVLE        pc, lr
; Now process 1 last sample, not reading ahead.
  ; Load last *y
  VLD1.16      {d4[]}, [r5]!
  VSRI.64      d4, d5, #16
  ; Load last *x
  VLD1.16      {d6[]}, [r4]!
  VMLAL.S16    q0, d4, d6
  MOV          pc, lr
  ENDP

; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
;  opus_val32 *xcorr, int len, int max_pitch, int arch)
celt_pitch_xcorr_neon PROC
  ; input:
  ;   r0  = opus_val16 *_x
  ;   r1  = opus_val16 *_y
  ;   r2  = opus_val32 *xcorr
  ;   r3  = int         len
  ; output:
  ;   r0  = int         maxcorr
  ; internal usage:
  ;   r4  = opus_val16 *x (for xcorr_kernel_neon())
  ;   r5  = opus_val16 *y (for xcorr_kernel_neon())
  ;   r6  = int         max_pitch
  ;   r12 = int         j
  ;   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
  ; ignored:
  ;         int         arch
  STMFD        sp!, {r4-r6, lr}
  LDR          r6, [sp, #16]
  VMOV.S32     q15, #1
  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
  SUBS         r6, r6, #4
  BLT celt_pitch_xcorr_neon_process4_done
celt_pitch_xcorr_neon_process4
  ; xcorr_kernel_neon parameters:
  ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
  MOV          r4, r0
  MOV          r5, r1
  VEOR         q0, q0, q0
  ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
  ; So we don't save/restore any other registers.
  BL xcorr_kernel_neon_start
  SUBS         r6, r6, #4
  VST1.32      {q0}, [r2]!
  ; _y += 4
  ADD          r1, r1, #8
  VMAX.S32     q15, q15, q0
  ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
  BGE celt_pitch_xcorr_neon_process4
; We have less than 4 sums left to compute.
celt_pitch_xcorr_neon_process4_done
  ADDS         r6, r6, #4
  ; Reduce maxcorr to a single value
  VMAX.S32     d30, d30, d31
  VPMAX.S32    d30, d30, d30
  ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
  BLE celt_pitch_xcorr_neon_done
; Now compute each remaining sum one at a time.
celt_pitch_xcorr_neon_process_remaining
  MOV          r4, r0
  MOV          r5, r1
  VMOV.I32     q0, #0
  SUBS         r12, r3, #8
  BLT celt_pitch_xcorr_neon_process_remaining4
; Sum terms 8 at a time.
celt_pitch_xcorr_neon_process_remaining_loop8
  ; Load x[0...7]
  VLD1.16      {q1}, [r4]!
  ; Load y[0...7]
  VLD1.16      {q2}, [r5]!
  SUBS         r12, r12, #8
  VMLAL.S16    q0, d4, d2
  VMLAL.S16    q0, d5, d3
  BGE celt_pitch_xcorr_neon_process_remaining_loop8
; Sum terms 4 at a time.
celt_pitch_xcorr_neon_process_remaining4
  ADDS         r12, r12, #4
  BLT celt_pitch_xcorr_neon_process_remaining4_done
  ; Load x[0...3]
  VLD1.16      {d2}, [r4]!
  ; Load y[0...3]
  VLD1.16      {d3}, [r5]!
  SUB          r12, r12, #4
  VMLAL.S16    q0, d3, d2
celt_pitch_xcorr_neon_process_remaining4_done
  ; Reduce the sum to a single value.
  VADD.S32     d0, d0, d1
  VPADDL.S32   d0, d0
  ADDS         r12, r12, #4
  BLE celt_pitch_xcorr_neon_process_remaining_loop_done
; Sum terms 1 at a time.
celt_pitch_xcorr_neon_process_remaining_loop1
  VLD1.16      {d2[]}, [r4]!
  VLD1.16      {d3[]}, [r5]!
  SUBS         r12, r12, #1
  VMLAL.S16    q0, d2, d3
  BGT celt_pitch_xcorr_neon_process_remaining_loop1
celt_pitch_xcorr_neon_process_remaining_loop_done
  VST1.32      {d0[0]}, [r2]!
  VMAX.S32     d30, d30, d0
  SUBS         r6, r6, #1
  ; _y++
  ADD          r1, r1, #2
  ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
  BGT celt_pitch_xcorr_neon_process_remaining
celt_pitch_xcorr_neon_done
  VMOV.32      r0, d30[0]
  LDMFD        sp!, {r4-r6, pc}
  ENDP

ENDIF

IF OPUS_ARM_MAY_HAVE_EDSP

; This will get used on ARMv7 devices without NEON, so it has been optimized
; to take advantage of dual-issuing where possible.
xcorr_kernel_edsp PROC
xcorr_kernel_edsp_start
  ; input:
  ;   r3      = int         len
  ;   r4      = opus_val16 *_x (must be 32-bit aligned)
  ;   r5      = opus_val16 *_y (must be 32-bit aligned)
  ;   r6...r9 = opus_val32  sum[4]
  ; output:
  ;   r6...r9 = opus_val32  sum[4]
  ; preserved: r0-r5
  ; internal usage
  ;   r2      = int         j
  ;   r12,r14 = opus_val16  x[4]
  ;   r10,r11 = opus_val16  y[4]
  STMFD        sp!, {r2,r4,r5,lr}
  LDR          r10, [r5], #4      ; Load y[0...1]
  SUBS         r2, r3, #4         ; j = len-4
  LDR          r11, [r5], #4      ; Load y[2...3]
  BLE xcorr_kernel_edsp_process4_done
  LDR          r12, [r4], #4      ; Load x[0...1]
  ; Stall
xcorr_kernel_edsp_process4
  ; The multiplies must issue from pipeline 0, and can't dual-issue with each
  ; other. Every other instruction here dual-issues with a multiply, and is
  ; thus "free". There should be no stalls in the body of the loop.
  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_0,y_0)
  LDR          r14, [r4], #4      ; Load x[2...3]
  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x_0,y_1)
  SUBS         r2, r2, #4         ; j-=4
  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_0,y_2)
  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x_0,y_3)
  SMLATT       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_1,y_1)
  LDR          r10, [r5], #4      ; Load y[4...5]
  SMLATB       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],x_1,y_2)
  SMLATT       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_1,y_3)
  SMLATB       r9, r12, r10, r9   ; sum[3] = MAC16_16(sum[3],x_1,y_4)
  LDRGT        r12, [r4], #4      ; Load x[0...1]
  SMLABB       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_2,y_2)
  SMLABT       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x_2,y_3)
  SMLABB       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_2,y_4)
  SMLABT       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x_2,y_5)
  SMLATT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_3,y_3)
  LDR          r11, [r5], #4      ; Load y[6...7]
  SMLATB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],x_3,y_4)
  SMLATT       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_3,y_5)
  SMLATB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],x_3,y_6)
  BGT xcorr_kernel_edsp_process4
xcorr_kernel_edsp_process4_done
  ADDS         r2, r2, #4
  BLE xcorr_kernel_edsp_done
  LDRH         r12, [r4], #2      ; r12 = *x++
  SUBS         r2, r2, #1         ; j--
  ; Stall
  SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_0)
  LDRHGT       r14, [r4], #2      ; r14 = *x++
  SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x,y_1)
  SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_2)
  SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x,y_3)
  BLE xcorr_kernel_edsp_done
  SMLABT       r6, r14, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_1)
  SUBS         r2, r2, #1         ; j--
  SMLABB       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x,y_2)
  LDRH         r10, [r5], #2      ; r10 = y_4 = *y++
  SMLABT       r8, r14, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_3)
  LDRHGT       r12, [r4], #2      ; r12 = *x++
  SMLABB       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x,y_4)
  BLE xcorr_kernel_edsp_done
  SMLABB       r6, r12, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_2)
  CMP          r2, #1             ; j--
  SMLABT       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_3)
  LDRH         r2, [r5], #2       ; r2 = y_5 = *y++
  SMLABB       r8, r12, r10, r8   ; sum[2] = MAC16_16(sum[2],tmp,y_4)
  LDRHGT       r14, [r4]          ; r14 = *x
  SMLABB       r9, r12, r2, r9    ; sum[3] = MAC16_16(sum[3],tmp,y_5)
  BLE xcorr_kernel_edsp_done
  SMLABT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_3)
  LDRH         r11, [r5]          ; r11 = y_6 = *y
  SMLABB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_4)
  SMLABB       r8, r14, r2, r8    ; sum[2] = MAC16_16(sum[2],tmp,y_5)
  SMLABB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],tmp,y_6)
xcorr_kernel_edsp_done
  LDMFD        sp!, {r2,r4,r5,pc}
  ENDP

celt_pitch_xcorr_edsp PROC
  ; input:
  ;   r0  = opus_val16 *_x (must be 32-bit aligned)
  ;   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
  ;   r2  = opus_val32 *xcorr
  ;   r3  = int         len
  ; output:
  ;   r0  = maxcorr
  ; internal usage
  ;   r4  = opus_val16 *x
  ;   r5  = opus_val16 *y
  ;   r6  = opus_val32  sum0
  ;   r7  = opus_val32  sum1
  ;   r8  = opus_val32  sum2
  ;   r9  = opus_val32  sum3
  ;   r1  = int         max_pitch
  ;   r12 = int         j
  ; ignored:
  ;         int         arch
  STMFD        sp!, {r4-r11, lr}
  MOV          r5, r1
  LDR          r1, [sp, #36]
  MOV          r4, r0
  TST          r5, #3
  ; maxcorr = 1
  MOV          r0, #1
  BEQ          celt_pitch_xcorr_edsp_process1u_done
; Compute one sum at the start to make y 32-bit aligned.
  SUBS         r12, r3, #4
  ; r14 = sum = 0
  MOV          r14, #0
  LDRH         r8, [r5], #2
  BLE celt_pitch_xcorr_edsp_process1u_loop4_done
  LDR          r6, [r4], #4
  MOV          r8, r8, LSL #16
celt_pitch_xcorr_edsp_process1u_loop4
  LDR          r9, [r5], #4
  SMLABT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
  LDR          r7, [r4], #4
  SMLATB       r14, r6, r9, r14     ; sum = MAC16_16(sum, x_1, y_1)
  LDR          r8, [r5], #4
  SMLABT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
  SUBS         r12, r12, #4         ; j-=4
  SMLATB       r14, r7, r8, r14     ; sum = MAC16_16(sum, x_3, y_3)
  LDRGT        r6, [r4], #4
  BGT celt_pitch_xcorr_edsp_process1u_loop4
  MOV          r8, r8, LSR #16
celt_pitch_xcorr_edsp_process1u_loop4_done
  ADDS         r12, r12, #4
celt_pitch_xcorr_edsp_process1u_loop1
  LDRHGE       r6, [r4], #2
  ; Stall
  SMLABBGE     r14, r6, r8, r14    ; sum = MAC16_16(sum, *x, *y)
  SUBSGE       r12, r12, #1
  LDRHGT       r8, [r5], #2
  BGT celt_pitch_xcorr_edsp_process1u_loop1
  ; Restore _x
  SUB          r4, r4, r3, LSL #1
  ; Restore and advance _y
  SUB          r5, r5, r3, LSL #1
  ; maxcorr = max(maxcorr, sum)
  CMP          r0, r14
  ADD          r5, r5, #2
  MOVLT        r0, r14
  SUBS         r1, r1, #1
  ; xcorr[i] = sum
  STR          r14, [r2], #4
  BLE celt_pitch_xcorr_edsp_done
celt_pitch_xcorr_edsp_process1u_done
  ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
  SUBS         r1, r1, #4
  BLT celt_pitch_xcorr_edsp_process2
celt_pitch_xcorr_edsp_process4
  ; xcorr_kernel_edsp parameters:
  ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
  MOV          r6, #0
  MOV          r7, #0
  MOV          r8, #0
  MOV          r9, #0
  BL xcorr_kernel_edsp_start  ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
  ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
  CMP          r0, r6
  ; _y+=4
  ADD          r5, r5, #8
  MOVLT        r0, r6
  CMP          r0, r7
  MOVLT        r0, r7
  CMP          r0, r8
  MOVLT        r0, r8
  CMP          r0, r9
  MOVLT        r0, r9
  STMIA        r2!, {r6-r9}
  SUBS         r1, r1, #4
  BGE celt_pitch_xcorr_edsp_process4
celt_pitch_xcorr_edsp_process2
  ADDS         r1, r1, #2
  BLT celt_pitch_xcorr_edsp_process1a
  SUBS         r12, r3, #4
  ; {r10, r11} = {sum0, sum1} = {0, 0}
  MOV          r10, #0
  MOV          r11, #0
  LDR          r8, [r5], #4
  BLE celt_pitch_xcorr_edsp_process2_loop_done
  LDR          r6, [r4], #4
  LDR          r9, [r5], #4
celt_pitch_xcorr_edsp_process2_loop4
  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
  LDR          r7, [r4], #4
  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
  SUBS         r12, r12, #4         ; j-=4
  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
  LDR          r8, [r5], #4
  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
  LDRGT        r6, [r4], #4
  SMLABB       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_2, y_2)
  SMLABT       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_2, y_3)
  SMLATT       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_3, y_3)
  LDRGT        r9, [r5], #4
  SMLATB       r11, r7, r8, r11     ; sum1 = MAC16_16(sum1, x_3, y_4)
  BGT celt_pitch_xcorr_edsp_process2_loop4
celt_pitch_xcorr_edsp_process2_loop_done
  ADDS         r12, r12, #2
  BLE  celt_pitch_xcorr_edsp_process2_1
  LDR          r6, [r4], #4
  ; Stall
  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
  LDR          r9, [r5], #4
  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
  SUB          r12, r12, #2
  SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
  MOV          r8, r9
  SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
celt_pitch_xcorr_edsp_process2_1
  LDRH         r6, [r4], #2
  ADDS         r12, r12, #1
  ; Stall
  SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
  LDRHGT       r7, [r4], #2
  SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
  BLE celt_pitch_xcorr_edsp_process2_done
  LDRH         r9, [r5], #2
  SMLABT       r10, r7, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_1)
  SMLABB       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_0, y_2)
celt_pitch_xcorr_edsp_process2_done
  ; Restore _x
  SUB          r4, r4, r3, LSL #1
  ; Restore and advance _y
  SUB          r5, r5, r3, LSL #1
  ; maxcorr = max(maxcorr, sum0)
  CMP          r0, r10
  ADD          r5, r5, #2
  MOVLT        r0, r10
  SUB          r1, r1, #2
  ; maxcorr = max(maxcorr, sum1)
  CMP          r0, r11
  ; xcorr[i] = sum
  STR          r10, [r2], #4
  MOVLT        r0, r11
  STR          r11, [r2], #4
celt_pitch_xcorr_edsp_process1a
  ADDS         r1, r1, #1
  BLT celt_pitch_xcorr_edsp_done
  SUBS         r12, r3, #4
  ; r14 = sum = 0
  MOV          r14, #0
  BLT celt_pitch_xcorr_edsp_process1a_loop_done
  LDR          r6, [r4], #4
  LDR          r8, [r5], #4
  LDR          r7, [r4], #4
  LDR          r9, [r5], #4
celt_pitch_xcorr_edsp_process1a_loop4
  SMLABB       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
  SUBS         r12, r12, #4         ; j-=4
  SMLATT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
  LDRGE        r6, [r4], #4
  SMLABB       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
  LDRGE        r8, [r5], #4
  SMLATT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_3, y_3)
  LDRGE        r7, [r4], #4
  LDRGE        r9, [r5], #4
  BGE celt_pitch_xcorr_edsp_process1a_loop4
celt_pitch_xcorr_edsp_process1a_loop_done
  ADDS         r12, r12, #2
  LDRGE        r6, [r4], #4
  LDRGE        r8, [r5], #4
  ; Stall
  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
  SUBGE        r12, r12, #2
  SMLATTGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
  ADDS         r12, r12, #1
  LDRHGE       r6, [r4], #2
  LDRHGE       r8, [r5], #2
  ; Stall
  SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, *x, *y)
  ; maxcorr = max(maxcorr, sum)
  CMP          r0, r14
  ; xcorr[i] = sum
  STR          r14, [r2], #4
  MOVLT        r0, r14
celt_pitch_xcorr_edsp_done
  LDMFD        sp!, {r4-r11, pc}
  ENDP

ENDIF

END