-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchroma_neon.S
220 lines (198 loc) · 4.71 KB
/
chroma_neon.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
@*****************************************************************************
@ 2015 olegk0 <olegvedi@gmail.com>
@ based on:
@
@ deinterleave_chroma.S : ARM NEONv1 conversion of interleaved to planar chroma
@*****************************************************************************
@ Copyright (C) 2009-2011 Rémi Denis-Courmont
@ Copyright (C) 2013 Martin Storsjö
@
@ This program is free software; you can redistribute it and/or modify
@ it under the terms of the GNU Lesser General Public License as published by
@ the Free Software Foundation; either version 2.1 of the License, or
@ (at your option) any later version.
@
@ This program is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU Lesser General Public License for more details.
@
@ You should have received a copy of the GNU Lesser General Public License
@ along with this program; if not, write to the Free Software Foundation,
@ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
@****************************************************************************/
.syntax unified
.fpu neon
.text
#define UV r0
#define COUNT r1
#define WIDTH r2
#define HEIGHT r3
#define DPITCH r4
#define DPAD r4
#define U r5
#define V r6
#define SPITCH lr
#define SPAD lr
.align 2
.global interleave_chroma_neon
.type interleave_chroma_neon, %function
interleave_chroma_neon:
push {r4-r6,lr}
vpush {d16, d17}
ldmia r0, {UV, DPITCH}
ldmia r1, {U, V, SPITCH}
@ one UV block on two pixel by x
mov WIDTH, WIDTH, lsr #1
@ and on two by y
mov HEIGHT, HEIGHT, lsr #1
@ round the width up to a multiple of 8
add WIDTH, WIDTH, #7
bic WIDTH, WIDTH, #7
cmp HEIGHT, #0
cmpgt WIDTH, #0
ble 3f
sub DPAD, DPITCH, WIDTH, lsl #1
sub SPAD, SPITCH, WIDTH
1:
mov COUNT, WIDTH
2:
pld [U, #64]
@ vld1.u8 {d0}, [U,:64]!
vld1.u8 {d16}, [U]!
pld [V, #64]
@ vld1.u8 {d1}, [V,:64]!
vld1.u8 {d17}, [V]!
@ vst2.u8 {d0, d1}, [UV,:128]!
vst2.u8 {d16, d17}, [UV]!
subs COUNT, COUNT, #8
bgt 2b
subs HEIGHT, #1
add UV, UV, DPAD
add U, U, SPAD
add V, V, SPAD
bgt 1b
3:
vpop {d16, d17}
pop {r4-r6,pc}
@*************************************************************************
#define Y r0
#define COUNT r1
#define WIDTH r2
#define HEIGHT r3
#define UV2 r4
#define DPITCH2 r5
#define DPAD2 r5
#define S r6
#define SPITCH lr
#define SPAD lr
.global yuyv_semiplanar_neon
.type yuyv_semiplanar_neon, %function
yuyv_semiplanar_neon:
push {r4-r6,lr}
vpush {d16, d17}
ldmia r0, {Y, UV2, DPITCH2}
ldmia r1, {S, SPITCH}
cmp HEIGHT, #0
cmpgt WIDTH, #0
ble 3f
sub DPAD2, DPITCH2,WIDTH
sub SPAD, SPITCH, WIDTH, lsl #1
1:
mov COUNT, WIDTH
2:
pld [S, #64]
subs COUNT, COUNT, #8
@ YUYV => d0:YY + d1:UV
vld2.u8 {d16, d17}, [S]! @ 16
@ vld1.u8 {q0-q1}, [S,:128]!
vst1.u8 {d16}, [Y]! @ 8
vst1.u8 {d17}, [UV2]! @ 8
bgt 2b
subs HEIGHT, #1
add S, S, SPAD
add Y, Y, DPAD2
add UV2, UV2, DPAD2@, lsr #1
bgt 1b
3:
vpop {d16, d17}
pop {r4-r6,pc}
@*************************************************************************
#define dY r0
#define DPITCH r1
#define DPAD r1
#define WIDTH r2
#define HEIGHT r3
#define sY r4
#define CNT r5
#define BWIDTH r6
#define SPITCH lr
#define SPAD lr
.global copy_neon
.type copy_neon, %function
copy_neon:
push {r4-r7,lr}
vpush {d16-d19}
ldmia r0, {dY, sY, SPITCH}
cmp HEIGHT, #2
ble 3f
subs HEIGHT, #1
mov BWIDTH, WIDTH
bic BWIDTH, #3
@ round up 32bytes align
add WIDTH, #31
bic WIDTH, #31
bic DPITCH, #3
bic SPITCH, #3
sub DPAD, DPITCH, WIDTH
sub SPAD, SPITCH, WIDTH
1:
mov CNT, WIDTH
@ copy with 32bytes accuracy
2:
@ pld [sY, #128]@???
@ pld [sY, #0xC0]
@ subs COUNT, COUNT, #8
@ vld2.u8 {d0}, [sY]!
@ vst1.u8 {d0}, [dY]!
vldm sY!,{d16-d19}
vstm dY!,{d16-d19}
subs CNT, #32
bgt 2b
add dY, DPAD
add sY, SPAD
subs HEIGHT, #1
bne 1b
@ last line with 4bytes accuracy
4:
ldr CNT, [sY], #4
str CNT, [dY], #4
subs BWIDTH, #4
bgt 4b
3:
vpop {d16-d19}
pop {r4-r7,pc}
@*************************************************************************
#define DST r0
#define FILL r1
#define SIZE r2
.global memset_neon
.type memset_neon, %function
memset_neon:
push {lr}
vpush {d16-d19}
@ round down
bics SIZE, #31
beq 3f
vdup.8 d16, FILL
vmov d17, d0
vmov d18, d0
vmov d19, d0
2:
// pld [DST, #128]@???
vstm DST!,{d16-d19}
subs SIZE, #32
bgt 2b
3:
vpop {d16-d19}
pop {pc}