-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathx86_function.s
379 lines (290 loc) · 12.1 KB
/
x86_function.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
; =============================================================================
; [PROJECT]
;
; Name: Triangle Shading
; Desc: Drawing a triangle based on Gouraud shading
; Arch: x86_64
; Lang: C++, ASM (Intel syntax)
; Tech: AVX, AVX2
; Libs: SFML 2.4
;
; [FILE]
; Name: x86_function.s
; Desc: Contains function written in assembler that draws triangle
;
; [FUNCTION]
;
; Name: x86_function
; Desc: Draws a triangle by modifying pixels array, Gouraud shading
; Arch: x86_64
; Lang: ASM (Intel syntax)
; Tech: AVX, AVX2
; Args: *vertices, *pixels, int width, int height, *mask
; Rets: Returns nothing
;
; Auth: Gerard Wypych
; =============================================================================
section .text
global x86_function
x86_function:
push rbp ; push "calling procedure" frame pointer
mov rbp, rsp ; set new frame pointer
;------------------------------------------------------------------------------
; saving registers rbp, rsp, rbx, r12, r13, r14, r15
; rbp was saved before, so need to save only 6 registers
push rsp ; [rbp-8]
push rbx ; [rbp-16]
push r12 ; [rbp-24]
push r13 ; [rbp-32]
push r14 ; [rbp-40]
push r15 ; [rbp-48]
; saving arguments
push rdi ; [rbp-56] pointer to vertices
push rsi ; [rbp-64] pointer to pixels
push rdx ; [rbp-72] width
push rcx ; [rbp-80] height
push r8 ; [rbp-88] mask
%define vertices [rbp-56]
%define pixels [rbp-64]
%define width [rbp-72]
%define height [rbp-80]
;%define mask [rbp-88]
vzeroall ; assigning zero to all ymm registers
vmovups xmm9, [r8] ; mask
%define mask xmm9
;------------------------------------------------------------------------------
; [0.0] loading vertices
; vertice looks like [x][y][z][w][r][g][b][a],
; each element 4 bytes (32 bits) float value
mov r9, vertices ; pointer to vertices
mov r10, r9 ; v1
mov r11, r9
add r11, 32 ; vertices + 32 = v2
mov r12, r9
add r12, 64 ; vertices + 64 = v3
; y coordinates
vmovups ymm1, [r10+4] ; v1, but begin from y
vcvtss2si r13, xmm1 ; v1.y
vmovups ymm2, [r11+4] ; v2, but begin from y
vcvtss2si r14, xmm2 ; v2.y
vmovups ymm3, [r12+4] ; v3, but begin from y
vcvtss2si r15, xmm3 ; v3.y
;------------------------------------------------------------------------------
; [1.0] sorting vertices
sort:
; if( v1.y > v2.y ) swap( v1, v2 );
check_v1_v2:
cmp r13, r14 ; compare v1.y with v2.y
jle check_v1_v3 ; skip swaping
; swaping pointers to vertices
mov rax, r10 ; move v1 to eax
mov r10, r11 ; move v2 to v1
mov r11, rax ; move v1 (eax) to v2
; swaping y values
mov rax, r13 ; y1 to eax
mov r13, r14 ; y2 to y1
mov r14, rax ; y1 (eax) to y2
; if( v1.y > v3.y ) swap( v1, v3 );
check_v1_v3:
cmp r13, r15 ; compare v1.y with v3.y
jle check_v2_v3 ; skip swaping
; swaping pointers to vertices
mov rax, r10 ; v1 to eax
mov r10, r12 ; v3 to v1
mov r12, rax ; v1 to v3
; swaping y values
mov rax, r13 ; y1 to eax
mov r13, r15 ; y2 to y1
mov r15, rax ; y1 to y2
; if( v2.y > v3.y ) swap( v2, v3 );
check_v2_v3:
cmp r14, r15 ; compare v2.y with v3.y
jle sort_end ; skip swaping
; swaping pointers to vertices
mov rax, r11 ; v2 to eax
mov r11, r12 ; v3 to v2
mov r12, rax ; v2 to v3
; swaping y values
mov rax, r14 ; y2 to eax
mov r14, r15 ; y3 to y2
mov r15, rax ; y2 to y3
sort_end:
;------------------------------------------------------------------------------
; x coordinates
vmovups ymm1, [r10] ; v1, begin from [x]
vmovups ymm2, [r11] ; v2, begin from [x]
vmovups ymm3, [r12] ; v3, begin from [x]
vmovups xmm10, [r10+4] ; v1, begin from [y]
vmovups xmm11, [r11+4] ; v2, begin from [y]
vmovups xmm12, [r12+4] ; v3, begin from [y]
; coordinates
; r13, r14, r15 = y1, y2, y3
; colors
; xmm10, xmm11, xmm12 = v1.rgba, v2.rgba, v3.rgba
%define v1 ymm1
%define v2 ymm2
%define v3 ymm3
%define vy1 xmm10
%define vy2 xmm11
%define vy3 xmm12
%define y1 r13
%define y2 r14
%define y3 r15
%define y r8 ; current y
%define x r9
mov y, 0
mov x, 0
;------------------------------------------------------------------------------
; [2.0] if v1.y is equal v3.y jump to the END
cmp y1, y3
je end
%define vB ymm4 ; v begin
%define vE ymm5 ; v end
%define d13 ymm6
%define d12 ymm7
%define d23 ymm8
; [2.1] calculating differences d13 (v1, v3)
; v3 - v1
vsubps d13, v3, v1
; y3 - y1
vsubps xmm0, vy3, vy1
vbroadcastss ymm0, xmm0
vdivps d13, d13, ymm0 ; v3-v1 / y3-y1
; [2.2] begin and end vertex = v1
vmovups vB, v1
vmovups vE, v1
;------------------------------------------------------------------------------
first_stage:
; [3.0] if v1.y is equal v2.y jump to [4.0]
cmp y1, y2
je second_stage
; [3.1] calculating differences d12 (v1, v2)
vsubps d12, v2, v1 ; v2 - v1
vsubps xmm0, vy2, vy1 ; y2 - y1
vbroadcastss ymm0, xmm0 ; filling vector with (y2-y1)
vdivps d12, d12, ymm0 ; v2-v1 / y2-y1
; [3.2] FIRST STAGE of drawing
mov y, y1
; [3.3] while( y < v2.y )
first_stage_loop:
; [3.3.0] draw a line from begin to end
; loading x values of begin and end
vcvtss2si rax, xmm4 ; vB
vcvtss2si rbx, xmm5 ; VE
mov rcx, pixels
; for linear interpolation
vmovups ymm14, vB
vmovups ymm15, vE
vsubps ymm13, ymm15, ymm14 ; vE - vB
vbroadcastss ymm0, xmm13 ; xE - xB
vdivps ymm13, ymm13, ymm0 ; vE-vB / xE-xB
; checking vB.x < vE.x
; drawing line begins from the lower value to the higher
vmovups ymm15, vB ; ymm15 is for current x
mov x, rax
cmp rax, rbx
jle first_stage_drawing_line
mov x, rbx
mov rbx, rax
mov rax, x
vmovups ymm15, vE ; ymm15 is for current x
; x=rax is lower then rbx ; rax = vB.x ; rbx = vE.x
first_stage_drawing_line:
mov rdx, width ; width
imul rdx, y ; width*y
add rdx, x ; width*y + x
shl rdx, 2 ; 4*(width*y + x)
add rdx, rcx
first_stage_drawing_line_reduced:
vextractf128 xmm0, ymm15, 1 ; color from current v
vcvtps2dq xmm0, xmm0 ; float to integer
pshufb xmm0, mask ; shuffle lower bytes to the beginning
vmovd [rdx], xmm0 ; store the lower bytes of color
inc x ; increment current x
vaddps ymm15, ymm15, ymm13 ; adding color change
add rdx, 4
cmp x, rbx ; compare current x with vE.x
jle first_stage_drawing_line_reduced
; [3.3.1] calculate next line begin and end
vaddps vB, vB, d13
vaddps vE, vE, d12
inc y ; increment current y
; contiune if y <= y2
cmp y, y2
jl first_stage_loop
;------------------------------------------------------------------------------
second_stage:
vmovups vE, v2
; [4.0] if v2.y is equal to v3.y jump to the END
cmp y2, y3
je end
; [4.1] calculating differences d23 (v2, v3)
vsubps d23, v3, v2 ; v3 - v2
vsubps xmm0, vy3, vy2 ; y3 - y2
vbroadcastss ymm0, xmm0
vdivps d23, d23, ymm0 ; v3-v2 / y3-y2
; [4.2] SECOND STAGE of drawing
mov y, y2
; [4.3] while( y <= v3.y )
second_stage_loop:
; [4.3.0] draw a line from begin to end
; loading x values of begin and end
vcvtss2si rax, xmm4 ; vB
vcvtss2si rbx, xmm5 ; VE
mov rcx, pixels
; for linear interpolation
vmovups ymm14, vB
vmovups ymm15, vE
vsubps ymm13, ymm15, ymm14 ; vE - vB
vbroadcastss ymm0, xmm13 ; xE - xB
vdivps ymm13, ymm13, ymm0 ; vE-vB / xE-xB
; checking vB.x < vE.x
; drawing line begins from the lower value to the higher
vmovups ymm15, vB ; ymm15 is for current x
mov x, rax
cmp rax, rbx
jle second_stage_drawing_line
mov x, rbx
mov rbx, rax
mov rax, x
vmovups ymm15, vE ; ymm15 is for current x
; x=rax is lower then rbx, rax = vB.x ; rbx = vE.x
second_stage_drawing_line:
mov rdx, width ; width
imul rdx, y ; width*y
add rdx, x ; width*y + x
shl rdx, 2 ; 4*(width*y + x)
add rdx, rcx
second_stage_drawing_line_reduced:
vextractf128 xmm0, ymm15, 1 ; extracting color from current v
vcvtps2dq xmm0, xmm0 ; float to integer
pshufb xmm0, mask ; shuffle lower bytes to the beginning
vmovd [rdx], xmm0 ; store the lower bytes of color
inc x ; incrementing current x
vaddps ymm15, ymm15, ymm13 ; adding color change
add rdx, 4
cmp x, rbx ; compare current x with vE.x
jle second_stage_drawing_line_reduced
; [4.3.1] calculate next line begin and end
vaddps vB, vB, d13
vaddps vE, vE, d23
inc y ; incrementing current y
; contiune if y <= y3
cmp y, y3
jle second_stage_loop
;------------------------------------------------------------------------------
end:
; free space - saved arguments
; 5 * 8 = 40 bytes (5 registers, rdi, rsi, rdx, rcx, r8 each 8 bytes)
add rsp, 40
; loading saved registers rbp, rsp, rbx, r12, r13, r14, r15
pop r15
pop r14
pop r13
pop r12
pop rbx
pop rsp
;------------------------------------------------------------------------------
mov rsp, rbp ; restore original stack pointer
pop rbp ; restore frame pointer
ret