; The base of this code was generated by gemini extern printf section .data ; Define 256-bit (32-byte) aligned constants align 32 vector1 dd 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0 vector2 dd 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8 floatFmt: db `%f + %f = %f\n`,0 section .bss align 32 result resd 8 ; Reserve space for 8 single-precision floats section .text global main main: ; 1. Load the first 256-bit vector from memory into YMM0 vmovaps ymm0, [vector1] ; 2. Add the second 256-bit vector from memory to YMM0, result in YMM1 ; VADDPS: Vector ADD Packed Single-precision vaddps ymm1, ymm0, [vector2] ; 3. Store the 256-bit result back to memory vmovaps [result], ymm1 mov r12, 0 .top: cmp r12,8 je .done mov rdi, floatFmt mov rax, 3 movss xmm0, dword [vector1 + r12 * 4] cvtss2sd xmm0, xmm0 movss xmm1, dword [vector2 + r12 * 4] cvtss2sd xmm1, xmm1 movss xmm2, dword [result + r12 * 4] cvtss2sd xmm2, xmm2 mov r13, rsp and rsp, -16 call printf mov rsp, r13 inc r12 jmp .top .done: ; Exit program (Linux x64) mov rax, 60 ; syscall: exit xor rdi, rdi ; status: 0 syscall