; the base of this code was generated by gemini section .data ; Define two vectors of four 32-bit floats vector1: dd 1.1, 2.2, 3.3, 4.4 vector2: dd 5.5, 6.6, 7.7, 8.8 floatFmt: db `%f + %f = %f\n`,0 section .bss ; Space for the result (4 floats * 4 bytes = 16 bytes) result: resd 4 section .text global main extern printf main: ; Load values into 128-bit XMM registers ; movups (Move Unaligned Packed Single-Precision) is used if memory isn't 16-byte aligned movups xmm0, [vector1] movups xmm1, [vector2] ; Add the four floats in xmm1 to the four in xmm0 ; Result: xmm0 = [1.1+5.5, 2.2+6.6, 3.3+7.7, 4.4+8.8] addps xmm0, xmm1 ; Store the final vector back to memory movups [result], xmm0 ; print the results mov r12, 0 .top: cmp r12, 4 je .done mov rdi, floatFmt mov rax, 3 movss xmm0, dword [vector1 + r12 * 4] cvtss2sd xmm0, xmm0 movss xmm1, dword [vector2 + r12 * 4] cvtss2sd xmm1, xmm1 movss xmm2, dword [result + r12 * 4] cvtss2sd xmm2, xmm2 mov r13, rsp and rsp, -16 call printf mov rsp, r13 inc r12 jmp .top .done: ; Exit program (Linux syscall for x86_64) mov rax, 60 ; syscall: exit xor rdi, rdi ; status: 0 syscall