simd.asm

URL: https://mirkwood.cs.edinboro.edu/~bennett/class/cmsc3100/spring2026/notes/float/code/simd.asm
 
; the base of this code was generated by gemini
section .data
    ; Define two vectors of four 32-bit floats
    vector1: dd 1.1, 2.2, 3.3, 4.4
    vector2: dd 5.5, 6.6, 7.7, 8.8

    floatFmt: db `%f + %f = %f\n`,0

section .bss
    ; Space for the result (4 floats * 4 bytes = 16 bytes)
    result: resd 4

section .text
    global main

extern printf

main:
    ; Load values into 128-bit XMM registers
    ; movups (Move Unaligned Packed Single-Precision) is used if memory isn't 16-byte aligned
    movups xmm0, [vector1] 
    movups xmm1, [vector2]

    ; Add the four floats in xmm1 to the four in xmm0
    ; Result: xmm0 = [1.1+5.5, 2.2+6.6, 3.3+7.7, 4.4+8.8]
    addps xmm0, xmm1

    ; Store the final vector back to memory
    movups [result], xmm0

; print the results

    mov r12, 0

.top:
    cmp r12, 4
    je .done

    mov rdi, floatFmt

    mov rax, 3

    movss xmm0, dword [vector1 + r12 * 4]
    cvtss2sd xmm0, xmm0
    movss xmm1, dword [vector2 + r12 * 4]
    cvtss2sd xmm1, xmm1
    movss xmm2, dword [result  + r12 * 4]
    cvtss2sd xmm2, xmm2

    mov r13, rsp
    and rsp, -16
    call printf
    mov rsp, r13

    inc r12
    jmp .top


.done:

    ; Exit program (Linux syscall for x86_64)
    mov rax, 60         ; syscall: exit
    xor rdi, rdi        ; status: 0
    syscall