#include <inttypes.h>
#include <malloc.h>
#include <stdio.h>

static inline uint64_t read_time(void)
{
    if(sizeof(long)==8)
    {
        uint32_t a, d;
        asm volatile("rdtsc" :"=a"(a), "=d"(d));
        return ((uint64_t)d << 32) | a;
    } else {
        uint64_t l;
        asm volatile("rdtsc" :"=A"(l));
        return l;
    }
}
 
int main()
{
    char *buf = valloc(0x2010);
    int i;
    for(i=0; i<=0x2000; i++) {
        int j = 1000000;
        uint64_t t = read_time();
        asm volatile(
            "1: \n"
            "movdqu %1, %%xmm0 \n"
            "movdqu %1, %%xmm0 \n"
            "movdqu %1, %%xmm0 \n"
            "movdqu %1, %%xmm0 \n"
            "sub $4, %0 \n"
            "jg 1b \n"
            :"+r"(j)
            :"m"(buf[i])
        );
        t = read_time() - t;
        printf("%x: %.2f\n", i, t/1e6);
    }
    return 0;
}
