#include <stdio.h>
struct IV {int x, y, z, _;}__attribute__((aligned(16)));
static inline void miv(struct IV* volatile d, struct IV* volatile s0, struct IV* volatile s1) {
d->x = s0->x * s1->x; d->y = s0->y * s1->y; d->z = s0->z * s1->z;
//__asm__ volatile("movdqa (%1),%%xmm0;pmulld (%0),%%xmm0;movdqa %%xmm0,(%2);"::"r"((s0)), "r"((s1)), "r"((d)):"memory"); // SSE4.1
}
struct FV {float x, y, z, _;}__attribute__((aligned(16)));
static inline void mfv(struct FV* volatile d, struct FV* volatile s0, struct FV* volatile s1) {
__asm__ volatile("movaps (%1),%%xmm0;mulps (%0),%%xmm0;movaps %%xmm0,(%2);"::"r"((s0)), "r"((s1)), "r"((d)):"memory");
}
typedef unsigned long long ull;
static ull rdtsc(void) {ull a; __asm__ volatile("rdtsc":"=A"(a)); return a;}
main(){
struct IV i={1,2,3,4}, ii;
struct FV f={5,6,7,8}, ff;
int T=100000000;
ull s=rdtsc();
int c=T; while(c-->0){miv(&ii,&i,&i);}
printf("tasizan [%llu]clock", rdtsc()-s);
printf("\t%d %d %d\n", ii.x, ii.y, ii.z);
s=rdtsc();
c=T; while(c-->0){mfv(&ff,&f,&f);}
printf("kakezan [%llu]clock", rdtsc()-s);
printf("\t%f %f %f\n", ff.x, ff.y, ff.z);
}