Hi,
Consider the following code and the generated assembly from ifort 14 (with -xCORE-AVX2 and -O2).
Assuming that the B.9 segment is the peel loop, why the compiler still uses unaligned mov instructions for the vectorized loop body?
subroutine aligntest (acc,z,n) real, dimension(*) :: acc real, dimension(*) :: z integer n integer i do i = 1 ,n acc(i) = acc(i) * z(i) enddo end subroutine
..B1.9: # Preds ..B1.7 ..B1.9 vmovss (%rdi,%rcx,4), %xmm0 #9.26 vmulss (%rsi,%rcx,4), %xmm0, %xmm1 #9.17 vmovss %xmm1, (%rdi,%rcx,4) #9.17 incq %rcx #8.14 cmpq %r8, %rcx #8.14 jb ..B1.9 # Prob 82% #8.14 # LOE rax rdx rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 ..B1.12: # Preds ..B1.7 ..B1.9 ..B1.12 vmovups (%rdi,%r8,4), %ymm0 #9.26 vmovups 32(%rdi,%r8,4), %ymm2 #9.26 vmulps (%rsi,%r8,4), %ymm0, %ymm1 #9.17 vmulps 32(%rsi,%r8,4), %ymm2, %ymm3 #9.17 vmovups %ymm1, (%rdi,%r8,4) #9.17 vmovups %ymm3, 32(%rdi,%r8,4) #9.17 addq $16, %r8 #8.14 cmpq %rdx, %r8 #8.14 jb ..B1.12 # Prob 82% #8.14