Causes:
truct char4 { char c1; char c2; char c3; char c4; }; extern struct char4 *a; void vecmsg_testcore003 () { int i; const struct char4 n = {0, 0, 0, 0}; #pragma omp simd for(i = 0; i < 1024; i++) { a[i] = n; } }
Recommendations |
inline char4 operator=(const char4 &x) { char4 temp; temp.c1 = x.c1; temp.c2 = x.c2; temp.c3 = x.c3; temp.c4 = x.c4; return temp; }
Cause:
In nested loop structures, the compiler targets the innermost loop for vectorization. The outer loop, by default, is not a target for vectorization; however, it may be a target for parallelization.
C++ Example:
#include <iostream> #define N 25 int main() { int a[N][N], b[N], i; for(int j = 0; j < N; j++) { for(int i = 0; i < N; i++) a[j][i] = 0; b[j] = 1; } int sum = __sec_reduce_add(a[:][:]) + __sec_reduce_add(b[:]); return 0; }
Recommendation |
Target | ICL/ICC/ICPC Directive | IFORT Directive |
---|---|---|
Outer loop | #pragma omp simd collapse(n), #pragma omp simd, or #pragma simd | !$OMP SIMD COLLAPSE(n), !$OMP SIMD, or !DIR$ SIMD |
Cause: The compiler vectorizer determined the remainder loop will not benefit from vectorization.
C++ Example:
#include < iostream > #define N 70 int main() { static short tab1[N], tab2[N]; int i, j; static short const data[] = {32768, -256, -255, -128, -127, -1, 0, 1, 127, 128, 255, 256, 32767}; for (j = i = 0; i < N; i++) { tab1[i] = i; tab2[i] = data[j++]; if (j > 12) j = 0; } int sum = __sec_reduce_add(tab1[:]) + __sec_reduce_add(tab2[:]); return 0; }
Recommendations |
Target | ICL/ICC/ICPC Directive | IFORT Directive |
---|---|---|
Source loop | #pragma vector vecremainder | !DIR$ SIMD VECREMAINDER |
Target | ICL/ICC/ICPC Directive | IFORT Directive |
---|---|---|
Source loop | #pragma vector novecremainder | !DIR$ SIMD NOVECREMAINDER |
Cause: The compiler vectorizer determined the loop will not benefit from vectorization. Common reasons include:
#include <iostream> #define N 100 struct s1 { int a, b, c; } int main() { s1 arr[N], sum; for(int i = 0; i < N; i++) { sum.a += arr[i].a; sum.b += arr[i].b; sum.c += arr[i].c; } std::cout << sum.a << "t" << sum.b << "t" << sum.c << "n"; return 0; }
Recommendations |
Target | ICL/ICC/ICPC Directive | IFORT Directive |
---|---|---|
Source loop | #pragma vector or #pragma vector always | !DIR$ VECTOR or !DIR$ VECTOR ALWAYS |
Target | ICL/ICC/ICPC Directive | IFORT Directive |
---|---|---|
Source loop | #pragma simd or #pragma omp simd | !DIR$ SIMD or !$OMP SIMD |
Causes:
void foo(int *A, int *restrict B, int n, int* x) { int i; #pragma omp simd for (i = 0; i < n; i++) { if (A[i] > i) *x = i; else B[i] = *x; } B[i] = *x++; }
Recommendations |
#include < stdlib.h > #define N 70 int main(int argc, char *argv[]) { int k = atoi(argv[1]); int a[N], i; for(i = abs(k); i < N; i++) a[i] = a[i+k] + 1; return 0; }
Recommendations |
Target | ICL/ICC/ICPC Directive | IFORT Directive |
---|---|---|
Source Loop | #pragma simd or #pragma omp simd | !DIR$ SIMD or !$OMP SIMD |
Target | ICL/ICC/ICPC Directive | IFORT Directive |
---|---|---|
Source Loop | #pragma ivdep | !DIR$ IVDEP |
Target | ICL/ICC/ICPC Directive | IFORT Directive |
---|---|---|
Source Loop | #pragma simd vectorlength(k) | !DIR$ SIMD VECTORLENGTH(k) |
Causes:
void no_vec(float a[], float b[], float c[]) { int i = 0.; while (i < 100) { a[i] = b[i] * c[i]; // this is a data-dependent exit condition: if (a[i] < 0.0) break; ++i; } }Exception: Loops searching for an array element, as in the example below, can be automatically vectorized when array a[i] is aligned.
for (i = 0; i < n; ++i) { if (a[i] == to_find) { index = I; break; } }C++ Example 2: A SIMD loop uses C++ exception handling or an OpenMP critical construct.
#define N 1000 int foo() { #pragma omp simd for (int i = 0; i < N; i++) { try { printf ("throw exception 11\n"); throw 11; } catch (int t) { printf ("caught exception %d\n", t); if (t != 11) { #pragma omp critical { printf ("TEST FAILED\n"); exit (0); } } } } printf ("TEST PASSED\n"); exit (0); }C++ Example 3: The compiler cannot determine which function is passed as a function parameter.
#include <iostream> int a[100]; int b[100]; int g(int i, int y) { return b[i]+y; } __declspec(noinline) void doit1(int x(int,int), int y) { int i; #pragma parallel for(i = 0; i < 100; i++) a[i] = x(i,y); }
Recommendations |
Causes:
subroutine d_15043(a,b,c,n) implicit none real, intent(in ), dimension(n) :: a, b real, intent(out), dimension(n) :: c integer, intent(in) :: n integer :: i do i=1,n if(a(i) < 0.) exit c(i) = sqrt(a(i)) * b(i) enddo end subroutine d_15043Fortran Example 2: The iteration count is data dependent.
subroutine d_15043_2(a,b,c,n) implicit none real, intent(in ), dimension(n) :: a, b real, intent(out), dimension(n) :: c integer, intent(in) :: n integer :: i i = 0 do while (a(i) > 0.) c(i) = sqrt(a(i)) * b(i) i = i + 1 enddo end subroutine d_15043_2Fortran Example 3: The loop contains a subroutine or function that prevents vectorization.
subroutine d_15043_3(a,b,c,n) implicit none real, intent(in ), dimension(n) :: a, b real, intent(out), dimension(n) :: c integer, intent(in) :: n integer :: i do i=1,n call my_sub(a(i),b(i),c(i)) enddo end subroutine d_15043_3
Recommendations |
do i=1,n if(a(i) > 0.) c(i) = sqrt(a(i)) * b(i) enddoIf necessary, the iteration count can be pre-computed.
Read More:
Cause: The compiler detected or assumed a vector dependence in the loop.
C++ Example:
int foo(float *A, int n) { int inx = 0; float max = A[0]; int i; for (i=0;i < n;i++) { if (max < A[i]) { max = A[i]; inx = i*i; } } return inx; }Fortran Example:
integer function foo(a, n) implicit none integer, intent(in) :: n real, intent(inout) :: a(n) real :: max integer :: inx, i max = a(0) do i=1,n if (max < a(i)) then max = a(i) inx = i*i endif end do foo = inx end function
Recommendations |
Target | ICL/ICC/ICPC Directive | IFORT Directive |
---|---|---|
Source Loop | #pragma simd or #pragma omp simd | !DIR$ SIMD or !$OMP SIMD |
Target | ICL/ICC/ICPC Directive | IFORT Directive |
---|---|---|
Source Loop | #pragma ivdep | !DIR$ IVDEP |
Target | ICL/ICC/ICPC Directive | IFORT Directive |
---|---|---|
Source Loop | #pragma simd vectorlength(k) | !DIR$ SIMD VECTORLENGTH(k) |
Causes:
#include <iostream> #include <complex> using namespace std; int main() { float c[10]; c[:] = 0.f; for(int i = 0; i < 10; i++) cout << c[i] << "n"; return 0; }
Recommendations |
Target | ICL/ICC/ICPC Construct |
---|---|
Source function | #pragma omp declare simd |
Source function | _declspec(vector) (Windows OS) or _attribute_(vector) (Linux OS) |
Cause: A function call inside the loop is preventing auto-vectorization.
Fortran Example:
Program foo implicit none integer, parameter :: nx = 100000000 real(8) :: x, xp, sumx integer :: i interface real(8) function bar(x, xp) real(8), intent(in) :: x, xp end end interface sumx = 0. xp = 1. do i = 1,nx x = 1.D-8*real(i,8) sumx = sumx + bar(x,xp) enddo print *, 'Sum =',sumx end real(8) function bar(x, xp) implicit none real(8), intent(in) :: x, xp bar = 1. - 2.*(x-xp) + 3.*(x-xp)**2 - 1.5*(x-xp)**3 + 0.2*(x-xp)**4 bar = bar / sqrt(x**2 + xp**2) end
Recommendations |
Target | IFORT Construct |
---|---|
Source function | !DIR$ OMP DECLARE SIMD |
Source function | ELEMENTAL keyword or !DIR$ ATTRIBUTES VECTOR |
real(8) function bar(x, xp) !$OMP DECLARE SIMD (bar) UNIFORM(xp) implicit none real(8), intent(in) :: x, xp bar = 1. - 2.*(x-xp) + 3.*(x-xp)**2 - 1.5*(x-xp)**3 + 0.2*(x-xp)**4 bar = bar / sqrt(x**2 + xp**2) endThe code now generates a vectorized version of function bar() ; however, the loop inside foo is still not vectorized because the compiler sees dependencies between loop iterations carried by both x and
Program foo implicit none integer, parameter :: nx = 100000000 real(8) :: x, xp, sumx integer :: i interface nbsp;real(8) function bar(x, xp) !$OMP DECLARE SIMD (bar) UNIFORM(xp) real(8), intent(in) :: x, xp end end interface sumx = 0. xp = 1. !$OMP SIMD private(x) reduction(+:sumx) do i = 1,nx x = 1.D-8*real(i,8) sumx = sumx + bar(x,xp) enddo print *, 'Sum =',sumx endThe loop now vectorizes successfully, and running the application shows a performance speedup.
Causes:
void foo(float *A) { int i; int OuterCount = 90; while (OuterCount > 0) { for (i = 1; i < bar(int(A[0])); i++) { A[i] = i + 4; } OuterCount--; } }C++ Example 2: The compiler cannot determine if there is aliasing between all the pointers used inside the loop and loop boundaries.
struct Dim { int x, y, z; }; Dim dim; double* B; void foo (double* A) { for (int i = 0; i < dim.x; i++) { A[i] = B[i]; } }
Recommendations |
void foo(float *A) { int i; int OuterCount = 90; int limit = bar(int(A[0])); while (OuterCount > 0) { for (i=1; i < limit; i++) { A[i] = i + 4; } OuterCount--; } }
Target | ICL/ICC/ICPC Directive |
---|---|
Source loop | #pragma simd or #pragma omp simd |
Target | ICL/ICC/ICPC Directive |
---|---|
Source loop | #pragma ivdep |
Cause: The loop iteration count is not available before the loop executes.
Fortran Example:
subroutine foo(a, n) implicit none integer, intent(in) :: n double precision, intent(inout) :: a(n) integer :: bar integer :: i i=0 100 CONTINUE a(i)=0 i=i+1 if (i < bar()) goto 100 end subroutine foo
Recommendations |
Cause: Any usage of volatile variables in the loop causes this diagnostic.
C++ Example:
volatile int32_t x; int32_t a[c_size]; for (int32_t i = 0; i < c_size; ++i) { a[i] = exp(x + i); x = a[i]; }
Recommendations |
Cause: Internal time limits for the optimization level prevented the compiler from determining a vectorization approach for this loop.
Recommendations |
Cause: The inner loop has an irregular structure. For example, it may have non-constant lower and higher bounds, a non-constant step for iterations, more than one entry, some assembly parts, volatile variables, long jumps, or complex switch clauses.
Recommendations |
Cause: The compiler vectorizer determined outer loop vectorization is not possible using auto-vectorization.
C++ Example:
void foo(float **a, float **b, int N) { int i, j; #pragma ivdep for (i = 0; i < N; i++) { float *ap = a[i]; float *bp = b[i]; for (j = 0; j < N; j++) { ap[j] = bp[j]; } } }Fortran Example:
subroutine foo(a, n1, n) implicit none integer, intent(in) :: n, n1 real, intent(inout) :: a(n,n1) integer :: i, j do i=1,n do j=1,n a(j,i) = a(j-1,i)+1 end do end do end subroutine foo
Recommendations |
Target | ICL/ICC/ICPC Directive | IFORT Directive |
---|---|---|
Source Loop | #pragma simd or #pragma omp simd | !DIR$ SIMD or !$OMP SIMD |
Target | ICL/ICC/ICPC Directive | IFORT Directive |
---|---|---|
Source Loop | #pragma ivdep | !DIR$ IVDEP |
Target | ICL/ICC/ICPC Directive | IFORT Directive |
---|---|---|
Source Loop | #pragma simd vectorlength(k) | !DIR$ SIMD VECTORLENGTH(k) |
Target | ICL/ICC/ICPC Directive | IFORT Directive |
---|---|---|
Inner loop | #pragma novector | !DIR$ NOVECTOR |
Outer loop | #pragma vector always | !DIR$ VECTOR ALWAYS |
Cause: The inner loop in a nested loop is vectorized.
C++ Example:
#define N 1000 float A[N][N]; void foo(int n) { int i,j; for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { A[i][j]++; } } }Fortran Example:
subroutine foo(a, n1, n) implicit none integer, intent(in) :: n, n1 real, intent(inout) :: a(n1,n1) integer :: i, j do i=1,n do j=1,n a(j,i) = a(j,i) + 1 end do end do end subroutine foo
Recommendations |
Target | ICL/ICC/ICPC Directive | IFORT Directive |
---|---|---|
Outer loop | #pragma omp simd collapse(n), #pragma omp simd, or #pragma simd | !$OMP SIMD COLLAPSE(n), !$OMP SIMD, or !DIR$ SIMD |
Target | ICL/ICC/ICPC Directive | IFORT Directive |
---|---|---|
Inner loop | #pragma novector | !DIR$ NOVECTOR |
Outer loop | #pragma vector always | !DIR$ VECTOR ALWAYS |
Cause: The loop lacks sufficient iterations to benefit from vectorization.
C++ Example:
#define TTT char TTT A[15]; TTT foo(int n) { TTT sum=0; int i; for (i = 0; i < n; i++) { sum+=A[i]; } return sum; }Fortran Example:
integer (kind=1) :: A(15), sum, i sum=0 do i=1,15 sum=sum+A(i) end do
Recommendations |
Target | ICL/ICC/ICPC Construct | IFORT Construct |
---|---|---|
Source loop | #pragma omp simd or #pragma simd | !$OMP SIMD or !DIR$ SIMD |
Cause: The compiler did not recognize a search idiom in a loop that may exit early. For example: The loop body contains:
void c15520(float a[], float b[], float c[], int n) { int i; for(i=0; i<n; i++) { if(a[i] < 0.) break; c[i] = sqrt(a[i]) * b[i]; } }Exception
// For Compiler 16.1 and higher this example generates Diagnostic 15333 instead __attribute__((vector)) void f1(double); int main() { int n = 10000; double a[n]; #pragma simd for(int i = 0 ; i < n ; i++) f1(a[i]); }Fortran Example:
subroutine f15520(a,b,c,n) implicit none real, intent(in ), dimension(n) :: a, b real, intent(out), dimension(n) :: c integer, intent(in) :: n integer :: i do i=1,n if(a(i).lt.0.) exit c(i) = sqrt(a(i)) * b(i) enddo end subroutine f15520
Recommendations |
void c15520(float a[], float b[], float c[]) { int i, j; for(i=0; i<1000; i++) { if(a[i] < 0.) break; } for(j=0; j<i-1; j++) { c[j] = sqrt(a[j]) * b[j]; } }Mark the function in the loop as nothrow .
__attribute__((vector, nothrow)) void f1(double); int main() { int n = 10000; double a[n]; #pragma simd for(int i = 0 ; i < n ; i++) f1(a[i]); }Fortran Example: Split the loop into a search loop and computational loop.
subroutine f15520(a,b,c,n) implicit none real, intent(in ), dimension(n) :: a, b real, intent(out), dimension(n) :: c integer, intent(in) :: n integer :: i, j do i=1,n if(a(i).lt.0.) exit enddo do j=1,i-1 c(j) = sqrt(a(j)) * b(j) enddo end subroutine f15520Read More C++ Information: Read More Fortran Information:
Cause: The compiler automatically generates a try block for a program block (that is, code inside {}) when it allocates a large, local object or array on the heap (because the object is too big to allocate on the stack) and a function within the block could throw an exception.
C++ Example:
__attribute__((vector)) void f1(double); int main() { int n = 10000; double a[n]; #pragma simd for(int i = 0 ; i < n ; i++) f1(a[i]); }
Cause: The compiler doesn't get enough information from the code to create one version of the loop. In the example below, the compiler takes a defensive stand and generates both vectorized and non-vectorized versions of the loop because it assumes memory aliasing (the pointers could be pointing to overlapping memory locations).
C++ Example:
void foo(float *a, float *b, float *c){ for(int i = 0 ; i < 256; i++) c[i] = a[i] * b[i]; return; }
Cause: The compiler doesn't get enough information from the code to create one version of the loop. In the example below, the compiler takes a defensive stand and generates thee versions of the loop, for k=0, k>0k<0. The version for k<0 cannot be safely vectorized because each later iteration may depend on the result of earlier iterations.
Fortran Example:
subroutine add(k, a) integer :: k real :: a(20) DO i = 1, 20 a(i) = a(i+k) * 2.0 end do end subroutine add