Causes:
truct char4 {
char c1;
char c2;
char c3;
char c4;
};
extern struct char4 *a;
void vecmsg_testcore003 ()
{
int i;
const struct char4 n = {0, 0, 0, 0};
#pragma omp simd
for(i = 0; i < 1024; i++) {
a[i] = n;
}
}
| Recommendations |
inline char4 operator=(const char4 &x) {
char4 temp;
temp.c1 = x.c1;
temp.c2 = x.c2;
temp.c3 = x.c3;
temp.c4 = x.c4;
return temp;
}
Cause:
In nested loop structures, the compiler targets the innermost loop for vectorization. The outer loop, by default, is not a target for vectorization; however, it may be a target for parallelization.
C++ Example:
#include <iostream>
#define N 25
int main()
{
int a[N][N], b[N], i;
for(int j = 0; j < N; j++)
{
for(int i = 0; i < N; i++)
a[j][i] = 0;
b[j] = 1;
}
int sum = __sec_reduce_add(a[:][:]) + __sec_reduce_add(b[:]);
return 0;
}
| Recommendation |
| Target | ICL/ICC/ICPC Directive | IFORT Directive |
|---|---|---|
| Outer loop | #pragma omp simd collapse(n), #pragma omp simd, or #pragma simd | !$OMP SIMD COLLAPSE(n), !$OMP SIMD, or !DIR$ SIMD |
Cause: The compiler vectorizer determined the remainder loop will not benefit from vectorization.
C++ Example:
#include < iostream >
#define N 70
int main() {
static short tab1[N],
tab2[N];
int i, j;
static short const data[] = {32768, -256, -255, -128, -127, -1, 0, 1, 127, 128, 255, 256, 32767};
for (j = i = 0; i < N; i++)
{
tab1[i] = i;
tab2[i] = data[j++];
if (j > 12) j = 0;
}
int sum = __sec_reduce_add(tab1[:]) + __sec_reduce_add(tab2[:]);
return 0;
}
| Recommendations |
| Target | ICL/ICC/ICPC Directive | IFORT Directive |
|---|---|---|
| Source loop | #pragma vector vecremainder | !DIR$ SIMD VECREMAINDER |
| Target | ICL/ICC/ICPC Directive | IFORT Directive |
|---|---|---|
| Source loop | #pragma vector novecremainder | !DIR$ SIMD NOVECREMAINDER |
Cause: The compiler vectorizer determined the loop will not benefit from vectorization. Common reasons include:
#include <iostream>
#define N 100
struct s1 {
int a, b, c;
}
int main() {
s1 arr[N], sum;
for(int i = 0; i < N; i++) {
sum.a += arr[i].a;
sum.b += arr[i].b;
sum.c += arr[i].c;
}
std::cout << sum.a << "t" << sum.b << "t" << sum.c << "n";
return 0;
}
| Recommendations |
| Target | ICL/ICC/ICPC Directive | IFORT Directive |
|---|---|---|
| Source loop | #pragma vector or #pragma vector always | !DIR$ VECTOR or !DIR$ VECTOR ALWAYS |
| Target | ICL/ICC/ICPC Directive | IFORT Directive |
|---|---|---|
| Source loop | #pragma simd or #pragma omp simd | !DIR$ SIMD or !$OMP SIMD |
Causes:
void foo(int *A, int *restrict B, int n, int* x) {
int i;
#pragma omp simd
for (i = 0; i < n; i++)
{
if (A[i] > i)
*x = i;
else
B[i] = *x;
}
B[i] = *x++;
}
| Recommendations |
#include < stdlib.h >
#define N 70
int main(int argc, char *argv[])
{
int k = atoi(argv[1]);
int a[N], i;
for(i = abs(k); i < N; i++)
a[i] = a[i+k] + 1;
return 0;
}
| Recommendations |
| Target | ICL/ICC/ICPC Directive | IFORT Directive |
|---|---|---|
| Source Loop | #pragma simd or #pragma omp simd | !DIR$ SIMD or !$OMP SIMD |
| Target | ICL/ICC/ICPC Directive | IFORT Directive |
|---|---|---|
| Source Loop | #pragma ivdep | !DIR$ IVDEP |
| Target | ICL/ICC/ICPC Directive | IFORT Directive |
|---|---|---|
| Source Loop | #pragma simd vectorlength(k) | !DIR$ SIMD VECTORLENGTH(k) |
Causes:
void no_vec(float a[], float b[], float c[])
{
int i = 0.;
while (i < 100) {
a[i] = b[i] * c[i];
// this is a data-dependent exit condition:
if (a[i] < 0.0)
break;
++i;
}
}
Exception: Loops searching for an array element, as in the example below, can be automatically vectorized when array a[i] is aligned.
for (i = 0; i < n; ++i) {
if (a[i] == to_find) {
index = I;
break;
}
}
C++ Example 2: A SIMD loop uses C++ exception handling or an OpenMP critical construct.
#define N 1000
int foo() {
#pragma omp simd
for (int i = 0; i < N; i++) {
try {
printf ("throw exception 11\n");
throw 11;
}
catch (int t) {
printf ("caught exception %d\n", t);
if (t != 11) {
#pragma omp critical
{
printf ("TEST FAILED\n");
exit (0);
}
}
}
}
printf ("TEST PASSED\n");
exit (0);
}
C++ Example 3: The compiler cannot determine which function is passed as a function parameter.
#include <iostream>
int a[100];
int b[100];
int g(int i, int y) {
return b[i]+y;
}
__declspec(noinline) void doit1(int x(int,int), int y) {
int i;
#pragma parallel
for(i = 0; i < 100; i++)
a[i] = x(i,y);
}
| Recommendations |
Causes:
subroutine d_15043(a,b,c,n)
implicit none
real, intent(in ), dimension(n) :: a, b
real, intent(out), dimension(n) :: c
integer, intent(in) :: n
integer :: i
do i=1,n
if(a(i) < 0.) exit
c(i) = sqrt(a(i)) * b(i)
enddo
end subroutine d_15043
Fortran Example 2: The iteration count is data dependent.
subroutine d_15043_2(a,b,c,n)
implicit none
real, intent(in ), dimension(n) :: a, b
real, intent(out), dimension(n) :: c
integer, intent(in) :: n
integer :: i
i = 0
do while (a(i) > 0.)
c(i) = sqrt(a(i)) * b(i)
i = i + 1
enddo
end subroutine d_15043_2
Fortran Example 3: The loop contains a subroutine or function that prevents vectorization.
subroutine d_15043_3(a,b,c,n)
implicit none
real, intent(in ), dimension(n) :: a, b
real, intent(out), dimension(n) :: c
integer, intent(in) :: n
integer :: i
do i=1,n
call my_sub(a(i),b(i),c(i))
enddo
end subroutine d_15043_3
| Recommendations |
do i=1,n
if(a(i) > 0.) c(i) = sqrt(a(i)) * b(i)
enddo
If necessary, the iteration count can be pre-computed.
Read More:
Cause: The compiler detected or assumed a vector dependence in the loop.
C++ Example:
int foo(float *A, int n) {
int inx = 0;
float max = A[0];
int i;
for (i=0;i < n;i++) {
if (max < A[i]) {
max = A[i];
inx = i*i;
}
}
return inx;
}
Fortran Example:
integer function foo(a, n)
implicit none
integer, intent(in) :: n
real, intent(inout) :: a(n)
real :: max
integer :: inx, i
max = a(0)
do i=1,n
if (max < a(i)) then
max = a(i)
inx = i*i
endif
end do
foo = inx
end function
| Recommendations |
| Target | ICL/ICC/ICPC Directive | IFORT Directive |
|---|---|---|
| Source Loop | #pragma simd or #pragma omp simd | !DIR$ SIMD or !$OMP SIMD |
| Target | ICL/ICC/ICPC Directive | IFORT Directive |
|---|---|---|
| Source Loop | #pragma ivdep | !DIR$ IVDEP |
| Target | ICL/ICC/ICPC Directive | IFORT Directive |
|---|---|---|
| Source Loop | #pragma simd vectorlength(k) | !DIR$ SIMD VECTORLENGTH(k) |
Causes:
#include <iostream>
#include <complex>
using namespace std;
int main() {
float c[10];
c[:] = 0.f;
for(int i = 0; i < 10; i++)
cout << c[i] << "n";
return 0;
}
| Recommendations |
| Target | ICL/ICC/ICPC Construct |
|---|---|
| Source function | #pragma omp declare simd |
| Source function | _declspec(vector) (Windows OS) or _attribute_(vector) (Linux OS) |
Cause: A function call inside the loop is preventing auto-vectorization.
Fortran Example:
Program foo
implicit none
integer, parameter :: nx = 100000000
real(8) :: x, xp, sumx
integer :: i
interface
real(8) function bar(x, xp)
real(8), intent(in) :: x, xp
end
end interface
sumx = 0.
xp = 1.
do i = 1,nx
x = 1.D-8*real(i,8)
sumx = sumx + bar(x,xp)
enddo
print *, 'Sum =',sumx
end
real(8) function bar(x, xp)
implicit none
real(8), intent(in) :: x, xp
bar = 1. - 2.*(x-xp) + 3.*(x-xp)**2 - 1.5*(x-xp)**3 + 0.2*(x-xp)**4
bar = bar / sqrt(x**2 + xp**2)
end
| Recommendations |
| Target | IFORT Construct |
|---|---|
| Source function | !DIR$ OMP DECLARE SIMD |
| Source function | ELEMENTAL keyword or !DIR$ ATTRIBUTES VECTOR |
real(8) function bar(x, xp)
!$OMP DECLARE SIMD (bar) UNIFORM(xp)
implicit none
real(8), intent(in) :: x, xp
bar = 1. - 2.*(x-xp) + 3.*(x-xp)**2 - 1.5*(x-xp)**3 + 0.2*(x-xp)**4
bar = bar / sqrt(x**2 + xp**2)
end
The code now generates a vectorized version of function
bar()
; however, the loop inside
foo
is still not vectorized because the compiler sees dependencies between loop iterations carried by both
x
and
Program foo
implicit none
integer, parameter :: nx = 100000000
real(8) :: x, xp, sumx
integer :: i
interface
nbsp;real(8) function bar(x, xp)
!$OMP DECLARE SIMD (bar) UNIFORM(xp)
real(8), intent(in) :: x, xp
end
end interface
sumx = 0.
xp = 1.
!$OMP SIMD private(x) reduction(+:sumx)
do i = 1,nx
x = 1.D-8*real(i,8)
sumx = sumx + bar(x,xp)
enddo
print *, 'Sum =',sumx
end
The loop now vectorizes successfully, and running the application shows a performance speedup. Causes:
void foo(float *A) {
int i;
int OuterCount = 90;
while (OuterCount > 0) {
for (i = 1; i < bar(int(A[0])); i++) {
A[i] = i + 4;
}
OuterCount--;
}
}
C++ Example 2: The compiler cannot determine if there is aliasing between all the pointers used inside the loop and loop boundaries.struct Dim { int x, y, z; };
Dim dim;
double* B;
void foo (double* A) {
for (int i = 0; i < dim.x; i++) {
A[i] = B[i];
}
}
| Recommendations |
void foo(float *A) {
int i;
int OuterCount = 90;
int limit = bar(int(A[0]));
while (OuterCount > 0) {
for (i=1; i < limit; i++) {
A[i] = i + 4;
}
OuterCount--;
}
}
| Target | ICL/ICC/ICPC Directive |
|---|---|
| Source loop | #pragma simd or #pragma omp simd |
| Target | ICL/ICC/ICPC Directive |
|---|---|
| Source loop | #pragma ivdep |
Cause: The loop iteration count is not available before the loop executes.
Fortran Example:
subroutine foo(a, n)
implicit none
integer, intent(in) :: n
double precision, intent(inout) :: a(n)
integer :: bar
integer :: i
i=0
100 CONTINUE
a(i)=0
i=i+1
if (i < bar()) goto 100
end subroutine foo
| Recommendations |
Cause: Any usage of volatile variables in the loop causes this diagnostic.
C++ Example:
volatile int32_t x;
int32_t a[c_size];
for (int32_t i = 0; i < c_size; ++i) {
a[i] = exp(x + i);
x = a[i];
}
| Recommendations |
Cause: Internal time limits for the optimization level prevented the compiler from determining a vectorization approach for this loop.
| Recommendations |
Cause: The inner loop has an irregular structure. For example, it may have non-constant lower and higher bounds, a non-constant step for iterations, more than one entry, some assembly parts, volatile variables, long jumps, or complex switch clauses.
| Recommendations |
Cause: The compiler vectorizer determined outer loop vectorization is not possible using auto-vectorization.
C++ Example:
void foo(float **a, float **b, int N) {
int i, j;
#pragma ivdep
for (i = 0; i < N; i++) {
float *ap = a[i];
float *bp = b[i];
for (j = 0; j < N; j++) {
ap[j] = bp[j];
}
}
}
Fortran Example:
subroutine foo(a, n1, n)
implicit none
integer, intent(in) :: n, n1
real, intent(inout) :: a(n,n1)
integer :: i, j
do i=1,n
do j=1,n
a(j,i) = a(j-1,i)+1
end do
end do
end subroutine foo
| Recommendations |
| Target | ICL/ICC/ICPC Directive | IFORT Directive |
|---|---|---|
| Source Loop | #pragma simd or #pragma omp simd | !DIR$ SIMD or !$OMP SIMD |
| Target | ICL/ICC/ICPC Directive | IFORT Directive |
|---|---|---|
| Source Loop | #pragma ivdep | !DIR$ IVDEP |
| Target | ICL/ICC/ICPC Directive | IFORT Directive |
|---|---|---|
| Source Loop | #pragma simd vectorlength(k) | !DIR$ SIMD VECTORLENGTH(k) |
| Target | ICL/ICC/ICPC Directive | IFORT Directive |
|---|---|---|
| Inner loop | #pragma novector | !DIR$ NOVECTOR |
| Outer loop | #pragma vector always | !DIR$ VECTOR ALWAYS |
Cause: The inner loop in a nested loop is vectorized.
C++ Example:
#define N 1000
float A[N][N];
void foo(int n) {
int i,j;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
A[i][j]++;
}
}
}
Fortran Example:
subroutine foo(a, n1, n)
implicit none
integer, intent(in) :: n, n1
real, intent(inout) :: a(n1,n1)
integer :: i, j
do i=1,n
do j=1,n
a(j,i) = a(j,i) + 1
end do
end do
end subroutine foo
| Recommendations |
| Target | ICL/ICC/ICPC Directive | IFORT Directive |
|---|---|---|
| Outer loop | #pragma omp simd collapse(n), #pragma omp simd, or #pragma simd | !$OMP SIMD COLLAPSE(n), !$OMP SIMD, or !DIR$ SIMD |
| Target | ICL/ICC/ICPC Directive | IFORT Directive |
|---|---|---|
| Inner loop | #pragma novector | !DIR$ NOVECTOR |
| Outer loop | #pragma vector always | !DIR$ VECTOR ALWAYS |
Cause: The loop lacks sufficient iterations to benefit from vectorization.
C++ Example:
#define TTT char
TTT A[15];
TTT foo(int n) {
TTT sum=0;
int i;
for (i = 0; i < n; i++) {
sum+=A[i];
}
return sum;
}
Fortran Example:
integer (kind=1) :: A(15), sum, i
sum=0
do i=1,15
sum=sum+A(i)
end do
| Recommendations |
| Target | ICL/ICC/ICPC Construct | IFORT Construct |
|---|---|---|
| Source loop | #pragma omp simd or #pragma simd | !$OMP SIMD or !DIR$ SIMD |
Cause: The compiler did not recognize a search idiom in a loop that may exit early. For example: The loop body contains:
void c15520(float a[], float b[], float c[], int n)
{
int i;
for(i=0; i<n; i++)
{
if(a[i] < 0.) break;
c[i] = sqrt(a[i]) * b[i];
}
}
Exception
// For Compiler 16.1 and higher this example generates Diagnostic 15333 instead
__attribute__((vector)) void f1(double);
int main()
{
int n = 10000;
double a[n];
#pragma simd
for(int i = 0 ; i < n ; i++)
f1(a[i]);
}
Fortran Example:
subroutine f15520(a,b,c,n)
implicit none
real, intent(in ), dimension(n) :: a, b
real, intent(out), dimension(n) :: c
integer, intent(in) :: n
integer :: i
do i=1,n
if(a(i).lt.0.) exit
c(i) = sqrt(a(i)) * b(i)
enddo
end subroutine f15520
| Recommendations |
void c15520(float a[], float b[], float c[])
{
int i, j;
for(i=0; i<1000; i++)
{
if(a[i] < 0.) break;
}
for(j=0; j<i-1; j++)
{
c[j] = sqrt(a[j]) * b[j];
}
}
Mark the function in the loop as
nothrow
.
__attribute__((vector, nothrow)) void f1(double);
int main()
{
int n = 10000;
double a[n];
#pragma simd
for(int i = 0 ; i < n ; i++)
f1(a[i]);
}
Fortran Example:
Split the loop into a search loop and computational loop.
subroutine f15520(a,b,c,n)
implicit none
real, intent(in ), dimension(n) :: a, b
real, intent(out), dimension(n) :: c
integer, intent(in) :: n
integer :: i, j
do i=1,n
if(a(i).lt.0.) exit
enddo
do j=1,i-1
c(j) = sqrt(a(j)) * b(j)
enddo
end subroutine f15520
Read More C++ Information:
Read More Fortran Information:
Cause: The compiler automatically generates a try block for a program block (that is, code inside {}) when it allocates a large, local object or array on the heap (because the object is too big to allocate on the stack) and a function within the block could throw an exception.
C++ Example:
__attribute__((vector)) void f1(double);
int main()
{
int n = 10000;
double a[n];
#pragma simd
for(int i = 0 ; i < n ; i++)
f1(a[i]);
}
Cause: The compiler doesn't get enough information from the code to create one version of the loop. In the example below, the compiler takes a defensive stand and generates both vectorized and non-vectorized versions of the loop because it assumes memory aliasing (the pointers could be pointing to overlapping memory locations).
C++ Example:
void foo(float *a, float *b, float *c){
for(int i = 0 ; i < 256; i++)
c[i] = a[i] * b[i];
return;
}
Cause: The compiler doesn't get enough information from the code to create one version of the loop. In the example below, the compiler takes a defensive stand and generates thee versions of the loop, for k=0, k>0k<0. The version for k<0 cannot be safely vectorized because each later iteration may depend on the result of earlier iterations.
Fortran Example:
subroutine add(k, a)
integer :: k
real :: a(20)
DO i = 1, 20
a(i) = a(i+k) * 2.0
end do
end subroutine add