Intel® C++ Compiler 16.0 User and Reference Guide
Moving the block of code that consists of a function-call (line %d), if-condition (line %d), and an early return (line %d) to outside the loop may enable parallelization of the loop at line %d.
Move the function call and an associated return from inside the loop (perhaps by inserting them before the loop) to help parallelize the loop.
This kind of function-leading-to-return inside a loop usually handles some error-condition inside the loop. If this error check can be done before starting the execution of the loop without changing the program semantics, the compiler may be able to parallelize the loop thus improving performance.
Consider the following:
extern int num_nodes; typedef struct TEST_STRUCT { // Coordinates of city1 float latitude1; float longitude1; // Coordinates of city2 float latitude2; float longitude2; } test_struct; extern int *mark_larger; extern float *distances, **matrix; extern test_struct** nodes; extern test_struct ***files; extern void init_node(test_struct *node, int i); extern void process_nodes(void); float compute_max_distance(void); extern int check_error_condition(int width); #include <math.h> #include <stdio.h> void process_nodes(int width) { float const R = 3964.0; float temp, lat1, lat2, long1, long2, result, pat2; int m, j, temp1 = num_nodes; nodes = files[0]; m = 1; #pragma loop count min(4) #pragma parallel for (int k=0; k < temp1; k++) { if (check_error_condition(width)) { return; } lat1 = nodes[k]->latitude1; lat2 = nodes[k]->latitude2; long1 = nodes[k]->longitude1; long2 = nodes[k]->longitude2; // Compute the distance between the two cities temp = sin(lat1) * sin(lat2) + cos(lat1) * cos(lat2) * cos(long1-long2); result = 2.0 * R * atan(sqrt((1.0-temp)/(1.0+temp))); pat2 = 0; for(j=0; j<width; j++) { pat2 += distances[j]; matrix[k][j] = distances[k]+j; } // Store the distance computed in the distances array if (result > distances[k]) { distances[k] = result + pat2; } } }
In this case, the compiler is unable to parallelize the loop at line 38.
If you determine it is safe to do so, you can modify the above code as follows:
extern int num_nodes; typedef struct TEST_STRUCT { // Coordinates of city1 float latitude1; float longitude1; // Coordinates of city2 float latitude2; float longitude2; } test_struct; extern int *mark_larger; extern float *distances, **matrix; extern test_struct** nodes; extern test_struct ***files; extern void init_node(test_struct *node, int i); extern void process_nodes(void); float compute_max_distance(void); extern int check_error_condition(int width); #include <math.h> #include <stdio.h> void process_nodes(int width) { float const R = 3964.0; float temp, lat1, lat2, long1, long2, result, pat2; int m, j, temp1 = num_nodes; nodes = files[0]; m = 1; if (check_error_condition(width)) { return; } #pragma loop count min(4) #pragma parallel for (int k=0; k < temp1; k++) { lat1 = nodes[k]->latitude1; lat2 = nodes[k]->latitude2; long1 = nodes[k]->longitude1; long2 = nodes[k]->longitude2; // Compute the distance between the two cities temp = sin(lat1) * sin(lat2) + cos(lat1) * cos(lat2) * cos(long1-long2); result = 2.0 * R * atan(sqrt((1.0-temp)/(1.0+temp))); pat2 = 0; for(j=0; j<width; j++) { pat2 += distances[j]; matrix[k][j] = distances[k]+j; } // Store the distance computed in the distances array if (result > distances[k]) { distances[k] = result + pat2; } } }
Confirm that the function call does not rely on any computation inside the loop and that restructuring the code as suggested above, retains the original program semantics.