Strange behaviour in OpenMP nested loop -


in following program different results (serial vs openmp), reason? @ moment can think perhaps loop "large" threads , perhaps should write in other way not sure, hints?

compilation: g++-4.2 -fopenmp main.c functions.c -o main_elec_gcc.exe

    #include <stdio.h>     #include <string.h>     #include <stdlib.h>     #include <omp.h>     #include <math.h>      #define nrack 64     #define nstars 1024      double mysumallatomic_serial(float rocks[nrack][3],float moon[nstars][3],float qr[nrack],float ql[nstars]) {         int j,i;         float temp_div=0.,temp_sqrt=0.;         float difx,dify,difz;         float mod2x, mod2y, mod2z;         double s2 = 0.;     for(j=0; j<nrack; j++){         for(i=0; i<nstars;i++){                  difx=rocks[j][0]-moon[i][0];             dify=rocks[j][1]-moon[i][1];             difz=rocks[j][2]-moon[i][2];             mod2x=difx*difx;             mod2y=dify*dify;             mod2z=difz*difz;             temp_sqrt=sqrt(mod2x+mod2y+mod2z);             temp_div=1/temp_sqrt;             s2 += ql[i]*temp_div*qr[j];              }     }     return s2; }  double mysumallatomic(float rocks[nrack][3],float moon[nstars][3],float qr[nrack],float ql[nstars]) {     float temp_div=0.,temp_sqrt=0.;     float difx,dify,difz;     float mod2x, mod2y, mod2z;     double s2 = 0.;  #pragma omp parallel shared(s2)     for(int j=0; j<nrack; j++){         for(int i=0; i<nstars;i++){             difx=rocks[j][0]-moon[i][0];             dify=rocks[j][1]-moon[i][1];             difz=rocks[j][2]-moon[i][2];             mod2x=difx*difx;             mod2y=dify*dify;             mod2z=difz*difz;             temp_sqrt=sqrt(mod2x+mod2y+mod2z);             temp_div=1/temp_sqrt;             float myterm=ql[i]*temp_div*qr[j];               #pragma omp atomic             s2 += myterm;         }     }     return s2;  int main(int argc, char *argv[]) { float rocks[nrack][3], moon[nstars][3]; float qr[nrack], ql[nstars]; int i,j;  for(j=0;j<nrack;j++){     rocks[j][0]=j;     rocks[j][1]=j+1;     rocks[j][2]=j+2;     qr[j] = j*1e-4+1e-3;     //qr[j] = 1; }  for(i=0;i<nstars;i++){     moon[i][0]=12000+i;     moon[i][1]=12000+i+1;     moon[i][2]=12000+i+2;     ql[i] = i*1e-3 +1e-2 ;     //ql[i] = 1 ; } printf(" serial: %f\n", mysumallatomic_serial(rocks,moon,qr,ql)); printf(" openmp: %f\n", mysumallatomic(rocks,moon,qr,ql)); return(0);      }     } 

i think should use reduction instead of shared variable , remove #pragma omp atomic, like:

#pragma omp parallel reduction(+:s2) 

and should work faster, because there no need atomic operations quite painful in terms of performance , threads synchronization.

update

you can have difference in results because of operations order:

\sum_1^100(x[i]) != \sum_1^50(x[i]) + \sum_51^100(x[i])


Comments

Popular posts from this blog

java - SNMP4J General Variable Binding Error -

windows - Python Service Installation - "Could not find PythonClass entry" -

Determine if a XmlNode is empty or null in C#? -