Mpi use more processors but cost more time

Dear all
I try to use mpi to realize program parallelism
It is mainly used to reduce the program running time, but I’ve found that as the number of processors increases, the program run time increases. I can not solve this problem ,so i want to consult you.
that’s my 8 cpu codes :

//8 processor
if ( mpisize != 8 ) {

cout << " sorry, number of processors !=8 " << endl;

exit(1);}

verbosity=3;

int n=48; // z0-z11

real[int] z(n); // 

real h=1./(n-1);

for (int i=0;i<=n-1;i++) //

{ z[i]= i*h; }//循环定义

mesh Th= square (n-1, n-1 , [x, y] ) ;

real m=(n-1)*(n-1);//1/h^2

fespace Vh(Th,P2);

Vh u, v, f,yyy,zzz,hhhh;

problem Poisson(u,v) =//posion 

int2d(Th)(dx(u)*dx(v) + dy(u)*dy(v)+2*m*u*v) // 

- int2d(Th)( f*v+m*(yyy+zzz)*v)+on(1,2,3,4,u=0) ;//

Vh[int] up(n+1),un(n+1);

for(int i=0;i<=n;i++){

un[i]=0;

}

if(mpirank==0){

for(int i=1;i<6;i++){

f=3*pi*pi*sin(pi*x)*sin(pi*y)*sin(pi*z[i]);

yyy=0;zzz=0; 

Poisson; un[i] = u;

up[i]=un[i];

}

}

else if(mpirank==7){

for(int i=42;i<47;i++){

f=3*pi*pi*sin(pi*x)*sin(pi*y)*sin(pi*z[i]);

yyy=0;zzz=0; 

Poisson; un[i] = u;

up[i]=un[i];}

}

else

{

for(int i=mpirank*6;i<(mpirank+1)*6;i++){

f=3*pi*pi*sin(pi*x)*sin(pi*y)*sin(pi*z[i]);

yyy=0;zzz=0; 

Poisson; un[i] = u;

up[i]=un[i];

}

}

//mpiBarrier(mpiCommWorld);

//第一次迭代*********************************************

int k=1;

for(int jj=0;jj<900;jj++){

if(mpirank<7){

processor(mpirank+1)<<un[5+mpirank*6][];

processor(mpirank+1)>>hhhh[];

up[(mpirank+1)*6]=hhhh;

}

if(mpirank>0){

processor(mpirank-1)<<un[mpirank*6][];

processor(mpirank-1)>>hhhh[];

up[mpirank*6-1]=hhhh;

}

/* processor(1-mpirank)<<un[0.5*n+mpirank-1][];

processor(1-mpirank)>>hhhh[];

up[0.5*n-mpirank]=hhhh; */

//求解

if(mpirank==0){

for(int i=1;i<6;i++){

f=3*pi*pi*sin(pi*x)*sin(pi*y)*sin(pi*z[i]);

yyy=up[i-1];zzz=up[i+1];

Poisson; un[i] = u;}

}

else if(mpirank==7)

{

for(int i=42;i<47;i++){

f=3*pi*pi*sin(pi*x)*sin(pi*y)*sin(pi*z[i]);

yyy=up[i-1];zzz=up[i+1];

Poisson; un[i] = u;

}

}

else{

for(int i=mpirank*6;i<(mpirank+1)*6;i++){

f=3*pi*pi*sin(pi*x)*sin(pi*y)*sin(pi*z[i]);

yyy=up[i-1];zzz=up[i+1];

Poisson; un[i] = u;}

}

/* for(int i=mpirank*0.5*n+1-mpirank;i<mpirank*0.5*n+0.5*n-mpirank;i++){

f=3*pi*pi*sin(pi*x)*sin(pi*y)*sin(pi*z[i]);

yyy=up[i-1];zzz=up[i+1]; // 对于每一个u【i】,已知u【i-1】

Poisson; un[i] = u;

//up[i]=un[i];

}

*/

//误差

real[int] error(8);

for(int i=mpirank*6;i<(mpirank+1)*6;i++){

error[mpirank]=error[mpirank]+int2d(Th)((un[i]-up[i])^2)*h;

//cout<<"i*****"<<i<<endl;

}

for(int i=0;i<8;i++){

broadcast(processor(i),error[i]);

}

real err,error2;

/* for(int i=0;i<8;i++){

error2=error2+error[i];

} */

err=sqrt(error[0]+error[1]+error[2]+error[3]+error[4]+error[5]+error[6]+error[7]);

cout <<" **********************err = " << err << " *********** " << endl;

if(err<0.00001) break;

//update

for(int i=mpirank*6;i<(mpirank+1)*6;i++){

up[i]=un[i];

}

k++;

cout<<"***************"<<k<<endl;

}

and that’s my 16 cpu codes

//16 processor
//2023年5月1日17:13:16

if ( mpisize != 16 ) {
cout << " sorry, number of processors !=3 " << endl;
exit(1);}
verbosity=3;

int n=48; // z0-z11
real[int] z(n); // z数组

real h=1./(n-1);
for (int i=0;i<=n-1;i++) //

{ z[i]= i*h; }//循环定义

mesh Th= square (n-1, n-1 , [x, y] ) ;
real m=(n-1)*(n-1);//1/h^2



fespace Vh(Th,P2);
Vh u, v, f,yyy,zzz,hhhh;

problem Poisson(u,v) =//posion 方程定义 使用变分形式
int2d(Th)(dx(u)*dx(v) + dy(u)*dy(v)+2*m*u*v) // u即所求函数,这里是二维的
- int2d(Th)( f*v+m*(yyy+zzz)*v)+on(1,2,3,4,u=0) ;//这里yyy表示u【i-1】,zzz表示u【i+1】
Vh[int] up(n+1),un(n+1); 
for(int i=0;i<=n;i++){
    un[i]=0;
}



//首项迭代

if(mpirank==0){

for(int i=1;i<3;i++){
f=3*pi*pi*sin(pi*x)*sin(pi*y)*sin(pi*z[i]);
yyy=0;zzz=0; // 对于每一个u【i】,已知u【i-1】
Poisson; un[i] = u;
up[i]=un[i];
}
}
else if(mpirank==15){

for(int i=45;i<47;i++){
f=3*pi*pi*sin(pi*x)*sin(pi*y)*sin(pi*z[i]);
yyy=0;zzz=0; // 对于每一个u【i】,已知u【i-1】
Poisson; un[i] = u;
up[i]=un[i];}
}
   else 
{
for(int i=mpirank*3;i<(mpirank+1)*3;i++){
f=3*pi*pi*sin(pi*x)*sin(pi*y)*sin(pi*z[i]);
yyy=0;zzz=0; // 对于每一个u【i】,已知u【i-1】
Poisson; un[i] = u;
up[i]=un[i];
}
}

//mpiBarrier(mpiCommWorld);
//第一次迭代*********************************************
int k=1;
for(int jj=0;jj<1000;jj++){


if(mpirank<15){
    processor(mpirank+1)<<un[2+mpirank*3][];
processor(mpirank+1)>>hhhh[]; 
up[(mpirank+1)*3]=hhhh;
}
if(mpirank>0){
     processor(mpirank-1)<<un[mpirank*3][];
processor(mpirank-1)>>hhhh[];
up[mpirank*3-1]=hhhh;
}


/* processor(1-mpirank)<<un[0.5*n+mpirank-1][];
processor(1-mpirank)>>hhhh[]; 
up[0.5*n-mpirank]=hhhh; */
//求解
if(mpirank==0){

for(int i=1;i<3;i++){
f=3*pi*pi*sin(pi*x)*sin(pi*y)*sin(pi*z[i]);
yyy=up[i-1];zzz=up[i+1]; 
Poisson; un[i] = u;}
}
else if(mpirank==15)
{
    for(int i=45;i<47;i++){
f=3*pi*pi*sin(pi*x)*sin(pi*y)*sin(pi*z[i]);
yyy=up[i-1];zzz=up[i+1]; 
Poisson; un[i] = u;
}
}
else{
for(int i=mpirank*3;i<(mpirank+1)*3;i++){
f=3*pi*pi*sin(pi*x)*sin(pi*y)*sin(pi*z[i]);
yyy=up[i-1];zzz=up[i+1]; 
Poisson; un[i] = u;}

}

/* for(int i=mpirank*0.5*n+1-mpirank;i<mpirank*0.5*n+0.5*n-mpirank;i++){
f=3*pi*pi*sin(pi*x)*sin(pi*y)*sin(pi*z[i]);
yyy=up[i-1];zzz=up[i+1]; // 对于每一个u【i】,已知u【i-1】
Poisson; un[i] = u;
//up[i]=un[i];
}
  */
//误差
real[int] error(16);
for(int i=mpirank*3;i<(mpirank+1)*3;i++){
    error[mpirank]=error[mpirank]+int2d(Th)((un[i]-up[i])^2)*h;
    //cout<<"i*****"<<i<<endl;
}

for(int i=0;i<16;i++){
broadcast(processor(i),error[i]);   
}
real err,error2;
for(int i=0;i<16;i++){
error2=error2+error[i];   
}
err=sqrt(error2);



cout <<" **********************err = " << err << " *********** " <<  endl;

if(err<0.00001) break;


//update

for(int i=mpirank*3;i<(mpirank+1)*3;i++){

    up[i]=un[i];
}






k++;
cout<<"***************"<<k<<endl; 
}



that’s my CPUtime:
for n=48
8 processor
**********************err = 9.9474e-06 ***********
times: compile 0.272s, execution 1177.91s, mpirank:7

16 processor
CodeAlloc : nb ptr 4745, size :566376 mpirank: 15
######## We forget of deleting 63955 Nb pointer, 0Bytes , mpirank 3, memory leak =0
**********************err = 9.9474e-06 ***********
times: compile 0.063s, execution 1632.11s, mpirank:15

That is to be expected. You should not use MPI for such a distribution of the workload, but instead to parallelize in space.

I didn’t actually look at the details but I don’t see the problem splitting up
tasks that don’t need to interact much- that should be ideal for parallelism.
The problem occurs when the tasks are small and the overhead is large
or when there are not enough physical cores available. For smaller problems
you may benefit from multiple threads as they share an address space
and you don’t have to do a lot of copies and messaging etc.

Its not hard to make parallelism counterproductive. To investigate this case
I would try to make it more flexible and pick mpiranks as a function of
mpisize( or whatver the variable is for number of processes ).
Something like ( n%mpisize)==mpirank I guess.

thank you very much ,but i don’t understand your words ,my work is to convert a 3D possion equation into 48 2D possion equations and solving them iteratively, I try to solve several 2D possion equations with each processor, where some matrices need to be transferred between the individual processors. I am sorry i do not know how to parallelize in space :dizzy_face:

Here is one tutorial Pierre Jolivet introduction to parallel FreeFEM part 1 - YouTube. Furthermore, you can solve the Poisson equation extremely efficiently with multigrid methods. This is not possible with the problem keyword, you’ll have to switch to a varf.

I guess ulitmately you need to look at the algorithm and estimate how
much work can be done indepdnently and how long it takes
compared to overhead. With shared memory the tradeoffs can be a lot
different. But even there its not hard for threads to wipe out each others’
memory cache etc.

hello,everyone。
My job was to convert the 3D possion equation into a series of 2D possion equations. In my code, I converted the 3D equation into 48 2D possion equations, and the equations were solved iteratively. Each equation required the solution of two 2D equations before and after, which was ordinary Jacobian iteration. I split them up between different processors, so we need to pass two-dimensional solutions between processors. For processors that are not head and tail, it needs to send and receive a two-dimensional solution, respectively


h is a constant,there n=47, f is given,set the initial iteration value to 0
image
For example, to solve u2, we need to know u1 and u3, just plug in the last iteration value. We assume that a processor solves six equations, and for one of the processor boundaries such as u5(starting from 0), To solve u5, this processor needs to receive u6 from the previous iteration from the next processor, and the next processor needs to receive u5 from the previous processor to solve u6, (processor 0:u0,u1,u2,u3,u4,u5;

processor1:u6,u7,u8,u9,u10,u11; processor2:u12,u13,u14,u15,u16,u17; … ; processor7:u42,u43,u44,u45,u46,u47.)

For processor 1, solving u11 requires receiving u12 solved in the previous iteration sent by processor 2, and this is the principle. Finally, an error is set to terminate the iteration.

This is a very inefficient strategy.

1 Like

thank you .I will learn the video you recommended. I need VPN to watch YouTube videos in my country, and I could not watch it yesterday. I have a VPN today, I go to learn