cdvmLDe (1158334), страница 8
Текст из файла (страница 8)
xi,jnew = ( w / 4 ) * ( xi-1,jnew + xi,j-1new + xi+1,jold + xi,j+1old ) + (1-w) * xi,jold
By using “red-black” coloring of variables each step of SOR consists of two half Jacobi steps. One processes “red” variables and the other processes “black” variables. Coloring of variables allows to overlap calculation and communication.
Example 1. The Gauss elimination algorithm
This is a program to solve a system of linear equations Ax = B by Gauss elimination method. The coefficient matrix A is represented by an array section A[0:N-1][0:N-1], and the vector B is represented by the section A[0:N-1][N] of the same array.
/* GAUSS program */
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define DVM(dvmdir)
#define DO(v,l,h,s) for(v=l; v<=h; v+=s)
#define N 10
int main (int argn, char **args)
{
long i, j, k;
/* declaration of dynamic distributed arrays */
DVM(DISTRIBUTE [BLOCK] []) float (*A)[N+1];
DVM(DISTRIBUTE [BLOCK]) float (*X);
/* creation of arrays */
A = malloc( N*(N+1)*sizeof(float));
X = malloc( N*sizeof(float));
/* initialize array A*/
DVM(PARALLEL [i][j] ON A[i][j])
DO(i,0,N-1,1)
DO(j,0,N,1)
if (i==j || j==N) A[i][j] = 1.f;
else A[i][j]=0.f;
/* elimination */
for (i=0; i<N-1; i++)
{
DVM(PARALLEL [k][j] ON A[k][j]; REMOTE_ACCESS A[i][])
DO (k,i+1,N-1,1)
DO (j,i+1,N,1)
A[k][j] = A[k][j]-A[k][i]*A[i][j]/A[i][i];
}
/* reverse substitution */
X[N-1] = A[N-1][N]/A[N-1][N-1];
for (j=N-2; j>=0; j-=1)
{
DVM(PARALLEL [k] ON A[k][]; REMOTE_ACCESS X[j+1])
DO (k,0,j,1)
A[k][N] = A[k][N]-A[k][j+1]*X[j+1];
X[j]=A[j][N]/A[j][j];
DVM(REMOTE_ACCESS X[j])
printf(“j=%4i X[j]=%3.3E\n”,j,X[j]);
}
return 0;
}
Example 2. Jacobi algorithm
/* JACOBI program */
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define Max(a,b) ((a)>(b)?(a): (b))
#define DVM(dvmdir)
#define DO(v,l,h,s) for(v=l; v<=h; v+=s)
#define L 8
#define ITMAX 20
int i,j,it,k;
double eps;
double MAXEPS = 0.5;
FILE *f;
/* 2D arrays block distributed along 2 dimensions */
DVM(DISTRIBUTE [BLOCK][BLOCK]) double A[L][L];
DVM(ALIGN[i][j] WITH A[i][j]) double B[L][L];
int main(int argn, char **args)
{
/* 2D loop with base array A */
DVM(PARALLEL [i][j] ON A[i][j])
DO(i,0,L-1,1)
DO(j,0,L-1,1)
{A[i][j]=0.;
B[i][j]=1.+i+j;
}
/****** iteration loop *************************/
DO(it,1,ITMAX,1)
{
eps= 0.;
/* Parallel loop with base array A */
/* calculating maximum in variable eps */
DVM(PARALLEL [i][j] ON A[i][j]; REDUCTION MAX(eps))
DO(i,1,L-2,1)
DO(j,1,L-2,1)
{eps = Max(fabs(B[i][j]-A[i][j]),eps);
A[i][j] = B[i][j];
}
/* Parallel loop with base array B and */
/* with prior updating shadow elements of array A */
DVM(PARALLEL[i][j] ON B[i][j]; SHADOW_RENEW A)
DO(i,1,L-2,1)
DO(j,1,L-2,1)
B[i][j] = (A[i-1][j]+A[i+1][j]+A[i][j-1]+A[i][j+1])/4.;
printf(“it=%4i eps=%3.3E\n”, it,eps);
if (eps < MAXEPS) break;
}/*DO it*/
f=fopen("jacobi.dat","wb");
fwrite(B,sizeof(double),L*L,f);
return 0;
}
Example 3. Jacobi Algorithm (asynchronous version)
/* Asynchronous JACOBI */
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define Max(a,b) ((a)>(b)? (a): (b))
#define DVM(dvmdir)
#define DO(v,l,u,s) for(v=l; v<=u; v+=s)
#define L 8
#define ITMAX 20
int i,j,it,k;
double eps;
double MAXEPS = 0.5;
FILE *f;
/* declare groups for shadow and reduction operations */
DVM(SHADOW_GROUP) void *grshad;
DVM(REDUCTION_GROUP) void *emax;
/* 2D arrays block distributed along 2 dimensions */
DVM(DISTRIBUTE [BLOCK][BLOCK]) double A[L][L];
DVM(ALIGN [i][j] WITH A[i][j]) double B[L][L];
int main(int argn, char **args)
{
/* 2D parallel loop with base array A */
DVM(PARALLEL [i][j] ON A[i][j])
DO(i,0,L-1,1)
DO(j,0,L-1,1)
{A[i][j]=0.;
B[i][j]=1.+i+j;}
/* Create group for shadow operation */
DVM(CREATE_SHADOW_GROUP grshad: A);
/************ iteration loop *************************/
DO(it,1,ITMAX,1)
{
eps= 0.;
/* Parallel loop with base array A: */
/* at first elements of array A exported by the */
/* neighbor processor are calculated and sent and */
/* then internal elements of array A are calculated */
DVM(PARALLEL [i][j] ON A[i][j];
SHADOW_START grshad; REDUCTION emax : MAX(eps))
DO(i,1,L-2,1)
DO(j,1,L-2,1)
{eps = max(fabs(B[i][j]-A[i][j]),eps);
A[i][j] = B[i][j];
}
/* Start asynchronous calculation of maximum */
DVM(REDUCTION_START emax);
/* Parallel loop with base array B: */
/* internal elements of array B are calculated at first */
/* then completion of array A shadow edges update is */
/* awaited and the loop iterations, requiring shadow */
/* elements of array A are calculated */
DVM(PARALLEL[i][j] ON B[i][j]; SHADOW_WAIT grshad)
DO(i,1,L-2,1)
DO(j,1,L-2,1)
B[i][j] = (A[i-1][j]+A[i+1][j]+A[i][j-1]+A[i][j+1])/4;
/* Wait for completion of reduction */
DVM(REDUCTION_WAIT emax);
printf( "it=%4i eps=%3.3E\n”, it,eps);
if (eps < MAXEPS) break;
}/*DO it*/
f=fopen("jacobi.dat","wb");
fwrite(B,sizeof(double),L*L,f);
return 0;
}
Example 4. Red-Black Successive Over-Relaxation
The idea of the method is to paint the points of the grid (the variables) in two colors - red and black - as on a chess-board. Non-rectangular index space is defined by the special form of a loop header: DO (var, a+b%2, ub, 2)
/* RED-BLACK program */
#include <stdlib.h>
#include <math.h>
#define DVM(dvmdir)
#define DO(v,l,h,s) for(v=l; v<=h; v+=s)
#define Max(a,b) ((a)>(b)?(a):(b))
#define ITMAX 10
int main(int an, char ** as)
{long i, j, it, irb;
float eps=0.;
float MAXEPS = 0.5E-5;
float w = 0.5;
/* Groups of asynchronous operations */
DVM(SHADOW_GROUP) void *sha;
DVM(REDUCTION_GROUP) void *rg;
int N;
/* redefine N as constant */
#define N 8
DVM(DISTRIBUTE [BLOCK] [BLOCK]) float (*A)[N];
#undef N /* restore N */
/* "Calculate" of array size (=8!). Create array */
N = 8;
A = malloc( N*N*sizeof(float));
/* Specification of members of shadow group */
DVM(CREATE_SHADOW_GROUP sha : A);
/* Initialization: parallel loop with surpassing */
/* calculation and sending of exported data */
DVM(PARALLEL [i][j] ON A[i][j]; SHADOW_START sha)
DO (i,0,N-1,1)
DO (j,0,N-1,1)
if(i==0 || i==N-1 || j==0 || j==N-1) A[i][j]=0.;
else A[i][j] = 1.+i+j;
DVM(SHADOW_WAIT sha);
/* iteration loop */
DO (it,0,ITMAX,1)
{
eps = 0.;
/* Parallel loop with reduction on RED points */
DVM(PARALLEL [i][j] ON A[i][j]; SHADOW_START sha;
REDUCTION rg: MAX(eps) )
DO (i,1,N-2,1)
DO (j,1+i%2,N-2,2)
{float s;
s = A[i][j];
A[i][j] = (w/4)*(A[i-1][j]+A[i+1][j]+
A[i][j-1]+A[i][j+1])+(1-w)*A[i][j];
eps = Max (fabs(s-A[i][j]), eps);
}
/* Start red reduction -- as early as possible */
/* Then wait shadows */
DVM(REDUCTION_START rg);
DVM(SHADOW_WAIT sha);
/* Parallel loop on BLACK points (without reduction) */
DVM(PARALLEL [i][j] ON A[i][j]; SHADOW_START sha)
DO (i,1,N-2,1)
DO (j,1+(i+1)%2,N-2,2)
{A[i][j] = (w/4)*(A[i-1][j]+A[i+1][j]+
A[i][j-1]+A[i][j+1])+(1-w)*A[i][j];
}
/* Wait shadows, then */
/* wait red point's reduction -- as late as possible */
DVM(SHADOW_WAIT sha);
DVM(REDUCTION_WAIT rg);
printf("it=%d eps=%e\n",it,eps);
if (eps < MAXEPS) break;
} /* DO it */
free(A);
return 0;
}
Example 5. Multigrid method program
/* MULTIGRID program */
#include <stdio.h>
#include <stdlib.h>
#define DVM(dvm)
#define DO(v,l,h,s) for(v=(l); v<=(h); v+=(s))
void oper(
DVM(*DISTRIBUTE [*][*][*]) float *AA,
int AAN, int AAM, int AAK,
DVM(*DISTRIBUTE [*][*][*]) float *BB,
int BBN, int BBM, int BBK)
/*Parameters: two distributed 3D arrays and */
/* their actual dimensions */
{
#define AA(i,j,k) AA[((i)*AAM+(j))*AAK+(k)]
#define BB(i,j,k) AA[((i)*BBM+(j))*BBK+(k)]
int i, j,k;
/* Alignment of array BB with array AA using stretching*/
DVM(REALIGN BB[i][j][k] WITH AA[2*i][2*j][2*k] NEW);
/* forming array BB from elements of array AA with even indexes */
DVM(PARALLEL [i][j][k] ON BB[i][j][k])
FOR(i,BBN)
FOR(j,BBM)
FOR(k,BBK)
BB(i,j,k)=AA(i*2,j*2,k*2);
#undef AA
#undef BB
}
int main(int argn, char **args)
{
int N1=8,M1=12,K1=16;
int N2,M2,K2;
int Narr=5,Nit=5;
int grid=0;
int ACM,ACK;
int i,j,k;
int step_grid=1;
/* Up to 20 distributed arrays */
DVM(DISTRIBUTE[BLOCK][BLOCK][]) float *A[20];
/* Pointer to current distributed array */
DVM(*DISTRIBUTE[*][*][*]) float *AC;
/* creation of array A[0] */
A[0]=malloc(N1*M1*K1*sizeof(float));
AC=A[0];
ACM=M1;
ACK=K1;
#define AC(i,j,k) AC[((i)*ACM+(j))*ACK+(k)]
/* initialization of source array */
DVM(PARALLEL [i][j][k] ON AC[i][j][k])
DO(i,0,N1-1,1)
DO(j,0,M1-1,1)
DO(k,0,K1-1,1)
AC(i,j,k)=1.+i+j+k ;
#undef AC
do{
printf("N1=%d M1=%d K1=%d \n”,N1,M1,K1);
N2=N1/2;
M2=M1/2;
K2=K1/2;
grid++;
/* creation of array A[grid] */
A[grid]=malloc(N2*M2*K2*sizeof(float));
oper(A[grid-1],N1,M1,K1,A[grid],N2,M2,K2);
N1=N2;
M1=M2;
K1=K2;
} while (N2>2 && grid<Narr)
for(i=0;i<=grid;i++)
free(A[i]);
return 0;
}
Example 6. Task Parallelism for Multiblock Code
/* PROGRAM TASKS */
/* rectangular grid is distributed on two blocks */
/* */
/* <------- A2,B2 -----> */
/* 0 1 2 ... N2 */
/* <--- A1,B1 --------> */
/* 0 1 ... N1-1 N1 N1+1 ... N1+N2-1 */
/*------------------------------------------------*/
/* 0 | . . . . . ... . */
/* ... | */
/* K-1 | . . . . . ... . */
/**************************************************/
#include <stdlib.h>
#include <math.h>
#define DVM(dvmdir)
#define DO(v,l,h,s) for(v=l; v<=h; v+=s)
#define FOR(v,n) for(v=0; v<n; v++)
#define Max(a,b) ((a)>(b)?(a):(b))
#define NUMBER_OF_PROCESSORS() 1
#define K 8
#define N1 4
#define N2 K-N1
#define ITMAX 10
DVM(PROCESSORS) void * P[NUMBER_OF_PROCESSORS()];
DVM(TASK) void * MB[2];
double eps0;
DVM(DISTRIBUTE) float (*A1)[K], (*A2)[K];
DVM(ALIGN) float (*B1)[K], (*B2)[K];
int main(int argn, char** args)
{
int i,j, it;
int NP;
printf("---------- starting --------------\n");
DVM(DEBUG 1 -d0)
{
NP = NUMBER_OF_PROCESSORS() / 2;
}
DVM(MAP MB[0] ONTO P[0:(NP?NP-1:0)]);
DVM(MAP MB[1] ONTO P[NP:(NP?2*NP-1:NP)]);
A1=malloc((N1+1)*K*sizeof(float));
DVM(REDISTRIBUTE A1[][BLOCK] ONTO MB[0]);
B1=malloc((N1+1)*K*sizeof(float));
DVM(REALIGN B1[i][j] WITH A1[i][j]);
A2=malloc((N2+1)*K*sizeof(float));
DVM(REDISTRIBUTE A2[][BLOCK] ONTO MB[1]);
B2=malloc((N2+1)*K*sizeof(float));
DVM(REALIGN B2[i][j] WITH A2[i][j]);
/* Initialization */
DVM(PARALLEL [i][j] ON A1[i][j])
FOR(i,N1+1)
FOR(j,K)
{if (i==0 || j==0 || j==K-1) {
A1[i][j] = 0.f;