Monday, March 15, 2010

Parallel Source code in C : Matrix-Vector Multiplication

The program implements Matrix Vector Multiplication, C=A*b using MPI. The program uses 9 processes(3*3 mesh) and 2-D partitioning of matrix A. The Matrix A is a 15*15 matrix that has the following elements { 1,2,3...14,15} in all the rows. The elements of the matrix A are locally generated by each processor(5*5 block).The size of the vector b is 15 and all its elements is equal to 1. The vector b is also locally generated in the last column of three processes.
The processors along the last column(=2) send their data to the diagonal processors. The diagonal processors perform a column-wise broadcast. Each processor performs the local matrix multiplication.A sum-reduction is performed along the rows to add up the partial dot-products. The output vector C is stored in the last column of three processes.

#include
#include
#include
#include
#include
#include

main(int argc,char *argv[])
{
int i,j,k,row_offset,nlocal;
int *local_a,*local_b,*px,*product_matrix;
int p,rank,n,sender;
int ROW=0,COL=1;
int npes,dims[2],periods[2],keep_dims[2];
int other_rank,coords[2];
int last_column_coords;
int my2drank,mycoords[2];
int dim_size;
MPI_Comm comm_2d,comm_row,comm_col;
MPI_Status status;

MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&p);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);

n = 15;

nlocal = n/sqrt(p);
dim_size = sqrt(p);

local_a = (int *)malloc((nlocal*nlocal)*sizeof(int));
local_b = (int *)malloc((nlocal)*sizeof(int));
px = (int *)malloc((nlocal)*sizeof(int));
product_matrix = (int *)malloc((nlocal)*sizeof(int));

/*Compute the size of square grid*/
dims[ROW] = dims[COL] = sqrt(p);

/*Set up the catesian topology and get the rank and coordinates of the processin this topology*/
periods[ROW]=periods[COL]=1; /*set the periods for wrap-around connections*/

MPI_Cart_create(MPI_COMM_WORLD,2,dims,periods,1,&comm_2d);
MPI_Comm_rank(comm_2d,&my2drank); /*get the rank in new topology*/
MPI_Cart_coords(comm_2d,my2drank,2,mycoords); /*get the coordinates*/

/*Create row-based topology*/
keep_dims[ROW] = 0;
keep_dims[COL] = 1;
MPI_Cart_sub(comm_2d,keep_dims,&comm_row);

/*create column based topology*/
keep_dims[ROW] = 1;
keep_dims[COL] = 0;
MPI_Cart_sub(comm_2d,keep_dims,&comm_col);

/*************Generate partial matrix ai at each process*******************/
printf("Generating Local Matrix A at process %d\n",rank);

for(i=0;i
{
for(j=0;j
{
local_a[i*nlocal+j]=(mycoords[COL]*nlocal)+j+1;
printf("%d\t",local_a[i*nlocal+j]);
}
printf("\n");
}
/**********************************************************************/

/****Generate partial vector b at the processes in the last column****/
last_column_coords = (dim_size - 1);
if(mycoords[COL] == last_column_coords)
{
printf("Generating Vector B at process %d\n",rank);

for(i=0;i
{
local_b[i]=1;
printf("%d\n",local_b[i]);
}
}
/*Redistribute the vector b. */

/*the processors along the last column send their data to diagonal processors*/
if(mycoords[COL] == last_column_coords && mycoords[ROW] != last_column_coords)
{
coords[ROW] = mycoords[ROW];
coords[COL] = mycoords[ROW];
MPI_Cart_rank(comm_2d,coords,&other_rank);
printf("Distributing the partial vector B to Diagonal Process %d from Process %d\n",other_rank,rank);
MPI_Send(local_b,nlocal,MPI_INT,other_rank,1,comm_2d);
}

if(mycoords[ROW] == mycoords[COL] && mycoords[ROW] != last_column_coords)
{
coords[ROW]=mycoords[ROW];
coords[COL]= last_column_coords;
MPI_Cart_rank(comm_2d,coords,&other_rank);
printf("Receiving the partial vector B at Diagonal Process at %d from Process %d\n",rank,other_rank);
MPI_Recv(local_b,nlocal,MPI_INT,other_rank,1,comm_2d,&status);

}

/*the diagonal processors perform a column wise broadcast*/
coords[0] = mycoords[COL];
MPI_Cart_rank(comm_col,coords,&other_rank);
MPI_Bcast(local_b,nlocal,MPI_INT,other_rank,comm_col);

/*Main Computation Loop*/
for(i=0;i
{
px[i]=0;
for(j=0;j
px[i] += local_a[i*nlocal+j]*local_b[j];
}

/*perform the sum reduction along the rows to add up the partial dot-products and store the result in the last column of processes*/
coords[0] = last_column_coords;
MPI_Cart_rank(comm_row,coords,&other_rank);
MPI_Reduce(px,product_matrix,nlocal,MPI_INT,MPI_SUM,other_rank,comm_row);

if(mycoords[COL] == last_column_coords)
{
printf("Product Vector at process %d\n",rank);

for(i=0;i
{
printf("%d\n",product_matrix[i]);
}
}
MPI_Comm_free(&comm_2d);
MPI_Comm_free(&comm_row);
MPI_Comm_free(&comm_col);

free(px);
free(local_a);
free(local_b);
free(product_matrix);

MPI_Finalize();
}//main

Parallel source code in C: Prim's Minimum Spanning Tree

The program implements Prim's Minimum Spanning Tree using p=3 processors and n=6 vertices. The weighted Adjacency Matrix is stored in the form of a 3D array.The parameter source stores the vertex from which we want to compute the MST. The parameter d points to a vector that will store the minimum length of the spanning tree from source
The computation works as follows:
Each process finds the locally stored vertex in V0 that has the smallest distance from the source. Next the vertex that has the smallest distance over all processes is determined at Process 0 using Reduce operation. This vertex is then broadcast from process 0 to all other processes. Last,all the processes update their distances to reflect the inclusion of the new vertex in Vc.


#include
#include
#include
#include
#include
#include
#define MAX_INT 65535

main(int argc,char *argv[])
{
int input_AL[3][6][2] = {
{{0,1},{1,0},{3,5},{MAX_INT,1},{MAX_INT,MAX_INT},{2,MAX_INT}},
{{3,MAX_INT},{5,1},{0,2},{2,0},{1,4},{MAX_INT,MAX_INT}},
{{MAX_INT,3},{MAX_INT,MAX_INT},{1,MAX_INT},{4,MAX_INT},{0,5},{5,0}}
};
int rank,n,p,nlocal;
int i,j,firstvtx,lastvtx;
int source;
int lminpair[2],gminpair[2];
int *u_vector,u;
int *d,*marker;
MPI_Status status;

MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&p);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);

n = 6; //number of vertices
source = 1;//vertex b

nlocal = n/p; //number of vertices stored locally


firstvtx = rank*nlocal;//index number of the first vertex stored locally
lastvtx = firstvtx + nlocal - 1;//index of the last vertex stored locally

printf("Input Adjacency List at process %d\n",rank);
for(i=0;i<6;i++)
{
for(j=0;j<2;j++)
{
printf("%d\t", input_AL[rank][i][j]);
}
printf("\n");
}

printf("\n\n");

/*Set the initial distances from source to all other vertices*/
d = (int *)malloc(nlocal *(sizeof(int)));
for(j=0;j d[j]=input_AL[rank][source][j];

printf("Step 1 :: Minimum Spanning Tree :: Vector d at Process %d\n",rank);
for(j=0;j printf("%d\t",d[j]);
printf("\n\n");

/*This array is used to indicate if the shortest path to a vertex has been found or not.If marker[v] is zero, then the shortest path to v has been found,otherwise it has not*/
marker = (int *)malloc(nlocal *(sizeof(int)));
for(j=0;j marker[j]=1;

/*the process that stores the source vertex,marks it as being seen*/
if(source >= firstvtx && source <= lastvtx)
{
marker[source-firstvtx] = 0;
}
for(i=1;i {
/*Find the local vertex that is at the smallest distance from source*/
lminpair[0] = MAX_INT;
lminpair[1] = -1;
for(j=0;j {
if(marker[j]&&d[j] {
lminpair[0] = d[j];
lminpair[1]=firstvtx+j;
}
}

u_vector =(int *)malloc(2*sizeof(int));
/*Compute the global minimum vertex at Process 0 and insert in into Vc*/
MPI_Reduce(lminpair,gminpair,1,MPI_2INT,MPI_MINLOC,0,MPI_COMM_WORLD);

if(rank == 0)
{
u_vector[0]=gminpair[1];
u=gminpair[1];
printf("Step %d :: GLOBAL MINIMUM VERTEX %d at Process %d\n",i+1,gminpair[1],rank);
}
/*Broadcast global minimum vertex from Process 0 to all other processes*/
MPI_Bcast(u_vector,2,MPI_INT,0,MPI_COMM_WORLD);

u=u_vector[0];
printf("Vertex received at Process %d after Broadcast %d\n",rank,u);

/*the process that stores the minimum vertex,marks it as being seen*/
if(u==lminpair[1])
marker[u-firstvtx] = 0;

/*Update the distances given that u got inserted into Vc*/
for(j=0;j {
if(marker[j] && input_AL[rank][u][j] < d[j])
{
d[j] = input_AL[rank][u][j];
}
}

printf("Step %d :: Minimum Spanning Tree :: Vector d at Process %d\n",i+1,rank);
for(j=0;j printf("%d\t",d[j]);
printf("\n\n\n");
}//for(i=1;i
printf("---------------------------------------------------------\n");
printf("Minimum Spanning Tree :: Final Vector d at Process %d\n",rank);
for(j=0;j printf("%d\t",d[j]);
printf("\n---------------------------------------------------------\n\n\n");

MPI_Finalize();
}//main

PARALLEL BITONIC SORT IMPLEMENTED ON HYPERCUBE SOURCE CODE in C

The program implements Bitonic sort on a hypercube using 8 processors and input size array of 64.The unsorted array is initially statically assigned to each processor. Each of the processors sorts the local array in increasing order using built-in quick sort api of mpi-interface. Once the processors complete local sorting, they communicate and exchange data in a certain fashion to merge the local subsequences using compare-split operations instead of compare-exchange operations. The first (d^2 - d)/2=3 steps1 are for obtaining a full bitonic sequence and the last (d=3) steps are for sorting this Bitonic sequence.

#include
#include
#include
#include
#include
#include

int IncOrder(const void *e1,const void *e2)
{
return (*((int *)e1) - *((int *)e2));
}

CompareSplit(int nlocal,int elements[][8],int rank,int *relements,int *wspace,int keepsmall)
{
int i,j,k;
for(i=0;i wspace[i]=elements[rank][i];

if(keepsmall)
{
i=0;
j=0;
for(k=0;k {
if(j == nlocal || (i < nlocal && (wspace[i] < relements[j])) )
elements[rank][k] = wspace[i++];
else
elements[rank][k] = relements[j++];
}
}
else
{
i=nlocal-1;
j=nlocal-1;
for(k=nlocal-1;k>=0;k--)
{
if((i >=0 && (wspace[i] >= relements[j])) )
elements[rank][k] = wspace[i--];
else
elements[rank][k] = relements[j--];
}
}
}

main(int argc,char *argv[])
{
int i,j,nlocal,k;
int p,rank,n,partner,temp;
int *relements,*wspace;
int msb_i1,lsb_j,test_msb,test_lsb,keepsmall=0;
int elements[8][8]={
{55,26,51,60,43,64,54,33},
{22,7,13,18,2,17,1,14},
{56,49,29,39,37,35,53,48},
{20,6,10,24,15,9,21,3},
{28,44,63,34,62,38,58,40},
{16,19,23,4,11,12,5,8},
{36,50,41,47,57,25,32,31},
{45,42,30,46,61,27,52,59} };
MPI_Status status;

MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&p);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);

n = 64;

nlocal = n/p;

relements = (int *)malloc(nlocal*sizeof(int));
wspace = (int *)malloc(nlocal*sizeof(int));

/*Locally sort array of block of n/p elements at each process using built-in sort*/
qsort(elements[rank],nlocal,sizeof(int),IncOrder);

printf("Locally sorted array at Process %d\n",rank);
for(i=0;i printf("%d\t",elements[rank][i]);
printf("\n\n");

for(i=0;i<3;i++)
{
for(j=i;j>=0;j--)
{
temp = pow(2,j);
/*compute the Processor with which Communication will take place*/
partner = (rank ^ temp);

/*Extract the (i+1)th bit of Process-id (rank in our case)*/
test_msb = rank>>(i+1);
msb_i1=test_msb&1;

/*Extract the jth bit of Process-id (rank in our case)*/
test_lsb = rank>>j;
lsb_j=test_lsb&1;

if(msb_i1==lsb_j)
keepsmall = 1;
else
keepsmall = 0;

MPI_Send(elements[rank],nlocal,MPI_INT,partner,1,MPI_COMM_WORLD);
MPI_Recv(relements,nlocal,MPI_INT,partner,1,MPI_COMM_WORLD,&status);
CompareSplit(nlocal,elements,rank,relements,wspace,keepsmall);

}

}

printf("--------------Final Sorted array at Process %d--------------\n",rank);
for(i=0;i printf("%d\t",elements[rank][i]);
printf("\n");
printf("------------------------------------------------------------\n\n");

MPI_Finalize();
}//main