#include <stdio.h>
#include <stdlib.h>

#include <mpi.h>

#define SIZE         2048*8
int a[SIZE];

int final_sum = 0;
int very_final_sum;

int main (int argc, char *argv[])
{
   int rc;
   int t;
   int i;
   int numprocs, mpi_id, namelen;
   char processor_name[MPI_MAX_PROCESSOR_NAME];



   MPI_Init(&argc,&argv);
   MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD,&mpi_id);
   MPI_Get_processor_name(processor_name,&namelen);

   printf("Process %d of %d on %s\n", mpi_id, numprocs, processor_name);
   fflush(stdout);


   // Initalize a[]
   for (i=0; i<SIZE; i++)
     a[i] = mpi_id+i*2;

   // set the number of threads with the bash shell command that sets 
   // an environment variable:
   //         export OMP_NUM_THREADS=<integer> 
   // or with the clause:
   //         num_threads(<integer>)
   // of the OpenMP directive "parallel"
   #pragma omp parallel
   {
     printf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads());
     #pragma omp parallel for schedule(dynamic) shared(a) private(i) reduction(+:final_sum)
     for (i=0; i<SIZE; i++)
       final_sum += a[i];
   } 
  
   printf("FINAL SUM from process %d: %d\n", mpi_id, final_sum);
   fflush(stdout);

   MPI_Reduce(&final_sum, &very_final_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

   if (mpi_id == 0) {
     printf("VERY FINAL SUM: %d\n", very_final_sum);
     fflush(stdout);
   }

   MPI_Finalize();
}

