#include <stdio.h>
#include <stdlib.h>

#include <pthread.h>
#include <mpi.h>

#define NUM_THREADS  2
#define SIZE         2048*8
#define N_JOBS       8
int a[SIZE];

typedef struct {
  int from[N_JOBS];
  int to[N_JOBS];
  int njobs;
} queue;

queue q;
pthread_mutex_t mutex_q;

int final_sum = 0;
pthread_mutex_t mutex_sum;

int very_final_sum;

void *partial_sum(void *threadid)
{
  int *ptr = (int *) threadid;
  int i, f, t, local_sum;
  int done = 0;

  while (1) {
    // access the queue to self-schedule a job
    pthread_mutex_lock(&mutex_q);
    if (q.njobs == 0)
      done = 1;
    else {
      q.njobs--;
      f = q.from[q.njobs];
      t = q.to[q.njobs];
    }
    pthread_mutex_unlock(&mutex_q);

    if (done)
      break;

    local_sum = 0;
    for (i=f; i<t; i++)
      local_sum += a[i];

    pthread_mutex_lock (&mutex_sum);
    final_sum += local_sum;
    pthread_mutex_unlock (&mutex_sum);
  }

  pthread_exit(NULL);
}


int main (int argc, char *argv[])
{
   pthread_t threads[NUM_THREADS];
   int myid[NUM_THREADS];
   int rc;
   int t;
   int i, delta;
   int numprocs, mpi_id, namelen;
   char processor_name[MPI_MAX_PROCESSOR_NAME];



   MPI_Init(&argc,&argv);
   MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD,&mpi_id);
   MPI_Get_processor_name(processor_name,&namelen);

   printf("Process %d of %d on %s\n", mpi_id, numprocs, processor_name);
   fflush(stdout);


   // Initalize a[]
   for (i=0; i<SIZE; i++)
     a[i] = mpi_id+i*2;

   // Initalize q[]
   delta = SIZE/N_JOBS;
   for (i=0; i<N_JOBS; i++) {
     q.from[i] = i * delta;
     q.to[i] = q.from[i] + delta;
   }
   q.njobs = N_JOBS;



   pthread_mutex_init(&mutex_sum, NULL);
   pthread_mutex_init(&mutex_q, NULL);


   for(t=0; t<NUM_THREADS; t++) {
     myid[t] = t;
     rc = pthread_create(&threads[t], NULL, partial_sum, (void *) &myid[t]);
     if (rc) {
       printf("ERROR; return code from pthread_create() is %d\n", rc);
       exit(0);
     }
   }

   
   for (t=0; t < NUM_THREADS; t++)
     pthread_join(threads[t], NULL);


   pthread_mutex_destroy(&mutex_sum);
   pthread_mutex_destroy(&mutex_q);
 
   printf("FINAL SUM from process %d: %d\n", mpi_id, final_sum);
   fflush(stdout);

   MPI_Reduce(&final_sum, &very_final_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

   if (mpi_id == 0) {
     printf("VERY FINAL SUM: %d\n", very_final_sum);
     fflush(stdout);
   }

   /* Last thing that main() should do */
   pthread_exit(NULL);

   MPI_Finalize();
}

