/* Code to sort on a parallel machine. NOTE: This is a specialised version adapted for both internal and external sorting. A more general internal-only sorting routine can be found at ftp://nimbus.anu.edu.au/pub/tridge/sorting/par_sort/ Written by Andrew Tridgell July 1995 based on earlier work by Andrew Tridgell and Richard Brent. This program is Copyright Andrew Tridgell 1995. There is no warranty with this product. last updated November 1995 contact: Andrew Tridgell Department of Computer Science Australian National University GPO BOX 4, Canberra City, A.C.T, 2601, Australia Tel: (06) 249-5691 Fax: (06) 249-0010 EMail: Andrew.Tridgell@anu.edu.au */ int debug_level = 1; int internal_timing = 1; #define DEBUG(l,x) ((l)<=debug_level?printf x:0) static double io_time = 0.0; static double other_time = 0.0; static double local_sort_time = 0.0; static double internal_sort_time = 0.0; static double primary_sort_time = 0.0; static double cleanup_sort_time = 0.0; static double simple_merge_sort_time = 0.0; /* which column algorithm ? */ #define SHEAR 1 #define BRSHEAR 0 #define ABRSHEAR 0 #define BDM 0 #define SWAPBR 0 #if GNU_SORT #define sort gnu_qsort #elif RADIX /* should I use a radix sort for the local sort? NOTE: This won't work for a general data type. It is only recommended for types such as 64 bit integers */ #define sort radix_sort #else #define sort qsort #endif #ifndef SIMPLE #define SIMPLE 0 #endif /* shall I use a hyper pre-sort ? */ #ifndef HYPER_SORT #define HYPER_SORT 1 #endif /* shall I try fast find techniques in the merge ? */ #ifndef FAST_MERGE #define FAST_MERGE 1 #endif /* shall I try to use an unbalanced merge ? */ #ifndef UNBALANCED_MERGE #define UNBALANCED_MERGE 1 #endif /* shall I assume integer data only ? */ #ifndef INTEGER #define INTEGER 0 #endif /* shall I assume element data only ? */ #ifndef FIXED_ELEMENT #define FIXED_ELEMENT 0 #endif #if FIXED_ELEMENT #include "element.h" #endif /* this file defines the following function */ int parallel_sort(int fd,int el_size,int (*comparison)()); #include #include #include #include "mimd.h" #include "sort_util.h" void *memalign(int alignment,size_t size); #define WRAP(p) ((p)%NUM_NODES) /* these macros provide all the element manipulation. Special purpose ones are used for integer sorts */ #if INTEGER int iii; #define COMPARE(p1,p2) (*((int *)(p1)) - *((int *)(p2))) #define ELEMENT(ptr,i) ((void *)((int *)(ptr) + (i))) #define COPYEL(p1,p2) (*((int *)p1) = *((int *)p2)) #define COPYEL_INC(p1,p2) \ (COPYEL(p1,p2),p1=ELEMENT(p1,1),p2=ELEMENT(p2,1)) #define COPYELS(p1,p2,n) memmove(p1,p2,(n)*EL_SIZE) #define EL_SIZE sizeof(int) #elif FIXED_ELEMENT static inline int comp(element *i1,element *i2) { int i=0,d; do { if ((d=(i1->i[i] - i2->i[i]))) return(d); } while (++i0; p/=2) { q = initq; r = 0; d = p; do { for (x=0; x= 8) { DOONE(d0); DOONE(d1); DOONE(d2); DOONE(d3); DOONE(d4); DOONE(d5); DOONE(d6); DOONE(d7); d[0] = d0;d[1] = d1;d[2] = d2;d[3] = d3; d[4] = d4;d[5] = d5;d[6] = d6;d[7] = d7; d+=8;N-=8; } while (N--) DOONE(*d++); /* simple_merge_sort_time += MIMD_Uptime() - tt; */ return(ss1-(int *)src1); #undef DOONE } #elif FIXED_ELEMENT #define DOONE (COMPARE(ss1,ss2)<0? (*d++ = *ss1++) : (*d++ = *ss2++)) { register int N = n; element *ss1=src1,*ss2=src2,*d=dest; while (N >= 4) { DOONE; DOONE; DOONE; DOONE; N -= 4; } while (N--) DOONE; /* simple_merge_sort_time += MIMD_Uptime() - tt; */ return(ss1-(element *)src1); } #else { int N = n; int elsize = EL_SIZE; int ii; int (*cmp)() = PSI.comparison; if (elsize%sizeof(double)==0 && ((int)src1)%sizeof(double)==0 && ((int)src2)%sizeof(double)==0 && ((int)dest)%sizeof(double)==0) { double *ss1=src1,*ss2=src2,*d=dest; elsize /= sizeof(double); switch (elsize) { case 1: while (N--) *d++ = (cmp(ss1,ss2)<0?*ss1++:*ss2++); break; case 2: { while (N--) { if (cmp(ss1,ss2)<0) { d[0] = ss1[0];d[1] = ss1[1]; ss1+=2; } else { d[0] = ss2[0];d[1] = ss2[1]; ss2+=2; } d+=2; } } break; default: while (N--) if (cmp(ss1,ss2)<0) {ii=elsize;while (ii--) *d++ = *ss1++;} else {ii=elsize;while (ii--) *d++ = *ss2++;} break; } /* simple_merge_sort_time += MIMD_Uptime() - tt; */ return(((char *)ss1 - (char *)src1)/EL_SIZE); } if (elsize==sizeof(float) && ((int)src1)%sizeof(float)==0 && ((int)src2)%sizeof(float)==0 && ((int)dest)%sizeof(float)==0) { float *ss1=src1,*ss2=src2,*d=dest; elsize /= sizeof(float); while (N--) *d++ = (cmp(ss1,ss2)<0?*ss1++:*ss2++); /* simple_merge_sort_time += MIMD_Uptime() - tt; */ return(((char *)ss1 - (char *)src1)/EL_SIZE); } { char *ss1=src1,*ss2=src2,*d=dest; while (N--) { if (cmp(ss1,ss2)<0) { memmove(d,ss1,elsize); d+=elsize; ss1+=elsize; } else { memmove(d,ss2,elsize); d+=elsize; ss2+=elsize; } } /* simple_merge_sort_time += MIMD_Uptime() - tt; */ return(((char *)ss1 - (char *)src1)/EL_SIZE); } } #endif } /******************************************************************* shift some elements - may overlap! ********************************************************************/ static void ShiftEls(void *dest,void *src,int N) { memmove(PTR(dest,0),PTR(src,0),N*EL_SIZE); } /******************************************************************* unbalanced merge of two lists - best when L1 or L2 is small ********************************************************************/ static void UnbalancedMerge(void *data,int L1,int L2,int gap) { int *slots; int N = L1 + L2; int smallL = MIN(L1,L2); int bigL = MAX(L1,L2); void *smallP,*bigP; slots = memalign(EL_SIZE,sizeof(int)*smallL); if (L1 < L2) { smallP = data; bigP = PTR(data,L1+gap); } else { bigP = data; smallP = PTR(data,L1+gap); } /* create the slots list - this says where each element in smallList will fit into the big list */ { int start=0; int i; for (i=0;i=0) { int todo = bigL-slots[slot]-num_taken; ShiftEls(PTR(data,N-num_done-todo), PTR(bigP,bigL-num_taken-todo), todo); num_done += todo; num_taken += todo; COPYEL(PTR(data,N-num_done-1), PTR(TempBuf,smallL-(num_done-num_taken)-1)); num_done++; slot--; } ShiftEls(data,bigP,N-num_done); free(TempBuf); } free(slots); } /******************************************************************* merge 2 lists into one with minimum memory overhead ********************************************************************/ static void insitu_merge(void *data,int L1,int L2,int gap) { int N = L1 + L2; int B; int bListLen1,bListLen2,bListLen3; int NumTemp=3; int TailSize; BOOL TailDone; typedef struct { int start,length; } bListStruct; bListStruct *bList1,*bList2,*bList3; void *TailBuf; void *TempBuf; double sqrt(double ); #define COPYP(p1,p2) COPYEL(p1,p2) #define COPYP_INC(p1,p2) COPYEL_INC(p1,p2) #define COPYPN(p1,p2,n) COPYELS(p1,p2,n) #define LESSTHANP(p1,p2) (COMPARE(p1,p2)<0) if (L2==0) return; if (L1==0 && gap>0) { ShiftEls(data,PTR(data,gap),L2); return; } B = MIN(N,(int)sqrt((double)N)*4); /* B = MIN(B,(128*1024/EL_SIZE)/4); */ #if UNBALANCED_MERGE if (L1 <= B || L2 <= B) { UnbalancedMerge(data,L1,L2,gap); return; } #endif /* INITIALISATION: allocate memory */ TempBuf = el_vector(NumTemp*B); bList1 = (bListStruct *)memalign(EL_SIZE,(L1/B + 2)*sizeof(bListStruct)); bList2 = (bListStruct *)memalign(EL_SIZE,(L2/B + 2)*sizeof(bListStruct)); bList3 = (bListStruct *)memalign(EL_SIZE,(L1/B + L2/B + 5)*sizeof(bListStruct)); TailSize = L1 % B; TailBuf = el_vector(TailSize); TailDone = (TailSize == 0); /* STAGE 1: Create the 2 source block lists and initialise the 3rd */ { int L,S,pos; bListStruct *list; list=bList1; L=L1 - TailSize; S=0; pos=0; while (L) { list[pos].start = S + pos * B; list[pos].length = MIN(B,L); L -= list[pos].length; pos++; } bListLen1=pos; /* copy the tail from list 1 */ COPYPN(TailBuf,PTR(data,bListLen1*B),TailSize); list=bList2; L=L2; S=L1+gap; pos=0; while (L) { list[pos].start = S + pos * B; list[pos].length = MIN(B,L); L -= list[pos].length; pos++; } bListLen2=pos; bListLen3=0; } /* STAGE 2: merge the two lists into list3 */ { void *pL1,*pL2,*pL3; int avail1,avail2; int space3; int sPos1,sPos2; int availPos3,dPos3; BOOL empty1,empty2; /* select the first blocks in list 1 and 2 */ if (bListLen1 > 0) { pL1 = PTR(data,bList1[0].start); avail1 = bList1[0].length; sPos1 = 0; empty1 = False; } else { pL1 = TailBuf; avail1 = TailSize; TailDone = True; sPos1 = 0; empty1 = False; } pL2 = PTR(data,bList2[0].start); avail2 = bList2[0].length; sPos2 = 0; empty2 = False; /* initially select the TempBuf */ bList3[0].start = -1; bList3[0].length = NumTemp*B; availPos3 = 1; pL3 = TempBuf; space3 = NumTemp*B; dPos3 = 0; while (!empty1 || !empty2) { int MinToDo = space3; if (!empty1) MinToDo = MIN(MinToDo,avail1); if (!empty2) MinToDo = MIN(MinToDo,avail2); space3 -= MinToDo; if (empty2 || empty1) { if (empty1) { COPYPN(pL3,pL2,MinToDo); pL2 = PTR(pL2,MinToDo); avail2 -= MinToDo; } else { COPYPN(pL3,pL1,MinToDo); pL1 = PTR(pL1,MinToDo); avail1 -= MinToDo; } pL3 = PTR(pL3,MinToDo); } else { int FrompL1 = SimpleMerge(pL3,pL1,pL2,MinToDo); avail1 -= FrompL1; avail2 -= (MinToDo-FrompL1); pL3 = PTR(pL3,MinToDo); pL1 = PTR(pL1,FrompL1); pL2 = PTR(pL2,MinToDo-FrompL1); } /* have I emptied list 1 ? */ if (avail1==0 && !empty1) { /* make the just emptied block available */ if (TailSize == 0 || !TailDone) { /* int k = dPos3+1; */ int k = availPos3; memmove(&bList3[k+1],&bList3[k],(availPos3-k)*sizeof(bList3[k])); bList3[k].start = bList1[sPos1].start; bList3[k].length = bList1[sPos1].length; availPos3++; sPos1++; } /* are there no blocks left ? */ if (sPos1 == bListLen1) { if (!TailDone) { pL1 = TailBuf; avail1 = TailSize; TailDone = True; } else empty1 = True; } else { /* start using the next block */ pL1 = PTR(data,bList1[sPos1].start); avail1 = bList1[sPos1].length; } } /* have I emptied list 2 ? */ if (avail2==0 && !empty2) { /* int k = dPos3+1; */ int k = availPos3; memmove(&bList3[k+1],&bList3[k],(availPos3 - k)*sizeof(bList3[k])); /* make the just emptied block available */ bList3[k].start = bList2[sPos2].start - (TailSize + gap); bList3[k].length = bList2[sPos2].length; if (sPos2 == (bListLen2-1)) bList3[k].length += TailSize; if (bList3[k].length > B) { bList3[availPos3+1].length = bList3[k].length - B; bList3[availPos3+1].start = bList3[k].start + B; bList3[k].length = B; availPos3++; } /* don't make odd sized blocks available */ if (bList3[availPos3].length == B) availPos3++; sPos2++; /* are there no blocks left ? */ if (sPos2 == bListLen2) empty2 = True; else { /* start using the next block */ pL2 = PTR(data,bList2[sPos2].start); avail2 = bList2[sPos2].length; } } /* have I filled list 3 ? */ if (space3 == 0) { /* start using the next block */ dPos3++; if (dPos3 == availPos3) printf("\n%d merge error - dPos3==availPos3! %d\n",NODE_NUMBER,dPos3); pL3 = PTR(data,bList3[dPos3].start); space3 = bList3[dPos3].length; } } /* we may not have filled list 3 completely */ bList3[dPos3].length -= space3; /* move the last elements to the last block in list 2 */ COPYPN(PTR(data,N-bList3[dPos3].length), PTR(pL3,-bList3[dPos3].length), bList3[dPos3].length); bList3[dPos3].start = N - bList3[dPos3].length; bListLen3 = dPos3; } /* STAGE 3: re-arrange to give final result */ { typedef struct { int where_hiding; int what_now; BOOL occupied; } block_struct; block_struct *blocks; int num_blocks = N / B; int i; BOOL all_done = False; blocks = (block_struct *)memalign(EL_SIZE,num_blocks * sizeof(block_struct)); for (i=0;i= 0) { COPYPN(PTR(data,i*B),PTR(data,from*B),B); blocks[from].occupied = False; } else COPYPN(PTR(data,i*B),PTR(TempBuf,-(from+1)*B),B); blocks[i].occupied = True; blocks[i].what_now = i; blocks[i].where_hiding = i; if (from >= 0) i = from; } } } for (i=0;i 0) max = proposed - 1; else min = proposed; } l1_kept = min; l2_kept = L2 - (L1 - l1_kept); } memswp(p2,PTR(p1,l1_kept),(l1 - l1_kept)*EL_SIZE); insitu_merge(p1,l1_kept,l2-l2_kept,0); insitu_merge(p2,l1-l1_kept,l2_kept,0); t2 = MIMD_Uptime(); DEBUG(3,("local_merge_exchange l1=%d l2=%d kept=%d took %g secs\n", l1,l2,l1_kept,t2-t1)); } /******************************************************************* a 2 node version of batcher_makesmaller ********************************************************************/ static void batcher_makesmaller(int n1,int n2) { double t1,t2; void *data=NULL; int local_size=0,other_size=0,other_node=0; int is_low_node=0; int swap_size; n1 = WRAP(n1); n2 = WRAP(n2); if (NODE_NUMBER != n1 && NODE_NUMBER != n2) return; t1 = MIMD_Uptime(); if (NODE_NUMBER == n1) { other_node = n2; is_low_node = 1; } else { other_node = n1; is_low_node = 0; } data = PSI.data; other_size = local_size = PSI.N; VnodeSwap(other_node,&other_size,sizeof(other_size),sizeof(other_size)); if (local_size + other_size <= 1) return; if (local_size == 0 || other_size == 0) return; /* this implements the find_exact algorithm */ if (!is_low_node) { int proposed=1; int N = other_size; while (proposed>0) { VnodeRecv(other_node,&proposed,sizeof(int),0); if (proposed > 0) VnodeSend(other_node,PTR(data,N-proposed),EL_SIZE,1); } swap_size = other_size + proposed; } else { int L1 = local_size,L2 = other_size; int min=MAX(L1-L2,0),max=L1,proposed=-1; void *TempEl; TempEl = memalign(EL_SIZE,EL_SIZE); while (min < max) { if (proposed==-1) proposed = max; else proposed = (min+max)/2 + 1; VnodeSend(other_node,&proposed,sizeof(int),0); VnodeRecv(other_node,TempEl,EL_SIZE,0); if (COMPARE(PTR(data,proposed-1),TempEl) > 0) max = proposed - 1; else min = proposed; } free(TempEl); swap_size = local_size - min; proposed = -min; VnodeSend(other_node,&proposed,sizeof(int),0); } if (swap_size) { if (is_low_node) VnodeSwap(other_node,PTR(data,local_size-swap_size), swap_size*EL_SIZE,swap_size*EL_SIZE); else VnodeSwap(other_node,data, swap_size*EL_SIZE,swap_size*EL_SIZE); if (is_low_node) insitu_merge(data,local_size-swap_size,swap_size,0); else insitu_merge(data,swap_size,local_size-swap_size,0); } t2 = MIMD_Uptime(); DEBUG(3,("(%d/%d) swap-size with node %d is %d (took %g secs) (%d ops)\n", n1,n2, other_node,swap_size,t2-t1,0)); } /********************************************************************** sort some local data **********************************************************************/ static void local_sort(void *data,int N) { if (PSI.local_done) return; if (N > 1) { double tt = MIMD_Uptime(); sort(data,N,EL_SIZE,(QSORT_CAST)PSI.comparison); local_sort_time += MIMD_Uptime() - tt; } } int internal_sort(void *data,int baseP,int P,int N,int el_size,int (*comparison)()) { double t1,t2; PSI.comparison = comparison; PSI.el_size = el_size; PSI.data = data; PSI.N = N; /* sort the local node */ t1 = MIMD_Uptime(); local_sort(data,N); t2 = MIMD_Uptime(); DEBUG(2,("local sort took %g secs\n",t2-t1)); fflush(stdout); #if HYPER_SORT t1 = MIMD_Uptime(); hyper_sort(baseP,P,batcher_makesmaller); t2 = MIMD_Uptime(); primary_sort_time += t2 - t1; DEBUG(3,("hyper sort took %g secs\n",t2-t1)); #endif t1 = MIMD_Uptime(); batchers_sort(baseP,P,batcher_makesmaller); t2 = MIMD_Uptime(); cleanup_sort_time += t2 - t1; DEBUG(2,("batchers sort took %g secs\n",t2-t1)); t1 = t2; fflush(stdout); t2 = MIMD_Uptime(); DEBUG(2,("waiting took %g secs\n",t2-t1)); return(0); } struct slice { int slice; int start; int N; int row,col; int p; void *smallest; void *largest; }; int maxC=0; void *load_slice(int fd,struct slice *slice,int el_size) { double t; void *data = NULL; extern void *data_ptr; if (!slice->N) return(NULL); if (data_ptr) { void *d = (void *)realloc(data_ptr,slice->N*el_size + EL_SIZE); data = d; if (((int)data)%EL_SIZE) { data = ((char *)data + (EL_SIZE - (((int)data)%EL_SIZE))); } if (!data) { free(data_ptr); data_ptr = NULL; } else { data_ptr = d; } } if (!data) { data_ptr = data = (void *)memalign(EL_SIZE,slice->N*el_size); } if (!data) { printf("Failed to malloc for slice\n"); exit(0); } lseek(fd,slice->start*el_size,SEEK_SET); t = MIMD_Uptime(); read(fd,data,slice->N*el_size); io_time += MIMD_Uptime() - t; { extern int worst_case; if (worst_case && !PSI.local_done) { char *p = data; int N = slice->N; while (N--) { *p = col_num(slice->col,slice->row,maxC,0); p += el_size; } } } return(data); } void save_slice(int fd,char *data,struct slice *slice,int el_size) { double t; if (!slice->N) return; memcpy(slice->smallest,data,el_size); memcpy(slice->largest,data+(slice->N-1)*el_size,el_size); lseek(fd,slice->start*el_size,SEEK_SET); t = MIMD_Uptime(); write(fd,data,slice->N*el_size); io_time += MIMD_Uptime() - t; } void sort_slice(int fd,int baseP,int P,struct slice *slices,int s, int el_size,int (*comparison)()) { double tt; void *data; DEBUG(2,("Sorting slice %d\n",s)); data = load_slice(fd,&slices[s],el_size); if (internal_timing) MIMD_Barrier(); tt = MIMD_Uptime(); internal_sort(data,baseP,P,slices[s].N,el_size,comparison); if (internal_timing) MIMD_Barrier(); internal_sort_time += MIMD_Uptime() - tt; save_slice(fd,data,&slices[s],el_size); } int slices_by_smallest(struct slice *slice1,struct slice *slice2) { int ret; if (!slice1->N || !slice2->N) return(slice1->slice - slice2->slice); ret = PSI.comparison(slice1->smallest,slice2->smallest); return(ret); } int slices_by_largest(struct slice *slice1,struct slice *slice2) { int ret; if (!slice1->N || !slice2->N) return(slice1->slice - slice2->slice); ret = PSI.comparison(slice1->largest,slice2->largest); return(ret); } int slices_by_slice(struct slice *slice1,struct slice *slice2) { return(slice1->slice - slice2->slice); } int collapse_slices(struct slice *slices,int S, int el_size,int (*comparison)()) { double tt; int n=1; int s,low=0,high=S-1; int done = 0; static int pass = 0; tt = MIMD_Uptime(); while (n=0;high--) if (slices[high].N) break; cum_smallest[high] = slices[high].smallest; for (s=high-1;s>=low;s--) { if (comparison(slices[s].smallest,cum_smallest[s+1])>0) cum_smallest[s] = cum_smallest[s+1]; else cum_smallest[s] = slices[s].smallest; } cum_largest[low] = slices[low].largest; for (s=low+1;s<=high;s++) { if (comparison(slices[s].largest,cum_largest[s-1])<0) cum_largest[s] = cum_largest[s-1]; else cum_largest[s] = slices[s].largest; } for (s=low;s<=high;s++) { if (slices[s].N==0) continue; if ((s==low || comparison(slices[s].smallest,cum_largest[s-1])>=0) && (s==high || comparison(slices[s].largest,cum_smallest[s+1])<=0)) { slices[s].N = 0; done++; } } printf("pass %d done %d of %d slices\n", pass, done, S); pass++; done = 0; for (low=0;low%d ",low,high-1); MIMD_Broad(&low,sizeof(low)); MIMD_Broad(&high,sizeof(high)); low = high-1; } printf("\n"); low = high = -1; MIMD_Broad(&low,sizeof(low)); MIMD_Broad(&high,sizeof(high)); free(cum_smallest); free(cum_largest); } else { done = 0; do { MIMD_Broad_Recv(0,&low,sizeof(low)); MIMD_Broad_Recv(0,&high,sizeof(high)); if (low != high) { done += high - low; for (s=low;s= 0) sort_slice(fd,baseP,countP,slices,my_slice,el_size,comparison); } } void sort_rows(int fd,struct slice *slices,int S,int R,int i, int el_size,int (*comparison)()) { int countP,my_slice,baseP; int P = 0; int r,s; for (r=0;r= 0) sort_slice(fd,baseP,countP,slices,my_slice,el_size,comparison); } } int sort_slices(int fd,struct slice *slices,int S,int R,int C, int el_size,int (*comparison)()) { int s,P=0; int iteration=0; DEBUG(1,("Starting loop at %g\n",MIMD_Uptime())); while (1) { sort_columns(fd,slices,S,C,iteration,el_size,comparison); PSI.local_done = True; if (collapse_slices(slices,S,el_size,comparison) == 0) break; sort_rows(fd,slices,S,R,iteration,el_size,comparison); DEBUG(1,("done loop at %g\n",MIMD_Uptime())); if (collapse_slices(slices,S,el_size,comparison) == 0) break; iteration++; } return(0); } int parallel_sort(int fd,int el_size,int (*comparison)()) { double t; extern int mem_limit; int N = lseek(fd,0,SEEK_END) / el_size; int P,k,p,L,n,m,M,S,R,C,s; struct slice *slices = NULL; io_time = 0.0; local_sort_time = 0.0; simple_merge_sort_time = 0.0; internal_sort_time = 0.0; primary_sort_time = 0.0; cleanup_sort_time = 0.0; t = MIMD_Uptime(); /* number of elements that can fit in a cell */ m = (mem_limit/el_size); PSI.local_done = False; PSI.el_size = el_size; P = NUM_NODES; /* total memory */ M = P * m; /* initial guess for k */ k = (N+(M-1))/M; /* ceil(N/M) */ if (k<=1) { R = k = 1; C = S = p = P; n = (N+(P-1))/P; } else { /* now try to find suitable values for the other numbers */ do { R = C = k; p = P/k; L = (N+(k*k-1))/(k*k); /* ceil(N/k^2) */ n = (L+(p-1))/p; /* ceil(L/p) */ if (n>m) k++; } while (n > m); } P = k*p; L = p*n; S = p*k*k; printf("S=%d n=%d k=%d L=%d P=%d p=%d R=%d C=%d\n", S,n,k,L,P,p,R,C); fflush(stdout); maxC = C; slices = (struct slice *)memalign(EL_SIZE,sizeof(*slices)*S); bzero(slices,sizeof(*slices)*S); for (s=0;s 0) { printf("Sort failed\n"); } other_time = MIMD_Uptime() - t; MIMD_Barrier(); other_time -= internal_sort_time + io_time; MIMD_Sum_Double(io_time,&io_time); MIMD_Sum_Double(local_sort_time,&local_sort_time); MIMD_Sum_Double(internal_sort_time,&internal_sort_time); MIMD_Sum_Double(primary_sort_time,&primary_sort_time); MIMD_Sum_Double(cleanup_sort_time,&cleanup_sort_time); MIMD_Sum_Double(other_time,&other_time); MIMD_Sum_Double(simple_merge_sort_time,&simple_merge_sort_time); if (NODE_NUMBER == 0) { DEBUG(1,("io_time was %g\n",io_time/NUM_NODES)); DEBUG(1,("local_sort_time was %g\n",local_sort_time/NUM_NODES)); DEBUG(1,("internal_sort_time was %g\n",internal_sort_time/NUM_NODES)); DEBUG(1,("primary_sort_time was %g\n",primary_sort_time/NUM_NODES)); DEBUG(1,("cleanup_sort_time was %g\n",cleanup_sort_time/NUM_NODES)); DEBUG(1,("simple_merge_sort_time was %g\n",simple_merge_sort_time/NUM_NODES)); DEBUG(1,("other_time was %g\n",other_time/NUM_NODES)); } return(0); }