/***********************************************************************
ESPRIT-Forest: Parallel Clustering of Massive Amplicon Sequence Data in Subquadratic Time 
by: Yunpeng Cai, Yijun Sun, Wei Zheng, Jin Yao and Yujie Yang  (C) 2016
Please kindly cite [Y.Cai et.al PLOS Comp. Biol. 2016]

THE LICENSED WORK IS PROVIDED UNDER THE TERMS OF THE ADAPTIVE PUBLIC LICENSE ("LICENSE") AS FIRST COMPLETED BY: _Yunpeng Cai, Yijun Sun, Wei Zheng, Jin Yao, Yujie Yang_ [Insert the name of the Initial Contributor here]. ANY USE, PUBLIC DISPLAY, PUBLIC PERFORMANCE, REPRODUCTION OR DISTRIBUTION OF, OR PREPARATION OF DERIVATIVE WORKS BASED ON, THE LICENSED WORK CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS LICENSE AND ITS TERMS, WHETHER OR NOT SUCH RECIPIENT READS THE TERMS OF THE LICENSE. "LICENSED WORK" AND "RECIPIENT" ARE DEFINED IN THE LICENSE. A COPY OF THE LICENSE IS LOCATED IN THE TEXT FILE ENTITLED "LICENSE.TXT" ACCOMPANYING THE CONTENTS OF THIS FILE. IF A COPY OF THE LICENSE DOES NOT ACCOMPANY THIS FILE, A COPY OF THE LICENSE MAY ALSO BE OBTAINED AT THE FOLLOWING WEB SITE: http://www.acsu.buffalo.edu/~yijunsun/lab/ESPRIT-Forest.html [Insert Initial Contributor's Designated Web Site here]

Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for the specific language governing rights and limitations under the License.
*/

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <set>
#include <fstream>
#include <time.h>
#include <math.h>
#include <omp.h>

#include "FASTA.h"
#include "util.h"
#include "global.h"
#include "ProbModel.h"
#include "TreeClust.h"
#include "MinHeap.h"

extern char **SeqStrs;
extern int *Freq;

extern int SeqNum;
extern int **KmerTabs;
extern int **KmerSeqs;
extern int *SeqID;
extern int *SeqLens;

extern bool verbose;

ProbString *ProbSeqs;

int ClsTop;
long int NumAl=0;
long int NumKmer=0;

float MAX_DIST=0.15;
int NumCls;

ClustRec *ClRec;

SwapRec *NNRec;

PairRec *MergeRec;

#define TOK_VOID  -1
#define TOK_FINISH 0
#define TOK_DELMERGE 1
#define TOK_UPD 2
#define COMM_BUF_LEN 4


float KmerDist(int uid1, int uid2)
{
	#pragma omp atomic
		NumKmer++;
	if (uid1 < SeqNum && uid2 < SeqNum)
	{
		if (SeqLens[uid1] < SeqLens[uid2])
		{
			return 1 - Global::kmer->KmerComp(KmerTabs[uid1], KmerTabs[uid2], KmerSeqs[uid1],SeqLens[uid1]);
		}
		else
		{
			return 1 - Global::kmer->KmerComp(KmerTabs[uid2], KmerTabs[uid1], KmerSeqs[uid2],SeqLens[uid2]);
		}
	}
	else if (uid1 < SeqNum)
	{
		return 1 - Global::kmer->KmerComp(KmerTabs[uid1], KmerTabs[uid2], KmerSeqs[uid1],SeqLens[uid1],ProbSeqs[uid2-SeqNum].Len(),ClRec[uid2].NumSeqs);
	}
	else if (uid2 <SeqNum)
	{
		return 1 - Global::kmer->KmerComp(KmerTabs[uid2], KmerTabs[uid1], KmerSeqs[uid2],SeqLens[uid2],ProbSeqs[uid1-SeqNum].Len(),ClRec[uid1].NumSeqs);
	}
	else
	{
		return 1 - Global::kmer->KmerComp(KmerTabs[uid1],KmerTabs[uid2],ClRec[uid1].NumSeqs,ClRec[uid2].NumSeqs);
	}
}
bool Quickdist(int uid1, int uid2, float &dist)
{
	if (Global::cache->Get(uid1,uid2,dist))
		return true;
	return false;
	float dist1,dist2,dist3,dist4;

// If the NN dist for the component sequences are all known, we can use it to estimate the NN dist for current sequence
	if (ClRec[uid1].fromid1 >=0)
	{
		if (Global::cache->Get(ClRec[uid1].fromid1,uid2,dist1) && Global::cache->Get(ClRec[uid1].fromid2,uid2,dist2))
		{
			dist=(dist1*ClRec[ClRec[uid1].fromid1].NumSeqs+dist2*ClRec[ClRec[uid1].fromid2].NumSeqs)/ClRec[uid1].NumSeqs;
			#pragma omp critical (CRI_CACHE)
			Global::cache->Store(uid1,uid2,dist);
			return true;
		}
	}
	if (ClRec[uid2].fromid1 >=0)
	{
		if (Global::cache->Get(ClRec[uid2].fromid1,uid1,dist1) && Global::cache->Get(ClRec[uid2].fromid2,uid1,dist2))
		{
			dist=(dist1*ClRec[ClRec[uid2].fromid1].NumSeqs+dist2*ClRec[ClRec[uid2].fromid2].NumSeqs)/ClRec[uid2].NumSeqs;
			#pragma omp critical (CRI_CACHE)
			Global::cache->Store(uid1,uid2,dist);
			return true;
		}
	}
	if (ClRec[uid1].fromid1 >=0 && ClRec[uid2].fromid1 >=0)
	{
		if (Global::cache->Get(ClRec[uid1].fromid1,ClRec[uid2].fromid1,dist1) && Global::cache->Get(ClRec[uid1].fromid2,ClRec[uid2].fromid1,dist2) &&
		  Global::cache->Get(ClRec[uid1].fromid1,ClRec[uid2].fromid2,dist3) && Global::cache->Get(ClRec[uid1].fromid2,ClRec[uid2].fromid2,dist4))
		 {
			dist=(dist1*ClRec[ClRec[uid1].fromid1].NumSeqs*ClRec[ClRec[uid2].fromid1].NumSeqs+dist2*ClRec[ClRec[uid1].fromid2].NumSeqs*ClRec[ClRec[uid2].fromid1].NumSeqs+
			  dist3*ClRec[ClRec[uid1].fromid1].NumSeqs*ClRec[ClRec[uid2].fromid2].NumSeqs+dist4*ClRec[ClRec[uid1].fromid2].NumSeqs*ClRec[ClRec[uid2].fromid2].NumSeqs)/
			  (ClRec[uid1].NumSeqs*ClRec[uid2].NumSeqs);
			Global::cache->Store(uid1,uid2,dist);
			return true;
		 }
	}

	return false;
}

float NeedleDist(int uid1, int uid2, float level)
{
	float dist;
	if (Quickdist(uid1,uid2,dist))
		return dist;
	//if (Global::cache->Get(uid1,uid2,dist))
		//return dist;

	#pragma omp atomic
		NumAl++;
	char buf1[65535];
	char buf2[65535];
	if (uid1 <SeqNum && uid2 < SeqNum)
	{
		Global::needle->Align(SeqStrs[uid1],SeqStrs[uid2],buf1,buf2,min(level,Global::DiagRate));
		dist=calcDistance(buf1,buf2);
	}
	else if (uid1 < SeqNum)
	{
		ProbString al;
		Global::needle->Align(SeqStrs[uid1],ProbSeqs[uid2-SeqNum],buf1,al,min(level,Global::DiagRate));
		dist=al.AveDist(buf1);
	}
	else if (uid2 < SeqNum)
	{
		ProbString al;
		Global::needle->Align(SeqStrs[uid2],ProbSeqs[uid1-SeqNum],buf1,al,min(level,Global::DiagRate));
		dist=al.AveDist(buf1);
	}
	else
	{
		ProbString al1,al2;
		Global::needle->Align(ProbSeqs[uid1-SeqNum],ProbSeqs[uid2-SeqNum],al1,al2,min(level,Global::DiagRate));
		dist=al1.AveDist(al2);
	}
	#pragma omp critical (CRI_CACHE)
		Global::cache->Store(uid1,uid2,dist);
	return dist;
}

void InitRecords(ClustRec *clRec,int tot)
{
	for (int uid=0;uid <tot;uid++)
	{
		clRec[uid].Node=NULL;
		clRec[uid].fromid1=-1;
		clRec[uid].fromid2=-1;
		clRec[uid].clsid=-1;
		clRec[uid].ID=uid;
		clRec[uid].distErr=0.0;
		clRec[uid].tagscan=0;
		clRec[uid].is_outlier=false;
		clRec[uid].Seqlist.clear();
		clRec[uid].NNlist.clear();
	}
}

void MergeSeqs(int id1, int id2, int mergeid, ClustRec *clRec)
{
	ProbString pr1,pr2,al1,al2;
	int *ktab;
	
	if (id1 < SeqNum) 
	{
		pr1.FromString(SeqStrs[id1]); 
		ktab=Global::kmer->KmerCopy(KmerTabs[id1],Freq[id1]);
	}
	else 
	{
		pr1=ProbSeqs[id1-SeqNum];
		ktab=Global::kmer->KmerCopy(KmerTabs[id1],1);
	}
	if (id2 < SeqNum) 
	{
		pr2.FromString(SeqStrs[id2]); 
		Global::kmer->KmerAdd(ktab,KmerTabs[id2],Freq[id2]);
	}
	else 
	{
		pr2=ProbSeqs[id2-SeqNum];
		Global::kmer->KmerAdd(ktab,KmerTabs[id2],1);
	}
	KmerTabs[mergeid]=ktab;
	clRec[mergeid].NumSeqs=clRec[id1].NumSeqs+clRec[id2].NumSeqs;
	clRec[mergeid].fromid1=id1;
	clRec[mergeid].fromid2=id2;
	
	Global::needle->Align(pr1,pr2,al1,al2,MAX_DIST);
	float wt1,wt2;
	wt1=(clRec[id1].NumSeqs+0.0)/clRec[mergeid].NumSeqs;
	wt2=(clRec[id2].NumSeqs+0.0)/clRec[mergeid].NumSeqs;
	al1*=wt1;
	al2*=wt2;
	al1 +=al2;
	
	if (clRec[mergeid].NumSeqs > 100.0/Global::ErrorRate) 
		al1.Rectify(Global::ErrorRate);
	ProbSeqs[mergeid-SeqNum]=al1;
}

// Creates the PBP tree and merges the bottom-layered nodes
void DoPreCluster(int myrank, TreeClust &treeclust,ClustRec *clRec, float clusterlevel)
{
	Tree *node;
	set<unsigned int>::iterator itr;
	
	node=treeclust.AddSeq(0);
	clRec[0].Node=node;
	TreeRec *bRec=(TreeRec *)Malloc(SeqNum*sizeof(TreeRec));
	bRec[0].parID=-1;
	bRec[0].linklevel=-1;
	for (int i=0;i <SeqNum;i++)
	{
		clRec[i].NumSeqs=Freq[i];
		clRec[i].Seqlist.insert(i);
	}
	

	if (myrank==0)
	{
		int procseq=1;
		Tree **TParent=(Tree **)Malloc(SeqNum*sizeof(Tree *));
		for (int i=0;i <SeqNum;i++)
			TParent[i]=NULL;
		
		int num_proc;
		
		#pragma omp parallel
		{
			num_proc=omp_get_num_threads();
		}

		
		#pragma omp parallel for schedule(dynamic) ordered shared(procseq)
		for (int i=1;i <SeqNum;i++)
		{

			int myproc;
			#pragma omp flush(procseq)
			myproc=procseq;
			TParent[i]=treeclust.FindInsertParent(i);
			#pragma omp ordered
			{
				if (TParent[i]->BottomLevel())  // the node is merged with existing branch, no further comparison needed
				{
					node=treeclust.AddSeqAt(i,TParent[i]);
					clRec[i].Node=node;
				}
				else
				{
					bool hasCl=false;
					for (int j=myproc+1;j<i;j++)
					{
						if (TParent[i]==TParent[j])     // node i may be potentially merged with node j
							if (Kdist2Ndist(KmerDist(i,j)) < TParent[i]->FirstChild()->GetThres())
							{
								node=treeclust.AddSeqFrom(i,j,TParent[i]);
								clRec[i].Node=node;
								hasCl=true;
								Tree *par=node->GetParent();
								while (par->NumChildren()==1 && par->GetParent() !=NULL)
									par=par->GetParent();
								TParent[i]=node->GetParent();
								break;
							}
					}
					if (!hasCl)
					{
						node=treeclust.AddSeqAt(i,TParent[i]);
						clRec[i].Node=node;
					}
				}
				procseq=i;
			}
		}
		free(TParent);
		for (int i=1;i <SeqNum;i++)
		{
			Tree *ptr=clRec[i].Node;
			bRec[i].linklevel=0;
			while (ptr->UID==ptr->GetParent()->UID)
			{
				bRec[i].linklevel++;
				ptr=ptr->GetParent();
				if (ptr->GetParent()==NULL) 
				{
					fprintf(stderr, "Invalid Hierarchy %d\n", ptr->UID);
					fflush(stderr);
					break;
				}
			}
			bRec[i].parID=ptr->GetParent()->UID;
		}
		MPI::COMM_WORLD.Bcast(bRec, 2*SeqNum, MPI::INT, 0);
	}
	else
	{

		MPI::COMM_WORLD.Bcast(bRec, 2*SeqNum, MPI::INT, 0);
		for (int i=1;i<SeqNum;i++)
		{
			if (bRec[i].parID <0 || bRec[i].parID >=i)
			{
				fprintf(stderr," Invalid Tree Record %d\n", i);
				fflush(stderr);
			}
			Tree *parnode=clRec[bRec[i].parID].Node->GetParent();
			for (int j=0;j<bRec[i].linklevel;j++)
				parnode=parnode->GetParent();
			clRec[i].Node=treeclust.AddSeqAt(i,parnode);
		}
	}
	
	// Merge bottom nodes and create prob-sequences

	ClsTop=SeqNum;
	NumCls=SeqNum;


	vector<Tree *> tvec;

	treeclust.ListBottom(tvec);
	vector<int>::size_type tvecsize = tvec.size();
	
	#pragma omp parallel
	{
		ProbString TCenter;
		ProbString al1,al2;
		int *ktab;
		char buf[65535];
		vector<Tree *> tchild;
		vector<Tree *>::iterator it2;
		set<unsigned int> mylist;
		
		#pragma omp for schedule(dynamic) ordered
		for (int i=0;i<tvecsize;i++)
		{
			mylist.clear();
			
			Tree *cur=tvec[i];
			tchild.clear();
			cur->ListLeaf(tchild);
			int NumLeaf=tchild.size();
			if (NumLeaf >1)
			{
				it2=tchild.begin();
				TCenter.FromString(SeqStrs[(*it2)->UID]);
				float wt1,wt2;
				int totseq=Freq[(*it2)->UID];
				
				ktab=Global::kmer->KmerCopy(KmerTabs[(*it2)->UID],Freq[(*it2)->UID]);
				//clRec[(*it2)->UID].clsid=ClsTop;
				//clRec[ClsTop].Seqlist.insert((*it2)->UID);
				mylist.insert((*it2)->UID);
				clRec[(*it2)->UID].Seqlist.clear();
				it2++;
				
				for (;it2!=tchild.end();it2++)
				{
					Tree *leaf=*it2;
					mylist.insert(leaf->UID);
					clRec[leaf->UID].Seqlist.clear();
					clRec[leaf->UID].Node=NULL;
					//clRec[leaf->UID].clsid=ClsTop;
					Global::kmer->KmerAdd(ktab,KmerTabs[leaf->UID],Freq[leaf->UID]);
					wt1=(totseq+0.0)/(totseq+Freq[leaf->UID]);
					wt2=(Freq[leaf->UID]+0.0)/(totseq+Freq[leaf->UID]);
					Global::needle->Align(SeqStrs[leaf->UID],TCenter,buf,al1,2*clusterlevel);
					#pragma omp atomic
							NumAl++;
					al1*=wt1;
					al2.FromString(buf);
					al2*=wt2;
					al1+=al2;
					TCenter=al1;
					totseq+=Freq[leaf->UID];	
				}
				#pragma omp ordered
				{
					KmerTabs[ClsTop]=ktab;
					ProbSeqs[ClsTop-SeqNum]=TCenter;
					clRec[ClsTop].Node=treeclust.Condense(cur,ClsTop);
					clRec[ClsTop].NumSeqs=totseq;
					clRec[ClsTop].Seqlist.clear();
					clRec[ClsTop].Seqlist.insert(mylist.begin(),mylist.end());
					for (it2=tchild.begin();it2!=tchild.end();it2++)
						clRec[(*it2)->UID].clsid=ClsTop;
					ClsTop++;
					NumCls-=(NumLeaf-1);
				}
			}
		}
	}
	
}

void DoShuffle(int *UpdList,int updptr,int numproc)
{
	int *SwList=(int *)Malloc(updptr*sizeof(int));
	int seglen=updptr/numproc;
	int segext=updptr-seglen*numproc;
	int incr=0;
	int ptr=0;
	int mystart;
	do
	{
		for (int i=0;i<numproc;i++)
		{
			ptr=incr*numproc+i;
			if (ptr >=updptr) break;
			mystart=i*seglen+min(i,segext);
			SwList[mystart+incr]=UpdList[ptr];
		}
		incr++;
	}while (ptr <updptr);
	for (int i=0;i<updptr;i++)
	{
		UpdList[i]=SwList[i];
	}
	free(SwList);
}

void MPI_Cluster(int myrank,int numthreads, int numproc, float clusterlevel,fstream &flist,fstream &fstat,fstream &fgroup,fstream &ftree)
{

	ClRec=(ClustRec *)Malloc(2*SeqNum*sizeof(ClustRec));
	SwapRec * UpdRec=(SwapRec *)Malloc(2*SeqNum*sizeof(SwapRec));
	int *UpdList=(int *)Malloc(SeqNum*sizeof(int));	
	
	MAX_DIST=Global::level_max+Global::level_step;
	
	ProbSeqs= new ProbString[SeqNum];
	TreeClust treeclust(max(clusterlevel,Global::level_min/Global::level_inc),Global::level_max);
	InitRecords(ClRec,2*SeqNum);

	if (myrank==0)
	{
		NNRec=(SwapRec *)Malloc(2*SeqNum*sizeof(SwapRec));
		for (int i=0;i<2*SeqNum;i++)
		{
			NNRec[i].NNseq=-1;
			NNRec[i].NNseq2=-1;
			NNRec[i].NNdist=-1;
			NNRec[i].NNdist2=-1;
		}
		MergeRec=(PairRec *)Malloc(SeqNum*sizeof(PairRec));
	}

    
	printf("Starting PreCluster\n");
	double w1=omp_get_wtime();
	DoPreCluster(myrank,treeclust,ClRec,clusterlevel);
	double w2=omp_get_wtime();
	printf("Node %d PreClustering Finished with %.6lf secs %d Clusters AL %ld KM %ld\n", myrank, w2-w1,NumCls, NumAl,NumKmer);
	
	if (myrank==0)
	{
		fstat.precision(5);
		for (int i=0; i<SeqNum;i++)
		{
			if (ClRec[i].clsid >0)
				ftree << SeqID[i] << "\t -1 \t" << ClRec[i].clsid << "\t" << fixed << clusterlevel<<endl;
		}
	}
	
	//FILE *flog;
	//char fname[50];
	//sprintf(fname,"log-%d.log",myrank);
	//flog=fopen(fname,"w");
	
	printf("Starting Find NN\n");
	int updcnt=0;
	for (int i=0;i<ClsTop;i++)
	{
		if (ClRec[i].clsid <0)
			UpdList[updcnt++]=i;
	}
	
	w1=omp_get_wtime();
	int seglen=updcnt/numproc;
	int segext=updcnt-seglen*numproc;
	int mystart=myrank*seglen+min(myrank,segext);
	int myend=(myrank+1)*seglen+min(myrank+1,segext);
	int *rcounts=(int *)Malloc(numproc*sizeof(int));
	int *rpos=(int *)Malloc(numproc*sizeof(int));
	
	rpos[0]=0;
	rcounts[0]=4*(seglen+min(1,segext));
	for (int i=1;i<numproc;i++)
	{
		rpos[i]=rpos[i-1]+rcounts[i-1];
		rcounts[i]=4*(seglen+min(i+1,segext)-min(i,segext));
	}
	
	#pragma omp parallel for schedule(dynamic)
	for (int i=mystart;i<myend;i++)
	{
		treeclust.EstimateNN(UpdList[i],ClRec,UpdRec+(i-mystart));    
		treeclust.FindNN(UpdList[i],UpdRec+(i-mystart),clusterlevel);
	}
	
	w2=omp_get_wtime();
	printf("Node %d FindNN Finished with %.6lf secs AL %ld KM %ld\n", myrank, w2-w1,NumAl,NumKmer);
	fflush(stdout);
	if (myrank==0)
	{
		MPI::COMM_WORLD.Gatherv(MPI_IN_PLACE,rcounts[myrank],MPI::INT,UpdRec,rcounts,rpos,MPI::INT,0);
		MPI::COMM_WORLD.Barrier();

		for (int i=0;i<updcnt;i++)
		{
			NNRec[UpdList[i]].NNseq=UpdRec[i].NNseq;
			NNRec[UpdList[i]].NNseq2=UpdRec[i].NNseq2;
			NNRec[UpdList[i]].NNdist=UpdRec[i].NNdist;
			NNRec[UpdList[i]].NNdist2=UpdRec[i].NNdist2;
			ClRec[UpdList[i]].tagscan=1;
		}
	}
	else
	{
		MPI::COMM_WORLD.Gatherv(UpdRec,rcounts[myrank],MPI::INT,UpdRec,rcounts,rpos,MPI::INT,0);
		MPI::COMM_WORLD.Barrier();
	}
	
	w1=omp_get_wtime();
	printf("Node %d Starting Clustering\n",myrank);
	fflush(stdout);
	
	int combuff[COMM_BUF_LEN];
	int nntail=0;
	set<unsigned int>::iterator itr;
	int Num_Outlier=0;
	
	if (myrank==0)
	{
		MinHeap mergeHeap;
		MinHeap WaitHeap;
		MinHeap findHeap;
		
		int num_merge;
		findHeap.Clear();
		WaitHeap.Clear();
		float cutoff;
		int updptr=0;
		volatile int findSize;
		volatile int isFindEmpty;
		
		do{
			for (int i=0;i<ClsTop;i++)
				ClRec[i].NNlist.clear();
			
			//rebuild NNlist
			for (int i=0;i<ClsTop;i++)
			{
				if(ClRec[i].clsid <0 && NNRec[i].NNseq >=0 && (!ClRec[i].is_outlier))
					ClRec[NNRec[i].NNseq].NNlist.insert(i);
			}	
			
			
			for (int i=0;i<ClsTop;i++)
			{
				//fprintf(flog,"%d - cls %d tag %d NN %d NNdist %f OT %d\n",i,ClRec[i].clsid,ClRec[i].tagscan,NNRec[i].NNseq,NNRec[i].NNdist,ClRec[i].is_outlier);
				//fflush(flog);
				if (ClRec[i].is_outlier || ClRec[i].clsid>=0 ||  ClRec[i].tagscan==0) continue;
				// remove isolated nodes to save search time
				if (NNRec[i].NNdist >=MAX_DIST)            
				{
					if (NNRec[i].NNseq >=0)
					{
						ClRec[NNRec[i].NNseq].NNlist.erase(i);
						NNRec[i].NNseq=-1;
					}
					ClRec[i].is_outlier=true;
					//BCast Delete node
					combuff[0]=TOK_DELMERGE;
					combuff[1]=i;
					combuff[2]=-1;
					combuff[3]=-1;
					MPI::COMM_WORLD.Bcast(combuff, COMM_BUF_LEN, MPI::INT, 0);
					MPI::COMM_WORLD.Barrier();
					treeclust.RemoveNode(ClRec[i].Node);
					//fprintf(flog,"New Outlier %d\n",i);
					//fflush(flog);
					ClRec[i].Node=NULL;
					Num_Outlier++;
					continue;
				}
				//perform tie-breaking to gain more mergable pairs
				if (ClRec[i].NNlist.find(NNRec[i].NNseq) == ClRec[i].NNlist.end())  
				{
					for (itr=ClRec[i].NNlist.begin();itr != ClRec[i].NNlist.end();itr++)
					{
						if (ClRec[*itr].tagscan==0) continue;
						
						if (ClRec[*itr].clsid >=0)
						{
							fprintf(stderr,"NN list error here\n");
						}
						if (NNRec[*itr].NNdist <= NNRec[i].NNdist)
						{
							if (NNRec[i].NNseq >=0)
								ClRec[NNRec[i].NNseq].NNlist.erase(i);
							NNRec[i].NNseq=*itr;
							ClRec[*itr].NNlist.insert(i);
							break;
						}
					}
				}
			}

			isFindEmpty=findHeap.Empty();
			volatile bool isWaitEmpty=WaitHeap.Empty();	

			cutoff=MAX_DIST;
			if (!isFindEmpty)
				cutoff=min(findHeap.Top(),cutoff);
			if (!isWaitEmpty)
				cutoff=min(WaitHeap.Top(),cutoff);
			
			mergeHeap.Clear();   //enroll all mergable pairs
		
			for (int i=0;i<ClsTop;i++)
			{
				if (ClRec[i].is_outlier || ClRec[i].clsid >=0) continue;
				if (NNRec[i].NNseq <0) continue;
				if (ClRec[i].tagscan==0 || ClRec[NNRec[i].NNseq].tagscan==0) continue;
				if (NNRec[NNRec[i].NNseq].NNseq ==i && NNRec[i].NNseq >i  && NNRec[i].NNdist <= cutoff)  
				{
					mergeHeap.Add((void *)(ClRec+i),NNRec[i].NNdist);
				}
			}
		
			volatile float mergedist;
			float minmergedist;
			num_merge=0;
			volatile bool ismergeEmpty;
			
			do{
				ismergeEmpty=mergeHeap.Empty();
				if (!ismergeEmpty)
				{
					ClustRec *thisRec;
					int id1,id2;
					mergedist=mergeHeap.Pop((void *&)thisRec);
					if (num_merge ==0)
						minmergedist=mergedist;
					if (mergedist <=Global::level_max)
					{
						num_merge++;
						id1=thisRec->ID;
						id2=NNRec[id1].NNseq;
					
						if (id2 <0 || ClRec[id2].clsid >=0)
						{
							fprintf(stderr,"Illegal pair here %d-%d\n", id1,id2);
							fflush(stderr);
						}
						//#BCast MERGE & DELETE
						combuff[0]=TOK_DELMERGE;
						combuff[1]=id1;
						combuff[2]=id2;
						combuff[3]=ClsTop;
						MPI::COMM_WORLD.Bcast(combuff, COMM_BUF_LEN, MPI::INT, 0);
						MPI::COMM_WORLD.Barrier();

						MergeSeqs(id1,id2,ClsTop, ClRec);
						MergeRec[nntail].id1=id1;
						MergeRec[nntail].id2=id2;
						MergeRec[nntail++].dist=NNRec[id1].NNdist;
						NumCls--;
						if (!ClRec[id1].is_outlier)
						{
							treeclust.RemoveNode(ClRec[id1].Node);
						}
						if (!ClRec[id2].is_outlier)
						{
							treeclust.RemoveNode(ClRec[id2].Node);
						}
						ClRec[id1].Node=NULL;
						ClRec[id2].Node=NULL;
						ClRec[id1].clsid=ClsTop;
						ClRec[id2].clsid=ClsTop;
						Tree *node=treeclust.AddSeq(ClsTop);
						ClRec[ClsTop].Node=node;
						ClRec[ClsTop].ID=ClsTop;
						ClRec[ClsTop].clsid=-1;
						ClRec[ClsTop].NNlist.clear();
						ClRec[ClsTop].Seqlist.clear();
						NNRec[ClsTop].NNseq=-1;
						NNRec[ClsTop].NNseq2=-1;
						NNRec[ClsTop].NNdist=MAX_DIST;
						NNRec[ClsTop].NNdist2=MAX_DIST;
						treeclust.EstimateNN(ClsTop,ClRec,NNRec+ClsTop);
						//fprintf(flog,"Merge %d + %d -> %d\n",id1,id2,ClsTop);
						//fflush(flog);

						if (NNRec[ClsTop].NNseq >= 0)
							ClRec[NNRec[ClsTop].NNseq].NNlist.insert(ClsTop);

						WaitHeap.Add((void *)(ClRec+ClsTop),NNRec[id1].NNdist);
						ClRec[ClsTop].tagscan=0;

						//enroll the inverse NN list of id1 to the wait list
						for (itr=ClRec[id1].NNlist.begin(); itr !=ClRec[id1].NNlist.end();itr++)
						{
							if (*itr==id2) continue;
							float olddist=NNRec[*itr].NNdist;
							int nseq=NNRec[*itr].NNseq2;
							if(nseq >=0 && ClRec[nseq].clsid <0)
							{
								NNRec[*itr].NNseq=nseq;
								NNRec[*itr].NNdist=NNRec[*itr].NNdist2;
								ClRec[nseq].NNlist.insert(*itr);
							}
							else
							{
								NNRec[*itr].NNseq=-1;
								NNRec[*itr].NNdist=MAX_DIST;
							}
							NNRec[*itr].NNseq2=-1;
							NNRec[*itr].NNdist2=MAX_DIST;
							ClRec[*itr].tagscan=0;
							WaitHeap.Add((void *)(&ClRec[*itr]),olddist);
							/*							
							// estimate possible NN dists for new node
							float rdist,cdist;
							if (Global::cache->Get(*itr,id2,cdist))
							{
								rdist=(ClRec[id1].NumSeqs*NNRec[*itr].NNdist+ClRec[id2].NumSeqs*cdist)/(ClRec[id1].NumSeqs+ClRec[id2].NumSeqs);
								Global::cache->Store(ClsTop,*itr,rdist);
								if (rdist < NNRec[*itr].NNdist)
								{
									if (NNRec[*itr].NNseq >=0)
									{
										ClRec[NNRec[*itr].NNseq].NNlist.erase(*itr);
									}
									NNRec[*itr].NNseq2=NNRec[*itr].NNseq;
									NNRec[*itr].NNdist2=NNRec[*itr].NNdist;
									NNRec[*itr].NNdist=rdist;
									NNRec[*itr].NNseq=ClsTop;
									ClRec[ClsTop].NNlist.insert(*itr);
								}
								else if (rdist <NNRec[*itr].NNdist2)
								{
									NNRec[*itr].NNdist2=rdist;
									NNRec[*itr].NNseq2=ClsTop;
								}
							
								if (rdist <NNRec[ClsTop].NNdist)
								{
									if (NNRec[ClsTop].NNseq >=0)
									{
										ClRec[NNRec[ClsTop].NNseq].NNlist.erase(ClsTop);
									}
									NNRec[ClsTop].NNseq2=NNRec[ClsTop].NNseq;
									NNRec[ClsTop].NNdist2=NNRec[ClsTop].NNdist;
									NNRec[ClsTop].NNdist=rdist;
									NNRec[ClsTop].NNseq=*itr;
									ClRec[*itr].NNlist.insert(ClsTop);
								}
								else if (rdist < NNRec[ClsTop].NNdist2)
								{
									NNRec[ClsTop].NNdist2=rdist;
									NNRec[ClsTop].NNseq2=*itr;
								}
							}*/
						}

						for (itr=ClRec[id2].NNlist.begin(); itr !=ClRec[id2].NNlist.end();itr++)
						{
							if (*itr==id1) continue;
							float olddist=NNRec[*itr].NNdist;
							int nseq=NNRec[*itr].NNseq2;
							if(nseq >=0 && ClRec[nseq].clsid <0)
							{
								NNRec[*itr].NNseq=nseq;
								NNRec[*itr].NNdist=NNRec[*itr].NNdist2;
								ClRec[nseq].NNlist.insert(*itr);
							}
							else
							{
								NNRec[*itr].NNseq=-1;
								NNRec[*itr].NNdist=MAX_DIST;
							}
							NNRec[*itr].NNseq2=-1;
							NNRec[*itr].NNdist2=MAX_DIST;
							ClRec[*itr].tagscan=0;
							WaitHeap.Add((void *)(&ClRec[*itr]),olddist);
							
							/*// estimate possible NN dists for new node
							float rdist,cdist;
							if (Global::cache->Get(*itr,id1,cdist))
							{
								rdist=(ClRec[id2].NumSeqs*NNRec[*itr].NNdist+ClRec[id1].NumSeqs*cdist)/(ClRec[id1].NumSeqs+ClRec[id2].NumSeqs);
								Global::cache->Store(ClsTop,*itr,rdist);
								if (rdist < NNRec[*itr].NNdist)
								{
									if (NNRec[*itr].NNseq >=0)
									{
										ClRec[NNRec[*itr].NNseq].NNlist.erase(*itr);
									}
									NNRec[*itr].NNseq2=NNRec[*itr].NNseq;
									NNRec[*itr].NNdist2=NNRec[*itr].NNdist;
									NNRec[*itr].NNdist=rdist;
									NNRec[*itr].NNseq=ClsTop;
									ClRec[ClsTop].NNlist.insert(*itr);
								}
								else if (rdist <NNRec[*itr].NNdist2)
								{
									NNRec[*itr].NNdist2=rdist;
									NNRec[*itr].NNseq2=ClsTop;
								}
							
								if (rdist <NNRec[ClsTop].NNdist)
								{
									if (NNRec[ClsTop].NNseq >=0)
									{
										ClRec[NNRec[ClsTop].NNseq].NNlist.erase(ClsTop);
									}
									NNRec[ClsTop].NNseq2=NNRec[ClsTop].NNseq;
									NNRec[ClsTop].NNdist2=NNRec[ClsTop].NNdist;
									NNRec[ClsTop].NNdist=rdist;
									NNRec[ClsTop].NNseq=*itr;
									ClRec[*itr].NNlist.insert(ClsTop);
								}
								else if (rdist < NNRec[ClsTop].NNdist2)
								{
									NNRec[ClsTop].NNdist2=rdist;
									NNRec[ClsTop].NNseq2=*itr;
								}
							}*/
						}
					}
				}
				ClsTop++;
				ismergeEmpty=mergeHeap.Empty();
				if (ismergeEmpty)
				{
					isWaitEmpty=WaitHeap.Empty();
					while (!isWaitEmpty)
					{
						ClustRec *tRec;
						float checkdist=WaitHeap.Pop((void *&)tRec);
						if (tRec->tagscan==0)
						{
							findHeap.Add((void *)tRec,checkdist);
						}
						isWaitEmpty=WaitHeap.Empty();
					}
				}
				else
				{
					volatile float checkdist=WaitHeap.Top();
					float compdist=min(mergeHeap.Top(),Global::level_max);
					isWaitEmpty=WaitHeap.Empty();
					while (!isWaitEmpty && checkdist *0.95 < compdist)
					{
						volatile ClustRec *tRec;
						checkdist=WaitHeap.Pop((void *&)tRec);
						if (tRec->tagscan==0)
						{
							findHeap.Add((void *)tRec,checkdist);
						}
						checkdist=WaitHeap.Top();
						isWaitEmpty=WaitHeap.Empty();
					}
				}
				findSize=findHeap.Size();
			}while (findSize < numthreads*numproc *40 && (!ismergeEmpty));	
			
			//fprintf(flog,"NumMerge %d findHeap %d WaitHeap %d MergeHeap %d Cls %d\n",num_merge,findHeap.Size(), WaitHeap.Size(),mergeHeap.Size(),NumCls);
		//	fflush(flog);
			updptr=0;
			
			volatile ClustRec *tRec;
			isFindEmpty=findHeap.Empty();
			do 
			{
				
				if (!isFindEmpty)
				{
					float popdist=findHeap.Pop((void *&)tRec);
					if (tRec->tagscan==0)
					{
						UpdList[updptr++]=tRec->ID;
						tRec->tagscan=1;
					}
				}
				isFindEmpty=findHeap.Empty();
			}while (updptr < 20*numproc*numthreads && (!isFindEmpty));
			//#BCAST SENDUPD
			combuff[0]=TOK_UPD;
			combuff[1]=updptr;
			float *fptr=(float *)&combuff[2];
			*fptr=minmergedist;
			MPI::COMM_WORLD.Bcast(combuff, COMM_BUF_LEN, MPI::INT, 0);
			MPI::COMM_WORLD.Barrier();

			DoShuffle(UpdList,updptr,numproc);
			
			//#BCAST UPDLIST
			MPI::COMM_WORLD.Bcast(UpdList, updptr, MPI::INT, 0);
			MPI::COMM_WORLD.Barrier();
			
			seglen=updptr/numproc;
			segext=updptr-seglen*numproc;
			mystart=myrank*seglen+min(myrank,segext);
			myend=(myrank+1)*seglen+min(myrank+1,segext);
	
			rpos[0]=0;
			rcounts[0]=4*(seglen+min(1,segext));
			for (int i=1;i<numproc;i++)
			{
				rpos[i]=rpos[i-1]+rcounts[i-1];
				rcounts[i]=4*(seglen+min(i+1,segext)-min(i,segext));
			}

			for (int i=0;i<updptr;i++)
			{
				UpdRec[i].NNseq=NNRec[UpdList[i]].NNseq;
				UpdRec[i].NNseq2=NNRec[UpdList[i]].NNseq2;
				UpdRec[i].NNdist=NNRec[UpdList[i]].NNdist;
				UpdRec[i].NNdist2=NNRec[UpdList[i]].NNdist2;
			}
			MPI::COMM_WORLD.Scatterv(UpdRec,rcounts,rpos,MPI::INT, MPI_IN_PLACE,rcounts[myrank], MPI::INT, 0);
			MPI::COMM_WORLD.Barrier();
			
			//fprintf(flog,"Finding NN for %d seqs\n",updptr);
			//fflush(flog);
			#pragma omp parallel for schedule(dynamic)
			for (int i=mystart;i<myend;i++)
			{
				treeclust.FindNN(UpdList[i],UpdRec+(i-mystart),max(clusterlevel,minmergedist));
			}
			/*for (int i=mystart;i<myend;i++)
			{
				fprintf(flog,"%d FindNN Seq %d Dist %f\n",UpdList[i],UpdRec[i-mystart].NNseq,UpdRec[i-mystart].NNdist);
				fflush(flog);
			}*/
			
			MPI::COMM_WORLD.Gatherv(MPI_IN_PLACE,rcounts[myrank],MPI::INT,UpdRec,rcounts,rpos,MPI::INT,0);
			MPI::COMM_WORLD.Barrier();
			for (int i=0;i<updptr;i++)
			{
				NNRec[UpdList[i]].NNseq=UpdRec[i].NNseq;
				NNRec[UpdList[i]].NNseq2=UpdRec[i].NNseq2;
				NNRec[UpdList[i]].NNdist=UpdRec[i].NNdist;
				NNRec[UpdList[i]].NNdist2=UpdRec[i].NNdist2;
			}
			findSize=findHeap.Size();
		}while (num_merge > 0 || updptr + findSize > 0);
		combuff[0]=TOK_FINISH;
		MPI::COMM_WORLD.Bcast(combuff, COMM_BUF_LEN, MPI::INT, 0);
		MPI::COMM_WORLD.Barrier();
	}
	else
	{
		volatile bool is_finished=false;
		int token;
		int updptr;
		float minmergedist;
		
		do {
			MPI::COMM_WORLD.Bcast(combuff, COMM_BUF_LEN, MPI::INT, 0);
			MPI::COMM_WORLD.Barrier();
			token=combuff[0];
			switch (token) {
			case TOK_FINISH:
				is_finished=true;
				break;
			case TOK_DELMERGE:
				if (combuff[3] >=0)
				{
					ClsTop=combuff[3];
					int id1=combuff[1];
					int id2=combuff[2];
					MergeSeqs(id1,id2,ClsTop, ClRec);
					if (ClRec[id1].Node !=NULL)
					{
						treeclust.RemoveNode(ClRec[id1].Node);
					}
					if (ClRec[id2].Node !=NULL)
					{
						treeclust.RemoveNode(ClRec[id2].Node);
					}
					ClRec[id1].Node=NULL;
					ClRec[id2].Node=NULL;
					ClRec[id1].clsid=ClsTop;
					ClRec[id2].clsid=ClsTop;
					Tree *node=treeclust.AddSeq(ClsTop);
					ClRec[ClsTop].Node=node;
					ClRec[ClsTop].ID=ClsTop;
					ClRec[ClsTop].clsid=-1;
					ClsTop++;
					NumCls--;
				}
				else
				{
					treeclust.RemoveNode(ClRec[combuff[1]].Node);
					ClRec[combuff[1]].Node=NULL;
					Num_Outlier++;
				}
				break;
			case TOK_UPD:
				updptr = combuff[1];
				float *fptr=(float *)&combuff[2];
				minmergedist = *fptr;
				MPI::COMM_WORLD.Bcast(UpdList, updptr, MPI::INT, 0);
				MPI::COMM_WORLD.Barrier();
				seglen=updptr/numproc;
				segext=updptr-seglen*numproc;
				mystart=myrank*seglen+min(myrank,segext);
				myend=(myrank+1)*seglen+min(myrank+1,segext);
		
				rpos[0]=0;
				rcounts[0]=4*(seglen+min(1,segext));
				for (int i=1;i<numproc;i++)
				{
					rpos[i]=rpos[i-1]+rcounts[i-1];
					rcounts[i]=4*(seglen+min(i+1,segext)-min(i,segext));
				}
				
				MPI::COMM_WORLD.Scatterv(UpdRec,rcounts,rpos,MPI::INT,UpdRec,rcounts[myrank], MPI::INT, 0);
				MPI::COMM_WORLD.Barrier();
			
				#pragma omp parallel for schedule(dynamic)
				for (int i=mystart;i<myend;i++)
				{
					treeclust.FindNN(UpdList[i],UpdRec+(i-mystart),max(clusterlevel,minmergedist));
				}
				/*for (int i=mystart;i<myend;i++)
				{
					fprintf(flog,"%d FindNN Seq %d Dist %f\n",UpdList[i],UpdRec[i-mystart].NNseq,UpdRec[i-mystart].NNdist);
					fflush(flog);
				}*/
				MPI::COMM_WORLD.Gatherv(UpdRec,rcounts[myrank],MPI::INT,UpdRec,rcounts,rpos,MPI::INT,0);
				MPI::COMM_WORLD.Barrier();
				break;
			}
		}while (!is_finished);
	}
	
	w2=omp_get_wtime();
	printf("Node %d Clustering Finished with %.6lf secs NumCls %d NumOL %d AL %ld KM %ld\n", myrank, w2-w1,NumCls,Num_Outlier, NumAl,NumKmer);
	fflush(stdout);

	if (myrank==0)
	{
		printf("\nGenerating Outputs\n");
		fflush(stdout);
		
		float levelrep=Global::level_min;
		fgroup.precision(3);
		flist.precision(3);
		fstat.precision(3);
		int i;

		for (i=0;i<nntail;i++)
		{
			MergeRec[i].clstag=0;
			int mu1=MergeRec[i].id1 < SeqNum? SeqID[MergeRec[i].id1] : MergeRec[i].id1;
			int mu2=MergeRec[i].id2 < SeqNum? SeqID[MergeRec[i].id2] : MergeRec[i].id2;
			ftree << mu1 << "\t" << mu2 << "\t" << ClRec[MergeRec[i].id1].clsid << "\t" << fixed << MergeRec[i].dist <<endl;
		}
		do{
			for (i=0;i<nntail;i++)
			{
				if (MergeRec[i].clstag ==0 && MergeRec[i].dist <=levelrep)
				{
					if (ClRec[MergeRec[i].id1].clsid != ClRec[MergeRec[i].id2].clsid)
						fprintf(stderr,"Warning: Wrong Cluster Hierarchy!\n");
					if (!(ClRec[MergeRec[i].id1].Seqlist.empty() || ClRec[MergeRec[i].id2].Seqlist.empty()))
					{
						int mergeid=ClRec[MergeRec[i].id1].clsid;
						ClRec[mergeid].Seqlist.insert(ClRec[MergeRec[i].id1].Seqlist.begin(),ClRec[MergeRec[i].id1].Seqlist.end());
						ClRec[mergeid].Seqlist.insert(ClRec[MergeRec[i].id2].Seqlist.begin(),ClRec[MergeRec[i].id2].Seqlist.end());
						ClRec[MergeRec[i].id1].Seqlist.clear();
						ClRec[MergeRec[i].id2].Seqlist.clear();
						MergeRec[i].clstag=1;
					}
				}
			}
			int Clscnt=0;
			fgroup << fixed << levelrep << " |";
			fstat << fixed << levelrep << " ";
			for (i=0;i<ClsTop;i++)
			{
				if (!ClRec[i].Seqlist.empty())
				{
					Clscnt++;
					fstat << fixed << ClRec[i].NumSeqs<<" ";
					for (itr=ClRec[i].Seqlist.begin();itr!=ClRec[i].Seqlist.end();itr++)
					{
						if (itr !=ClRec[i].Seqlist.begin())
							fgroup <<" ";
						fgroup << SeqID[*itr];
					}
					fgroup << "|";
				}
			}
			
			fstat << endl;
			fgroup <<endl;	

			flist << fixed << levelrep << " " << Clscnt << endl;
			printf("Level %.3f\t OTUs %d\n",levelrep,Clscnt);
			levelrep+=Global::level_step;
		}while (levelrep <=Global::level_max);
	}
	
}



