/***********************************************************************
ESPRIT-Forest: Parallel Clustering of Massive Amplicon Sequence Data in Subquadratic Time 
by: Yunpeng Cai, Yijun Sun, Wei Zheng, Jin Yao and Yujie Yang  (C) 2016
Please kindly cite [Y.Cai et.al PLOS Comp. Biol. 2016]

THE LICENSED WORK IS PROVIDED UNDER THE TERMS OF THE ADAPTIVE PUBLIC LICENSE ("LICENSE") AS FIRST COMPLETED BY: _Yunpeng Cai, Yijun Sun, Wei Zheng, Jin Yao, Yujie Yang_ [Insert the name of the Initial Contributor here]. ANY USE, PUBLIC DISPLAY, PUBLIC PERFORMANCE, REPRODUCTION OR DISTRIBUTION OF, OR PREPARATION OF DERIVATIVE WORKS BASED ON, THE LICENSED WORK CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS LICENSE AND ITS TERMS, WHETHER OR NOT SUCH RECIPIENT READS THE TERMS OF THE LICENSE. "LICENSED WORK" AND "RECIPIENT" ARE DEFINED IN THE LICENSE. A COPY OF THE LICENSE IS LOCATED IN THE TEXT FILE ENTITLED "LICENSE.TXT" ACCOMPANYING THE CONTENTS OF THIS FILE. IF A COPY OF THE LICENSE DOES NOT ACCOMPANY THIS FILE, A COPY OF THE LICENSE MAY ALSO BE OBTAINED AT THE FOLLOWING WEB SITE: http://www.acsu.buffalo.edu/~yijunsun/lab/ESPRIT-Forest.html [Insert Initial Contributor's Designated Web Site here]

Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for the specific language governing rights and limitations under the License.
*/


#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <set>
#include <time.h>
#include <math.h>
#include <omp.h>

#include "FASTA.h"
#include "util.h"
#include "global.h"
#include "TreeClust.h"
int SeqCount=10000000;
int numproc,myrank;

char **SeqStrs;
char **LabelStrs;
int *SeqLens;
int *Freq;

int SeqNum;
int **KmerTabs;
int **KmerSeqs;
int *SeqID;
float AveSeqLen;
int SumFreq;
float clusterlevel=0.01;
bool verbose=false;

#define bufsize 65535
char CodeTable[]="ATCGatcg";


int MPI_READ_FASTA(char *filename, char **&labels,char **&seqs)
{
	char buffer[bufsize+1];
	char LabelBuf[bufsize+1];
	char SeqBuf[bufsize+1];
	char *temp;
	int seqcount=0;
	MPI::Status status;
	
	MPI::File fobj= MPI::File::Open(MPI::COMM_WORLD, filename,MPI::MODE_RDONLY, MPI::INFO_NULL); 
	fobj.Set_view(0, MPI::CHAR,MPI::CHAR,"native", MPI::INFO_NULL);
    
	int phase=0;
	int ptr=0;
	int idx=0;

	while (fobj.Get_position() <fobj.Get_size())
	{
		fobj.Read_all(buffer, bufsize, MPI::CHAR, status);

		if (idx==0 && '>' != buffer[0])
		{
			fprintf(stderr,"Invalid file format, expected '>' to start FASTA label\n");
			fobj.Close();
			return 0;
		}
		int cnt=min((long)bufsize, (long)(fobj.Get_size()-idx*bufsize));
		idx++;

		char *temp;
		for (int i=0;i<cnt;i++)
		{
			if (phase==0 && ('\r' == buffer[i] || '\n' == buffer[i]))
			{
				LabelBuf[ptr]=0;
				temp=(char *)Malloc((ptr+1)*sizeof(char));
				strcpy(temp,LabelBuf);
				labels[seqcount]=temp;
				ptr=0;
				phase=1;
				continue;
			}
			if (phase==1 && ('>' == buffer[i] || EOF == buffer[i]))
			{
				SeqBuf[ptr]=0;
				temp=(char *)Malloc((ptr+1)*sizeof(char));
				strcpy(temp,SeqBuf);
				seqs[seqcount++]=temp; 
				ptr=0;
				phase=0;
				continue;
			}
			if (phase==0)
			{			
				if (buffer[i]!='>')
					LabelBuf[ptr++]=buffer[i];	
			}
			else
			{
				if (strchr(CodeTable,buffer[i])!=NULL)
					SeqBuf[ptr++]=(char) toupper(buffer[i]);	
			}
		}
	}
	if (ptr >0)
	{
		SeqBuf[ptr]=0;
		temp=(char *)Malloc((ptr+1)*sizeof(char));
		strcpy(temp,SeqBuf);
		seqs[seqcount++]=temp; 
	}
	fobj.Close();
	return seqcount;
}

void MPI_BCAST_SORTSEQ(int SeqNum,int *SeqID,int *SeqFreq)
{
	int *buffer=(int *)Malloc(SeqNum*2*sizeof(int));
	int *ptr=buffer;
	for (int i=0;i<SeqNum;i++)
	{
		*(ptr++)=SeqID[i];
		*(ptr++)=SeqFreq[i];
	}
	
	MPI::COMM_WORLD.Bcast(buffer, 2*SeqNum,MPI::INT, 0); 
	free(buffer);

}

void MPI_RECV_SORTSEQ(int SeqNum,int *SeqID,int *SeqFreq)
{
	int *buffer=(int *)Malloc(SeqNum*2*sizeof(int));
	MPI::COMM_WORLD.Bcast(buffer, 2*SeqNum,MPI::INT, 0); 
	int *ptr=buffer;
	for (int i=0;i<SeqNum;i++)
	{
		SeqID[i]=*(ptr++);
		SeqFreq[i]=*(ptr++);
	}
	free(buffer);
}

typedef struct{
	char *seq;
	int len;
	int frq;
	int id;
}SeqRec;

void LoadKmerPar(char mypath[])
{
	char *kfile=mypath;
	char *ptr=kfile+strlen(kfile)-1;
	while (ptr >kfile && *ptr!='/' && *ptr!='\\') ptr--;
	if (*ptr=='/' || *ptr=='\\') ptr++;
	char fmt[20];
	
	if (AveSeqLen <=100)
	{
		strcpy(fmt,"l50k");
	}
	else if (AveSeqLen <=200)
	{
		strcpy(fmt,"l100k");
	}
	else if (AveSeqLen <400)
	{
		strcpy(fmt,"l200k");
	}
	else if (AveSeqLen <800)
	{
		strcpy(fmt,"l400k");
	}
	else
	{
		strcpy(fmt,"fulk");
	}
	sprintf(ptr,"%s%d.krate",fmt,Global::Kmer_Len);
	printf("Used Kmer configure file: %s\n",kfile);
	Global::LoadKList(kfile);
}

void LoadFreq(char *freqfile)
{
	int i,fq;
	FILE *fp;
	char buf[65535];
	
	if (freqfile==NULL)
		return;
	
	if ((fp=fopen(freqfile,"r"))==NULL)
		{
			fprintf(stderr,"Cannot find frequency file\n");
			return;
		}
		i=0;
	while (fscanf(fp,"%s %d",buf,&fq)!=EOF)
	{
		Freq[i++]=fq;
	}			
	fclose(fp);
}


int CompSeq(const void * a, const void * b)
{
	SeqRec *Seq1=(SeqRec *)a;
	SeqRec *Seq2=(SeqRec *)b;
	if (Seq1->len > AveSeqLen && Seq1->frq > Seq2->frq) return -1;
	if (Seq2->len > AveSeqLen && Seq2->frq > Seq1->frq) return 1;
	
	if (Seq1->len >Seq2->len) return -1;
	if (Seq1->len <Seq2->len) return 1;
	return 0;	
}

void SortSeqs()
{
	SeqRec *AllRec=(SeqRec *)Malloc(SeqNum*sizeof(SeqRec));
	
	AveSeqLen=0.0;
	SumFreq=0;
	for (int i=0;i<SeqNum;i++)
	{
		SeqLens[i]=strlen(SeqStrs[i]);
		SumFreq+=Freq[i];
	}
	
	for (int i=0;i<SeqNum;i++)
	{
		AveSeqLen+=SeqLens[i]*Freq[i]+0.0;
		
		AllRec[i].seq=SeqStrs[i];
		AllRec[i].len=SeqLens[i];
		AllRec[i].frq=Freq[i];
		AllRec[i].id=i;
	}
	AveSeqLen/=SumFreq;
	qsort(AllRec,SeqNum,sizeof(SeqRec),CompSeq);
	for (int i=0;i<SeqNum;i++)
	{
		SeqStrs[i]=AllRec[i].seq;
		
		SeqLens[i]=AllRec[i].len;
		Freq[i]=AllRec[i].frq;
		SeqID[i]=AllRec[i].id;
	}
	free(AllRec);
}


void ArrangeSeqs()
{
	SeqRec *AllRec=(SeqRec *)Malloc(SeqNum*sizeof(SeqRec));
	
	
	for (int i=0;i<SeqNum;i++)
	{
		AllRec[i].seq=SeqStrs[SeqID[i]];
	}

	AveSeqLen=0.0;
	SumFreq=0;
	for (int i=0;i<SeqNum;i++)
	{
		SeqStrs[i]=AllRec[i].seq;
		SeqLens[i]=strlen(SeqStrs[i]);
		SumFreq+=Freq[i];
		AveSeqLen+=SeqLens[i]*Freq[i]+0.0;
	}

	AveSeqLen/=SumFreq;
	free(AllRec);
}

void BuildKmer()
{
	
	KmerTabs=(int**)Malloc(2*SeqNum*sizeof(int *));
	KmerSeqs=(int**)Malloc(SeqNum*sizeof(int *));
	
#pragma omp parallel for schedule(dynamic)
	for (int i=0;i<SeqNum;i++)
	{
		KmerSeqs[i]=(int *)Malloc((SeqLens[i]-Global::Kmer_Len+1)*sizeof(int));
		KmerTabs[i]=Global::kmer->AllocCodeTable();
		Global::kmer->KmerCount(SeqStrs[i],KmerTabs[i],KmerSeqs[i]);
	}

}


void Clear()
{
	int i;
	for (int i=0;i<SeqNum;i++)
	{
		free(SeqStrs[i]);
		if (KmerTabs[i]) free(KmerTabs[i]);
		if (KmerSeqs[i]) free(KmerSeqs[i]);
	}
	free(SeqStrs);
	free(SeqLens);
	free(KmerTabs);
	free(KmerSeqs);
	free(Freq);
}

void Usage()
{
	printf("ESTclust [-v] [-n seqnum] [-s level] [-o <outfile>] [-f <freqfile>] <seqfile> \n");
}

void MPI_Cluster(int myrank,int numthreads, int numproc, float clusterlevel,fstream &flist,fstream &fstat,fstream &fgroup,fstream &ftree);

int main (int argc, char **argv)
{
	double starttime, endtime;
	fstream flist,fstat,fgroup,ftree;
	char buf[2000];
	char c;
	char *frfile = NULL;
	char *prefix = NULL;
	char mypath[2000];
	char *alfile = NULL;
	
	strcpy(mypath,argv[0]);
	
	printf("ESPRIT-Forest: Parallel Clustering of Massive Amplicon Sequence Data in Subquadratic Time\n");
    printf("by: Yunpeng Cai, Yijun Sun, Wei Zheng, Jin Yao and Yujie Yang  (C) 2016\n");
    printf("Please kindly cite [Y.Cai et.al PLOS Comp. Biol. 2016]\n");
	printf("ESPRIT-Forest execution path %s\n",mypath);
	
	MPI::Init(argc,argv);
	starttime = MPI::Wtime();

	numproc=MPI::COMM_WORLD.Get_size();
	myrank=MPI::COMM_WORLD.Get_rank();
	
	while ((c = getopt(argc, argv, "hvl:s:u:g:e:k:f:o:a:q:p:")) != -1)
    {
        switch (c)
        {
        case 'h':
        	Usage();
        	exit(0);
        	break;
		case 'v':
			verbose=true;
			break;
        case 'l':
			Global::level_min = atof(optarg);
			break;
		case 's':
			Global::level_step = atof(optarg);
			break;	
        case 'u':
			Global::level_max = atof(optarg);
			break;
		case 'g':
			Global::gap_o = atof(optarg);
			break;	
		case 'e':
			Global::gap_e = atof(optarg);
			break;	
        case 'k':
			Global::Kmer_Len = atoi(optarg);
			break;
		case 'f':
			frfile = optarg;	
			break;
		case 'a':
			Global::ShowAlign=true;
			alfile = optarg;
			break;
		case 'q':
			Global::ProbSeqOut=optarg;
			break;
		case 'o':
			prefix = optarg;
			break;
		case 'p':
			clusterlevel=atof(optarg);
			break;
        default:
        	fprintf(stderr,"Unknown option -%c\n",c);
					break;
        }
    }
	argc -= optind;
	argv += optind;

	if (argc==0)
	{
    	Usage();
     	exit(0);
	}
	
	char *ifile=argv[0];
			
	printf("Input %s Freq %s from %.3f to %.3f\n",ifile,frfile,Global::level_min,Global::level_max);
	fflush(stdout);
	char filename[2000];
	strcpy(filename,ifile);
	
	SeqStrs=(char **)Malloc(SeqCount*sizeof(char *));
	LabelStrs=(char **)Malloc(SeqCount*sizeof(char *));
	printf("Opening %s\n",filename);
	SeqNum=MPI_READ_FASTA(filename, LabelStrs,SeqStrs);
	endtime = MPI::Wtime();
	
	printf("Node %d read %d seqs in %f secs\n",myrank,SeqNum,endtime - starttime);
	
	SeqID=(int *)Malloc(SeqNum*sizeof(int));
	Freq=(int *) Malloc(SeqNum*sizeof(int));
	SeqLens=(int *)Malloc(SeqNum*sizeof(int));

	for (int i=0;i<SeqNum;i++) Freq[i]=1;
	
	if (myrank==0)
	{
		LoadFreq(frfile);
		SortSeqs();
		MPI_BCAST_SORTSEQ(SeqNum,SeqID,Freq);
	}
	else
	{
		MPI_RECV_SORTSEQ(SeqNum,SeqID,Freq);
		ArrangeSeqs();
	}
	
	MPI::COMM_WORLD.Barrier();
	printf("Total %d Reads Unique %d Average Len %.1f\n",SumFreq,SeqNum,AveSeqLen);
	

	if (prefix==NULL)
	{
		strcpy(buf,ifile);
	}
	else
	{	
		strcpy(buf,prefix);
	}
	
	RepExt(buf,"OTU");
	flist.open(buf,ios_base::out | ios_base::trunc);	
	if (flist.fail())
	{
		fprintf(stderr,"Cannot Create Output\n");
		exit(1);
	}
	RepExt(buf,"Cluster_List");
	fstat.open(buf,ios_base::out | ios_base::trunc);	
	if (fstat.fail())
	{
		fprintf(stderr,"Cannot Create Output\n");
		exit(1);
	}
	RepExt(buf,"Clusters");
	fgroup.open(buf,ios_base::out | ios_base::trunc);	
	if (fgroup.fail())
	{
		fprintf(stderr,"Cannot Create Output\n");
		exit(1);
	}
	RepExt(buf,"tree");
	ftree.open(buf,ios_base::out | ios_base::trunc);	

	Global::Init(SeqNum);

	LoadKmerPar(mypath);	
	int numthreads;
	#pragma omp parallel
	{
		numthreads = omp_get_num_threads();
	}

	printf("Node %d # %d threads running\n",myrank,numthreads);
	double w1=omp_get_wtime();
	BuildKmer();
	double w2=omp_get_wtime();
	printf("\r%.6lf secs in Building Kmer. \n",w2-w1);
	fflush(stdout);
	
	clusterlevel=min(3.0f/AveSeqLen,Global::level_min);
	
	w1=omp_get_wtime();
	MPI_Cluster(myrank,numthreads, numproc, clusterlevel,flist,fstat,fgroup,ftree);
	w2=omp_get_wtime();
	printf("%.6lf secs total in clustering. \n",w2-w1); 
	fflush(stdout);
	flist.close();
	fstat.close();
	fgroup.close();
	ftree.close();
	MPI::Finalize();

	free(SeqID);
 	return 0;
}

