/***********************************************************************
ESPRIT-Forest: Parallel Clustering of Massive Amplicon Sequence Data in Subquadratic Time 
by: Yunpeng Cai, Yijun Sun, Wei Zheng, Jin Yao and Yujie Yang  (C) 2016
Please kindly cite [Y.Cai et.al PLOS Comp. Biol. 2016]

THE LICENSED WORK IS PROVIDED UNDER THE TERMS OF THE ADAPTIVE PUBLIC LICENSE ("LICENSE") AS FIRST COMPLETED BY: _Yunpeng Cai, Yijun Sun, Wei Zheng, Jin Yao, Yujie Yang_ [Insert the name of the Initial Contributor here]. ANY USE, PUBLIC DISPLAY, PUBLIC PERFORMANCE, REPRODUCTION OR DISTRIBUTION OF, OR PREPARATION OF DERIVATIVE WORKS BASED ON, THE LICENSED WORK CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS LICENSE AND ITS TERMS, WHETHER OR NOT SUCH RECIPIENT READS THE TERMS OF THE LICENSE. "LICENSED WORK" AND "RECIPIENT" ARE DEFINED IN THE LICENSE. A COPY OF THE LICENSE IS LOCATED IN THE TEXT FILE ENTITLED "LICENSE.TXT" ACCOMPANYING THE CONTENTS OF THIS FILE. IF A COPY OF THE LICENSE DOES NOT ACCOMPANY THIS FILE, A COPY OF THE LICENSE MAY ALSO BE OBTAINED AT THE FOLLOWING WEB SITE: http://www.acsu.buffalo.edu/~yijunsun/lab/ESPRIT-Forest.html [Insert Initial Contributor's Designated Web Site here]

Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for the specific language governing rights and limitations under the License.
*/

#include <string.h>
#include <stdlib.h>
#include "util.h"
#include "ProbModel.h"
#include <iostream>
#include "global.h"

using namespace std;

float gaparr[]={1.0,0.0,0.0,0.0,0.0,0.0,0.0};
float Aarr[]={0.0,1.0,0.0,0.0,0.0,0.0,0.0};
float Garr[]={0.0,0.0,1.0,0.0,0.0,0.0,0.0};
float Carr[]={0.0,0.0,0.0,1.0,0.0,0.0,0.0};
float Tarr[]={0.0,0.0,0.0,0.0,1.0,0.0,0.0};
float Narr[]={0.0,0.0,0.0,0.0,0.0,1.0,0.0};
float endarr[]={0.0,0.0,0.0,0.0,0.0,0.0,1.0};

static char RNAList[]="-AGCTN_";

ProbChar gapChar(gaparr);
ProbChar EndgapChar(endarr);

ProbChar::ProbChar()
{
}

ProbChar::ProbChar(float arr[])
{
	Assign(arr);
}

float ProbChar::Dist(ProbChar &pc){
	float diff=probvect[0]*(1-pc[0]-pc[Num_RNAType+2]);//*0.5;
	for (int j=1; j<=Num_RNAType+1;j++)
		{
			diff+=probvect[j]*(1-pc[j]-pc[Num_RNAType+2]); //-0.5*pc[0]
		}
  return diff;
}

void ProbChar::Assign(float arr[])
{
	for (int i=0;i<=Num_RNAType+2;i++)
			probvect[i]=arr[i];	
}


char ProbChar::ToChar()
{
	float maxpr=probvect[0];
	int maxidx=0;
	for (int j=1;j<=Num_RNAType+2;j++)
		if (probvect[j]>maxpr)
		{
			maxpr=probvect[j];
			maxidx=j;
		}
	return RNAList[maxidx];
}

char ProbChar::ToWildcard()
{
	if (probvect[Num_RNAType+2] > 1.0-EPS) return '_';
	if (probvect[0] > 1.0-EPS) return '-';
	if (isGap()) return '=';
	if (probvect[Num_RNAType+2] >0.5) return ',';
	if (probvect[0]+probvect[Num_RNAType+2] >0.5) return '+';
	if (probvect[1]/(1-probvect[0]-probvect[Num_RNAType+2]) >0.5) return 'A';
	if (probvect[2]/(1-probvect[0]-probvect[Num_RNAType+2]) >0.5) return 'G';
	if (probvect[3]/(1-probvect[0]-probvect[Num_RNAType+2]) >0.5) return 'C';
	if (probvect[4]/(1-probvect[0]-probvect[Num_RNAType+2]) >0.5) return 'T';
	if (probvect[5]/(1-probvect[0]-probvect[Num_RNAType+2]) >0.5) return 'N';
	float max1=0;
	float max2=0;
	float max3=0;
	int idx1;
	int idx2;
	int idx3;
	int i;
	
	for (i=1;i<5;i++)
		if (probvect[i] >max1)
		{
			max1=probvect[i];
			idx1=i;
		}
	for (i=1;i<5;i++)
		if (i !=idx1 && probvect[i] >max2)
		{
			max2=probvect[i];
			idx2=i;
		}
	if ((max1+max2)/(1-probvect[0]-probvect[Num_RNAType+2]) >0.5)
	{
		if (idx1==1 && idx2==2 || idx1==2 && idx2==1) return 'R';
		if (idx1==1 && idx2==3 || idx1==3 && idx2==1) return 'M';
		if (idx1==1 && idx2==4 || idx1==4 && idx2==1) return 'W';
		if (idx1==2 && idx2==3 || idx1==3 && idx2==2) return 'S';
		if (idx1==2 && idx2==4 || idx1==4 && idx2==2) return 'K';
		if (idx1==3 && idx2==4 || idx1==4 && idx2==3) return 'Y';
	}
	
	for (i=1;i<5;i++)
		if (i !=idx1 && i!=idx2 && probvect[i] >max3)
		{
			max3=probvect[i];
			idx3=i;
		}
	if ((max1+max2+max3)/(1-probvect[0]-probvect[Num_RNAType+2]) >0.75)
	{
		if (10-idx1-idx2-idx3 ==1) return 'B';
		if (10-idx1-idx2-idx3 ==2) return 'D';
		if (10-idx1-idx2-idx3 ==3) return 'H';
		if (10-idx1-idx2-idx3 ==4) return 'V';
	}
	return 'N';
}

void ProbString::Clear()
{
		if (seq !=NULL) 
		{			
			delete [] seq;
			seq=NULL;
		}
}

void ProbString::Alloc(int len)
{
	Clear();
	this->len=len;
	seq=new ProbChar[len];
}

ProbString::~ProbString()
{
	Clear();
}

ProbString & ProbString::operator =(ProbString &ps)
{
	Alloc(ps.Len());
	
	for (int i=0;i<len;i++)
		seq[i]=ps[i];
	return ps;
}

ProbString::ProbString(char *strseq)
{
	seq=NULL;
	Alloc(strlen(strseq));
	for (int i=0;i<len;i++)
	{
		switch (strseq[i]){
			case '-': seq[i].Assign(gaparr);
				break;
			case 'A': seq[i].Assign(Aarr);
				break;
			case 'G': seq[i].Assign(Garr);
				break;
			case 'C': seq[i].Assign(Carr);
				break;
			case 'T': seq[i].Assign(Tarr);
				break;
			case 'N': seq[i].Assign(Narr);
				break;			
			default:
				break;				
		}
	}
		int ptr=0;
	while (ptr <len && seq[ptr].isGap())
	{
		seq[ptr++].Assign(endarr);
	}
	ptr=len-1;
	while (ptr >=0 && seq[ptr].isGap())
	{
		seq[ptr--].Assign(endarr);
	}
}

void ProbString::FromString(char *strseq)
{
	Alloc(strlen(strseq));
	for (int i=0;i<len;i++)
	{
		switch (strseq[i]){
			case '-': seq[i].Assign(gaparr);
				break;
			case 'A': seq[i].Assign(Aarr);
				break;
			case 'G': seq[i].Assign(Garr);
				break;
			case 'C': seq[i].Assign(Carr);
				break;
			case 'T': seq[i].Assign(Tarr);
				break;
			case 'N': seq[i].Assign(Narr);
				break;			
			default:
				break;				
		}
	}
	int ptr=0;
	while (ptr <len && seq[ptr].isGap())
	{
		seq[ptr++].Assign(endarr);
	}
	ptr=len-1;
	while (ptr >=0 && seq[ptr].isGap())
	{
		seq[ptr--].Assign(endarr);
	}
}

float ProbString::AveDist(ProbString &str)
{
	int gap_alert1 = 0;
	int gap_alert2 = 0;

	int start_seq1 = 0;
	int start_seq2 = 0;
	int end_seq1 = this->len;
	int end_seq2 = str.Len();
	int k;
	float residuecount, distance;
	float accgap=0.0;
	float prob_res;
	
	residuecount = distance = 0.0;

	k = 0;
	while(seq[k].isGap()) k++;
	start_seq1 =  k;
	k = 0;
	while(str[k].isGap()) k++;
	start_seq2 = k;
	
	k = end_seq1-1;
	while(seq[k].isGap()) k--;
	end_seq1 = k+1; 
	  	
	k = end_seq2-1;
	while(str[k].isGap()) k--;
	end_seq2 = k+1; 

	gap_alert1 = 0;
	gap_alert2 = 0;
	
	for( k=max(start_seq1, start_seq2); k <min(end_seq1, end_seq2); k++) 
	{

		if (seq[k].isGap() && str[k].isGap())
		{
			continue;
		}

		if(seq[k].isUnknown() && str[k].isUnknown())
		{
			if (gap_alert1>0) 
			{
				residuecount+=prob_res;
				//residuecount+=1;
				distance+=accgap/gap_alert1;
				accgap=0.0;
				prob_res=1.0;
			}
			if (gap_alert2>0) 
			{
				residuecount+=prob_res;
				//residuecount+=1;
				distance+=accgap/gap_alert2;
				accgap=0.0;
				prob_res=1.0;
			}
			residuecount+=1.0;
			gap_alert1 = 0;
			gap_alert2 = 0;
			continue;
		}

		if (seq[k].isGap())
		{
			if(gap_alert1 == 0)
			{
				if (gap_alert2>0) 
				{
					residuecount+=prob_res;
					//residuecount+=1;
					distance+=accgap/gap_alert2;
				}
				accgap=0.0;
				gap_alert2 = 0;
				prob_res=0.0;
			}
			accgap+=seq[k].Dist(str[k]);
			prob_res=max(prob_res,(1-str[k].ProbEnd())*(1-seq[k].ProbEnd()));
			gap_alert1++;
			continue;
		}

		if ( str[k].isGap())
		{
			if(gap_alert2 == 0)
			{
				if (gap_alert1>0) 
				{
					residuecount+=prob_res;
					//residuecount+=1;
					distance+=accgap/gap_alert1;
				}
				prob_res=0.0;
				accgap=0.0;
				gap_alert1 = 0;
			}
			accgap+=seq[k].Dist(str[k]);
			prob_res=max(prob_res,(1-str[k].ProbEnd())*(1-seq[k].ProbEnd()));
			gap_alert2++;
			continue;
		}

		distance += seq[k].Dist(str[k]);
		residuecount+=(1.0-seq[k].ProbEnd())*(1.0-str[k].ProbEnd());
		if (gap_alert1>0) 
		{
			distance+=accgap/gap_alert1;
			residuecount+=prob_res;
			//residuecount+=1;
			accgap=0.0;
			prob_res=1.0;
		}
		if (gap_alert2>0) 
		{
			distance+=accgap/gap_alert2;
			residuecount+=prob_res;
			//residuecount+=1;
			accgap=0.0;
			prob_res=1.0;
		}
		gap_alert1 = 0;
		gap_alert2 = 0;
	}

  if (residuecount > 0) {
		distance = distance / residuecount;
  }
  else {
		distance = 1.0;
  }
     
  return distance;  
}

float ProbString::AveDist(char *str)
{
	int gap_alert1 = 0;
	int gap_alert2 = 0;

	int start_seq1 = 0;
	int start_seq2 = 0;
	int end_seq1 = this->len;
	int end_seq2 = strlen(str);
	int k;
	float residuecount, distance;
	float accgap=0.0;
	float prob_res;
	
	residuecount = distance = 0.0;

	k = 0;
	while(seq[k].isGap()) k++;
	start_seq1 =  k;
	k = 0;
	while(str[k]=='-') k++;
	start_seq2 = k;
	k = end_seq1-1;
	while(seq[k].isGap()) k--;
	end_seq1 = k+1; 
	k = end_seq2-1;
	while(str[k]=='-') k--;
	end_seq2 = k+1; 

	gap_alert1 = 0;
	gap_alert2 = 0;

	for( k=max(start_seq1, start_seq2); k <min(end_seq1, end_seq2); k++) {

		if (seq[k].isGap() && str[k]=='-')
		{
			continue;
		}

	  	if(seq[k].isUnknown() && str[k]=='N')
		{
			if (gap_alert1>0) 
			{	
				distance+=accgap/gap_alert1;
				residuecount+=prob_res;
				//residuecount+=1;
				accgap=0.0;
				prob_res=1.0;
			}
			if (gap_alert2>0) 
			{
				distance+=accgap/gap_alert2;
				//residuecount+=prob_res;
				residuecount+=1;
				accgap=0.0;
				prob_res=1.0;
			}
			residuecount+=1;
			gap_alert1 = 0;
			gap_alert2 = 0;
			continue;
		}

		if ( seq[k].isGap())
		{
			if(gap_alert1 == 0)
			{
				if (gap_alert2>0) 
				{	
					distance+=accgap/gap_alert2;
					residuecount+=prob_res;
					//residuecount+=1;
				}
				accgap=0.0;
				prob_res=0;
				gap_alert2 = 0;
			}
			accgap+=1-seq[k].ProbEnd();
			prob_res=max(prob_res,1-seq[k].ProbEnd());
			gap_alert1++;
			continue;
		}

		
    	if ( str[k]=='-')
		{
			if(gap_alert2 == 0)
			{
				if (gap_alert1 >0)
				{
					distance+=accgap/gap_alert1;
					residuecount+=prob_res;
					//residuecount+=1;
				}
				accgap=0.0;
				prob_res=0.0;
				gap_alert1 = 0;
			}
           	accgap+=1-seq[k].ProbGap();
			prob_res=max(prob_res,1-seq[k].ProbEnd());
			gap_alert2++;
			continue;
		}

   	 	int idx=(int) (strchr(RNAList,str[k])-RNAList);
     	distance+=1-seq[k][idx]-seq[k].ProbEnd();
		residuecount+=1.0-seq[k].ProbEnd();
		if (gap_alert1>0) 
		{
			distance+=accgap/gap_alert1;
			residuecount+=prob_res;
			//residuecount+=1;
			accgap=0.0;
			prob_res=1.0;
		}
		if (gap_alert2>0) 
		{	
			distance+=accgap/gap_alert2;
			residuecount+=prob_res;
			//residuecount+=1;
			accgap=0.0;
			prob_res=1.0;
		}
		gap_alert1 = 0;
		gap_alert2 = 0;
	}

  if (residuecount > 0) {
		distance = distance / residuecount;
  }
  else {
		distance = 1.0;
  }
     
  return distance;  
}

void ProbString::operator +=(ProbString &ps)
{
	if (len !=ps.Len())
	{
		cerr << "Cannot combine ProbString of different length" <<endl;
		return;
	}
	for (int i=0;i<len;i++)
		seq[i]+=ps[i];
} 
	
void ProbString::operator *=(float w)
{
	for (int i=0;i<len;i++)
		seq[i]*=w;	
} 

void ProbString::Add(char *str, float wt)
{
	int idx;
	for (int i=0;i<len;i++)
	{
		idx=(int) (strchr(RNAList,str[i])-RNAList);
		seq[i][idx]+=wt;
	}
}

bool ProbString::Rectify(float thres)
{
	int numreserve=0;
	int i,j,ptr;
	for (i=0;i<len;i++)
	{
		int cntlive=0;
		for (j=1; j<=Num_RNAType+1;j++)
		{
			if (seq[i][j] > thres) cntlive++;
		}
		if (cntlive>0) numreserve++;
	}
	ProbChar *newseq = new ProbChar[numreserve];

	ptr=0;
	for (i=0;i<len;i++)
	{
		float denorm=0;
		int cntlive=0;
		for (j=1; j<=Num_RNAType+1;j++)
		{
			if (seq[i][j] > thres) cntlive++;
		}
		
		for (j=0; j<=Num_RNAType+2;j++)
		{
			if (seq[i][j] > thres) // || (j>0 && cntlive>1 &&seq[i][j] > thres/2))
			{
				newseq[ptr][j]=seq[i][j];
				denorm+=seq[i][j];
			}
			else
			{
				newseq[ptr][j]=0.0;
			}
		}
		if (cntlive>0)
		{
			newseq[ptr] *=1.0/denorm;
			ptr++;
			if (ptr >=numreserve) break;
		}
	}
	Clear();
	seq=newseq;
	bool dorec=(numreserve <len);
	len=numreserve;
	return dorec;
}

char* ProbString::ToString()
{
	char *buff;
	buff=(char *)Malloc((len+1)*sizeof(char));
	for (int i=0;i<len;i++)
	{
		buff[i]=seq[i].ToChar();
	}
	buff[len]='\0';
	return buff;
}

char* ProbString::ToWildcard()
{
	char *buff;
	buff=(char *)Malloc((len+1)*sizeof(char));
	for (int i=0;i<len;i++)
	{
		buff[i]=seq[i].ToWildcard();
	}
	buff[len]='\0';
	return buff;
}

fstream &operator <<(fstream &out, ProbChar &chr)
{
	out << (int) ((chr[0]+chr[Num_RNAType+2]+1/510)*255) << " ";
	
	for (int i=1;i<=Num_RNAType;i++)
	{
		out << (int) ((chr[i]+1/510)*255) << " ";
	}
	return out;
}

fstream &operator <<(fstream &out, ProbString &str)
{
	out << str.Len() << "\t";
	for (int i=0;i<str.Len(); i++)
	{
		out<<"[";
 		out<<str[i];
		out<<"]\t";
	}
 	out << endl;
 return out;
}

fstream &operator >>(fstream &in,ProbChar &chr)
{
	int ntprob;
	for (int i=0;i<=Num_RNAType;i++)
	{
		in >>  ntprob;
		chr[i]=(float) ntprob/255;
	}
	chr[Num_RNAType+1]=0;
	chr[Num_RNAType+2]=0;
	return in;
	
}

fstream &operator >>(fstream &in,ProbString &str)
{
	int len,i;
	in >> len;
	str.Alloc(len);
	for (i=0;i<len;i++)
	{
		in >>str[i];
	}
	return in;
}

