/***********************************************************************
ESPRIT-Forest: Parallel Clustering of Massive Amplicon Sequence Data in Subquadratic Time 
by: Yunpeng Cai, Yijun Sun, Wei Zheng, Jin Yao and Yujie Yang  (C) 2016
Please kindly cite [Y.Cai et.al PLOS Comp. Biol. 2016]

THE LICENSED WORK IS PROVIDED UNDER THE TERMS OF THE ADAPTIVE PUBLIC LICENSE ("LICENSE") AS FIRST COMPLETED BY: _Yunpeng Cai, Yijun Sun, Wei Zheng, Jin Yao, Yujie Yang_ [Insert the name of the Initial Contributor here]. ANY USE, PUBLIC DISPLAY, PUBLIC PERFORMANCE, REPRODUCTION OR DISTRIBUTION OF, OR PREPARATION OF DERIVATIVE WORKS BASED ON, THE LICENSED WORK CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS LICENSE AND ITS TERMS, WHETHER OR NOT SUCH RECIPIENT READS THE TERMS OF THE LICENSE. "LICENSED WORK" AND "RECIPIENT" ARE DEFINED IN THE LICENSE. A COPY OF THE LICENSE IS LOCATED IN THE TEXT FILE ENTITLED "LICENSE.TXT" ACCOMPANYING THE CONTENTS OF THIS FILE. IF A COPY OF THE LICENSE DOES NOT ACCOMPANY THIS FILE, A COPY OF THE LICENSE MAY ALSO BE OBTAINED AT THE FOLLOWING WEB SITE: http://www.acsu.buffalo.edu/~yijunsun/lab/ESPRIT-Forest.html [Insert Initial Contributor's Designated Web Site here]

Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for the specific language governing rights and limitations under the License.
*/

#include <string.h>
#include <stdio.h>
#include <ctype.h>
#include <math.h>
#include "util.h"
#include "kmer.h"
#include <algorithm>
#include <iostream>
#include <fstream>
#include <bitset>

using namespace std;
#define MINPROB 1e-10

Kmer::Kmer(int klen, int clen)
	{
		CodeLen=clen;
		KmerLen=klen;
		TableLen=(int ) pow(CodeLen,KmerLen);
		debug=0;
	}

int Kmer::GetKmerLen(){return TableLen;}

Kmer::~Kmer()
{
}
//need to override if the sequence is not nuclocide
int Kmer::KmerCode(char c) 
{
	switch (toupper(c))
	{
		case 'A':
				return 0;
		case 'G':
				return 1;
		case 'C':
			  return 2;
		case 'T':
			  return 3;
		default:
				return -1;
	}
}

int Kmer::KmerIndex(char *s)
{
	int idx=0;
	int code;
	if (strlen(s)<KmerLen)
		return -(KmerLen+1);
	for (int i=0;i<KmerLen;i++)
	{
	  code=KmerCode(s[i]);
	  if (code ==-1)
	  	 return -(i+1);
	  else
	  	{	 
	  		idx=idx*CodeLen+code; 
	  	}
	 } 	
	return idx;	
}

int Kmer::KmerIndexNext(int lastidx, char *s)
{
	int code;
	if (strlen(s)<KmerLen)
		return -(KmerLen+1);
	code = KmerCode(*(s+KmerLen-1));
	if (code ==-1)
	 	  return -KmerLen;
	 else
	 		return (lastidx*CodeLen % TableLen) + code;	
}

int* Kmer::AllocCodeTable()
{
	int *ptr;
	ptr= (int *)Malloc(TableLen*sizeof(int));
	memset(ptr,0,TableLen*sizeof(int));
	return ptr;
}
	
void Kmer::KmerCount(char *seq, int *tab, int *kseq)
{
	  int idx;
	  int slen=strlen(seq)-KmerLen+1;
		int *oldkseq=kseq;
	  idx=KmerIndex(seq);
	  char *seq_end=seq+slen;
	  
	  if (idx <0)
	  	{
	  		seq-=idx;
	  	}
	  else
	  	{	
	  		tab[idx]++;
	  		if (kseq!=NULL) *(kseq++)=idx;
	  		seq++;
	  	}
	  while (seq<seq_end)
	  {
	  	if (idx <0)
	  	{
	  		idx=KmerIndex(seq);
	  	}
	  	else
	  	{	
		  	idx=KmerIndexNext(idx,seq);
		  }
			if (idx <0)
		  {
	  		seq-=idx;
	  	}
	  	else
	  	{
		  	tab[idx]++;
	  		if (kseq!=NULL) *(kseq++)=idx;
				seq++;
			}
	  }
	  if (kseq!=NULL)
	  	{
	  	while (kseq <oldkseq+slen)
	  		*(kseq++)=-1;
	  	}
}

float Kmer::KmerComp(int *tab1, int *tab2, int *kseq1,int seqlen)
/* assume that seq1 is shorter than seq2 */
{
	int ucomm=0;
	int i;
	int idx;
	bitset<65536> tag;
	
	
	if (kseq1==NULL)
	{
		for (i=0;i<TableLen;i++)
			ucomm+=min(tab1[i],tab2[i]);
	}	
	else
	{
		for (i=0;i<seqlen-KmerLen+1;i++)
		{
			idx=kseq1[i];
			if (idx>=0)
			if (!tag.test(idx))
				{
					ucomm+=min(tab1[idx],tab2[idx]);
					tag.set(idx);
				}		
		}
	}
	return (float) ucomm/(seqlen-KmerLen+1.0);
}

float Kmer::KmerComp(int *tab1, int *tab2, int *kseq1,int seqlen1,int seqlen2,int mul2)
/* seq1 is a single seq and seq2 is a compound seq */
{
	float ucomm=0;
	int i;
	int idx;
	bitset<65536> tag;
	
	
	if (kseq1==NULL)
	{
		for (i=0;i<TableLen;i++)
			ucomm+=min(tab1[i]+0.0,(tab2[i]+0.0)/mul2);
	}	
	else
	{
		for (i=0;i<seqlen1-KmerLen+1;i++)
		{
			idx=kseq1[i];
			if (idx>=0)
			if (!tag.test(idx))
				{
					ucomm+=min(tab1[idx]+0.0,(tab2[idx]+0.0)/mul2);
					tag.set(idx);
				}		
		}
	}
	return ucomm/(min(seqlen1,seqlen2)-KmerLen+1.0);
}

float Kmer::KmerComp(int *tab1, int *tab2, int mul1, int mul2)
{
	float ucomm=0;
	float tot1=0;
	float tot2=0;
	float val1;
	float val2;
	int i;
	for (i=0;i<TableLen;i++)
	{
		val1=(tab1[i]+0.0)/mul1;
		val2=(tab2[i]+0.0)/mul2;
		ucomm+=min(val1,val2);
		tot1+=val1;
		tot2+=val2;
	}
	if (tot1 <=MINPROB || tot2 <=MINPROB)
		return 0;
	return ucomm/min(tot1,tot2);
}

int *Kmer::KmerAdd(int *tab1,int *tab2,int mul1,int mul2)
{
	int *tab=AllocCodeTable();
	for (int i=0;i<TableLen;i++)
	{
		tab[i]=tab1[i]*mul1+tab2[i]*mul2;
	}
	return tab;
}

int *Kmer::KmerCopy(int *tab1,int mul1)
{
	int *tab=AllocCodeTable();
	for (int i=0;i<TableLen;i++)
	{
		tab[i]=tab1[i]*mul1;
	}
	return tab;
}

void Kmer::KmerAdd(int *dest,int *tab1,int mul1)
{
	for (int i=0;i<TableLen;i++)
	{
		dest[i]+=tab1[i]*mul1;
	}
}


