/*
 * thermodynamic.c
 *
 *  Created on: 06.02.2011
 *      Author: Stefan Seemann
 */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include "thermodynamic.h"
#include "utils.h"
#include "fold_vars.h"
#include "fold.h"
#include "part_func.h"
#include "petfoldlibs.h"

/*
 * Extracts base pair probabilities from thermodynamic partition function of RNAfold
 * Calculates energy distributions without gaps
 * Constraints an evolutionary conserved base pair if both bases aren't gaps
 * Uses for gaps the average of probabilities in column
 */
double **get_prob_paired_seq(Aln *align_gf, PartStruc *evocon)
{
	double **paired;
	SeqList *seqlist_gf;
	char *evocon_db;
	float e;
	double kT, sfact=1.07, min_en;
	int *gappos;
	int i, d, k;

	static int SCALELENGTH = 200;

	/* list of sequences without gaps */
	seqlist_gf = get_align_without_gaps(align_gf);
	//for (i=0; i < seqlist_gf->nr; i++) printf("%s\t%i\n",seqlist_gf->sequence[i], seqlist_gf->len[i]);

	/* initialize base paired probability matrix of thermodynamic model */
	paired = (double **)malloc((align_gf->len) * sizeof(double *));
	for (i=0; i<align_gf->len; i++)
		paired[i] = (double *)malloc((align_gf->len) * sizeof(double));

	for (d=0; d<align_gf->len; d++)
		for (k=0; k<align_gf->len; k++)
			paired[d][k] = .0;

	/* thermodynamic model is switched on */
	if( setthermo_flag )
	{
		/* folding parameters */
		//temperature = 30.;      /* fold at 30C instead of the default 37C */
		dangles = 2;
		kT = (temperature+273.15)*1.98717/1000.;  /* kT in kcal/mol */
		fold_constrained = 1;	/* the structure string is interpreted on 'pf_fold' input as a list of constraints for the folding */
		noLonelyPairs = 1; /* disallow all pairs which can only occur as lonely pairs */

		for (i=0; i<align_gf->nr; i++)
		{
			/* write partial structure as constraint string usable with 'RNAfold -C' */
			evocon_db = get_constraint_string(evocon, seqlist_gf->origid[i], seqlist_gf->len[i]);
			if (verbose_flag)
				printf(" Constrained structure = %s\n", evocon_db);

			/* for longer sequences one should also set a scaling factor for
			   partition function folding, e.g: */
			if( seqlist_gf->len[i]>SCALELENGTH )
			{
				min_en = fold(seqlist_gf->sequence[i], evocon_db);
				pf_scale = exp(-(sfact*min_en)/kT/seqlist_gf->len[i]);
			}
			init_pf_fold(seqlist_gf->len[i]);

			/* calculate partition function and base pair probabilities by constrained folding */
			e = pf_fold(seqlist_gf->sequence[i], evocon_db);

			/* write probabilities in gap-including matrix */
			for (d=1; d<=seqlist_gf->len[i]; d++)
				for(k=d+1; k<=seqlist_gf->len[i]; k++)
					paired[ seqlist_gf->origid[i][d-1] ][ seqlist_gf->origid[i][k-1] ] += pr[iindx[d]-k]; //sqrt(pr[iindx[d]-k]);

			if (verbose_flag)
				printf(" Sequence %2i structure = %s\n free energy of ensemble=%5.2f kcal/mol\n", i+1, evocon_db, e);

			free_pf_arrays();  /* free space allocated for pf_fold() */
			free(evocon_db);
		}

		/*
		 * arithmetic mean of base pair probabilities
		 */

		/* count for each column of alignment the number of gaps */
		gappos = (int *)malloc(align_gf->len * sizeof(int));
		for (d=0; d<align_gf->len; d++)
		{
			gappos[d] = 0;
			for (k=0; k<align_gf->nr; k++)
				if( align_gf->sequence[k][d] == '-')
					gappos[d]++;
		}

		/* divide the sum of base pair probabilities by the number of gap-free sequences */
		for (d=0; d<align_gf->len; d++)
			for (k=d+1; k<align_gf->len; k++)
				paired[d][k] /= align_gf->nr - gappos[d];
	}

	FreeSeqList(seqlist_gf);

	return paired;
}


/*
 * Computes unpaired probabilities as
 * Prob_unpaired(i) = 1 - SUM_j{Prob_paired(i,j)}
 */
double *get_prob_unpaired_seq(Aln *align_gf, double **paired_seq)
{
	double *single;
	double sum;
	int i, j;

	/* initialize single stranded probability array of thermodynamic model */
	single = (double *)malloc((align_gf->len) * sizeof(double));

	for (i=0; i<align_gf->len; i++)
		single[i] = .0;

	/* thermodynamic model is switched on */
	if( setthermo_flag )
	{
		for (i=0; i<align_gf->len; i++)
		{
			sum = 0;
			for (j=i+1; j<align_gf->len; j++)
				sum += paired_seq[i][j];
			for (j=0; j<i; j++)
				sum += paired_seq[j][i];
			single[i] = 1 - sum;
		}
	}

	return single;
}


/*
 * Extracts gap-free sequences from an alignment
 * and keeps the index relation between original and gap free sequences
 */
SeqList *get_align_without_gaps(Aln *align_gf)
{
	SeqList *seqlist_gf;
	int i, j, k;

	/* Initialize SeqList */
	seqlist_gf = (SeqList *)malloc(sizeof(SeqList));
	seqlist_gf->sequence = (char **)malloc(align_gf->nr * sizeof(char *));
	for (i=0; i<align_gf->nr; i++)
		seqlist_gf->sequence[i] = (char *)malloc((align_gf->len+1) * sizeof(char));
	seqlist_gf->origid = (int **)malloc(align_gf->nr * sizeof(int *));
	for (i=0; i<align_gf->nr; i++)
		seqlist_gf->origid[i] = (int *)malloc(align_gf->len * sizeof(int));
	seqlist_gf->len = (int *)malloc(align_gf->nr * sizeof(int));

	seqlist_gf->nr = align_gf->nr;

	for (i=0; i<align_gf->nr; i++)
	{
		k = 0;
		seqlist_gf->len[i] = 0;
		for (j=0; j<align_gf->len; j++)
		{
			if( align_gf->sequence[i][j] != '-' )
			{
				seqlist_gf->origid[i][k] = j;
				seqlist_gf->sequence[i][k++] = align_gf->sequence[i][j];
				seqlist_gf->len[i]++;
			}
			seqlist_gf->sequence[i][k] = '\0';
		}
	}

	return seqlist_gf;
}
