/* cands.c
 * Zizhen Yao
 *
 * CVS $Id: cands.c,v 3.1 2006/03/07 19:38:35 yzizhen Exp $.
 * 
 * Pairwise tree-edit comparison between candidates. 
 * Select seed candidates to construct initial alignment
 */
 

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "cand.h"
#include "match_constr.h"
#include "treedist.h"
#include "edit_cost.h"
#include "squid.h"
#include "utils.h"


#define MAX_LENGTH_DIFF  0.3
#define DIST_THRESH      1.5  /* A threshold to determine whether distance of two candidates is too far*/
#define SIM_THRESH      0.3   /* A treshold to determine whether two candidates are very similar. */

#define OPTIONS "n:f:m:"
char usage[] = "\
Usage:\n\
cands [-n Seed] [-f fraction] [-m match_constrain_file] <seqfile> <candfile> \n";

int nseed = 0;
int nseq = 0;
Cand**  seeds = NULL;
Cand**  cand = NULL;
int*    ncand = NULL;
Cand*** chosen = NULL;
int*    chosen_size = NULL;
float*   edit_cost;


int compFloat(const void * a, const void* b)
{
  float a1 = *(float*)a;
  float b1 = *(float*)b;
  if (a1 < b1) return -1;
  if (a1 > b1) return 1;
  return 0;
}

int my_index(int i, int j, int k, int l)
{
  int idx, idx1, idx2;
  idx1 = (i < k )? i * MAXCAND + j: k * MAXCAND + l;
  idx2 = (i > k )? i * MAXCAND + j: k * MAXCAND + l;      
  idx = idx2 * (idx2 + 1) / 2 + idx1;    
  return idx;
}


int choose(Cand* seed, Cand** select, double fraction)
{ 
  Cand* curr = 0;  
  Cand* best_match; 
  int   count;  
  int   i, j;  
  short* seq_flags = space(sizeof(short) * nseq);
  memset(seq_flags, 0, sizeof(short) * nseq);
  for( i = 0; i < nseq; i++) 
      for(j = 0; j < ncand[i]; j++) 
	cand[i][j].weight = 0;  

  curr = seed;
  count = 1;
  select[curr->seq_id] = curr;  
  while(count < nseq) {
    double min_cost = INF;    
    best_match=NULL;
    seq_flags[ curr-> seq_id] = 1;
    for( i = 0; i < nseq; i++) {
      if (seq_flags[i] ) continue;
      for(j = 0; j < ncand[i]; j++) {
	cand[i][j].weight += edit_cost[ my_index(i, j, curr->seq_id, curr->cand_id)];	
	if (cand[i][j].weight < min_cost ){
	  min_cost = cand[i][j].weight;
	  best_match = &cand[i][j];
	}
      }
    }
    if (best_match==NULL) 
      break;
    curr = best_match;
    if (curr->weight / count > DIST_THRESH && count > nseq * fraction) 
      break;    
    select[curr->seq_id] = curr;    
    count++;
  } 
  return count;  
}



void SelectCand(double fraction, int max_seed, MatchPtr** match_constr)
{
  Cand**   all_cand;  
  int      total_cand;
  float    cost;  
  float    dist;  
  float*** best_match_cost;
  int***   best_match;  
  int      sflag;  
  int      i, j, k, l, m, n;
  char     *struc;  
  Tree***   trees;
  MatchPtr  curr, prev;

  edit_cost = (float*) space( sizeof(float* ) * ( my_index(nseq-2,  MAXCAND -1, nseq-1, MAXCAND -1 ) + 1));
  best_match_cost = (float***) space(sizeof(float**) * nseq);
  best_match = (int***) space(sizeof(int**) * nseq);
  trees = (Tree***) space(sizeof(Tree**) * nseq);
  seeds = (Cand**) space(sizeof(Cand*) * max_seed);  

  for(i=0; i< nseq; i++) {
    best_match_cost[i] = (float**) space(sizeof(float*) * ncand[i]);    
    best_match[i] = (int**) space(sizeof(int*) * ncand[i]);    
    trees[i] = (Tree**) space(sizeof(Tree*) * ncand[i]);    
    for(j =0 ; j < ncand[i]; j++) {
      best_match_cost[i][j] = (float*) space(sizeof(float) * nseq);
      best_match[i][j] = (int*) space(sizeof(int) * nseq);      
      struc = ExpandFull(cand[i][j].ss, cand[i][j].seq);      
      trees[i][j] = make_tree(struc);      
      free(struc);
      
      for(k=0; k < nseq; k++) {	
	best_match_cost[i][j][k] = INF;
	best_match[i][j][k] = -1;
      }
    }    
  }
      
  for( i=0; i < nseq; i++) { 
    for( j=0; j < ncand[i]; j++) {
      for(k = i+1; k < nseq; k++) {
	if (k==i) continue;	
	for( l = 0; l < ncand[k]; l++) {
	  /* If length of the two candidates differ significantly, no need to compare them */
	  int l1 = abs(cand[i][j].stop - cand[i][j].start)+1 ;
	  int l2 = abs(cand[k][l].stop - cand[k][l].start)+1;	  
	  int diff = abs(l1 - l2);	  
	  if ( diff > MAX_LENGTH_DIFF * l1 || diff > MAX_LENGTH_DIFF  * l2) {
	    edit_cost[my_index(i,j,k,l)] = INF;		      
	    continue;	    
	  }	  

	  /* Check whether they violate anchor constraint */
	  if(match_constr) {
	    if(!CheckMatch(cand[i][j].start, cand[i][j].stop, cand[k][l].start, cand[k][l].stop, match_constr[i][k])){
	      edit_cost[my_index(i,j,k,l)] = INF;		      		     
	      continue;
	    }	    
	  }
	  
	  /* The distance of two candidates is the tree-edit distance normalized by length */
	  dist = tree_edit_distance(trees[i][j], trees[k][l]);      
	  dist /= sqrt(l1 * l2);	  	  

	  edit_cost[ my_index(i, j, k, l) ] = dist;	  
	  if (dist < best_match_cost[i][j][k]){
	    best_match_cost[i][j][k] = dist;
	    best_match[i][j][k] = l;
	  }
	  if (dist < best_match_cost[k][l][i]) {
	    best_match_cost[k][l][i] = dist;
	    best_match[k][l][i] = j;
	  }
	}
      }
      
    }
  }  
  
  
  for(i = 0; i < nseq; i++) {  
    for(j = 0; j < ncand[i]; j++) {      
      //printf("%d_%d\n", i, j);      
      qsort(best_match_cost[i][j], nseq, sizeof(float), compFloat);      
      cost = 0;
      for(k = 0; k < (nseq -1 )* fraction; k++) {
	cost += best_match_cost[i][j][k];	
	//printf("%f\t", best_match_cost[i][j][k]);	
      }      
      //printf("\n");      
      cand[i][j].score = - cost / k ;            
      //printf("Cost %f\n", cand[i][j].score);      
    }
  }

  for(i=0, total_cand = 0; i < nseq; i++) total_cand += ncand[i];  
  all_cand = (Cand **) malloc( sizeof(Cand *) * total_cand);  
  for(i=0, k=0; i < nseq; i++)
    for(j = 0; j < ncand[i]; j++) 
      all_cand[k++] = &cand[i][j];          
  
  qsort(all_cand, total_cand, sizeof(Cand*), CompCandByScore); 

  /* Choose seed candidate */
  for( m = 0, nseed = 0; nseed < max_seed  && m < total_cand; m++) {
    sflag = 0;    
    i = all_cand[m]->seq_id;
    j = all_cand[m]->cand_id;
    
    for (n=0; n < nseed; n++) {
      /* this candidate is exactly the same as a previous chosen candidates, 
	 or similar to a previous seed */   
      if (chosen[n][i] == all_cand[m] || 
	  (i != (seeds[n]->seq_id) && 
	   (edit_cost[my_index(i, j, seeds[n]->seq_id, seeds[n]->cand_id)]  < SIM_THRESH))){	
	sflag = 1;
	break;	
      }      
    }    
    if (sflag) continue;
    
    chosen[nseed] = (Cand**)space(sizeof(Cand*) * nseq);
    memset(chosen[nseed], 0, sizeof(Cand*) * nseq);
          
    chosen_size[nseed] = choose(all_cand[m], chosen[nseed], fraction);

    for (n=0; n < nseed; n++) {    
      sflag = 0;      
      for(k=0; k<nseq; k++) {  
	if (chosen[n][k]== NULL ||chosen[nseed][k]== NULL ) continue;	
	/* If this candidate appear in a previous alignment */
	if (chosen[n][k] ==  chosen[nseed][k]){	  
	  /* this candidate match a previous seed better, remove it */
	  if (edit_cost[my_index(chosen[n][k]->seq_id, chosen[n][k]->cand_id, seeds[n]->seq_id, seeds[n]->cand_id)] <=
	      edit_cost[my_index(chosen[n][k]->seq_id, chosen[n][k]->cand_id, i,j)]) {
	    chosen[nseed][k] = NULL;	    
	    chosen_size[nseed] -- ;	    	    
	  }	    
	  sflag++;	      
	}	  
	/* If this candidate overlap significantly with a candiate in a previous alignment */
	else if (abs(chosen[n][k]->start - chosen[nseed][k]->start) + abs(chosen[n][k]->stop - chosen[nseed][k]->stop) <
		 0.2 * chosen[n][k]->len)
	  sflag++;	  	    
      }
      /* Too much overlap with a previous alignment */
      if (sflag > 0.5 * chosen_size[0]){
	break;
      }            
    }
    
    
    if( chosen_size[nseed] >= 4 && (nseed == 0 ||  sflag <= 0.5 * chosen_size[0])){      
      seeds[nseed] = all_cand[m];      
      nseed++;    
      printf("Seq_%d_Cand%d_%d_%d  %f\n", i, j, cand[i][j].start, cand[i][j].stop, cand[i][j].score);
      printf("%s\n", cand[i][j].seq);
      printf("%s\n", cand[i][j].ss);    
    }    
  }
  
  
  free(all_cand);  
  if (match_constr) {
    for(i=0; i< nseq; i++) {    
      for(j=0; j < nseq; j++) {
	curr = match_constr[i][j];
	while(curr) {
	  prev = curr;
	  curr = curr->next;
	  free(prev);	
	}      
      }
      free(match_constr[i]);    
    }
    free(match_constr);  
  }
  
  for(i=0; i < nseq; i++) {
    
    for(j=0; j < ncand[i]; j++) {
      free(best_match_cost[i][j]);
      free(best_match[i][j]);      
      free_tree(trees[i][j]);      
    }
    free(best_match_cost[i]);
    free(best_match[i]);
    free(trees[i]);    
  }
  
  free(edit_cost);  
  free(best_match_cost);
  free(best_match);  
  free(trees);  
}


int main(int argc, char* argv[])
{
  int    max_seed = 1;  
  char   seqfile[100]; 
  char**  rseqs;
  SQINFO* sqinfo;
  int     format; 
  char*   matchfile=NULL;
  MatchPtr** match_constr = NULL;
  char candfile[100];  
  int   i;  
  int   max_cand;  
  double blast_threshold = 0.001;  

  int          optc;
  float        fraction = 1;  
  extern char *optarg;          /* for getopt() */
  extern int   optind;		/* for getopt() */  

  while ((optc = getopt(argc, argv, OPTIONS)) != -1)
    switch (optc) {
    case 'm': matchfile     = optarg;      break;      
    case 'n': max_seed         = atoi(optarg);       break;
    case 'f': fraction      = atof(optarg);       break;
      
    }
  if (argc - optind < 2)
    Die("%s\n", usage);
    
  strcpy(seqfile, argv[argc-2]);  
  strcpy(candfile, argv[argc-1]);     
  
   /* Read Sequence file */
  format = SQFILE_FASTA;   
  
  if(! ReadMultipleRseqs(seqfile, format, &rseqs, &sqinfo, &nseq))
    Die("Failed to read squences from file %s", seqfile);


  /* Read match constraint */
  if (matchfile) {    
    match_constr = ReadMatchConstr(matchfile, nseq, sqinfo,blast_threshold);
  }
  			   
  /* Read Candidates */ 			 
  cand = Read2DCand(candfile,  nseq,  &ncand, &max_cand);

  if (max_cand < MAXCAND)
    MAXCAND = max_cand;
  
  chosen = (Cand***) malloc( sizeof(Cand**) * max_seed );  
  chosen_size = (int*) malloc( sizeof(int) * max_seed );  
  SelectCand(fraction, max_seed, match_constr);
  for(i = 0; i < nseed; i++) {
    sprintf(candfile, "%s.%d", argv[argc-1], i+1);    
    chosen[i][seeds[i]->seq_id] = chosen[i][0];
    chosen[i][0] = seeds[i];    
    Write1DCand(candfile, chosen[i], nseq);    
    free(chosen[i]);        
  }
  
  free(chosen);  
  free(seeds);  
  free(chosen_size);
  for(i=0; i <nseq; i++) {    
    free(cand[i]);
  }
  
  free(cand);  
  free(ncand);  
  return 1;
}
