/* candf.c
 * Zizhen Yao
 *
 * CVS $Id: candf.c,v 3.1 2006/03/07 19:38:35 yzizhen Exp $.
 * 
 * Search motif candidates using secondary structure
 * folding algorithm
 */



#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <ctype.h>
#include <unistd.h>
#include <string.h>
#include "cand.h"
#include "energy_const.h"
#include "fold.h"
#include "part_func.h"
#include "fold_vars.h"
#include "utils.h"
#include "funcs.h"

#define MIN_ENERGY   0
#define LOCAL        3


static struct opt_s OPTIONS[] = {
  { "-c", TRUE, sqdARG_INT}, 
  { "-m", TRUE, sqdARG_INT}, 
  { "-M", TRUE, sqdARG_INT}, 
  { "-s", TRUE, sqdARG_INT}, 
  { "-S", TRUE, sqdARG_INT}, 
  { "-o", TRUE, sqdARG_STRING}, 
};


#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))

char usage[] = "\
usage:\n\
candf [-c max_cand] [-o output_file] [-m min_span] [-M max_span] [-s min_hairpin] [-S max_hairpin]<seqfile>\n";

char* substr(char* sub, char* seq, int start, int end)
{
  int len = end - start + 1;  
  if (sub == NULL) {
    sub = (char*) space( len + 1);    
  }
  strncpy(sub, seq + start, len);
  sub[len] = '\0';  
  return sub;  
}

int isMaximal(int i, int j, int* tri_indx, int* energy, int range, int total_len)
{
  int k, l;
  int idx = tri_indx[j]  + i;
  int idx1;  
  for(k=i; k >= i-range && k >=0; k--) 
    for(l=j; l <= j+range && l <total_len; l++) {
      if (k==i && l == j) continue;
      idx1 = tri_indx[l] + k;
      if (energy[idx1] < energy[idx]){
	//printf("%d %d energy %d not maximal: %d %d %d\n", i, j, energy[idx], k, l, energy[idx]);
	return 0;
      }      
    }
  for(k=i; k < i+range &&  k < j; k++)
    for(l=j; l > j-range && l > k; l--) {
      if (k==i && l == j) continue;
      idx1 = tri_indx[l] + k;
      if (energy[idx1] <= energy[idx]){
	//printf("%d %d energy %d not maximal: %d %d %d\n", i, j, energy[idx], k, l, energy[idx]);
	return 0;
      }      
    }
  //printf("%d %d energy %d Maximal\n", i, j, energy[idx]);  
  return 1;  
}




void CompCand(char* seq, int min_hairpin, int max_hairpin, Cand** ret_cand, int*  ret_ncand)
{
  int    i, j, length, idx, diag;
  int*   tri_indx;  
  Cand*  cand;
  int    ncand;
  Cand** sort_cand;
  int    cand_idx;
  char*  structure=NULL;  
  char   backtrack_type;  
  int    hairpin_num = 0;  
  char*  sp;  

  PrepareSequence(seq);   
  length = (int) strlen(seq);   
  structure = (char *) space((unsigned) length+1);
  tri_indx = (int*) space( sizeof(int) * (length+1));   
  for(j=1; j <= length; j++) 
    tri_indx[j] = j * (j-1)>>1;
  
  initialize_fold(length);
  comp_energy(seq);   
  cand = (Cand*) space(sizeof(Cand) * length * (length - 1) / 2);   
  memset(cand, 0, sizeof(Cand) * length * (length - 1) / 2);   
  
  ncand = 0;   
  for( diag = 2; diag <= 2 * length - 1; diag++) {          
    for(j = diag / 2 + 1; j < MIN(length, diag); j++) {
      i = diag - j;       
      if ( j - i + 1 <  MINSPAN)
	continue;      
      if ( j - i +1 >= MAXSPAN)
	continue;       
      idx = tri_indx[j]  + i;
      /* Not stable */
      if ( (bp_energy[idx] > MIN_ENERGY && min_hairpin == 1) || 
	   max_hairpin > 1 && ml_energy[idx] > MIN_ENERGY )
	continue;

      if( max_hairpin ==1  || bp_energy[idx] <= ml_energy[idx]) {	/* i, j are basepaired */	
	/* not maximal stack */
	if (!isMaximal(i, j, tri_indx, bp_energy, LOCAL, length))
	    continue;

	if ( j > i + 2  && bp_energy[tri_indx[j-1] + i+1] >= INF)     /*stack of only 1 bp */
	    continue;

	cand[ncand].start = i;
	cand[ncand].stop = j;
	cand[ncand++].energy =  bp_energy[idx];
      }            
      else if (max_hairpin > 1 ){	
	/* not maximal stack */
	if (!isMaximal(i, j, tri_indx, ml_energy, LOCAL, length)){	  
	  continue;
	}
	cand[ncand].start = i;
	cand[ncand].stop = j;
	cand[ncand++].energy =  ml_energy[idx];
	
      }            
    }
    
  }     


  cand_idx = 0;  
  *ret_cand = malloc(sizeof(Cand) * MAXCAND);  
  sort_cand = SortCand(cand, ncand, CompCandByEnergy);      
  for(i=0; i<ncand && cand_idx < MAXCAND; i++) {         
    int contained = 0;     
    int l1 = sort_cand[i]->stop - sort_cand[i]->start + 1;
        
    for(j=0; j < i; j++) {
      int l2 = sort_cand[j]->stop - sort_cand[j]->start + 1;
      int diff = MAX(abs(sort_cand[i]->stop - sort_cand[j]->stop), abs(sort_cand[i]->start -  sort_cand[j]->start));      
      if ( diff <= 0.15 * MIN(l1, l2)  && diff <= 10){	
	contained = 1;	      
	break;
      }      
    }        
    
    if (contained) continue;  /* Too much overlap with in existing candidate */         
    sort_cand[i]->cand_id = cand_idx;    
    if (sort_cand[i]->energy == bp_energy[ tri_indx[  sort_cand[i]->stop ] + sort_cand[i]->start ])
      backtrack_type = 'C';
    else
      backtrack_type = 'M';  
  
    get_structure( sort_cand[i]->start, sort_cand[i]->stop, seq, structure, backtrack_type);    
    hairpin_num = countHairpin( structure);    
    if (hairpin_num < min_hairpin || hairpin_num > max_hairpin)
      continue;    

    /* remove the dangling bases for multiloop */
    //printf("Str\t%s\n", structure);    
    sp = structure + sort_cand[i]->stop - sort_cand[i]->start;
    while(*sp == '.') {
      sp--;
      sort_cand[i]->stop--;
    }    
    *(sp+1) = '\0';
    sp = structure;
    while(*sp =='.') {
      sp++;
      sort_cand[i]->start++;
    }
    //printf("\t%s\n", sp);
    
    strcpy( sort_cand[i]->ss, sp);                             
    substr(sort_cand[i]->seq, seq, sort_cand[i]->start - 1, sort_cand[i]->stop - 1);

    memcpy( &(*ret_cand)[cand_idx],sort_cand[i], sizeof(Cand));    
    cand_idx++;    
  }

  *ret_ncand = cand_idx;  
  free(structure);   
  free(tri_indx);   
  free_arrays();   
  free(cand);
  
}


int main(int argc, char* argv[])
{  
  char*   seqfile=NULL; 
  int     nseq;  
  char**  rseqs;
  SQINFO* sqinfo;
  int     format;     

  Cand**  cand;   
  int*    ncand;
  char*    cand_file=NULL;
  int     i,j;

  int     min_hairpin = 1;  
  int     max_hairpin = 1;  
 
  char  *optname;                /* name of option found by Getopt()        */
  char  *optarg;                 /* argument found by Getopt()              */
  int    optind;                 /* index in argv[]                         */	

  /*Parse command line */	

#ifdef MEMDEBUG
  unsigned long histid1, histid2, orig_size, current_size;
#endif

  while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
                &optind, &optname, &optarg))  {

    if      (strcmp(optname, "-c") == 0)    MAXCAND       = atoi(optarg); 
    else if (strcmp(optname, "-o") == 0)    cand_file     = optarg;    
    else if (strcmp(optname, "-m") == 0)    MINSPAN       = atoi(optarg);    
    else if (strcmp(optname, "-M") == 0)    MAXSPAN       = atoi(optarg);    
    else if (strcmp(optname, "-s") == 0)    min_hairpin   = atoi(optarg);    
    else if (strcmp(optname, "-S") == 0)    max_hairpin   = atoi(optarg);    
    else{
      Die("Invalid Option!\n %s", usage);      
    }    
  }
   
  if (max_hairpin < min_hairpin) {
    Die("Max hairpin should be bigger than Min hairpin \n %s\n", usage);    
  }
  
  if (argc - optind < 1)
    Die("%s\n", usage);  
   
   seqfile = argv[argc - 1];   
  
   /* RNAfold parameter */
   do_backtrack = 1; 
   noLonelyPairs=1;  
    
  
   /* Read Sequence fiOAle */
   format = SQFILE_FASTA;   
   
   if(! ReadMultipleRseqs(seqfile, format, &rseqs, &sqinfo, &nseq))
    Die("Failed to read squences from file %s", seqfile);
  
  
  cand = malloc(sizeof(Cand*) * nseq);  
  ncand = malloc(sizeof(int) * nseq);  
  for(i=0; i < nseq;  i++) {    
    CompCand(rseqs[i], min_hairpin, max_hairpin, &cand[i], &ncand[i] );             
    for(j =0; j < ncand[i]; j++) 
      cand[i][j].seq_id = i;          
  }

  Write2DCand(cand_file, nseq, cand, ncand);
  
  for(i = 0; i < nseq; i++){
    free(cand[i]);
  }
  free(cand);
  free(ncand);  
  return 1;
  
}

  
