/* Last changed Time-stamp: <2013-04-15 18:40:00 nikolai> */
/*
  For a given sequence x and substructure S, find subsequence x[i..j],
  such that probability of forming S is maximal. We compute the probability
  that x[i..j] forms S as Q(x[i..j]|S)/Q(x[i..j]]), where Q(x[i..j]|S) can
  be computed via constraint folding. Since a single call to pf_fold(x)
  computes all Q(x[i..j]), only two foldings are needed. 

                  c Ivo L Hofacker
                  Vienna RNA package

		  modified by Nikolai Hecker
		  (command parameters, sanity checks, ...)
*/


#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <ctype.h>
#include <unistd.h>
#include <string.h>
#include <limits.h>
#include <float.h>
#include "fold.h"
#include "part_func.h"
#include "fold_vars.h"
#include "PS_dot.h"
#include "utils.h"
#include "read_epars.h"
#include "RNAcop_cmdl.h"

/* -------------------------- check_canonical ------------------------------
 * very simple function to check whether to pairing bases for canonical pair
 * characters must be upper case
 * 
 * returns 1 if canonical
 * returns 0 otherwise
 */
static int check_canonical(char a, char b)
{
  /* check for canonical base pair */
  if( a == 'A' && b == 'U' )
    return 1;
  if( a == 'U' && b == 'A' )
    return 1;
  if( a == 'G' && b == 'U' )
    return 1;
  if( a == 'U' && b == 'G' )
    return 1;
  if( a == 'G' && b == 'C' )
    return 1;
  if( a == 'C' && b == 'G' )
    return 1;
  
  return 0;
}

/* -------------------------- check_standard_base ------------------------------
 * very simple function to check whether base is 'A', 'C', 'G', 'U'
 * characters must be upper case
 * 
 * returns 1 if standard base
 * returns 0 otherwise
 */
static int check_standard_base(char a)
{
  /* check for standard base */
  if( a == 'A')
    return 1;
  if( a == 'C')
    return 1;
  if( a == 'G')
    return 1;
  if( a == 'U')
    return 1;
  
  return 0;
}

/*--------------------------------------------------------------------------*/

extern double get_subseq_F(int i, int j);
int main(int argc, char *argv[]){
  struct        RNAcop_args_info args_info;
  char          *buf, *rec_sequence, *rec_id, **rec_rest, *structure, *cstruc, *orig_sequence;
  char          fname[FILENAME_MAX_LENGTH], ffname[FILENAME_MAX_LENGTH], *ParamFile=NULL;
  char          *ns_bases=NULL, *c;
  int           i, j, length, l, cl, sym, r, istty, pf, noPS, noconv, fasta;
  unsigned int  rec_type, read_opt;
  double        energy, min_en, kT, sfact;
  int           circular;
  int minleft, minright, maxleft, maxright, cur_minleft, cur_minright, cur_maxleft, cur_maxright, cur_minlen;

  char *pf_struc;
  float **Fsubseq;
  int minI, maxI, bestI=0, bestJ=0;
  int minlen;
  double opt, ddG;
  unsigned int coptions, full;
  short *pt;
  int minsep, skip;
  char *errstr;

  errstr = (char *) space((unsigned) 100);

  rec_type      = read_opt = 0;
  rec_id        = buf = rec_sequence = structure = cstruc = orig_sequence = NULL;
  rec_rest      = NULL;
  do_backtrack  = 0;
  pf            = 0;
  sfact         = 1.07;
  noPS          = 0;
  noconv        = 0;
  circular      = 0;
  fasta         = 0;
  cl            = l = length = 0;
  dangles	= 2;
  full          = 0;
  minleft       = 0;
  minright      = 0;
  maxleft       = INT_MAX;
  maxright      = INT_MAX;
  minlen        = 0;
  minsep        = 3;
  /*
  #############################################
  # check the command line parameters
  #############################################
  */
  if(RNAcop_cmdline_parser (argc, argv, &args_info) != 0) exit(1);
  /* temperature */
  if(args_info.temp_given)        temperature = args_info.temp_arg;
  /* structure constraint */
  if(args_info.noTetra_given)     tetra_loop=0;
  /* set dangle model */
  if(args_info.dangles_given)     dangles = args_info.dangles_arg;
  /* do not allow weak pairs */
  if(args_info.noLP_given)        noLonelyPairs = 1;
  /* do not allow wobble pairs (GU) */
  if(args_info.noGU_given)        noGU = 1;
  /* do not allow weak closing pairs (AU,GU) */
  if(args_info.noClosingGU_given) no_closingGU = 1;
  /* do not convert DNA nucleotide "T" to appropriate RNA "U" */
  if(args_info.noconv_given)      noconv = 1;
  /* set energy model */
  if(args_info.energyModel_given) energy_set = args_info.energyModel_arg;
  /* take another energy parameter set */
  if(args_info.paramFile_given)   ParamFile = strdup(args_info.paramFile_arg);
  /* Allow other pairs in addition to the usual AU,GC,and GU pairs */
  if(args_info.nsp_given)         ns_bases = strdup(args_info.nsp_arg);
  /* set pf scaling factor */
  if(args_info.pfScale_given)     sfact = args_info.pfScale_arg;
  /* assume RNA sequence to be circular */
  if(args_info.circ_given)        circular=1;
  /* don produce PS output */
  if(args_info.noPS_given)        noPS=1;
  /* partition function settings */

  /*flanks*/
  if(args_info.minleft_given)
    minleft = args_info.minleft_arg;
  if(args_info.minright_given)
    minright = args_info.minright_arg;
  if(args_info.maxleft_given)
    maxleft = args_info.maxleft_arg;
  if(args_info.maxright_given)
    maxright = args_info.maxright_arg;

  /* minimum length */
  if(args_info.minlen_given)
    minlen = args_info.minlen_arg;
  

  if(args_info.full_given)        full = 1;
  
  /* free allocated memory of command line data structure */
  RNAcop_cmdline_parser_free (&args_info);

  /*
  #############################################
  # begin initializing
  #############################################
  */
  if (ParamFile != NULL)
    read_parameter_file(ParamFile);

  if (circular && noLonelyPairs)
    warn_user("depending on the origin of the circular sequence, some structures may be missed when using -noLP\nTry rotating your sequence a few times");

  if (ns_bases != NULL) {
    nonstandards = space(33);
    c=ns_bases;
    i=sym=0;
    if (*c=='-') {
      sym=1; c++;
    }
    while (*c!='\0') {
      if (*c!=',') {
        nonstandards[i++]=*c++;
        nonstandards[i++]=*c;
        if ((sym)&&(*c!=*(c-1))) {
          nonstandards[i++]=*c;
          nonstandards[i++]=*(c-1);
        }
      }
      c++;
    }
  }

  istty = isatty(fileno(stdout))&&isatty(fileno(stdin));
  fold_constrained=1;
  /* print user help if we get input from tty */
  if(istty){
    if(fold_constrained){
      print_tty_constraint_full();
      print_tty_input_seq_str("Input sequence (upper or lower case) followed by structure constraint");
    }
    else print_tty_input_seq();
  }

  /* set options we wanna pass to read_record */
  if(istty)             read_opt |= VRNA_INPUT_NOSKIP_BLANK_LINES;
  if(!fold_constrained) read_opt |= VRNA_INPUT_NO_REST;


  /* test values for flanking regions */
  if(maxleft < minleft)
    {
      warn_user("Desired maximum for flanking region to the left smaller than desired minimum. Setting maximum to minimum.");
      maxleft = minleft;
    }
  if(maxright < minright)
    {
      warn_user("Desired maximum for flanking region to the right smaller than desired minimum. Setting maximum to minimum.");
      maxright = minright;
    }


  /*
  #############################################
  # main loop: continue until end of file
  #############################################
  */
  while (!((rec_type = read_record(&rec_id, &rec_sequence, &rec_rest, read_opt))
	  & (VRNA_INPUT_ERROR | VRNA_INPUT_QUIT))) {
  
    /*
      ########################################################
      # init everything according to the data we've read
      ########################################################
    */
    if(rec_id){
      if(!istty) printf("%s\n", rec_id);
      (void) sscanf(rec_id, ">%s", fname);
    }
    else fname[0] = '\0';

    length  = (int)strlen(rec_sequence);
    structure = (char *)space(sizeof(char) *(length+1));
    
    /* parse the rest of the current dataset to obtain a structure constraint */
    if(fold_constrained){
      cstruc = NULL;
      coptions = (rec_id) ? VRNA_CONSTRAINT_MULTILINE : 0;
      coptions |= VRNA_CONSTRAINT_ALL;
      getConstraint(&cstruc, (const char **)rec_rest, coptions);
      cl = (cstruc) ? (int)strlen(cstruc) : 0;
      
      if(cl == 0)           warn_user("structure constraint is missing");
      else if(cl > length)  nrerror("structure constraint is too long");
      if(cstruc) strncpy(structure, cstruc, sizeof(char)*(cl+1));
    }

    
    
    /* convert DNA alphabet to RNA if not explicitely switched off */
    if(!noconv) str_DNA2RNA(rec_sequence);
    /* store case-unmodified sequence */
    orig_sequence = strdup(rec_sequence);
    /* convert sequence to uppercase letters only */
    str_uppercase(rec_sequence);
    
    if(istty) printf("length = %d\n", length);
  
    /*
      ########################################################
      # begin actual computations
      ########################################################
    */

    fold_constrained=0;
    min_en = (circular) ? circfold(rec_sequence, structure) : fold(rec_sequence, structure);

    {
      pf_struc = (char *) space((unsigned) length+1);
      Fsubseq = (float **) space((unsigned) sizeof(float *)*(length+1));
      for (i=1; i<=length; i++)
	Fsubseq[i] = (float *) space((unsigned) sizeof(float)*(length+1));
      
      kT = (temperature+273.15)*1.98717/1000.; /* in Kcal */
      pf_scale = exp(-(sfact*min_en)/kT/length);
      if (length>2000) fprintf(stderr, "scaling factor %f\n", pf_scale);
    
      if (cstruc!=NULL) strncpy(pf_struc, cstruc, length+1);

      /* -- sanity checks -- */
      skip = 0;
      for(i=0; i < length; i++)
	{
	  if(!check_standard_base(rec_sequence[i]))
	    {
	      sprintf(errstr, "Base %d (%c) is not canonical (A, C, G, U). Skipped entry.\n", 
			  i+1, rec_sequence[i]);
	      warn_user(errstr);
	      skip = 1;
	      break;
	    }
	}

      if(skip)
	continue;

      /* remove non-canonical base pairs if present; check for min separation */
      pt = make_pair_table(cstruc);
      for(i=1; i<=length; i++)
	{
	  /*check pair */
	  if(pt[i] > 0 && i < pt[i])
	    {
	      /* check for min separation */
	      if( pt[i]-i-1 < minsep)
		{
		  sprintf(errstr, "Replaced base pair '%d-%d' (%c%c) by '.' since separation < %d.\n", 
			  i, pt[i], rec_sequence[i-1], rec_sequence[pt[i]-1], minsep);
		  warn_user(errstr);
		  cstruc[i-1] = '.';
		  cstruc[pt[i]-1] = '.';
		  
		}
	      else if(!check_canonical(rec_sequence[i-1], rec_sequence[pt[i]-1]))
		{
		  sprintf(errstr, "Replaced base pair '%d-%d' (%c%c) by '.' since not canonical.\n", 
			  i, pt[i], rec_sequence[i-1], rec_sequence[pt[i]-1]);
		  warn_user(errstr);
		  cstruc[i-1] = '.';
		  cstruc[pt[i]-1] = '.';
		}
	    }
	}
      free(pt);

      /* locate constrained structure motif */
      for (minI=0; cstruc[minI] == '.'; minI++);
      for (maxI=strlen(cstruc)-1; cstruc[maxI] == '.'; maxI--);
      printf("desired structure located %d..%d\n", minI+1, maxI+1);
      
      
      

      /* check/correct flanking region requirements etc */
      cur_minleft = minleft;
      cur_minright = minright;
      cur_maxright = maxright ;
      cur_minlen = minlen;

      /* current max flanks avoid addition to MAX_INT */
      cur_maxleft = maxleft > length ? length : maxleft;
      cur_maxright = maxright > length ? length : maxright;
      if(cur_maxright+maxI+1 > length)
	{
	  cur_maxright = length-maxI-1;
	}
      if(cur_maxleft > minI)
	{
	  cur_maxleft = minI;
	}
      
      
      if(cur_minleft > minI)
	{
	  cur_minleft = minI;
	  warn_user("Desired minimum flanking region to the left too large. Using maximum allowed value.");
	}
      
      if(maxI+cur_minright >= length)
	{
	  cur_minright = length-maxI-1;
	  warn_user("Desired minimum flanking region to the right too large. Using maximum allowed value.");
	}

      if( (maxI+cur_maxright+1) - (minI-cur_maxleft+1) < cur_minlen)
	{
	  warn_user("Desired minimum length larger than any possible sequence given maximum flanking boundaries. Using maximimum allowed value.");
	  cur_minlen = (maxI+cur_maxright+1) - (minI-cur_maxleft+1);
	}
      
      

      /* fold and compute probabilities */
      energy = (circular) ? pf_circ_fold(rec_sequence, pf_struc) : pf_fold(rec_sequence, pf_struc);
      printf("unconstrained %6.2f kcal/mol, ",energy );
      for (i=1; i<=length; i++)
	for (j=i; j<=length; j++) 
	  Fsubseq[i][j] = get_subseq_F(i,j);

      fold_constrained =1;

      if (cstruc!=NULL) strncpy(pf_struc, cstruc, length+1);
      
      energy = (circular) ? pf_circ_fold(rec_sequence, pf_struc) : pf_fold(rec_sequence, pf_struc);
      printf("constrained %6.2f kcal/mol\n",energy );

      for (i=1; i<=length; i++)
	for (j=i; j<=length; j++) 
	  Fsubseq[j][i] = get_subseq_F(i,j);
      
      free_pf_arrays();

      opt= -FLT_MAX;
      for (i=1; i<=minI-cur_minleft+1; i++)
		  for (j=maxI+cur_minright+1; j<=length; j++) {
			  if (j-i+1 < cur_minlen) continue;

			  if(j-maxI-1 > maxright)
			    continue;

			  if(minI-i+1 > maxleft)
			    continue;

			   
			  ddG = Fsubseq[i][j]-Fsubseq[j][i];

			  if(full)
			    printf("i:%2d j:%2d  ddG:%6.4f\n", i,j, ddG);

			  if (ddG > opt) {
				  opt = ddG;
				  bestI=i; bestJ=j;
			  }
		  }

      printf("Best subsequence %d..%d, ddG and probability of substructure %6.2f  %6.4E\n",
	     bestI, bestJ, opt, exp(opt/kT));
      rec_sequence[bestJ] = '\0';
      cstruc[bestJ] = '\0';
      printf("%s\n%s\n", rec_sequence+bestI-1, cstruc+bestI-1);

      for (i=0; i<=length; i++)
	free(Fsubseq[i]);
      
      (void) fflush(stdout);
    }

    /* clean up */
    if(cstruc) free(cstruc);
    if(rec_id) free(rec_id);
    free(rec_sequence);
    free(orig_sequence);
    free(structure);
    free(errstr);
    /* free the rest of current dataset */
    if(rec_rest){
      for(i=0;rec_rest[i];i++) free(rec_rest[i]);
      free(rec_rest);
    }
    rec_id = rec_sequence = structure = cstruc = NULL;
    rec_rest = NULL;
  
    /* print user help for the next round if we get input from tty */
    if(istty){
      if(fold_constrained){
	print_tty_constraint_full();
	print_tty_input_seq_str("Input sequence (upper or lower case) followed by structure constraint");
      }
      else print_tty_input_seq();
    }
  }
  return EXIT_SUCCESS;
}
