#!/usr/bin/env perl

use Getopt::Long qw(:config no_ignore_case); 
$path= $ENV{CMfinder};
$blast_path=$ENV{BLAST};

#default parameters

$CAND=40;
$MAXSPAN1=100;
$MINSPAN1=30;
$MAXSPAN2=100;
$MINSPAN2=40;
$FRACTION=0.8;
$SINGLE = 5;
$DOUBLE = 5;
$verbose = 0;
$help = 0;
$BLAST = 1;
$COMBINE = 1;
$RANK = 0;
$RANK_W = 0;
$SELECT = 0;
$FILTER=0;
$FILTER_W=1;
$FILTER_S=0;
$FILTER_TH=0.5;

if (!GetOptions(
	 "h" => $help,
	 "v" => \$verbose,
	 "c=i" => \$CAND,	
	 "m1=i" => \$MINSPAN1,
	 "M1=i" => \$MAXSPAN1,
	 "m2=i" => \$MINSPAN2,
	 "M2=i" => \$MAXSPAN2,
	 "f=f" => \$FRACTION,
	 "s1=i" => \$SINGLE,
	 "s2=i" => \$DOUBLE,
	 "combine=i" =>\$COMBINE,
	 "rank" =>\$RANK,
	 "rank_w" =>\$RANK_W,
	 "select" =>\$SELECT,
         "filter" =>\$FILTER,
         "filter_w" =>\$FILTER_W,
	 "filter_s" =>\$FILTER_S,
	 "filter_th=f"=>\$FILTER_TH)
    ){
    
    print STDERR "Invalid options\n";
    print_help();
    exit(1);
}

if ($help) {
    print_help();
    exit(0);
}

if (scalar @ARGV==0){
    print STDERR "No sequence file is specfied\n";
    print_help();
    exit(1);
}

$SEQ= shift @ARGV;


print <<OPTION;
CAND=$CAND;
MAXSPAN1=$MAXSPAN1;
MINSPAN1=$MINSPAN1;
MAXSPAN2=$MAXSPAN2;
MINSPAN2=$MINSPAN2;
FRACTION=$FRACTION;
SINGLE=$SINGLE;
DOUBLE=$DOUBLE;
BLAST=$BLAST;
COMBINE=$COMBINE;
FILTER=$FILTER;
RANK=$RANK;
SELECT=$SELECT;
OPTION


if (! defined $blast_path || ! -e "$blast_path/blastn" || ! -e "$blast_path/xdformat"){
    print STDERR "Can not find BLAST. Search without BLAST\n"; 
    $BLAST = 0;
}

$match_option="";
if ($BLAST == 1) {
    #build blast database
    system("$blast_path/xdformat -n $SEQ 2>/dev/null");
    system("$blast_path/blastn $SEQ $SEQ -notes -top -W 8 -noseqs> $SEQ.blast ");
    system("$path/parse_blast.pl $SEQ.blast > $SEQ.match"); 		    
    $match_option = "-m $SEQ.match";
}

if ($SINGLE){
    $cand = "$SEQ.cand.h1";
    $cmd1 = "$path/candf -c $CAND -o $cand -M $MAXSPAN1 -m $MINSPAN1 -s 1 -S 1 $SEQ";
    $cmd2 = "$path/cands -n $SINGLE -f $FRACTION $match_option $SEQ $cand";
    if ($verbose){
	print("$cmd1\n");
	print("$cmd2\n");
    }
    system($cmd1);
    system($cmd2);
}
if ($DOUBLE){
    $cand = "$SEQ.cand.h2";
    $cmd1 = "$path/candf -c $CAND -o $cand -M $MAXSPAN2 -m $MINSPAN2 -s 2 -S 2 $SEQ";
    $cmd2 = "$path/cands -n $DOUBLE -f $FRACTION $match_option $SEQ $cand";
    if ($verbose){
	print ("$cmd1\n");
	print ("$cmd2\n");
    }
    system($cmd1);
    system($cmd2);
}

@cands = glob("$SEQ.*cand.h*.*");
foreach $cand (@cands){   
    $align =$cand;
    $align =~ s/cand/align/;
    $cmd = "$path/canda $cand  $SEQ $align";
    if ($verbose){
	print "$cmd\n";
    }
    system($cmd);
    $motif = $cand;
    $motif =~ s/cand/motif/;
    $cm = $cand;
    $cm =~ s/cand/cm/;
    $cmd = "$path/cmfinder -o $motif -a $align $SEQ $cm";
    if ($verbose){
        print "$cmd\n";
    }
    system($cmd);
}

if ($COMBINE){
    print "Combine Motif\n";
    $cmd="perl $path/CombMotif.pl $SEQ $SEQ.motif";
    if ($verbose){
       print $cmd, "\n";
    }
    system($cmd);
}

if ($FILTER){
    print "Filter Motif\n";
    if ($FILTER_W){
	$cmd = "perl $path/filter.pl -w -t $FILTER_TH ";
    }
    else{
	$cmd = "perl $path/filter.pl -s -t $FILTER_TH ";
    }
    @motifs= glob("$SEQ.motif.*");
    foreach $m (@motifs){
	if ($verbose){
	    print "$cmd $m $m\n";	
	}
	system("$cmd $m $m");	
    }
}

if ($RANK){
    print "Rank Motif\n";
    if ($RANK_W){
    	$cmd = "perl $path/rank_cmfinder.pl -w \"$SEQ.motif.*\" $SEQ.summary";
    }
    else{
    	$cmd = "perl $path/rank_cmfinder.pl \"$SEQ.motif.*\" $SEQ.summary"; #bigscan
    }
    if ($verbose){
	print "$cmd\n";
    }
    system($cmd);
}

if ($SELECT){
    $cmd = "perl $path/select_cmfinder.pl $SEQ.summary select";
    if ($verbose){
	print "$cmd\n";
    }
    system($cmd);
}

if (! $verbose){
    system("rm $SEQ.*cand*");
    system("rm $SEQ.*align*");

    if ($BLAST) {
	system("rm $SEQ.blast");
	system("rm $SEQ.match");
	system("rm $SEQ.xn*");
    }
}

sub print_help{
    print STDERR <<HELP;
CMFINDER [options] SEQ
Options:
    -b               Do not use BLAST search to locate anchors
    -c <number>      The maximum number of candidates in each sequence. Default 40. No bigger than 100.
    -m1 <number>     The minimum length of single stemloop candidates. Default 30
    -M1 <number>     The maximum length of single stemloop candidates. Default 100
    -m2 <number>     The minimum length of double stemloop candidates. Default 40
    -M2 <number>     The maximum length of double stemloop candidates. Default 100
    -f <number>      The fraction of the sequences expected to contain the motif. Default 0.80
    -s1 <number>     The max number of output single stem-loop motifs. Default 5
    -s2 <number>     The max number of output double stem-loop motifs. Default 5    
    -combine <0|1>   Combine the output motifs. Default 1
    -filter          Filter the output motifs. 
    -filter_s        Filtering based on scores. Only valid with -filter.
    -filter_w        Filtering based on weights. Only valid with -filter. 
    -filter_th <number>  Filtering threshold. 
    -rank            Rank the output motifs. 
    -rank_w          Ranking based on weights. Only valid with -rank. 
    -select          Select the output motifs. 
    -h               Show this list
HELP
}
