package EST2ncRNA::BlastInterface;

=head1 NAME

C<EST2ncRNA::BlastInterface> - Automatize the execution of BLAST processes

=head1 SYNOPSIS

    use EST2ncRNA::BlastInterface;

    $server = eval { new EST2ncRNA::ServerInterface($SERVER, $WORKDIR, $QUEUE, $NODES, $WALLT); } or die ($@);
    $server->prepare_dir_tree;
    $blast = eval { new EST2ncRNA::BlastInterface($server, $subdir); } or die ($@);

    $blast->run_standard_blastn("PATH/query.fa.gz", \@chr");
    $blast->run_advanced_blastn($advquery_ref, \@chr, $advancedblastn);

=cut

use strict;
use warnings;

require Exporter;

our @ISA = qw(Exporter);  # inherits from Exporter

# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.

our @EXPORT_OK = qw(
		    		);

our @EXPORT = qw();

our $VERSION = '0.01';


=head1 METHODS

=head2 B<new>

Creates a new EST2ncRNA::BlastInterface class instance
    Input  - C<server instance>,
             optional an own temporary C<subdirectory>
    Output - EST2ncRNA::BlastInterface class instance

=cut

sub new {
    my $class = shift @_;
    my ($workdir,$server,$uid) = @_;

    my $self = {
	_subdir       => $workdir."/est2ncrna.blast",
	_server       => $server,
	_uid          => $uid,
	_advblast     => 0,
	_formatdb     => undef,
	_blast	      => undef,
	_evalue       => 1e-20,
	_length       => 100,
        _identity     => 75,
	_seqnr	      => 10000
    };
    bless $self, $class;
    
    # create subdir if not already exists
    mkdir $self->{_subdir} unless -d $self->{_subdir};

    return $self;
}


=head2 B<subdir>

Accessor method for BlastInterface subdir

=cut

sub subdir {
    my($self) = @_;	
    return $self->{_subdir};
}


=head2 B<server>

Accessor method for BlastInterface server-interface

=cut

sub server {
    my($self) = @_;
    return $self->{_server};
}


=head2 B<uid>

Accessor method for BlastInterface uid

=cut

sub uid {
    my($self) = @_;
    return $self->{_uid};
}


sub advblast {
    my($self, $advblast) = @_;
    $self->{_advblast} = $advblast if defined $advblast;
    return $self->{_advblast};
}


sub formatdb {
    my($self, $formatdb) = @_;
    $self->{_formatdb} = $formatdb if defined($formatdb);
    return $self->{_formatdb};
}


sub blast {
    my($self, $blast) = @_;
    $self->{_blast} = $blast if defined($blast);
    return $self->{_blast};
}


sub evalue {
    my($self, $evalue) = @_;
    $self->{_evalue} = $evalue if defined $evalue;
    return $self->{_evalue};
}


sub length {
    my($self, $length) = @_;
    $self->{_length} = $length if defined $length;
    return $self->{_length};
}


sub identity {
    my($self, $identity) = @_;
    $self->{_identity} = $identity if defined $identity;
    return $self->{_identity};
}


# maximal number of sequences in query fasta files
sub seqnr {
    my($self, $seqnr) = @_;
    $self->{_seqnr} = $seqnr if defined $seqnr;
    return $self->{_seqnr};
}


=head2 B<run_standard_blastn>

Runs blastn using standard parameter (e-value<1e-20) and creates an result file C<query.blast.gz> in $self->subdir
    Input  - (1) query data as gzipped fasta file
             (2) reference to an hash with pairs of chromosome to gzipped fasta file of genome data of the subject organism 
             (4) name of blast output file (gzipped file, optional)
    B<REQUISIT:> the tool C<blastwrap> on $self->server

=cut

sub run_standard_blastn {
    my ($self, $rquery, $refchr, $output) = @_;
    my ($chrfile, $chr, $SUBJECTFILE, $SUBJECTDB, $BLASTFILE);
    my ($index, $index2, @blastout, $node, $querycmd, $blastcmd, $mvcmd, $SUBJECTDBcmd, $refchr_length, $rquery_length, @blastfile);

    my $SERVER = $self->server->name;
    my $SERVER_HOME = $self->server->home;
    my $SUBDIR = $self->subdir;
    my $NODES = $self->server->nodes;
    my $QUEUE = $self->server->queue;
    my $WALLT = $self->server->wallt;
    my $BLAST = $self->blast;
    my $FORMATDB = $self->formatdb;
    my $UID = $self->uid;

    $output = "query.blast.gz" if !defined $output;
    my $evalue = $self->evalue;

    $index = 0;
    $index2 = 0;
    $node = 0;
    $querycmd = "";
    $SUBJECTDBcmd = "";
    $refchr_length = keys %$refchr;
    $rquery_length = @$rquery;
    foreach $chr ( sort keys %$refchr ) {

       	# create file names
       	#($chr) = $chrfile =~ /(chr\w+)\./;
	$chrfile = $$refchr{$chr};
       	$SUBJECTDB = "stdblast$UID.".$chr;
       	$SUBJECTFILE = $chrfile;

	# secure copy of Subject genome data to the running machine
        $self->server->scpcall($SUBJECTFILE, "$SERVER:$SERVER_HOME");

        # use formatdb to format a nucleotid source database as blast input
        $SUBJECTFILE =~ s/.*\/(\w+.fa.gz)$/$1/;
        $self->server->sshcall("cd $SERVER_HOME; zcat $SUBJECTFILE | $FORMATDB/formatdb -i stdin -p F -o T -s T -n $SUBJECTDB -v 100000000"); #rm -f $SUBJECTFILE");

    	foreach ( @$rquery ) {
		$node++;
		$index++;

        	# secure copy of query fasta file to the running machine
        	$self->server->scpcall($_, "$SERVER:$SERVER_HOME");

       		$BLASTFILE = "stdblastquery$UID.".$chr.".".$index.".blast.gz";
		push @blastfile, $BLASTFILE;

		my $query = ( reverse( split "\/", $_) )[0];
    		$query =~ s/\.gz//;
		$querycmd .= qq{$query.gz } if index($querycmd,"$query.gz") == -1;
		$SUBJECTDBcmd .= qq{$SUBJECTDB* } if index($SUBJECTDBcmd,$SUBJECTDB) == -1;
		$blastcmd .= qq{$BLAST/blastall -p blastn -d $SUBJECTDB -i /scratch/est2ncrna$UID/$query -F F -e $evalue | gzip > /scratch/est2ncrna$UID/$BLASTFILE &\n};
        	$mvcmd .= qq{mv /scratch/est2ncrna$UID/$BLASTFILE $SERVER_HOME &\n};

        	if( $node == $NODES || $index == $refchr_length*$rquery_length ) {

                	# create the shell script 'pbs_blastn.sh' to start the blastn as pbs jobs
                	open OUT, ">$SUBDIR/pbs_stdblastn$UID$index2.sh" || die("Can not open the file!\n");
                	print OUT "#!/bin/sh\n";
                	print OUT "#PBS -l nodes=$NODES\n" if( defined $NODES && $NODES ne "" );
                	print OUT "#PBS -l walltime=$WALLT\n" if( defined $WALLT && $WALLT ne "");
                	print OUT "#PBS -q $QUEUE\n" if( defined $QUEUE && $QUEUE ne "" );
                	print OUT "cd $SERVER_HOME\n";
                	print OUT "mkdir -p /scratch/est2ncrna$UID\n";
                	print OUT "cp $querycmd $SUBJECTDBcmd /scratch/est2ncrna$UID\n";
                	print OUT "cd /scratch/est2ncrna$UID\n";
                	print OUT "gunzip -f $querycmd\n";
                	print OUT $blastcmd . "wait\n";
                	print OUT $mvcmd . "wait\n";
                	close OUT;

                	# start the pbs jobs on the server
                	print "Start standard BLAST search for $chr and query $query on $SERVER.\n";
 	                $self->server->scpcall("$SUBDIR/pbs_stdblastn$UID$index2.sh", "$SERVER:$SERVER_HOME");
         	       	system(qq{ssh $SERVER 'chmod u+x $SERVER_HOME/pbs_stdblastn$UID$index2.sh; qsub -N blastn$UID $SERVER_HOME/pbs_stdblastn$UID$index2.sh'});

                	$node = 0;
			$querycmd = "";
                	$blastcmd = "";
                	$mvcmd = "";
			$SUBJECTDBcmd = "";
			$index2++;
        	}
    	}
    }

    # wait until all blastn processes of user are finished
    sleep 10;
    $self->server->wait("blastn$UID");

    $index = 0;
    foreach $chr ( sort keys %$refchr ) {
	$chrfile = $$refchr{$chr};	
        #$chrfile =~ /(chr\w+)\./;

	foreach my $query ( @$rquery ) {
		$index++;
        	
		$BLASTFILE = $blastfile[$index-1];

        	# fetch results (blast output file) from the server
        	$self->server->scpcall("$SERVER:$SERVER_HOME/$BLASTFILE", $SUBDIR);
	
        	# collect advanced blastn output in an array
        	push @blastout, $SUBDIR."/".$BLASTFILE;
    	}
    }

    # concatenate all advanced blastn output
    `cat @blastout > $SUBDIR/$output`;

    map { unlink $_ } @blastout;

    return $output;
}


=head2 B<run_advanced_blastn>

Runs blastn using advanced parameter (C<-r 1 -q -1 -G 1 -E 2 -W 9 -F "m D;V" -U -e 1e-20 -b 100 -v 1000>; recommended by oreilly blast p.137 for annotating genomic DNA with ESTs) and creates an result file C<query.advanced.blast.gz> in $self->subdir
    Input  - (1) reference to an array including for every chromosome in (2) a gzipped fasta file with all query sequences
             (2) reference to an hash with pairs of chromosome to location of closed organism chromosome file
             (3) name of generated output file

=cut

sub run_advanced_blastn {
    my ($self, $advquery, $refchr, $output) = @_;
    my ($chrfile, $chr, $QUERYFILE, $SUBJECTFILE, $SUBJECTDB, $BLASTFILE);
    my ($query, $index, @blastout, $node, $blastcmd, $mvcmd, $querycmd, $SUBJECTDBcmd, $gzipquerycmd, $refchr_length);

    my $SERVER = $self->server->name;
    my $SERVER_HOME = $self->server->home;
    my $SUBDIR = $self->subdir;
    my $NODES = $self->server->nodes;
    my $QUEUE = $self->server->queue;
    my $WALLT = $self->server->wallt;
    my $BLAST = $self->blast;
    my $FORMATDB = $self->formatdb;
    my $UID = $self->uid;

    $output = "query.advanced.blast.gz" if !defined $output;
    my $evalue = $self->evalue;

    # for BLASTN vector filtering (-F "V") is been needed the UniVec_Core database (ftp://ftp.ncbi.nih.gov/pub/UniVec/UniVec_Core)
    # use formatdb to format a nucleotid source database as blast input
    if( `ssh $SERVER "ls $SERVER_HOME/UniVec_Core.nsi" 2>&1`=~/^ls:/ ) {
        $self->server->scpcall($self->advblast, "$SERVER:$SERVER_HOME");
        $self->server->sshcall("cd $SERVER_HOME; $FORMATDB/formatdb -i UniVec_Core -p F -o T -s T -n UniVec_Core -v 100000000");
    }

    $index = 0;
    $node = 0;
    foreach $chr ( keys %$refchr ) {

	$node++;

        # create file names
	$chrfile = $$refchr{$chr};
	#($chr) = $chrfile =~ /(chr\w+)\./;
	$SUBJECTDB = "advblast$UID.".$chr;
        $QUERYFILE = $$advquery[$index++];
        $SUBJECTFILE = $chrfile;
        $BLASTFILE = "advblastquery$UID.".$chr.".blast.gz";

        # secure copy of query fasta file and subject genome data to the running machine
        $self->server->scpcall("$SUBDIR/$QUERYFILE", "$SERVER:$SERVER_HOME");
        $self->server->scpcall("$SUBJECTFILE", "$SERVER:$SERVER_HOME");

        # use formatdb to format a nucleotid source database as blast input
        $SUBJECTFILE =~ s/.*\/(\w+.fa.gz)$/$1/;
        $self->server->sshcall("cd $SERVER_HOME; zcat $SUBJECTFILE | $FORMATDB/formatdb -i stdin -p F -o T -s T -n $SUBJECTDB -v 100000000"); #rm -f $SUBJECTFILE");

	$query = ( reverse( split "\/", $QUERYFILE) )[0];
        $query =~ s/\.gz//;
	$querycmd .= qq{$query.gz };
	$SUBJECTDBcmd .= qq{$SUBJECTDB* };
	$gzipquerycmd .= qq{gunzip -f $query.gz\n};

	$blastcmd .= qq{$BLAST/blastall -p blastn -d $SUBJECTDB -i /scratch/est2ncrna$UID/$query -r 1 -q -1 -G 1 -E 2 -W 9 -F "m D;V" -U -E $evalue -b 100 -v 1000 | gzip > /scratch/est2ncrna$UID/$BLASTFILE &\n};
	$mvcmd .= qq{mv /scratch/est2ncrna$UID/$BLASTFILE $SERVER_HOME &\n};

	$refchr_length = keys %$refchr;
        if( $node == $NODES || $index == $refchr_length ) {

	        # create the shell script 'pbs_blastn.sh' to start the blastn as pbs jobs
		#$cmd = qq{blastall -p blastn -d $SUBJECTDB -i /scratch/$gzipQF -r 1 -q -1 -G 1 -E 2 -W 9 -F "\\""m D;V"\\"" -U -e $evalue -b 100 -v 1000 | gzip > $BLASTFILE};
	    	open OUT, ">$SUBDIR/pbs_advblastn$UID$index.sh" || die("Can not open the file!\n");
		print OUT "#!/bin/sh\n";
	   	print OUT "#PBS -l nodes=$NODES\n" if( defined $NODES && $NODES ne "" );
    		print OUT "#PBS -l walltime=$WALLT\n" if( defined $WALLT && $WALLT ne "");
	    	print OUT "#PBS -q $QUEUE\n" if( defined $QUEUE && $QUEUE ne "" );
    		print OUT "cd $SERVER_HOME\n";
		print OUT "mkdir -p /scratch/est2ncrna$UID\n";
		print OUT "cp $querycmd $SUBJECTDBcmd UniVec_Core.* /scratch/est2ncrna$UID\n";
		print OUT "cd /scratch/est2ncrna$UID\n";
		print OUT $gzipquerycmd;
		print OUT $blastcmd . "wait\n";
		print OUT $mvcmd . "wait\n";
    		close OUT;

        	# start the pbs jobs on the server
	        print "Start advanced BLAST search for $chr on $SERVER.\n";
        	$self->server->scpcall("$SUBDIR/pbs_advblastn$UID$index.sh", "$SERVER:$SERVER_HOME");
	    	system(qq{ssh $SERVER 'chmod u+x $SERVER_HOME/pbs_advblastn$UID$index.sh; qsub -N blastn$UID $SERVER_HOME/pbs_advblastn$UID$index.sh'});

		$node = 0;
		$blastcmd = "";
		$mvcmd = "";
		$querycmd = "";
		$SUBJECTDBcmd = "";
	 	$gzipquerycmd = "";
	}
    }

    # wait until all blastn processes of user are finished
    sleep 10;
    $self->server->wait("blastn$UID");

    foreach $chr ( keys %$refchr ) {
	$chrfile = $$refchr{$chr};
        #$chrfile =~ /(chr\w+)\./;
        $BLASTFILE = "advblastquery$UID.".$chr.".blast.gz";
	
        # fetch results (blast output file) from the server
        $self->server->scpcall("$SERVER:$SERVER_HOME/$BLASTFILE", $SUBDIR);

        # collect advanced blastn output in an array
        push @blastout, $SUBDIR."/".$BLASTFILE;
    }

    # concatenate all advanced blastn output
    `cat @blastout > $SUBDIR/$output`;

    map { unlink $_ } @blastout;

    return $output;
}


=head2 B<filter_blast_output>

Takes a blast output file created by C<kvlblast2table>, filters hits through a specified identity and subject coverage, and writes hits together with their annotation in a hash.
    Input  - (1) blast output file in table format
             (2) subject database file
             (3) filter parameter identity of aligned query and subject subsequence (0..1)
	     (4) filter parameter subject coverage ( alignment_length/SUBJEct_length )
    Output - reference to a hash with key=query_id and value='query_start:query_end:subject_id:evalue:identity:subject_coverage:subject_description'

=cut

sub filter_blast_output {
    my ($self, $blasttable, $db, $identity, $subcoverage) = @_;
    my (@par, %hit, %sub, $line, $desc);

    open IN, $blasttable || die("Can not open the file!\n");
    <IN>;
    while( <IN> ) {
        @par = split " ", $_;

        # identity
        next if $par[7]/$par[6]<$identity;
        # subject coverage
        next if $par[6]/$par[3]<$subcoverage;

        # get query subsequence and look if rate of A or T smaller as 50%
#        $subseq = $mysql->get_assest_subsequence($par[0], $par[9], $par[10]);
#        @subseq = split //,$subseq;
#        $a = 0;
#        $t = 0;
#        foreach $base (@subseq) {
#                $a++ if lc($base) eq 'a';
#                $t++ if lc($base) eq 't';
#        }
#        $la = $a/length($subseq);
#        $lt = $t/length($subseq);
#        next if $la > 0.5;
#        next if $lt > 0.5;

        # add to hash if not already key exists
        $hit{$par[0]} = $par[9].":".$par[10].":".$par[1].":".$par[8].":".$par[7]/$par[6].":".$par[6]/$par[3] unless defined $hit{$par[0]};
        $sub{$par[0]} = $par[1] unless defined $sub{$par[0]};
    }
    close IN;

    open IN, "zcat $db |" || die("Can not open the file!\n");
    while ( <IN> ) {
        $line = $_;
        next unless /^>/;
        foreach( keys %sub ) {
                if( index($line, $sub{$_}) >-1 ) {
                        $desc = ( split " ", $line )[1];
                        $hit{$_}=$hit{$_}.":".$desc;
                }
        }
    }
    close IN;

    return \%hit;
}


=head2 B<get_blast_output_bed>

Creates a BED-file from the C<kvlblast2table> output
    Input  - (1) the blasttable file
             (2) name of the output file
    Output - number of BED-file items

=cut

sub get_blast_output_bed {
    my ($self, $blasttable, $outfile) = @_;
    my (@par, $strand, $tmp, $status, %hit, $key, $item);
    my $nr = 0;

    # read kvlblast2table results
    open IN, "zcat $blasttable |" || die("Can not open the file!\n");
    <IN>;
    while( <IN> ) {
        @par = split " ", $_;

	# blasttable: $par[0]-QUERY_ID $par[1]-SUBJECT_ID $par[6]-ALIGN_LENGTH $par[7]-IDENTITY $par[8]-EVALUE $par[9]-QUERY_START $par[10]-QUERY_STOP $par[11]-SUBJECT_START $par[12]-SUBJECT_STOP
	
	# test type of strand
        if( $par[11]<=$par[12] ) {
            $strand = "+";
        }
        else {
            $strand = "-";
            $tmp = $par[11];
            $par[11] = $par[12];
            $par[12] = $tmp;
        }
        # index conversion from blastn-convention to over.chain-convention (UCSC) on strand "+"
        $par[11] = $par[11] - 1;

	# add item to hits if align_length>100nts and identity>75nts and it doesn't overlap with previous ones (highest e-value)
	$status = 0;
	if( $par[6]>100 && $par[7]>75 ) {
		if( !defined $hit{$par[0]} ) {
			$status = 1;
		}
		else {
			$status = 1;
			foreach $item ( @{$hit{$par[0]}} ) {
				#print "$$item[0] $$item[1]\n";
				if( $par[9]>=$$item[0] && $par[9]<=$$item[1] ) {
					$status = 0;
				}
				elsif( $par[10]>=$$item[0] && $par[10]<=$$item[1] ) {
					$status = 0;
				}
			}
		}
	}
	push( @{$hit{$par[0]}}, [$par[9], $par[10], $par[1], $par[11], $par[12], $strand] ) if $status;
	
    }
    close IN;

    # write BED-file $outfile
    open OUT, ">$outfile" || die("Can not open the file!\n");
    foreach $key ( keys %hit ) {
    	foreach $item ( @{$hit{$key}} )  {
		# chromosome
		$$item[2] =~ /.*\|(\w+)$/;
		print OUT "$1\t$$item[3]\t$$item[4]\t$key|$$item[0]|$$item[1]\t1\t$$item[5]\n";
		$nr++;
	}
    }
    close OUT;

    return $nr;
}


1;

=head1 AUTHOR

Stefan Seemann, E<lt>seemann@bioinf.uni-leipzig.deE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006 by Stefan Seemann

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.6 or,
at your option, any later version of Perl 5 you may have available.

=cut
