package EST2ncRNA::RavennaInterface;

use strict;
use warnings;

use EST2ncRNA::ServerInterface;
use EST2ncRNA::MysqlInterface;

require Exporter;

our @ISA = qw(Exporter);  # inherits from Exporter

# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.

our @EXPORT_OK = qw(
		    );

our @EXPORT = qw();

our $VERSION = '0.01';


# constructor
sub new {
    my $class = shift @_;
    my ($workdir,$ravennahome, $rfammodel, $rfamseed, $server) = @_;

    my $self = {
	_subdir       => $workdir."/est2ncrna.ravenna",
	_home         => $ravennahome,
	_rfammodel    => $rfammodel,
	_rfamseed     => $rfamseed,
	_length       => 50,
	_server       => $server
    };
    bless $self, $class;

    # create subdir if not already exists
    mkdir $self->{_subdir} unless -d $self->{_subdir};

    return $self;
}


# accessor method for RavennaInterface subdir
sub subdir {
    my($self) = @_;
    return $self->{_subdir};
}


# accessor method for RavennaInterface ravennahome
sub home {
    my($self) = @_;
    return $self->{_home};
}


# accessor method for RavennaInterface rfammodel
sub rfammodel {
    my($self) = @_;
    return $self->{_rfammodel};
}


# accessor method for RavennaInterface rfamseed
sub rfamseed {
    my($self) = @_;
    return $self->{_rfamseed};
}


# accessor method for RavennaInterface length (minimal length of known ncRNAs)
sub length {
    my($self, $length) = @_;
    $self->{_length} = $length if defined($length);
    return $self->{_length};
}


# accessor method for RavennaInterface server-interface
sub server {
    my($self) = @_;
    return $self->{_server};
}


# catch in the Rfam.seed file the RaveNnA input parameters and the modelled basepair rate of the asked families
#
# input are
#   (1) location of the annotated seed alignments (Rfam.seed also from 'http://www.sanger.ac.uk/Software/Rfam/ftp.shtml')
#   (2) list of all Rfam model names should be examined (f.e. 'RF00001') or
#       'ALL' for all Rfam models
#
# returns references to
#   (1) the parameter array and
#   (2) the modelled basepair rate array

sub catch_rfam_seed {
    my $self = shift;
    my $rfamseed = shift;
    my @families = @_;
    my ($family, $line, $status, @sscons, $sscons, @window);
    my (@params, @bp);

    open IN, $rfamseed || die("Can not open the file!\n");

    # search for all families
    if( $families[0] eq "ALL" ) {
        while ( <IN> ) {
            if( /^\#=GF AC\s*(RF\d{5})/ ) {
                $line = "$1\t";
            }
            elsif( /^\#=GF GA\s*(\d*.\d*)/ ) {
                $line = $line."$1\t";
            }
            elsif( /^\#=GF BM\s*cmsearch --local -W (\d*)/ ) {
                $line = $line."$1\tlocal";
                push @params, $line;
		push @window, $1;
            }
            elsif( /^\#=GF BM\s*cmsearch -W (\d*)/ ) {
                $line = $line."$1\tglobal";
                push @params, $line;
		push @window, $1;
            }
            elsif( /^\#=GC SS_cons\s*([\<\.\>]*)/ ) {
                $sscons = $sscons."$1";
            }
            elsif( /^\/\// ) {
                push @sscons, $sscons;
                $sscons = "";
            }
        }
    }
    # search for asked families
    else {
        foreach $family ( @families ) {
            $status = 0;
            while ( <IN> ) {
                if( /^\#=GF AC\s*(RF\d{5})/ ) {
                    if( $family eq $1 ) {
                        $status = 1;
                        $line = "$1\t";
                    }
                }
                elsif( $status==1 ) {
                    if( /^\#=GF GA\s*(\d*.\d*)/ ) {
                        $line = $line."$1\t";
                    }
                    elsif( /^\#=GF BM\s*cmsearch --local -W (\d*)/ ) {
                        $line = $line."$1\tlocal";
                        push @params, $line;
			push @window, $1;
                    }
                    elsif( /^\#=GF BM\s*cmsearch -W (\d*)/ ) {
                        $line = $line."$1\tglobal";
                        push @params, $line;
			push @window, $1;
                    }
                    elsif( /^\#=GC SS_cons\s*([\<\.\>]*)/ ) {
                        $sscons = $sscons."$1";
                    }
                    elsif( /^\/\// ) {
                        push @sscons, $sscons;
                        $sscons = "";
                        last;
                    }
                }
            }
        }
    }

    close IN;

    # count the number of basepairs in the consensus secondary structures
    foreach( @sscons ) {
        push @bp, tr/<//;
    }

    return \@params, \@bp, \@window;
}


# runs the software RaveNnA: assigning sequences to covariance models of known ncRNAs using a statistical context-free grammar
#
# input are
#   (1) location of a fasta file with all sequences should be examined
#   (2) location of the Rfam models (Rfam.tar.gz from 'http://www.sanger.ac.uk/Software/Rfam/ftp.shtml')
#   (3) reference to the parameter array of scanning families
#
#	Rfam: annotating non-coding RNAs in complete genomes
#	Sam Griffiths-Jones, Simon Moxon, Mhairi Marshall, Ajay Khanna, 
#	Sean R. Eddy and Alex Bateman
#	Nucleic Acids Res. 2005 33:D121-D124
#
#	Rfam: an RNA family database.  
#	Sam Griffiths-Jones, Alex Bateman, Mhairi Marshall, Ajay Khanna 
#	and Sean R. Eddy.  
#	Nucleic Acids Res. 2003 31:439-441

sub run_ravenna {
    my $self = shift;
    my $fastafile = shift;
    my $RFAMMODELS = shift;
    my $params = shift;

    my $ravennahome = $self->home;
    my $subdir = $self->subdir;
    my $server = $self->server->name;
    my $serverhome = $self->server->home;
    my $userid = $self->server->userid;
    my $nodes = $self->server->nodes;
    my $wallt = $self->server->wallt;
    my $queue = $self->server->queue;
    my $PBSUBMIT = $self->server->pbsubmit;
    my $SCRATCH = $self->server->scratch;

    my ($cmd, $hours, $min, $index, @qstat, @par, @proc, $proc, @actproc, @splice, $status);

    # publish the db in the RaveNnA config file
    open OUT, ">$subdir/ravenna.config.tab" || die("Can not open the file!\n");
    print OUT "cmzashaExe\t$ravennahome/src/release/cmzasha\nInfernalBinDir\t$ravennahome/NotByZasha/infernal-0.7/src\nRfamDir\t$ravennahome/data\nRsearchMatrixDir\t$ravennahome/NotByZasha/rsearch-1.1-zasha/matrices\nPerlDir\t$ravennahome/src\ndb\tESTs\t$ravennahome/partFASTAs/partition.gz.list\t$ravennahome/data/EmblIdAndOrganismCompact_Barrick3.tab\t$ravennahome/data/default.heurcreationspec\n";
    close OUT;

    # create the heuristic HMM training file
    open OUT, ">$subdir/default.heurcreationspec" || die("Can not open the file!\n");
    print OUT "genomes\t$ravennahome/data/E_coli_NC_000913.fasta\t1\t$ravennahome/data/Bordetella_bronchiseptica.fasta\t1\t$ravennahome/data/S_aureus.fasta\t1\n";
    close OUT;

    # secure copy of all input data to ravenna home directory
    system(qq{scp $subdir/db.fa.gz $subdir/ravenna.config.tab $subdir/default.heurcreationspec $RFAMMODELS $server:$ravennahome/data/});

    # uncompress the Rfam models
    system(qq{ssh $server 'cd $ravennahome/data; gunzip -c Rfam.tar.gz | tar xvf - >>$serverhome/stdout_ravenna.log; rm Rfam.tar.gz'}) && die("Can't create covariance models!\n");

    # create a list of fasta files (named 'partition.gz.list') as db-input for RaveNnA whereas each file includes maximal 5.000.000 nucleotids
    system(qq{ssh $server "mkdir -p $ravennahome/partFASTAs; rm $ravennahome/partFASTAs/*; $ravennahome/src/release/cmzasha --partition-fasta $ravennahome/data/db.fa.gz $ravennahome/partFASTAs 5000000 >>$serverhome/stdout_ravenna.log; gzip $ravennahome/partFASTAs/*.fasta"}) && die("Can't create the RaveNna database!\n");


    # run RaveNnA for each covariance model in $params (known ncRNA-families)
    #my $dummy = 1;
    print "Start RaveNnA on $server.\n";

    # loop until the data is tested against all covariance models
    while( @$params > 0 ) {

        foreach( @$params ) {

            @par = split " ", $_;
            $cmd = qq{perl $ravennahome/src/ravenna.pl -configFile $ravennahome/data/ravenna.config.tab -scoreThreshold $par[1] -database ESTs -$par[3] -cmFileName $ravennahome/data/$par[0].cm -workDir $SCRATCH $par[2]};
            print "$cmd\n";

            # create the shell script 'pbs_ravenna.sh' to start the pbs job for one Rfam family
            open OUT, ">$subdir/pbs_ravenna.sh" || die("Can not open the file!\n");
            print OUT "#!/bin/tcsh\n set path = ( $PBSUBMIT \$path )\n";
	    print OUT "cd $ravennahome\npbsubmit.pl -o \"-N ravenna $nodes $wallt\" $queue -D -Q -B $ravennahome -c \"cp $ravennahome/src/ParallelBlock.pm $ravennahome/src/RavennaConfigFile.pm .; $cmd; cp $SCRATCH/*.cmzasha* $serverhome/ravenna; rm -f ParallelBlock.pm RavennaConfigFile.pm\"\n";
            close OUT;

            # start the pbs job on the server
            system(qq{scp $subdir/pbs_ravenna.sh $server:$serverhome/ravenna/});
            system(qq{ssh $server 'chmod u+x $serverhome/pbs_ravenna.sh; $serverhome/pbs_ravenna.sh'}) && die("RaveNnA finished with an error!\n");

            sleep 2;

            # collect the started ravenna process id in an array
            @qstat = $self->server->sshcall("qstat -u$userid");
            foreach( reverse @qstat ) {
                if( /ravenna/ ) {
                    push @proc, /(\d+)./;
                    last;
                };
            }

            #last unless --$dummy;
        }

        # wait until all started RaveNnA processes are finished
        while(1) {
            @qstat = $self->server->sshcall("qstat -u$userid");

            @actproc = ();
            @splice = ();
            foreach( reverse @qstat ) {
                if( /ravenna/ ) {
                    push @actproc, /(\d+)./;
                    last;
                };
            }

            $index=0;
            $status=0;
            foreach $proc ( @proc ) {
                foreach ( @actproc ) {
                    if( $_==$proc ) {
                        $status=1;
                        last;
                    }
                }
                push @splice, $index if $status==0;
                $index++;
                $status=0;
            }
            foreach( reverse @splice ) {
                splice @proc, $_, 1;
            }

            last if $#proc==-1;

            sleep 60;
        }

        # fetch results (cmzasha.* files) from the server
        system(qq{scp $server:$serverhome/ravenna/*.cmzasha* $subdir});

        # test if all covariance models are used because some can be lost through the walltime
        foreach( <$self-\>subdir/*.cmzasha.csv> ) {
            last if @$params==0;
            $index = 0;
            $_ =~ /(RF\d{5})/;
            while(1) {
                if( $params->[$index] =~ /$1/ ) {
                    splice @$params, $index, 1;
                    last;
                }
                else {
                    $index++;
                    last if $index==@$params;
                }
            }
        }
        # double the walltime
        if( @$params > 0 ) {
            ($hours,$min) = $self->server->wallt =~ /walltime=(\d+):(\d+)/;
            $min=2*($hours*60+$min);
            $hours=int($min/60);
            $min=$min % 60;
            $wallt =~ s/-l walltime=\d+:\d+/-l walltime=$hours:$min/;
            $index = @$params;
            print "$index left covariance models will be tested again with a increased walltime of \'$wallt\'.\n";
        }

    }

    # clean the server
    #system(qq{ssh $server "rm -rf $serverhome"}) && die("Can't clean the server from RaveNnA!\n");
}


sub filter_ravenna_hits {
	my $self = shift;
    	my $subdir = shift;
	my $refbp = shift;
    	my $length = shift;
	my $mysql = shift;

	my ($index, $cvsfiles, $family, $status, @par, $subseq, $sshit, $line, $id, $bp, $modbpr, @knownncrna);

	# analyse the results
    	$index = 0;
    	while( $cvsfiles = <$subdir/*.cmzasha.csv> ) {
        	( $family ) = $cvsfiles =~ /(RF\d{5})/;

        	open IN, "$cvsfiles" || die("Can not open the file!\n");
        	while( <IN> ) {
            	  if( $_ =~ /^Params:/ ) {
                	$status = "HIT";
                	@par = split ",", $_;

 	               # test the plausibility of the ravenna hits

                	# check if the modeled subsequence has gaps inside (bug of the assembly software of Dr. Mike Gilchrist)
                	# ravenna hits including gap regions causing modelling mistakes hence they should be ignored
                	$subseq = $mysql->get_assest_subsequence($par[6],$par[9],$par[10]);
                	$status = "GAPS" if $subseq=~/-/;

                	# check if the modeled subsequence are longer as $LENGTH nucleotides
                	$status = "SMALL" if( $status eq "HIT" && abs($par[10]-$par[9])<=0.6*$$length[$index] );

                	# check if the number of basepairs lies in a range of 20% around the basepair number of the model
                	if( $status eq "HIT" ) {

                    		# catch the dot-bracket notation of the predicted secondary structure from the .cmzasha-file
                    		$cvsfiles =~ /^(.*).csv$/;
                    		open IN2, "$1" || die("Can not open the file!\n");
                    		$sshit=0;
                    		foreach $line ( <IN2> ) {
                        		( $id ) = $line =~ /^----sequence:\s\#\d+,(\d+)/;
                        		$sshit=1 if defined $id && $id==$par[6];
                        		if( $sshit && $line=~/^----ssRNAplot:/ ) {
                            			( $subseq ) = $line =~ /^----ssRNAplot:\s+([\(\.\)]+)/;
                            			last;
                        		}
				}
				close IN2;

                    		#$bprate = get_basepair_rate($subseq);
                    		$bp = $subseq =~ tr/(//;
                    		$modbpr = $$refbp[$index];
                    		$status = "BP" if( $bp<=$modbpr-0.2*$modbpr || $bp>=$modbpr+0.2*$modbpr );
                	}

			# add every hits to the covariance models in the 'knownncrna'-array
			push @knownncrna, [$par[6], $par[9], $par[10], $par[11], $family, $bp, $status];
		  }
		}
		close IN;
		$index++;
	}

	return \@knownncrna;
}

1;

__END__
# Below is stub documentation for your module. You'd better edit it!

=head1 NAME

EST2ncRNA::RavennaInterface - Perl extension for blah blah blah

=head1 SYNOPSIS

  use EST2ncRNA::RavennaInterface;
  blah blah blah

=head1 DESCRIPTION

Stub documentation for EST2ncRNA::RavennaInterface, created by h2xs. It looks like the
author of the extension was negligent enough to leave the stub
unedited.

Blah blah blah.

=head2 EXPORT

None by default.



=head1 SEE ALSO

Mention other useful documentation such as the documentation of
related modules or operating system documentation (such as man pages
in UNIX), or any relevant external documentation such as RFCs or
standards.

If you have a mailing list set up for your module, mention it here.

If you have a web site set up for your module, mention it here.

=head1 AUTHOR

Stefan Seemann, E<lt>seemann@bioinf.uni-leipzig.deE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006 by Stefan Seemann

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.6 or,
at your option, any later version of Perl 5 you may have available.


=cut
