package EST2ncRNA::ClustalwInterface;

use strict;
use warnings;

use EST2ncRNA::SequenceInterface qw(get_subject_subsequences_of_one_chromosome
				    get_subsequence);

require Exporter;

our @ISA = qw(Exporter);

# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.

our @EXPORT_OK = (
		  );

our @EXPORT = qw();

our $VERSION = '0.01';


# constructor
sub new {
    my $class = shift @_;
    my ($workdir,$server, $uid) = @_;

    my $self = {
	_subdir       => $workdir."/est2ncrna.rnaz",
	_fastadir     => "fasta",
	_pbsdir       => "pbsscript",
	_uid          => $uid,
	_server       => $server
    };
    bless $self, $class;
    
    # create subdirs if not already exists
    mkdir $self->{_subdir} unless -d $self->{_subdir};
    mkdir $self->{_subdir}."/".$self->{_fastadir} unless -d $self->{_subdir}."/".$self->{_fastadir};
    mkdir $self->{_subdir}."/".$self->{_pbsdir} unless -d $self->{_subdir}."/".$self->{_pbsdir};
    
    return $self;
}


# accessor method for ClustalwInterface subdir
sub subdir {
    my($self) = @_;
    return $self->{_subdir};
}


# accessor method for ClustalwInterface subdir fasta
sub fastadir {
    my($self) = @_;
    return $self->{_fastadir};
}


# accessor method for ClustalwInterface subdir pbsscript
sub pbsdir {
    my($self) = @_;
    return $self->{_pbsdir};
}


=head2 B<uid>

Accessor method for BlastInterface uid

=cut

sub uid {
    my($self) = @_;
    return $self->{_uid};
}


# accessor method for ClustalwInterface server-interface
sub server {
    my($self) = @_;
    return $self->{_server};
}


# run clustalw for all input fasta-files on a server in two parallel processes and save the aln-files in the subdirectory 'aln'
#
# input is a reference to an array including the name of all fasta-files (without file-ending)
sub calculate_multiple_alignments {
    my ($self) = @_;	
    my (%scriptNames, $fastafile);
    my (@qstat, $index);
    my ($i, @nr, $nr);
    #my $pid;
    
    my $SERVER = $self->server->name;
    my $SERVER_HOME = $self->server->home;
    my $QUEUE = $self->server->queue;
    my $USERID = $self->server->userid;
    my $SUBDIR = $self->subdir;
    my $FASTADIR = $self->fastadir;
    my $PBSDIR = $self->pbsdir;
    my $PBSUBMIT = $self->server->pbsubmit;
    my $UID = $self->uid;

    # create a tar-archive of all fasta files
    system(qq{cd $SUBDIR; tar -vzcf clustalw.input.fa.tar.gz $FASTADIR > /dev/null});
    $self->server->scpcall("$SUBDIR/clustalw.input.fa.tar.gz", "$SERVER:$SERVER_HOME");
    $self->server->sshcall("cd $SERVER_HOME; tar xzvf clustalw.input.fa.tar.gz > /dev/null");
    # wait until tar process of user 'whoami' is finished
    $self->waitTar($SERVER, $USERID);

    # fetch all fasta files
    my @fasta = map{my $tmp = (split /\//, $_)[-1]; $tmp =~ s/\.fa//; $tmp} `ls $SUBDIR/$FASTADIR`;
    chomp @fasta;

    # This section of the code was updated by Jakob
    # Now each job in the que runs several clustalw jobs
    my $masterScript = $self->buildScripts(\@fasta, \%scriptNames, $SUBDIR, $PBSDIR, $PBSUBMIT, $SERVER_HOME, $QUEUE, $UID);

    # transfer a tar-archive of all pbs scripts to the running machine
    $self->transferScripts($SUBDIR, $PBSDIR, $SERVER, $SERVER_HOME, $USERID);

    # delete all existed files in the temporary aln-folder
    `rm -f $SUBDIR/aln/* > /dev/null`;

    # start the clustalw pbs jobs on the server
    $nr = @fasta;
    print "Start calculating of $nr multiple alignments with clustalw on $SERVER.\n";

    while(1) {
	
        $nr = keys %scriptNames;
        print "$nr multiple alignment jobs left.\n";
	
	$self->server->sshcall("$SERVER_HOME/$PBSDIR/$masterScript");
#        $index = 0;
#        foreach my $scriptName ( keys %scriptNames ) {
#	    
#	    $self->server->sshcall("$SERVER_HOME/$PBSDIR/$scriptName");
#	    
#            # run maximal 200 clustalw processes parallel
#	    while(1) {
#            	@qstat = $self->server->sshcall("qstat -u$USERID");
#                last if @qstat==0;
#		$index = 0;
#		map { $index++ if /clustalw/ } @qstat;
#			last if $index<=2000;
#		sleep 60;
#	    }
#        }

        # wait until all clustalw processes of user 'whoami' are finished
        $self->server->wait("clustalw");

        # fetch results (clustalw aln-files) as tar archive from the server
        $self->server->sshcall("cd $SERVER_HOME; tar --remove-files -vzcf clustalw.aln.tar.gz aln > /dev/null");
        $self->waitTar($SERVER, $USERID);
        $self->server->scpcall("$SERVER:$SERVER_HOME/clustalw.aln.tar.gz", $SUBDIR);
        system(qq{cd $SUBDIR; tar xzvf clustalw.aln.tar.gz > /dev/null});

        # This section of the code was updated by Jakob
        # test which multiple alignments aren't calculated and leave if all alignments are finished
	my @missingFasta;
	$self->findMissingAlignments(\@fasta, \@missingFasta, $SUBDIR);
	if (scalar @missingFasta == 0) {last;}

	# Some alignments are missing, build new scripts and transfer them
    	$self->buildScripts(\@missingFasta, \%scriptNames, $SUBDIR, $PBSDIR, $PBSUBMIT, $SERVER_HOME, $QUEUE, $UID);
    	$self->transferScripts($SUBDIR, $PBSDIR, $SERVER, $SERVER_HOME, $USERID);
    }
}

sub findMissingAlignments {

    # This section of the code was updated by Jakob
    my ($self, $fasta, $missingFasta, $SUBDIR) = @_;
    
#    @$missingFasta = @$fasta;
    
    foreach( @$missingFasta ) {
	my @key = `ls $SUBDIR/aln/$_.aln 2> /dev/null`;
	if(not defined $key[0] ) {
	    chomp( $key[0] );
	    my ($key) = $key[0] =~ /([^\/]+)\.aln$/;
	    push @{$missingFasta}, $key;
#	    delete $$missingFasta{$key};
	}
    }
   
}

sub buildScripts {

    # This section of the code was updated by Jakob
    my ($self, $fasta, $scriptNames, $SUBDIR, $PBSDIR, $PBSUBMIT, $SERVER_HOME, $QUEUE, $UID) = @_;

    # This section of the code was updated by Jakob
    # Now each job in the que runs several clustalw jobs
    undef(%$scriptNames);

    my $maxJobsPrScript = 1000;
    my $scriptCount = -1;
    my $jobCount = $maxJobsPrScript;
    # create pbs script for each clustalw process
    my $filehandle;
    foreach my $fastafile ( @$fasta ) {
	if ($jobCount >= $maxJobsPrScript) {
		$scriptCount++;
		if ($scriptCount > 0) {
			close $filehandle;
		}
		my $scriptName = "pbs_clustalw.$scriptCount.sh";
	        open $filehandle, ">$SUBDIR/$PBSDIR/$scriptName" || die("Can not open the file!\n");
	        `chmod a+x $SUBDIR/$PBSDIR/$scriptName`;
	        $$scriptNames{$scriptName} = 1;
	        $jobCount = 0;
        	print $filehandle "#!/bin/tcsh\n";
		print $filehandle "cd $SERVER_HOME\n";
		print $filehandle "mkdir -p $SERVER_HOME/aln;\n";
		print $filehandle "mkdir -p /scratch/est2ncrna$UID;\n";
		print $filehandle "cd /scratch/est2ncrna$UID/;\n";
	}
	print $filehandle "cp $SERVER_HOME/fasta/$fastafile.fa /scratch/est2ncrna$UID;\n";
	print $filehandle "clustalw $fastafile.fa > /dev/null;\n";
	print $filehandle "mv /scratch/est2ncrna$UID/$fastafile.aln $SERVER_HOME/aln/;\n";
    }
    close $filehandle;
    
    my $masterScript = "clustalw-masterScript";
    my $maxJobs = 400;
    open MASTER, ">", "$SUBDIR/$PBSDIR/$masterScript" or die "Could not write the master script: $!\n";
    print MASTER "#!/usr/bin/perl -w\n";
    print MASTER "use strict;\n";
    print MASTER "my \@jobs = (\n";
    foreach my $job (sort keys %$scriptNames) {
    	print MASTER "\t\"$SERVER_HOME/$PBSDIR/$job\",\n";
    }
    print MASTER ");\n";
    print MASTER "my \$user = `whoami`;\n";
    print MASTER "chomp \$user;\n";
    print MASTER "`mkdir -p $SERVER_HOME/que`;\n";
    print MASTER "my \$jobCount = $maxJobs;\n";
    print MASTER "foreach my \$job (\@jobs) {\n";
    print MASTER "\tmy (\$name) = \$job =~ /([^\\/]+)\$/;\n";
    print MASTER "\t`qsub -e $SERVER_HOME/que/\$name.err -o $SERVER_HOME/que/\$name.out ";
    print MASTER "-N \"clustalw\" -q $QUEUE \$job`;\n";
    print MASTER "\twhile (\$jobCount >= $maxJobs) {\n";
    print MASTER "\t\tmy \@queSize = `qstat -u \$user | egrep ' Q | H ' | wc -l`;\n";
    print MASTER "\t\tmy \$queSize = \$queSize[0];\n";
    print MASTER "print \"\$queSize\\n\";\n";
    print MASTER "\t\tif (\$queSize eq \"\") {\n";
    print MASTER "\t\t\t\$jobCount = int(0.5*$maxJobs)+1;\n";
    print MASTER "\t\t}\n";
    print MASTER "\t\telsif (\$queSize >= $maxJobs) {sleep 60; next;}\n";
    print MASTER "\t\telse {\$jobCount = int(0.5*($maxJobs - \$queSize));}\n";
    print MASTER "\t}\n";
    print MASTER "\t\$jobCount++;\n";
    print MASTER "}\n";
    close MASTER;
    
    return $masterScript;
}


sub transferScripts {

    # This section of the code was updated by Jakob
    my ($self, $SUBDIR, $PBSDIR, $SERVER, $SERVER_HOME, $USERID) = @_;
    
    # transfer a tar-archive of all pbs scripts to the running machine
    system(qq{cd $SUBDIR; tar --remove-files -vzcf pbs_clustalw.tar.gz $PBSDIR > /dev/null});
    $self->server->scpcall("$SUBDIR/pbs_clustalw.tar.gz", "$SERVER:$SERVER_HOME");
    $self->server->sshcall("cd $SERVER_HOME; tar xzvf pbs_clustalw.tar.gz > /dev/null; chmod -R u+x $SERVER_HOME/$PBSDIR/");
    # wait until tar process of user 'whoami' is finished
    $self->waitTar($SERVER, $USERID);
}

sub waitTar {

    # This section of the code was updated by Jakob
    my ($self, $SERVER, $USERID) = @_;
    
    while(1) {
        my @tar = `ssh $SERVER "ps u -C tar | grep $USERID"`;
        last if @tar==0;
        sleep 20;
    }
}

# create maf-files using the clustalw output (aln-files)
#
# input are a reference to a hash including 
#             as key the aln-file name and 
#             as value a hash about the aligned sequences in the aln-file including 
#                 as key the identifier and
#                 as value an anonymous array with startposition, length, strand and sequence ('0'),
#           the maximal number of alignments in amaf-file
# returns a reference to an array including the name of all created maf-files
sub aln2maf {
    my ($self, $aln_refhash, $MAFLENGTH) = @_;	
    my ($aln, @maffiles, $index, $file, $newf, $src, $seq, @tmp, $nr, $cid);
    
    my $SUBDIR = $self->subdir;

    # remove old est.*.maf.gz files
    `rm -f $SUBDIR/est.*.maf.gz`;

    print "Create MAF format from clustalw output.\n";

    foreach $file ( keys %$aln_refhash ) {
        $newf = 0;
        $aln = $$aln_refhash{$file};
        $index++;
        
        $cid = ( split '\.', $file )[0];
	
	# read alignment files
	open IN, "$SUBDIR/aln/$file.aln" || die("Can not open the file!\n");
        <IN>; # jump header line
        while( <IN> ) {
            next if $_ =~ /^\s+/;
            chomp $_;
            ( $src, $seq ) = split " ", $_;
	     
            @tmp = split '\.', $src;
            $src="gnl|$tmp[0]|$tmp[1]" if @tmp>1;
	    
            if( $#{$$aln{$src}}==4 ) {
                $$aln{$src}->[4] = $$aln{$src}->[4].$seq;
            }
            else {
                push @{$$aln{$src}}, $seq;
            }
        }
        close IN;
	
	# change first base index from 1 to 0 (MAF format convention)
	foreach( keys %$aln) {
	    $$aln{$_}->[0]--;
	}

        # print maf files
        $nr = int( $index/$MAFLENGTH );
        unless( -f "$SUBDIR/est.$nr.maf.gz" ) {
            $newf = 1;
            push @maffiles, "est.$nr.maf";
        }
	open OUT, "| gzip >>$SUBDIR/est.$nr.maf.gz" || die("Can not open the file!\n");
        print OUT "##maf version=1\n" if $newf;
        print OUT "\n";
        print OUT "a score=0\n";
	
        print OUT "s $cid @{$$aln{$cid}}\n";
        foreach( keys %$aln) {
            next if $_ eq $cid;
            print OUT "s $_ @{$$aln{$_}}\n";
	}
	close OUT;
    }
    
    return \@maffiles;
}


sub get_homologous_fasta_from_pafout {
    my ($self, $queryfile, $closedorg, $bed_file_hashref, $outputfolder, $fileext) = @_;
    my ($bed, %conserved, $subject, @bed, $estid, $eststart, $estend, %estsub, $subseq, $file, $line, $chrom, %chr, %query, $c, $subseq_hashref, $tmp);  

    $fileext = (defined $fileext) ? $fileext : "paf.fa";

    foreach $bed ( keys %$bed_file_hashref ) {
        # get subject name
        $subject = ( split '\.', $bed )[0];
	next if $subject ne $closedorg;
	# get conserved ests
	open IN, $bed || die("Can not open the file!\n");
        while( <IN> ) {
                chomp( $_ );
                @bed = split " ";
                ( $estid, $eststart, $estend ) = split '\|', $bed[3];
		push @{$conserved{$estid}}, $eststart;
		$estsub{"$estid:$eststart"} = $estend;
	}
	close IN;
    }

    # create fasta-files including queries and bed-files
    open IN, "zcat $queryfile |" || die("Can not open the file!\n");
    while( <IN> ) {
        chomp( $_ );
        if( /^>/ ) {
                />(.*)/;
                $estid = $1;
                next unless defined $conserved{$estid};
                foreach $eststart ( @{$conserved{$estid}} ) {
                        $file = "$outputfolder/$estid.$eststart.$fileext";
                        open OUT, ">$file" || die("Can not open the file!\n");
                        print OUT "$_\n";
                        close OUT;
                }
        }
        else {
                foreach $eststart ( @{$conserved{$estid}} ) {
			# get est subsequence
			$subseq = get_subsequence($_, $eststart, $estsub{"$estid:$eststart"});        
                        $file = "$outputfolder/$estid.$eststart.$fileext";
                        open OUT, ">>$file" || die("Can not open the file!\n");
                        print OUT "$subseq\n";
                        close OUT;
                }
        }
    }
    close IN;

    foreach $bed ( keys %$bed_file_hashref ) {
        %chr = ();
        %query = ();

        # get subject name
        $subject = ( reverse( split '\/', $bed ) )[0];
        $subject = ( split '\.', $subject )[0];

        # catch the names of chromosome files in a hash
        open IN, $$bed_file_hashref{$bed} || die("Can not open the file!\n");
        foreach $line ( <IN> ) {
                chomp $line;
                ( $chrom ) = $line =~ /(chr\S+).fa.gz$/;
                $chr{$chrom} = $line;
        }
	close IN;

        # read bed-file
        open IN, "$bed" || die("Can not open the file!\n");
        while( <IN> ) {
                @bed = split " ";
                ( $estid, $eststart ) = split '\|', $bed[3];
		# switch start and end position if '-' strand
		if( $bed[5] eq "-" ) {
			$tmp = $bed[1];
			$bed[1] = $bed[2];
			$bed[2] = $tmp;
		}
		# switch from bed-indizes to blast-indizes
                $bed[1] = $bed[1]+1;

                push @{$query{$bed[0]}}, [$estid, $eststart, $bed[1], $bed[2]];
        }
        close IN; 

    	# get subsequences of subject organism for each chromosome and write in a fasta-file
	foreach $c ( keys %query ) {
        	#map{ print "$chr{$c}: @$_\n"} @{$query{$c}};
        	$subseq_hashref = get_subject_subsequences_of_one_chromosome($chr{$c}, \@{$query{$c}});
        	foreach( keys %$subseq_hashref ) {
                	( $estid, $eststart ) = split '\|';
                	open OUT, ">>$outputfolder/$estid.$eststart.$fileext" || die("Can not open the file!\n");
                	print OUT ">gnl|$subject|$c\n";
                	print OUT "$$subseq_hashref{$_}\n";
                	close OUT;
		}
        }
    }

}


sub get_homologous_fasta_from_mafout {
     my ($self, $queryfile, $closedorg, $bed_file_hashref, $mafout_file, $outputfolder, $fileext) = @_;
     my ($bed, %conserved, %tmp, $subject, @bed, $estid, $eststart, $estend, %estsub, $subseq, $file, $line, $chrom, %chr, %query, $c, $subseq_hashref, @mafout, $status, $tmp);

     $fileext = (defined $fileext) ? $fileext : "maf.fa";

     open IN, "zcat $mafout_file |" || die("Can not open the file!\n");
     while( <IN> ) {
    	chomp( $_ );
        @mafout = split ":";
	if( !defined $tmp{"$mafout[0]:$mafout[1]"} ) {
		$tmp{"$mafout[0]:$mafout[1]"} = 1;
        	push @{$conserved{$mafout[0]}}, $mafout[1];
	}
     }
     close IN;

     foreach $bed ( keys %$bed_file_hashref ) {
        # get subject name
        $subject = ( split '\.', $bed )[0];
        next if $subject ne $closedorg;
        # get conserved ests
        open IN, $bed || die("Can not open the file!\n");
        while( <IN> ) {
                chomp( $_ );
                @bed = split " ";
                ( $estid, $eststart, $estend ) = split '\|', $bed[3];
                $estsub{"$estid:$eststart"} = $estend;
        }
        close IN;
    }

     # create fasta-files including queries and closedorg
     open IN, "zcat $queryfile |" || die("Can not open the file!\n");
     while( <IN> ) {
     	chomp( $_ );
     	if( /^>/ ) {
		/>(.*)/;
        	$estid = $1;
	        next unless defined $conserved{$estid};
        	foreach $eststart ( @{$conserved{$estid}} ) {
			$file = "$outputfolder/$estid.$eststart.$fileext";
	                open OUT, ">$file" || die("Can not open the file!\n");
        	        print OUT "$_\n";
                	close OUT;
	      	}
     	}
     	else {
		next unless defined $conserved{$estid};
     		foreach $eststart ( @{$conserved{$estid}} ) {
			# get est subsequence
                        $subseq = get_subsequence($_, $eststart, $estsub{"$estid:$eststart"});
         		$file = "$outputfolder/$estid.$eststart.$fileext";
	                open OUT, ">>$file" || die("Can not open the file!\n");
        	        print OUT "$subseq\n";
                	close OUT;
	        }
    	}
     }
     close IN;

     foreach $bed ( keys %$bed_file_hashref ) {
     	%chr = ();
     	%query = ();

     	# get subject name
     	$subject = ( reverse( split '\/', $bed ) )[0];
     	$subject = ( split '\.', $subject )[0];

     	# catch the names of chromosome files in a hash
     	open IN, $$bed_file_hashref{$bed} || die("Can not open the file!\n");
     	foreach $line ( <IN> ) {
    		chomp $line;
        	( $chrom ) = $line =~ /(chr\S+).fa.gz$/;
        	$chr{$chrom} = $line;
     	}
 	close IN;

     	# read bed-file
     	open IN, "$bed" || die("Can not open the file!\n");
     	while( <IN> ) {
    		@bed = split " ";
        	( $estid, $eststart ) = split '\|', $bed[3];
		# switch start and end position if '-' strand
                if( $bed[5] eq "-" ) {
                        $tmp = $bed[1];
                        $bed[1] = $bed[2];
                        $bed[2] = $tmp;
                }
		# switch from bed-indizes to blast-indizes
                $bed[1] = $bed[1]+1;

        	push @{$query{$bed[0]}}, [$estid, $eststart, $bed[1], $bed[2]];
     	}
     	close IN;

     	# get subsequences of subject organism for each chromosome and write in a fasta-file
     	foreach $c ( keys %query ) {
     		#map{ print "$chr{$c}: @$_\n"} @{$query{$c}};
     		$subseq_hashref = get_subject_subsequences_of_one_chromosome($chr{$c}, \@{$query{$c}});
     		foreach( keys %$subseq_hashref ) {
    			( $estid, $eststart ) = split '\|';
			next unless defined $conserved{$estid};
			$status = 0;
			map{ $status=1 if $_ eq $eststart } @{$conserved{$estid}};
			next unless $status;
        		open OUT, ">>$outputfolder/$estid.$eststart.$fileext" || die("Can not open the file!\n");
	        	print OUT ">gnl|$subject|$c\n";
        		print OUT "$$subseq_hashref{$_}\n";
	        	close OUT;
    		}
     	}
     }

     # extend fasta-files with mafout_file
     open IN, "zcat $mafout_file |" || die("Can not open the file!\n");
     while( <IN> ) {
	@mafout = split ":";
	open OUT, ">>$outputfolder/$mafout[0].$mafout[1].$fileext" || die("Can not open the file!\n");
	print OUT ">$mafout[4]\n";
	print OUT "$mafout[7]\n";
	close OUT;
     }
     close IN;
}


1;

__END__
# Below is stub documentation for your module. You'd better edit it!

=head1 NAME

EST2ncRNA::ClustalwInterface - Perl extension for blah blah blah

=head1 SYNOPSIS

  use EST2ncRNA::ClustalwInterface;
  blah blah blah

=head1 DESCRIPTION

Stub documentation for EST2ncRNA::ClustalwInterface, created by h2xs. It looks like the
author of the extension was negligent enough to leave the stub
unedited.

Blah blah blah.

=head2 EXPORT

None by default.



=head1 SEE ALSO

Mention other useful documentation such as the documentation of
related modules or operating system documentation (such as man pages
in UNIX), or any relevant external documentation such as RFCs or
standards.

If you have a mailing list set up for your module, mention it here.

If you have a web site set up for your module, mention it here.

=head1 AUTHOR

Stefan Seemann, E<lt>seemann@bioinf.uni-leipzig.deE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006 by Stefan Seemann

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.6 or,
at your option, any later version of Perl 5 you may have available.


=cut
