package EST2ncRNA::UCSCInterface;

use strict;
use warnings;

require Exporter;

our @ISA = qw(Exporter);

# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.

our @EXPORT_OK = qw(
		    );

our @EXPORT = qw();

our $VERSION = '0.01';


# constructor
sub new {
    my $class = shift @_;
    my ($workdir,$server,$uid) = @_;

    my $self = {
	_subdir       => $workdir."/est2ncrna.estcoverage",
	_server       => $server,
	_uid	      => $uid
    };
    bless $self, $class;

    # create subdir if not already exists
    mkdir $self->{_subdir} unless -d $self->{_subdir};

    return $self;
}


# accessor method for UCSCInterface subdir
sub subdir {
    my($self) = @_;
    return $self->{_subdir};
}


# accessor method for UCSCInterface server-interface
sub server {
    my($self) = @_;
    return $self->{_server};
}


# accessor method for Pipeline UID
sub uid {
    my($self) = @_;
    return $self->{_uid};
}


# search pairwise alignments with the UCSC tool liftOver in an UCSC-generated over.chain file
#
# input are the query organism name,
#           the query subsequence coordinates in BED format,
#           the UCSC-generated over.chain file and
#           the minimum ratio of bases that must remap
# returns a reference to an hash with pairs of estcoverage primary key vs remaining estcoverage values 
#           (est_id:est_start:subject_name:align_type vs subject_start:subject_end:)
sub run_liftOver {
    my ($self, $query, $OLDFILE, $chainfile, $minMatch) = @_;
    my ($qid, $qstart, $sid, $sstart, $sstop, $strand, $start, $end, $tmp, %cand, $blastnid);
    
    my $SUBDIR = $self->subdir;
    my $UID = $self->uid;
    $chainfile =~ /To(\w+)\./;
    my $NEWFILE = "$SUBDIR/$1".".loOut.$UID.bed";
    my $UNMAPPED = "$SUBDIR/loUnMapped.$UID.txt";

    # start liftOver
    `zcat $chainfile | tools/liftOver -minMatch=$minMatch $OLDFILE stdin $NEWFILE $UNMAPPED`;

    # read output file of liftOver in a hash
    open IN, "$NEWFILE" || die("Can not open the file!\n");
    while( <IN> ) {
        #( $sid, $start, $end, $qid, $qstart, $strand ) = /^(\w+)\s+(\d+)\s+(\d+)\s+(\d+)\|(\d+)\s+1\s+([+-])$/;
	my @l = split " ";
	$sid = $l[0];
	$start = $l[1];
	$end = $l[2];
	($qid, $qstart, $blastnid) = split /\|/, $l[3];
	$strand = $l[5];

        # index conversion from over.chain-convention (UCSC) to blastn-convention (equal to estcoverage-table convention)
        $sstart = $start + 1;
        $sstop = $end;

        # swap coordinates if strand "-"
        if( $strand eq "-" ) {
            $tmp = $sstart;
            $sstart = $sstop;
            $sstop = $tmp;
        }
        # fill hash with source organism vs subject organism pairs
        $cand{"$qid:$qstart:$blastnid:$sid:$query"."_pa"} = "$sstart:$sstop:";
    }
    close IN;

    return \%cand;
}


# input  - (1) a bed-file
#          (2) organism after that should be searched in maf-file
sub get_mafscan_input {
    my ($self, $queryFile, $queryorg) = @_;
    my (@line, $org, $estid, $eststart, $tmp, %map, $item);

    my $SUBDIR = $self->subdir;
    my $OLDFILE = $queryorg.".estcand.txt";

    open IN, "$queryFile" || die("Can not open the file!\n");
    while( <IN> ) {

	@line = split " ";

        #extract data and put it in hash with key=chromosome of $queryorg and value=reference to an array holding a entry with start, end, EST-id, EST-start 
	# EST data
	( $estid, $eststart ) = split '\|', $line[3];
	# swap coordinates if strand "-"
        if( $line[5] eq "-" ) {
	    $tmp = $line[1];
            $line[1] = $line[2];
            $line[2] = $tmp;
	}
	# add to hash
	push (@{$map{$line[0]}}, [$line[1], $line[2], $estid, $eststart]);

    }

    # sort for each $queryorg chromosome the items in the array after the $queryorg startposition
    foreach $item ( keys %map ) {
	@{$map{$item}} = sort { $$a[0] <=> $$b[0] } @{$map{$item}};
    }

    open OUT, ">$SUBDIR/$OLDFILE" || die("Can not open the file!\n");
    # add the alignment in the mafscan_input file
    foreach $item ( keys %map ) {
	print OUT "#$item\n";
	foreach( @{$map{$item}} ) {
		$org = "gnl|$queryorg|$item";
		print OUT "$$_[2] $$_[3] $org $$_[0] $$_[1]\n";
	}
    }
    close OUT;

    return $OLDFILE;
}


sub scan_maf {
    my ($self, $queryfile, $maffiles_hash_ref, $queryorg, $minMatch, $OUTPUTFILE) = @_;
    my ($maffile, $outfile, @mafout, $index, @e, %cand);
    my %index = ();

    my $SCANMAF = "scan_maf.pl";

    my $SERVER = $self->server->name;
    my $SERVER_HOME = $self->server->home;
    my $SUBDIR = $self->subdir;
    my $USERID = $self->server->userid;
    my $NODES = $self->server->nodes;
    my $QUEUE = $self->server->queue;
    my $PBSUBMIT = $self->server->pbsubmit;
    my $SCRATCH = $self->server->scratch;
    my $LOCALLY = $self->server->locally;

    # transfer necessary files to running machine
    map { $self->server->scpcall($_, "$SERVER:$SERVER_HOME/$SUBDIR") } values %$maffiles_hash_ref;
    $self->server->scpcall("$SCANMAF", "$SERVER:$SERVER_HOME/$SUBDIR");
    $self->server->scpcall("$SUBDIR/$queryfile", "$SERVER:$SERVER_HOME/$SUBDIR");
    $self->server->sshcall("chmod u+x $SERVER_HOME/$SUBDIR/$SCANMAF");

    # create a shell script 'pbs_scanmaf.sh' for each maf file (chromosome) to start the scanning process as pbs jobs
    foreach( keys %$maffiles_hash_ref ) {
        ( $maffile ) = $$maffiles_hash_ref{$_} =~ /.*\/(\S+)$/;
        $outfile = "mafout.".$_.".txt.gz";
        push @mafout, $outfile;
        open OUT, ">$SUBDIR/pbs_scanmaf.sh" || die("Can not open the file!\n");
 	if( $LOCALLY ) {
		print OUT "cd $SERVER_HOME/$SUBDIR\n nohup ./$SCANMAF $maffile $queryorg $minMatch $queryfile $_\n";
	}
	else {
        	print OUT "#!/bin/tcsh\n set path = ( $PBSUBMIT \$path )\n";
		print OUT "cd $SERVER_HOME/$SUBDIR\n pbsubmit.pl -o \"-N scanmaf $NODES -l walltime=100:0:0\" $QUEUE -D -Q -B $SERVER_HOME -c \"$SERVER_HOME/$SUBDIR/$SCANMAF $SERVER_HOME/$SUBDIR/$maffile $queryorg $minMatch $SERVER_HOME/$SUBDIR/$queryfile $_\"\n";
	}
        close OUT;

        # start the pbs jobs on the server
        print "Start scanning of $maffile on $SERVER.\n";
        $self->server->scpcall("$SUBDIR/pbs_scanmaf.sh", "$SERVER:$SERVER_HOME/$SUBDIR");
        $self->server->sshcall("chmod u+x $SERVER_HOME/$SUBDIR/pbs_scanmaf.sh; $SERVER_HOME/$SUBDIR/pbs_scanmaf.sh");
    }

    # wait until all scanmaf processes of user 'whoami' are finished
    $self->server->wait("scanmaf");
    #while(1) {
    #    my @qstat = `ssh $SERVER "ps u -C pbs_scanmaf.sh | grep $USERID"`;
    #    last if @qstat==0;
    #    sleep 60;
    #}
    
    # fetch results (scan_maf output file) from the server
    map { $self->server->scpcall("$SERVER:$SERVER_HOME/$SUBDIR/$_", $SUBDIR) } @mafout;

    # concatenate all scan_maf output files
    map { $_ = "$SUBDIR/$_" } @mafout;
    `cat @mafout > $SUBDIR/$OUTPUTFILE`;
    map { unlink $_ } @mafout;

    # read output file of scan_maf.pl in a hash
    open IN, "zcat $SUBDIR/$OUTPUTFILE |" || die("Can not open the file!\n");
    while( <IN> ) {
	@e = split ":", $_;

	# count in how many different multiple alignments one candidate EST is involved (count only one time for each alignment)
	# $index{"$e[0]:$e[1]:$e[2]:$e[3]"} present the number of a special alignment (key is est_id,est_start,reference_chromosome,align_nr_in_chrom) 
	# $index{"$e[0]:$e[1]"} counts number of an EST in general
	unless( defined $index{"$e[0]:$e[1]:$e[2]:$e[3]"} ) {
	    $index{"$e[0]:$e[1]"} = ( defined $index{"$e[0]:$e[1]"} ) ? $index{"$e[0]:$e[1]"}+1 : 1;
	    $index{"$e[0]:$e[1]:$e[2]:$e[3]"} = $index{"$e[0]:$e[1]"};
	}
	$index = $index{"$e[0]:$e[1]:$e[2]:$e[3]"};

	# fill hash with pairs of estcoverage primary key vs remaining estcoverage values 
	# (est_id:est_start:subject_name:align_type vs subject_start:subject_end:subject_seq)
	$cand{"$e[0]:$e[1]:$e[4]:".$queryorg."_ma_".$index} = "$e[5]:$e[6]:$e[7]";
    }
    close IN;

    return \%cand;
}


sub mafscan2bed {
    my ($self, $scanmaffile, $bedfile) = @_;
    my (@e, $chr, $strand, $tmp);
    my $SUBDIR = $self->subdir;

    # read output file of scan_maf.pl (start index = 1) and write in a BED-file (start index = 0)
    open IN, "zcat $SUBDIR/$scanmaffile |" || die("Can not open the file!\n");
    open OUT, ">$SUBDIR/$bedfile " || die("Can not open the file!\n");
    while( <IN> ) {
        @e = split ":", $_;
         
        $chr = ( split '\|', $e[4] )[2];
	if( $e[5]>$e[6] ) {
		$strand = '-';
		$tmp = $e[5];
		$e[5] = $e[6];
		$e[6] = $tmp;
	}
	else {
		$strand = '+';
	}
	$e[5] = $e[5]-1;

	print OUT "$chr\t$e[5]\t$e[6]\t$e[0]|$e[1]\t1\t$strand\n";
    }
    close OUT;
    close IN;
}


1;
__END__
# Below is stub documentation for your module. You'd better edit it!

=head1 NAME

EST2ncRNA::UCSCInterface - Perl extension for blah blah blah

=head1 SYNOPSIS

  use EST2ncRNA::UCSCInterface;
  blah blah blah

=head1 DESCRIPTION

Stub documentation for EST2ncRNA::UCSCInterface, created by h2xs. It looks like the
author of the extension was negligent enough to leave the stub
unedited.

Blah blah blah.

=head2 EXPORT

None by default.



=head1 SEE ALSO

Mention other useful documentation such as the documentation of
related modules or operating system documentation (such as man pages
in UNIX), or any relevant external documentation such as RFCs or
standards.

If you have a mailing list set up for your module, mention it here.

If you have a web site set up for your module, mention it here.

=head1 AUTHOR

Stefan Seemann, E<lt>seemann@bioinf.uni-leipzig.deE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006 by Stefan Seemann

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.6 or,
at your option, any later version of Perl 5 you may have available.


=cut
