#!/usr/bin/perl -w

sub get_estcandidates_from_file {
    $file = shift;

    my (%chrom);

    # read input
    $status=0;
    open IN, $file || die("Can not open the file!\n");
    foreach $line ( <IN> ) {
	
	if( $line =~ /^\#/ ) {
    
	    if( $status ) {
		# sort array to start position
		my @sortqueries = sort { $$a[3] <=> $$b[3] } @$queries_ary_ref;
		my $queries_ary_reff = \@sortqueries;
		
		$chrom{$qchr} = $queries_ary_reff;
		undef $queries_ary_ref;
	    }

	    ( $qchr ) = $line =~ /^\#(\w+)/;
	    $status=1;
	}
	else {
	    my @l = $line =~ /(\d+) (\d+) \S+ (\d+) (\d+)/;

	    # test type of query strand
	    if( $l[2]<=$l[3] ) {
		push @l, "+";
	    }
	    else {
		push @l, "-";
		$tmp = $l[2];
		$l[2] = $l[3];
		$l[3] = $tmp;
	    }

	    push @$queries_ary_ref, \@l;
	}
    }
    if( $status ) {
	# sort array to start position
	my @sortqueries = sort { $$a[2] <=> $$b[2] } @$queries_ary_ref;
	my $queries_ary_reff = \@sortqueries;

	$chrom{$qchr} = $queries_ary_reff;
	undef $queries_ary_ref;
    }
    
    close IN;

    return \%chrom;
}

sub find_hit_query_indices {
    my $query_ary_ref = shift;
    my $qs = shift;
    my $qe = shift;

    my $qstart = $$query_ary_ref[2];
    my $qstop = $$query_ary_ref[3];
    my $crate = 0;

    # calculate coverage rate
    return 0 if $qstop<=$qs;
    return 2 if $qstart>=$qe;

    if( $qstart>=$qs ) {
	if( $qstop<=$qe ) {
	    $crate = 1;
	}
	else {
	    $crate = ($qe-$qstart)/($qstop-$qstart);
	}
    }
    elsif( $qstop<=$qe ) {
	$crate = ($qstop-$qs)/($qstop-$qstart);
    }
    elsif( $qstart<$qs && $qstop>$qe ) {
	$crate = ($qe-$qs)/($qstop-$qstart);
    }
    
    # add coverage rate to the query array
    push @$query_ary_ref, $crate;

    return $crate;
}


sub start_maf_scanning {
    my $maffile = shift;
    my $queryorg = shift;
    my $CR = shift;
    my $input = shift;
    my $chrom = shift;

    my ($qe, $qs, %chrom, $aligni, $query_ary_ref, $refchrom, $hit);
    my ($chr, $start, $size, $strand, $srcSize, $nr);
    my %hitcollection = ();;
    my @hits = ();
    my @align = ();
    my %hitnr = ();

    # read input
    $refchrom = get_estcandidates_from_file($input);

    # loop maf file
    open IN, "zcat $maffile |" || die("Can not open the file!\n");
    while ( <IN> ) {
	$line = $_;
	chomp $line;

	# a sequence within an alignment block
	if( $line=~/^s/ ) {
	    push @align, $line;
	}

	# prepare last multiple alignment
	next if $#align<0;
	if( $line=~/^(a|\#\# eof maf)/ ) {

	    foreach $aligni ( @align ) {
		
		$aligni =~ /^s\s+(\S+)/;
		next unless index($1,$queryorg)>=0;

		( $chr, $start, $size, $strand, $srcSize) = $aligni =~ /^s\s+\S+\.(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s+(\d+)/;

		foreach $query_ary_ref ( @{$refchrom->{$chr}} ) {

		    # get borders of query strand in alignment in blastn format
		    if( $strand eq "-" ) {
			$qe = $srcSize - $start;
			$qs = $srcSize - ( $start + $size - 1 );
		    }
		    else {
			$qs = $start + 1;
			$qe = $start + $size;
		    }
 
		    # does map the query subsequence?
		    $hit = find_hit_query_indices($query_ary_ref, $qs, $qe);
		    
		    # leave the alignment because the last ordered query start position was bigger as the alignment end position 
		    last if $hit==2;

		    # add the query to the hit list
		    push @hits, $query_ary_ref if $hit>$CR;
		}

	        # query organism was found in multiple alignment!
	        last;
	    } 

	    if( $#hits<0 ) {
		@align = ();
		next;
	    }

	    # actualize the hitnumber of candncrnas
	    foreach $hiti ( @hits ) {
		if( defined $hitnr{"$$hiti[0]:$$hiti[1]"} ) {
		    $hitnr{"$$hiti[0]:$$hiti[1]"} += 1;
		}
		else {
		    $hitnr{"$$hiti[0]:$$hiti[1]"} = 1;
		}
	    }

	    # save successful multiple alignments
	    foreach $aligni ( @align ) {

		( $org, $chr, $start, $size, $strand, $srcSize, $text) = $aligni =~ /^s\s+(\S+)\.(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s+(\d+)\s+(\S+)/;
		next if $org eq $queryorg;
		# get borders of subject strand in alignment in blastn format
		if( $strand eq "-" ) {
		    $qs = $srcSize - $start + 1;
		    $qe = $srcSize - ( $start + $size - 2 );
		}
		else {
		    $qs = $start + 2;
		    $qe = $start + $size + 1;
		}
		foreach $hiti ( @hits ) {
		    $index = $hitnr{"$$hiti[0]:$$hiti[1]"};
		    $hitcollection{"$$hiti[0]:$$hiti[1]:$chrom:$index:gnl|$org|$chr"} = "$qs:$qe:$text:$$hiti[5]" if $$hiti[5]>$CR;
		}
	    }

	    # clear hits array
	    @align = ();
	    @hits = ();
	}
    }

    close IN;

    $nr = keys %hitcollection;
    return \%hitcollection if $nr>0;
    return 0;
}


# get parameters
if( @ARGV!=5) {
    exit;
}
$maf = shift @ARGV;
$queryorg = shift @ARGV;
$CR = shift @ARGV;
$input = shift @ARGV;
$chrom = shift @ARGV;

# search multiple alignments of the estcoverage-table $query subsequences to $subjects in the UCSC maf-file
# get back a reference to a hash with the pairs successfully mapped source EST : reference to an array of mapped subjects subsequence
$outfile = "mafout.".$chrom.".txt.gz";
$refalign = start_maf_scanning($maf, $queryorg, $CR, $input, $chrom);

open OUT, "| gzip > $outfile" || die("Can not open the file!\n");
if( $refalign!=0 ) {
    foreach $refhit ( keys %$refalign ) {
	print OUT "$refhit:$$refalign{$refhit}\n";
    }
}
close OUT;


__END__

=head1 NAME

C<scan_maf.pl> - Search multiple alignments of ESTs in a UCSC maf-file using a reference organism

=head1 SYNOPSIS

    ./scan_maf.pl $maffile $reference_organism $minMatch $queryfile $chromosome

=head1 INPUT

=over 6

=item B<$maffile>

UCSC multiple alignment (maf) file of the $reference_organism of one $chromosome

=item B<$reference_organism>

the name of the reference organism which was already aligned against the ESTs and for which a maf-file exists

=item B<$minMatch>

the minimal coverage rate of a returned hit

=item B<$queryfile>

a flat file of the alignments of ESTs to the reference organism including in each line seperated by space 
    EST-id
    EST-startindex (first index = 1)
    reference organism name (gnl|name|chromosome)
    reference organism startindex (first index = 1) 
    reference organism stopindex (first index = 1)

=item B<$chromosome>

the name of the chromosome of the $reference_organism represented by the maf-file 

=back

=head1 AUTHOR

Stefan Seemann, E<lt>seemann@bioinf.uni-leipzig.deE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006 by Stefan Seemann

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.6 or,
at your option, any later version of Perl 5 you may have available.

=cut
