#!/usr/bin/env perl

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

use strict;
use warnings;
use Getopt::Long;
use File::Basename;

$SIG{PIPE} = sub { exit };
Getopt::Long::Configure("bundling");

sub help {
    my $prog = basename($0);
    print <<EOT;
Usage: $prog [--all] [FILE]
Convert a NCBI blast report to table form.

Options:
  -a, --all    Also print query and subject sequence.
  -M, --make   Make FILE.table.gz and FILE.table.desc.
  --make-desc  Only make FILE.table.desc.
  -u           Do not compress output table.
EOT
    exit;
}

sub gaps {
    my $s = shift;
    my @g;
    my $pos = -1;
    while (($pos = index($s, '-', $pos)) > -1) {
	push @g, ($pos + 1);
	$pos++;
    }
    return join(",", @g);
}

my $opt_all = 0;
my $opt_make = 0;
my $opt_make_desc = 0;
my $opt_uncompressed = 0;
our $progress = 0;
our $debug = 0;
GetOptions('h|help' => \&help,
	   'M|make' => \$opt_make,
	   'make-desc' => \$opt_make_desc,
	   'u' => \$opt_uncompressed,
	   'P|progress' => \$progress,
	   'debug' => \$debug,
	   'a|all' => \$opt_all) || die;

my @spec = ({ -name => 'QUERY_ID',          -level => 1 },
	    { -name => 'SUBJECT_ID',        -level => 1 },
	    { -name => 'QUERY_LENGTH',      -level => 1 },
	    { -name => 'SUBJECT_LENGTH',    -level => 1 },
	    { -name => 'BIT_SCORE',         -level => 1 },
	    { -name => 'SCORE',             -level => 1 },
	    { -name => 'ALIGN_LENGTH',      -level => 1 },
	    { -name => 'IDENTITY',          -level => 1 },
	    { -name => 'EVALUE',            -level => 1 },
	    { -name => 'QUERY_START',       -level => 1 },
	    { -name => 'QUERY_STOP',        -level => 1 },
	    { -name => 'SUBJECT_START',     -level => 1 },
	    { -name => 'SUBJECT_STOP',      -level => 1 },
	    { -name => 'QUERY_GAPS',        -level => 1 },
	    { -name => 'SUBJECT_GAPS',      -level => 1 },
	    { -name => 'QUERY_ALIGN_SEQ',   -level => 2 },
	    { -name => 'SUBJECT_ALIGN_SEQ', -level => 2 });

my @hdr = map { $_->{-name} } grep { $opt_all ? 1 : $_->{-level} < 2 } @spec;

my @files = @ARGV > 0 ? @ARGV : ('-');

sub parse_num {
    my $s = shift;
    $s =~ s/,//g;
    return $s;
}

sub parse_float {
    my $s = shift;
    if ($s =~ /^e/) {
	return "1$s";
    }
    else {
	return $s;
    }
}

sub rep_print_rec {
    my $out_fh = shift;
    my $rec = shift;
    return unless exists $rec->{'SUBJECT_START'};
    my @rec;
    
    $rec->{'QUERY_GAPS'} = gaps($rec->{'QUERY_ALIGN_SEQ'});
    $rec->{'SUBJECT_GAPS'} = gaps($rec->{'SUBJECT_ALIGN_SEQ'});

    $rec->{'SUBJECT_ID'} = $rec->{'QUERY_ID'} unless
	exists $rec->{'SUBJECT_ID'};

    $rec->{'SUBJECT_LENGTH'} = $rec->{'QUERY_LENGTH'} unless
	exists $rec->{'SUBJECT_LENGTH'};

    foreach my $hdr (@hdr) {
	if (exists $rec->{$hdr}) {
	    push @rec, $rec->{$hdr};
	}
	else {
	    push @rec, '?';
	}
    }
    print $out_fh join("\t", @rec), "\n";
    delete $rec->{'QUERY_START'};
    delete $rec->{'SUBJECT_START'};
    $rec->{'QUERY_ALIGN_SEQ'} = '';
    $rec->{'SUBJECT_ALIGN_SEQ'} = '';
}

sub rep_parse {
    my $in_fh = shift;
    my $out_fh = shift;
    my %rec = ('QUERY_ALIGN_SEQ' => '', 'SUBJECT_ALIGN_SEQ' => '');
    our $debug;
    our $progress;
    my $i = 0;

    while (<$in_fh>) {
	print "DEBUG: Line: $_" if $debug;
	if (/^BLAST[XN] / or /^MEGABLAST /) {
	    if ($progress && ++$i % 10 == 0) {
		print STDERR "#";
		print STDERR " $i\n" if $i % 500 == 0;
	    }
	    rep_print_rec($out_fh, \%rec);
	    %rec = ('QUERY_ALIGN_SEQ' => '', 'SUBJECT_ALIGN_SEQ' => '');
	}
	elsif (/^Query=\s+(\S+)/) {
	    # Query= gi|6137801|ref|NC_000934.1| Loxodonta africana
	    $rec{'QUERY_ID'} = $1;
	    if ("$'" eq "\n") {
		my $peek = <$in_fh>;
		if ($peek =~ /^\s+\(([\d,]+) letters\)/) {
		    $_ = $peek;
		    redo;
		}
		else {
		    chomp $_;
		    $_ .= $peek;
		    redo;
		}
	    }
	}
	elsif (/^\s+\(([\d,]+) letters\)/) {
	    #         (16,866 letters)
	    $rec{'QUERY_LENGTH'} = parse_num($1);
	}
	elsif (/^ Identities = (\d+)\/(\d+)/) {
	    $rec{'IDENTITY'} = $1;
	    $rec{'ALIGN_LENGTH'} = $2;
	}
	elsif (/^>(\S+)/) {
	    rep_print_rec($out_fh, \%rec);
	    $rec{'SUBJECT_ID'} = $1;
	}
	elsif (/^\s+Length\s+=\s+(\d+)/) {
	    $rec{'SUBJECT_LENGTH'} = $1;
	}
	elsif (/^ Score\s+=\s+([\d.]+) bits \((\d+)\), Expect = ([\d.e-]+)/) {
	    # NCBI blastall
	    rep_print_rec($out_fh, \%rec);
	    # Score = 50.1 bits (25), Expect = 6e-05
	    $rec{'BIT_SCORE'} = $1;
	    $rec{'SCORE'} = $2;
	    $rec{'EVALUE'} = parse_float($3);
	}
	elsif (/^ Score\s+=\s+(\d+) \(([\d.]+) bits\), Expect = ([\d.e-]+)/) {
	    # WU blast
	    rep_print_rec($out_fh, \%rec);
	    # Score = 150 (28.6 bits), Expect = 5.2, P = 0.994
	    $rec{'BIT_SCORE'} = $2;
	    $rec{'SCORE'} = $1;
	    $rec{'EVALUE'} = parse_float($3);
	}
	elsif (/^Query:\s+(\d+)\s+(\S+)\s+(\d+)/) {
	    $rec{'QUERY_START'} = $1 unless exists $rec{'QUERY_START'};
	    $rec{'QUERY_ALIGN_SEQ'} .= uc($2);
	    $rec{'QUERY_STOP'} = $3;
	}
	elsif (/^Sbjct:\s+(\d+)\s+(\S+)\s(\d+)/) {
	    $rec{'SUBJECT_START'} = $1 unless exists $rec{'SUBJECT_START'};
	    $rec{'SUBJECT_ALIGN_SEQ'} .= uc($2);
	    $rec{'SUBJECT_STOP'} = $3;
	}
    }
    rep_print_rec($out_fh, \%rec);
    print STDERR " Done\n" if $progress;
}

sub xml_parse {
    ;
}

sub parse {
    my $in_fh = shift;
    my $out_fh = shift;
    my $in_file = shift;
    local $_ = <$in_fh> || die "Cannot read $in_file";
    if (/^BLAST[XN] / or /^MEGABLAST /) {
	rep_parse($in_fh, $out_fh);
    }
    elsif (/^<\?xml/) {
	xml_parse($in_fh, $out_fh);
    }
    else {
	die "Unknown file format: $in_file"
    }    
}

if ($opt_make) {
    foreach my $file (@files) {
	my $in_fh = input_open($file);
	my $out_file;
	if ($opt_uncompressed) {
	    $out_file = strip_suffix($file, '.gz') . '.table';
	}
	else {
	    $out_file = strip_suffix($file, '.gz') . '.table.gz';
	}
	next if -e $out_file;
	my %desc = desc_read($file);
	my $out_fh;
	if ($opt_uncompressed) {
	    $out_fh = new FileHandle "> ${out_file},";
	}
	else {
	    $out_fh = new FileHandle "|kvlgz -o ${out_file},";
	}
	print "Creating $out_file\n";
	if (%desc) {
	    delete $desc{'FILE'};
	    $desc{'TYPE'} = 'blast.table';
	    desc_write($out_file, %desc); 
	}
	print $out_fh join("\t", @hdr), "\n";
	parse($in_fh, $out_fh, $file);
	$out_fh->close;
	rename("${out_file},", ${out_file});
    }
}
elsif ($opt_make_desc) {
    foreach my $file (@files) {
	my %desc = desc_read($file);
	my $out_file = strip_suffix($file, '.gz') . '.table';
	if (%desc) {
	    delete $desc{'FILE'};
	    $desc{'TYPE'} = 'blast.table';
	    print "Creating $out_file description.\n";
	    desc_write($out_file, %desc); 
	}
    }
}
else {
    my $out_fh = \*STDOUT;
    print join("\t", @hdr), "\n";
    foreach my $file (@files) {
	my $in_fh = input_open($file);
	parse($in_fh, $out_fh, $file);
    }
}

############################################################
#!file.ph
#!desc.ph
# file.ph -*- perl -*-

use FileHandle;
use File::Basename;

sub input_open {
    my $file = shift;
    return \*STDIN if (not defined $file or $file eq '-');
    my $fh;
    if ($file =~ /\.gz$/) {
        $fh = new FileHandle "gunzip -c $file|";
    }
    else {
        $fh = new FileHandle $file, '<';
    }
    return $fh if defined $fh;
    $fh = new FileHandle "gunzip -c $file.gz|";
    return $fh if defined $fh;
    die "Cannot open input file '$file'";
}

sub strip_suffix {
    my $name = shift;
    return dirname($name) . '/' . basename($name, @_);
}
# desc.ph -*- perl -*-

sub desc_read {
    my $desc_file = shift;
    my %desc;
    local *IN;
    local $_;
    $desc_file = substr($desc_file, 0, -3)
	if (! -r "$desc_file.desc" && substr($desc_file, -3) eq '.gz');
    my $ext = substr($desc_file, -5);
    $desc_file = "$desc_file.desc" if $ext ne '.desc';
    unless (open(IN, '<', $desc_file)) {
	return ();
    }
    exists $desc{'FILE'} and die "Illegal FILE field in $desc_file";
    $desc{'FILE'} = substr($desc_file, 0, -5);
    while (<IN>) {
	/^\s*#/ and next;
	/^\s*$/ and next;
	/^(\S+)\s*: (.*)/ and $desc{$1} = $2;
    }
    close IN;
    $desc{'REV'} = -1 unless exists $desc{'REV'};
    return %desc;
}

sub desc_read_cache_file {
    my $dir = shift;
    local *IN;
    local $_;
    my $desc_file = "$dir/DESC.cache";
    unless (open(IN, '<', $desc_file)) {
	return ();
    }
    my @desc;
    my %desc;
    while (<IN>) {
	if (/^\s*$/) {
	    push @desc, { %desc };
	    %desc = ();
	}
	else {
	    /^(\S+)\s*: (.*)/ or die "Bad entry in $desc_file: $_";
	    $desc{$1} = $2;
	}
    }
    close IN;
    return @desc;
}

our @desc_cache;

sub desc_read_db {
    our @desc_cache;
    return @desc_cache if @desc_cache;
    my @desc;
    foreach my $dir (split(':', $ENV{KVLDB} || '/home/databases')) {
	push @desc, desc_read_cache_file($dir) if -r "$dir/DESC.cache";
    }
    @desc = sort { $b->{'REV'} <=> $a->{'REV'} } @desc;
    @desc_cache = @desc;
    return @desc;
}

# Example: desc_match('HELLO' => ['1', '2'], 'WORLD' => ['8'])
sub desc_match {
    my %match = @_;
    my @desc = desc_read_db;
    if (%match) {
	return grep {
	    my $desc = $_;
	    my $ok = 1;
	    foreach my $key (keys %match) {
		if (@{$match{$key}}) {
		    unless (exists $desc->{$key}
			    and grep { $_ eq $desc->{$key} } @{$match{$key}}) {
			$ok = 0;
			last;
		    }
		}
		elsif (exists $desc->{$key}) {
		    $ok = 0;
		    last;
		}
	    }
	    $ok;
	} @desc;
    }
    else {
	return @desc;
    }
}

sub desc_opt {
    my $default_match = shift;
    my %match;
    foreach my $opt (@_) {
	if ($opt =~ /^(\w+)=(.*)/) {
	    exists $match{$1} or $match{$1} = [];
	    push @{$match{$1}}, $2;
	}
	elsif ($opt =~ /^(\w+)!$/) {
	    $match{$1} = [];
	}
	else {
	    exists $match{'NAME'} or $match{'NAME'} = ();
	    push @{$match{'NAME'}}, $opt;
	}
    }
    if (defined $default_match) {
	while (my ($name, $values) = each %{$default_match}) {
	    $match{$name} = $values unless exists $match{$name};
	}
    }
    return desc_match(%match);
}

sub desc_format {
    my %desc;
    if (@_ == 1 && ref $_[0]) {
	%desc = %{$_[0]};
    }
    else {
	%desc = @_;
    }
    my $m = 0;
    foreach (keys %desc) {
	my $n = length $_;
	$m = $n if $n > $m;
    }
    my $s = '';
    foreach my $key (sort keys %desc) {
	$s .= sprintf "%-${m}s : %s\n", $key, $desc{$key};
    }
    return "$s\n";
}

sub desc_write {
    use FileHandle;
    my $file = strip_suffix(shift, '.gz.desc', '.desc', '.gz') . '.desc';
    my %desc = @_;
    -e $file && die "Description already exists: $file";
    my $fh = new FileHandle $file, '>';
    print $fh desc_format(%desc);
}

sub desc_list_full {
    my $default_match = shift;
    my @desc = desc_opt($default_match, @_);
    foreach my $desc (@desc) {
	print desc_format(%{$desc});
    }
}

sub desc_list_overview {
    my $all = shift;
    my $name_keys = shift;
    my $rev_keys = shift;
    my $default_match = shift;
    my @desc = desc_opt($default_match, @_);
    my @key = (@{$name_keys}, @{$rev_keys});
    my %w;

    unless ($all) {
	# Don't show hidden descriptions.
	@desc = grep { not (exists $_->{HIDDEN}
			    and $_->{HIDDEN}) } @desc;
	# Only show the latest revision descriptions.
	my %count;
	@desc = grep {
	    my $desc = $_;
	    my $r = join('/', map { $desc->{$_} } @{$name_keys});
	    $count{$r}++;
	    $count{$r} == 1;
	} (sort { $b->{REV} <=> $a->{REV} } @desc);
    }
    # Sort according to keys.
    @desc = sort {
	my $cmp;
	foreach (@key) {
	    if ($a->{$_} =~ /^\d+$/ && $b->{$_} =~ /^\d+$/) {
		$cmp = $a->{$_} <=> $b->{$_};
	    }
	    else {
		$cmp = $a->{$_} cmp $b->{$_};
	    }
	    last if $cmp;
	}
	$cmp;
    } @desc;

    # Find the widths of the header keys.
    foreach (@key) {
	$w{$_} = length $_;
    }
    # Find the maximum widths of the fields.
    foreach my $desc (@desc) {
	foreach my $key (@key) {
	    my $w = exists $desc->{$key} ? length($desc->{$key}) : 0;
	    $w{$key} = $w if $w > $w{$key};
	}
    }
    # Print the header keys.
    print join(' ', map { sprintf('%-' . $w{$_} . 's', $_) } @key), "\n";
    # Print underlines.
    print join(' ', map { '=' x length($_) 
			  . ' ' x ($w{$_} - length($_))} @key), "\n";
    # Print all the fields.
    my %count;
    foreach my $desc (@desc) {
        my $rec = join(' ', map { exists $desc->{$_}
				  ? sprintf('%-' . $w{$_} . 's',
					    $desc->{$_})
				  : ' ' x $w{$_} } @key);
	$rec =~ s/\s*$//;
	print "$rec\n";
    }
}
