#!/usr/local/bin/perl -w

###########################################################################
# This script takes a sequence (in FASTA or RAW format) 
# BLASTS the sequence against the DBEST database
# Returns the top 10 results that have an E Value better than .0001
# It then looks up the accession numbers to the BLAST Results
# against the UNIGENE Data file
# It returns the best hit 
# If there are no hits it returns nothing
# If there were more than one Unigene Cluster returned from the top 10 est
# hits a * is tacked on to the best hit.
###########################################################################
# Script requires the Hs.data file from Unigene
# ftp://ftp.ncbi.nlm.nih.gov/repository/UniGene/
# Script also requires NHGRI::Blastall
# ftp://ftp.nhgri.nih.gov/pub/software/blastall/
###########################################################################

use NHGRI::Blastall;
use strict;
use vars qw($VERSION $HS_DATA_FILE %UNIGENE_HASH %HITS);

$VERSION = 0.01;
$HS_DATA_FILE = './Hs.data';
%UNIGENE_HASH = ();
%HITS = ();

MAIN: {
    my $seq_file = $ARGV[0] || die "usage: $0 SEQFILE\n";
    my $b = new NHGRI::Blastall;

    $b->blastall( {'p' => 'blastn',
                   'a' => 4,
                   'e' => '.0001',
                   'b' => 10,
                   'd' => 'est', 
                   'i' => $seq_file } );

    my @ids = $b->result('id');
    read_unigene_into_memory();

    foreach my $id (@ids) {
        $id =~ m/^[^\|]*\|([^\|]*)\|/;
        my $accession = $1;
        $HITS{$UNIGENE_HASH{$accession}}++ if ($UNIGENE_HASH{$accession});
    }

    print_stats();
}

sub print_stats {
    my @hits = sort {$HITS{$b} <=> $HITS{$a} } keys %HITS;

    if (@hits > 1) {
        print "$hits[0]*\n";    
    } elsif (@hits == 1) {
        print "$hits[0]\n";    
    }
}

sub read_unigene_into_memory {
    my $current_cluster_id;
    open IN, $HS_DATA_FILE or die "cannot open $HS_DATA_FILE:$!";

    while (<IN>) {
        chomp;
        if (/^ID\s+(.*)$/) {
            $current_cluster_id = $1;
        } elsif (/^SEQUENCE\s+ACC=(\S+);/) {
            $UNIGENE_HASH{$1} = $current_cluster_id;
        }
    }
}
.
