#!/usr/local/bin/perl -w


#
# file aptsplice.pl -- caount the number of tags/gene and relate
# to the tag/transcript abundance
#

use strict;
my $file = shift or die "Usage: ./altsplice.pl input_file\n";
die "File $file not found\n" unless -e $file;

my $count   = 0;

# initialize a hash reference
my $tags    = {};

# load data from external files
my @file    = `grep 'coding_RNA' $file`;
my $partial = `cat NlaIII_partial`;
my $polyA   = `cat possible_polyA`;
my @introns = `cat intron_hits`;

# add introns to the other tags
push @file, @introns;


# iterate through the list
for (@file) {
    # split the line into 'words'
    my @line  = split;
    
    # this is what a line would typically look like:
    # count tag source pos'n strand gene locus descrption
    
    # assign some variables
    my $freq  = $line[0];
    my $tag   = $line[1];
    my $pos   = $line[3];
    my $gene  = $line[5];
  
    # if no gene, it's an intron hit -- find the gene name 
    # at the end of the line
    if ( !$gene ) {
        ($gene) = /(\S+)"$/; 
    }

    # remove alternatice splice suffix
    $gene =~ s/[a-z]$//;
    
    # intialize an array reference for the gene if we do not have one
    $tags->{$gene} ||= [];

    # skip tags that may be due to internal polyA's or
    # NlaIII partial digestion
    next if reject( $gene, $tag );
    
    # add the tag's count to the array
    push @{$tags->{$gene}}, $freq;
}

# initialize some hashes we will need
my (%count, %total);


for ( sort keys %{$tags} ) {
    
    # get the list of tag counts for this gene 
    my @tags = @{$tags->{$_}};    
    
    # set the counter to zero
    my $sum  = 0;   

    # add up all the tag counts
    for my $f (@tags) {
       $sum += $f;
    }

    # count the number of genes in each *number of tags) category
    for my $num ( 1..10 ) {
       $count{$num}++ and $total{$num} += $sum if @tags == $num;    
    }  
    $count{11}++ and $total{11} += $sum if @tags   >  10;
}

print "\nLibrary $file: ", scalar( keys %{$tags} ), " genes detected\n";
print "\nTranscripts\tGenes\tAbundance (average)\n";

for ( 1..11 ) {
    next unless $total{$_} ;
    my $average = int( $total{$_}/$count{$_} + 0.5 );
    my $transcripts = $_ == 11 ? '>10' : $_;
    print "$transcripts\t\t$count{$_}\t\t$average\n"; 
}


# this subroutine evaluated each gene/tag pair to see if
# they were previosuly identified as potential artifacts
sub reject {
    my ($gene, $tag) = @_;
    
    # are tag and gene in the partial digest file?
    return 1 if $partial =~ /^${gene}[a-z]?\s+$tag/m;
    
    # are tag and gene in the intternal polyA file?
    return 1 if $polyA   =~ /^${gene}[a-z]?\s+$tag/m;
    
    return 0;
}