#!/usr/local/bin/perl -w


# file partial.pl -- Estimate how many observed SAGE tags could be
# due to partial digestion with the NlaIII anchoring enzyme


#
# The raw code -- see below for detailed comments
#

use strict;

my $file = shift or die "Usage: ./partial.pl input_file\n";
die "File $file not found\n" unless -e $file;
my @file = sort_by_position( $file );

my ( $count, %seen, %seenpos1, $maybepartial, %pos1_freq );

for (@file) {
    my @field  = split;
    my $gene   = $field[5];
    my $pos    = $field[3];
    my $tcount = $field[0];
    my $tag    = $field[1];
    
    $gene =~ s/[a-z]$//;
    $count++ unless $seen{$gene};
    $seen{$gene} = 1;
    $seenpos1{$gene}  ||= $pos;
    $pos1_freq{$gene} ||= $tcount;
    
    if ( $pos == ( $seenpos1{$gene} + 1 ) && 
         $tcount <= ( $pos1_freq{$gene}/10 ) ) {
	$maybepartial++;
	print "$gene\t$tag\t$pos\t$tcount\n";
    }
}    


$file =~ s/\.txt//;
 
warn "\n\n$file has $count unambiguously mapped genes\n",
     "There were a total of ", scalar( @file ), " tags mapped\n",
     "There were $maybepartial possible NlaIII partials\n\n";


sub sort_by_position {
    my $lib = shift;
    my @file = `grep coding_RNA $lib`;
    
    return map  { $_->[1] }
           sort { $a->[0] <=> $b->[0] }
           map  { [ (split)[3], $_ ] }
           @file;
}


__END__

# The code below is the same as above with comments added

use strict;

# get the input filename
my $file = shift 

# or die with a complaint about not getting one
or die "Usage: ./partial.pl input_file\n";

# die if the input file does not exist
die "File $file not found\n" unless -e $file;

# call the sort_by_position subroutine to process
# the input file and return its contents as an array
my @file = sort_by_position( $file );

# initialize some variables that will be used later
my ( $count, %seen, %seenpos1, $maybepartial, %pos1_freq );

# Iterate through each element in the array. Each line in the file
# is one array element.  Unless otherwise specified, array elements 
# are assigned to the scalar variable '$_'
for (@file) {

    # split $_ on whitespace (default bahavior if 'split' is called with no arguments)
    my @field  = split;
    
    # assign some variables based on their position in the line of text 
    # indexing is zero-based, so count would be position 0
    #
    # count     Tag            Source     Pos Str    Gene    Locus     Description
    # 7548    GGATTCGGTC      coding_RNA   1   +   F25H2.10  rpa-0   "deoxyribonuclease"
    my $gene   = $field[5];
    my $pos    = $field[3];
    my $tcount = $field[0];
    my $tag    = $field[1];

    
    # C. elegans alternative transcripts are listed as gene-name + a-z
    # eg: C05D2.1a, C05D2.1b, etc.  The following line will remove the trailing
    # letter, so we know we are dealing with the same gene
    $gene =~ s/[a-z]$//;
    
    # Increment the gene counter unless we have seen this gene before
    $count++ unless $seen{$gene};
    
    ########################################################################
    # The first time we encounter a gene:                                  #
    # 1) save a record of having seen this particular gene                 #
    $seen{$gene} = 1;                                                      #
    #                                                                      #
    # 2) save a record of seeing the lowest position match (recall we did  #
    # an ascending sort based on tag position, so the first one we see is  #
    # the the lowest position match for this gene)                         #
    $seenpos1{$gene}  ||= $pos;                                            #
    #                                                                      #
    # 3) save a record of the number of times this tag was observed        #
    $pos1_freq{$gene} ||= $tcount;                                         # 
    ########################################################################
    
    # We've been through this loop a few times, now we check to see if:
    if ( 
         # the gene we are considering has been seen before and 
	 # the current tag match in is the second-lowest position
	 $pos == ( $seenpos1{$gene} + 1 )
	 
	 # -AND-
	 &&
	 
	 # the number of counts for this tag are at least 10-fold
	 # lower than the lowest position match
         $tcount <= ( $pos1_freq{$gene}/10 )  ) {
        
	
	# If the above two conditions are satisfied:
	
	# increment the partial digest counter
	$maybepartial++;
	
        # print a record of the tag in question to STDOUT
	print "$gene\t$tag\t$pos\t$tcount\n";
    }
}


$file =~ s/\.txt//;

# print a summary to STDERR
warn 
     # scalars ($file) and escape characters (\n) are interpolated inside of ""
     "\n\n$file has $count unambiguously mapped genes\n",
     "There were a total of ", 
     
     # count of lines in the input file (not inperpolated inside of "")
     scalar @file, 
     
     # more text
     " tags mapped\n",
     "There were $maybepartial possible NlaIII partials\n\n";


# this subroutine loads the input file and sorts the line order
sub sort_by_position {
    
    # get the filename (passed as an argument)
    my $lib = shift;
    
    # load the file, but first use command-line grep to grab only
    # lines that contain the keyword 'coding_RNA', which is what we
    # are interested in here.  The backtick (``) operator runs a shell
    # command and returns whatever is printed to STDOUT.  Whatever is
    # happening inside of `` is outside of this script
    my @file = `grep coding_RNA $lib`;

    # This is a bit of magic called a Schwartzian Transform.  It allows us
    # to sort a list of text elements based on a precomputed field (in this case,
    # the tag position that is embedded in the middle of the text):
    
    # tells the subroutine to return the following and exit...
    return 
    
    # Step 3 -- the sorted array
    # returns a list of the second elements in a list of  anonymous arrays that contain 
    # the precomuted field and the original text.  
    # eg [ tag_position, line of original text from file ]
    # Each anonymous array is pointed to by the array reference ($_).  
    # Array elements are pulled from the array reference with the dereferencing
    # operator '->'.  Huh?? -- This is actually the last part of the transform 
    # which works from the bottum upward
    map  { $_->[1] }
    
    # Step 2 -- a custom sort function 
    sort { 
           # $a->[0] is the first element of the anonymous array referred to by $a
	   # $b->[0] is the first element of the anonymous array referred to by $b
	   # this is what were are sorting on.
           # the <=> operator returns: 
	   # -1 if $a->[0] > $b->[0] ($a goes before $b)
	   #  0 if $a->[0] = $b->[0] (not sorting, they are equal)
           #  1 if $a->[0] < $b->[0] ($b goes before $a)   
	   $a->[0] <=> $b->[0] 
    }
          
    # make an array of array references to sort ( a meta-array, of you will).  
    map  { 
    
    # the [] operator creates a reference to an anonyous array whose elements are listed 
    # like so [ 'thing1', 'thing2', 'cat', 'hat' ]
    
    [ # start of anonymous array 
    
    # this expression does the following:
    # 1) split the default variable $_ (line of text from @file) on whitespace
    # 2) return the fourth 'word' from the line of text
    (split)[3], 
    
    # this is the line of text being considered
    $_ 
    
    # end of the anonymous array 
    ] 
    
    # end of the map function 
    } 
    
    # the array we are sorting (where the lines of text are coming from)
    @file;
    
}