lecture code viewer
downloads

Code
First Look at Perl
First Look at Perl
Sheldon McKay
#!/usr/local/bin/perl -w
use strict;
my $file = shift or die "Usage: ./partial.pl input_file\n";
die "File $file not found\n" unless -e $file;
my @file = sort_by_position( $file );
my ( $count, %seen, %seenpos1, $maybepartial, %pos1_freq );
for (@file) {
my @field = split;
my $gene = $field[5];
my $pos = $field[3];
my $tcount = $field[0];
my $tag = $field[1];
$gene =~ s/[a-z]$//;
$count++ unless $seen{$gene};
$seen{$gene} = 1;
$seenpos1{$gene} ||= $pos;
$pos1_freq{$gene} ||= $tcount;
if ( $pos == ( $seenpos1{$gene} + 1 ) &&
$tcount <= ( $pos1_freq{$gene}/10 ) ) {
$maybepartial++;
print "$gene\t$tag\t$pos\t$tcount\n";
}
}
$file =~ s/\.txt//;
warn "\n\n$file has $count unambiguously mapped genes\n",
"There were a total of ", scalar( @file ), " tags mapped\n",
"There were $maybepartial possible NlaIII partials\n\n";
sub sort_by_position {
my $lib = shift;
my @file = `grep coding_RNA $lib`;
return map { $_->[1] }
sort { $a->[0] <=> $b->[0] }
map { [ (split)[3], $_ ] }
@file;
}
__END__
# The code below is the same as above with comments added
use strict;
# get the input filename
my $file = shift
# or die with a complaint about not getting one
or die "Usage: ./partial.pl input_file\n";
# die if the input file does not exist
die "File $file not found\n" unless -e $file;
# call the sort_by_position subroutine to process
# the input file and return its contents as an array
my @file = sort_by_position( $file );
# initialize some variables that will be used later
my ( $count, %seen, %seenpos1, $maybepartial, %pos1_freq );
# Iterate through each element in the array. Each line in the file
# is one array element. Unless otherwise specified, array elements
# are assigned to the scalar variable '$_'
for (@file) {
# split $_ on whitespace (default bahavior if 'split' is called with no arguments)
my @field = split;
# assign some variables based on their position in the line of text
# indexing is zero-based, so count would be position 0
#
# count Tag Source Pos Str Gene Locus Description
# 7548 GGATTCGGTC coding_RNA 1 + F25H2.10 rpa-0 "deoxyribonuclease"
my $gene = $field[5];
my $pos = $field[3];
my $tcount = $field[0];
my $tag = $field[1];
# C. elegans alternative transcripts are listed as gene-name + a-z
# eg: C05D2.1a, C05D2.1b, etc. The following line will remove the trailing
# letter, so we know we are dealing with the same gene
$gene =~ s/[a-z]$//;
# Increment the gene counter unless we have seen this gene before
$count++ unless $seen{$gene};
########################################################################
# The first time we encounter a gene: #
# 1) save a record of having seen this particular gene #
$seen{$gene} = 1; #
# #
# 2) save a record of seeing the lowest position match (recall we did #
# an ascending sort based on tag position, so the first one we see is #
# the the lowest position match for this gene) #
$seenpos1{$gene} ||= $pos; #
# #
# 3) save a record of the number of times this tag was observed #
$pos1_freq{$gene} ||= $tcount; #
########################################################################
# We've been through this loop a few times, now we check to see if:
if (
# the gene we are considering has been seen before and
# the current tag match in is the second-lowest position
$pos == ( $seenpos1{$gene} + 1 )
# -AND-
&&
# the number of counts for this tag are at least 10-fold
# lower than the lowest position match
$tcount <= ( $pos1_freq{$gene}/10 ) ) {
# If the above two conditions are satisfied:
# increment the partial digest counter
$maybepartial++;
# print a record of the tag in question to STDOUT
print "$gene\t$tag\t$pos\t$tcount\n";
}
}
$file =~ s/\.txt//;
# print a summary to STDERR
warn
# scalars ($file) and escape characters (\n) are interpolated inside of ""
"\n\n$file has $count unambiguously mapped genes\n",
"There were a total of ",
# count of lines in the input file (not inperpolated inside of "")
scalar @file,
# more text
" tags mapped\n",
"There were $maybepartial possible NlaIII partials\n\n";
# this subroutine loads the input file and sorts the line order
sub sort_by_position {
# get the filename (passed as an argument)
my $lib = shift;
# load the file, but first use command-line grep to grab only
# lines that contain the keyword 'coding_RNA', which is what we
# are interested in here. The backtick (``) operator runs a shell
# command and returns whatever is printed to STDOUT. Whatever is
# happening inside of `` is outside of this script
my @file = `grep coding_RNA $lib`;
# This is a bit of magic called a Schwartzian Transform. It allows us
# to sort a list of text elements based on a precomputed field (in this case,
# the tag position that is embedded in the middle of the text):
# tells the subroutine to return the following and exit...
return
# Step 3 -- the sorted array
# returns a list of the second elements in a list of anonymous arrays that contain
# the precomuted field and the original text.
# eg [ tag_position, line of original text from file ]
# Each anonymous array is pointed to by the array reference ($_).
# Array elements are pulled from the array reference with the dereferencing
# operator '->'. Huh?? -- This is actually the last part of the transform
# which works from the bottum upward
map { $_->[1] }
# Step 2 -- a custom sort function
sort {
# $a->[0] is the first element of the anonymous array referred to by $a
# $b->[0] is the first element of the anonymous array referred to by $b
# this is what were are sorting on.
# the <=> operator returns:
# -1 if $a->[0] > $b->[0] ($a goes before $b)
# 0 if $a->[0] = $b->[0] (not sorting, they are equal)
# 1 if $a->[0] < $b->[0] ($b goes before $a)
$a->[0] <=> $b->[0]
}
# make an array of array references to sort ( a meta-array, of you will).
map {
# the [] operator creates a reference to an anonyous array whose elements are listed
# like so [ 'thing1', 'thing2', 'cat', 'hat' ]
[ # start of anonymous array
# this expression does the following:
# 1) split the default variable $_ (line of text from @file) on whitespace
# 2) return the fourth 'word' from the line of text
(split)[3],
# this is the line of text being considered
$_
# end of the anonymous array
]
# end of the map function
}
# the array we are sorting (where the lines of text are coming from)
@file;
}
|
1 |
First Look at Perl |
0.1.0.1.1
0.1.0.1.1a.p1 |
Processing C Elegans Data
|
Sheldon McKay
|
ppt
0.1.0.1.1a.a1 |
Processing C Elegans Data
|
Sheldon McKay
|
pdf
0.1.0.1.1b.p2 |
Fetching Web Data and Making Graphs
|
Martin Krzywinski
|
ppt
0.1.0.1.1b.a2 |
Fetching Web Data and Making Graphs
|
Martin Krzywinski
|
pdf
0.1.0.1.1.c1 |
altsplice.pl
|
Sheldon McKay
|
code
0.1.0.1.1.c2 |
grabdata
|
Sheldon McKay
|
code
0.1.0.1.1.c3 |
partial.pl
|
Sheldon McKay
|
code
0.1.0.1.1.c4 |
tagger.pl
|
Sheldon McKay
|
code
0.1.0.1.1a.d1 |
First Look at Perl
|
Sheldon McKay
|
data
0.1.0.1.1.d2 |
First Look at Perl
|
Sheldon McKay
|
data
0.1.0.1.1.d3 |
First Look at Perl
|
Sheldon McKay
|
data
0.1.0.1.1.d4 |
First Look at Perl
|
Sheldon McKay
|
data
0.1.0.1.1.d5 |
First Look at Perl
|
Sheldon McKay
|
data
0.1.0.1.1.d6 |
First Look at Perl
|
Sheldon McKay
|
data
0.1.0.1.1.d7 |
First Look at Perl
|
Sheldon McKay
|
data
0.1.0.1.1.d8 |
First Look at Perl
|
Sheldon McKay
|
data
0.1.0.1.1.i1 |
First Look at Perl
|
Sheldon McKay
|
images
0.1.0.1.1.i2 |
First Look at Perl
|
Sheldon McKay
|
images
0.1.0.1.1.i3 |
First Look at Perl
|
Sheldon McKay
|
images
0.1.0.1.1.i4 |
First Look at Perl
|
Sheldon McKay
|
images
0.1.0.1.1.i5 |
First Look at Perl
|
Sheldon McKay
|
images
0.1.0.1.1a.s1 |
Processing C Elegans Data
|
Sheldon McKay
|
slides
0.1.0.1.1b.s1 |
Fetching Web Data and Making Graphs
|
Martin Krzywinski
|
slides
|