A 2- or 4-day practical mini-course in Circos, command-line parsing and scripting. This material is part of the Bioinformatics and Genome Analysis course held at the Institut Pasteur Tunis.
BCGA 2018 | 1-day Circos course | Circos documentation best practices getting started | Brewer palette swatches | Color resources | Nature Methods Points of View Points of Significance
This page lists all the scripts from all lectures.
When you untar the material, these scripts should be symbolic links back to their corresponding day and lecture
sessions/scripts/parsemaf -> ../day.4/lecture.3/1/parsemaf
and some of the Perl scripts may have accompanying .conf files such as
sessions/day.2/lecture.1/2/colorinterpolate.conf
These are automatically loaded by the script, as long as these files are in the same directory as the script or in ../etc or etc/ relative to the script.
#!/bin/bash
CTOOLS=/home/martink/work/circos/svn/tools/
# report a count of the number of links (-num) in 250 base windows (-bin_size 250)
# counting both ends of the link (-link_end 2)
cat ../../lecture.2/1/data/links.txt | $CTOOLS/binlinks/bin/binlinks -bin_size 250 -num -link_end 2
# This is a boilerplate Perl script for you to edit to build up a clustal parser
#
# Run it first as
#
# ./clustalpaser -h
#
# and then using the alignment file
#
# cat ../../data/alignment.aln | ./clustalparser
my $data = read_file($CONF{in});
# dump the data structure we created from the
# alignments to STDOUT
printdumper($data);
# Continue with the script here
# ...
# Convert a list of numbers into a
# list of contiguous ranges defined by [min,max]
#
# [ [min1,max1], [min2,max2], ...]
sub list2ranges {
my @x = sort {$a <=> $b} @_;
my $ranges = [];
# I wrote two approaches to solve this problem. The first
# is more like C code and the second is more Perlish.
# Change the 1 to a 0 in the outer if loop to use the second approach.
# Which one reads easier?
if(1) {
# This one is more like C
for ( my $i = 0; $i < @x; $i++) {
my $min = $i;
for my $j ($i+1..@x-1) {
if($x[$j] - $x[$j-1] > 1) {
last;
}
$i = $j;
}
push @$ranges, [$min,$i];
}
} else {
# This one is more Perlish
my $i = 0;
my $max;
while(@x) {
my $x = shift @x;
if(! defined $ranges->[-1] || $x - $max > 1) {
push @$ranges, [$i,$i];
} else {
$ranges->[-1][1] = $i;
}
$max = $x;
$i++;
}
}
return $ranges;
}
# Read the alignment file. For now, all it does
# is read the alignment lines and store them
# as a hash, keyed by the sequence id.
sub read_file {
my $file = shift;
my $inputhandle = get_handle($file);
my $data;
while(<$inputhandle>) {
# skip the first line, which is a header
# this is one of those rare times that a special variable
# other than $_ or @_ is usedful. The $. gives the
# line number of the file handle that is currently being read.
next if $. == 1;
# skip comments and blank lines
next if /^\s*\#/ ||/^\s*$/;
# trim newline
chomp;
# you now have the fields of this line
my ($id,$seq,$cumul) = split;
# put these values into a hash and push it onto the list
push @$data, {id=>$id,
seq=>$seq,
cumul=>$cumul};
}
return $data;
}
my $c1 = color2space([split(",",$CONF{start})],$CONF{space},$CONF{calcspace});
my $c2 = color2space([split(",",$CONF{end})],$CONF{space},$CONF{calcspace});
my $interpolated;
my $cintlabprev;
for my $i (0..$CONF{steps}) {
my $cint = [interpvalue($c1->[0],$c2->[0],$i,$CONF{steps}),
interpvalue($c1->[1],$c2->[1],$i,$CONF{steps}),
interpvalue($c1->[2],$c2->[2],$i,$CONF{steps})];
my $cout = color2space($cint,$CONF{calcspace},$CONF{space});
my $cintlab = color2space($cint,$CONF{calcspace},"lab");
my $de = $cintlabprev ? sqrt( ($cintlab->[0]-$cintlabprev->[0])**2 +
($cintlab->[1]-$cintlabprev->[1])**2 +
($cintlab->[2]-$cintlabprev->[2])**2) : 0;
push @$interpolated, {components=> ref $cout eq "ARRAY" ? [ map { nearest($CONF{nearest},$_) } @$cout ] : $cout,
i=>$i,
de=>$de,
lab=> $cintlab,
name=>sprintf("%s%03d",$CONF{rootname},$i)};
$cintlabprev = $cintlab;
}
for my $c (@$interpolated) {
printinfo(sprintf("%s = %9s # rgb %9s rgbhex %s idx %3d deltaE %6.2f lab(%s,%s,%s)",
$c->{name},
ref $c->{components} eq "ARRAY" ? sprintf("%d,%d,%d",@{$c->{components}}) : $c->{components},
sprintf("%d,%d,%d",lab2rgb(@{$c->{lab}})),
rgb2hex(lab2rgb(@{$c->{lab}})),
$c->{i},
nearest(0.01,$c->{de}),
(map { nearest(0.2,$_) } @{$c->{lab}})));
}
sub color2space {
my ($components,$from,$to) = @_;
my $fnfrom = sprintf("new_%s",$CONF{fn}{$from});
my $fnto = sprintf("as_%s",$CONF{fn}{$to});
my $c = $from =~ /hex/ ? Graphics::ColorObject->$fnfrom($components->[0]) : Graphics::ColorObject->$fnfrom($components);
my $cout = $c->$fnto();
printdebug("converting",@$components,"from",$from,"to",$to,item2list($cout));
return $cout;
}
sub interpvalue {
my ($start,$end,$step,$steps) = @_;
return $start + $step*($end-$start)/$steps;
}
sub item2list {
my $x = shift;
return ref $x eq "ARRAY" ? @$x : $x;
}
sub rgb2hsv {
my @c = @_;
my $c = Graphics::ColorObject->new_RGB([@c]);
my @nc = @{$c->as_HSV()};
printinfo(@c,@nc);
return @nc;
}
sub hsv2rgb {
my @c = @_;
my $c = Graphics::ColorObject->new_HSV([@c]);
return @{$c->as_RGB()};
}
sub rgb2hex {
my @c = @_;
my $c = Graphics::ColorObject->new_RGB255([@c]);
return $c->as_RGBhex();
}
sub lab2rgb {
my @c = @_;
my $c = Graphics::ColorObject->new_Lab([@c]);
return @{$c->as_RGB255()};
}
#!/bin/bash
# extract genes
grep -v \# ../1/ebola.genes.txt | cut -f 2,5,6,9 | \
awk '{print "ebola",$2,$3,$4,"name="$1}' > ebola.genes.circos.txt
# extract SNPs
grep -v \# ../1/ebola.variation.txt | cut -f 2,3,4,10,15 | \
perl -pe 's/\w+:\w+//' | \
awk '{print "ebola",$1,$2,$5,"snp="$3",gene="$4}' > ebola.variation.circos.txt
#!/bin/bash
CTOOLS=/home/martink/work/circos/svn/tools/
# Generate some random data on the human chromosome.
#
# To list predefined rules that define data position and value, see
#
# $CTOOLS/randomdata/bin/randomdata -listrules
#
# To understand how the rules work, see the script's manpage
#
# $CTOOLS/randomdata/bin/randomdata -man
# Make 5 random data sets based on default rule (10 mb bins, standard normal distribution).
for i in `seq 1 5` ; do
$CTOOLS/randomdata/bin/randomdata -karyotype karyotype.human.txt -ruleset default > random.$i.txt
done
# Convince yourself that the values are indeed standard normal (mean 0, sd 1)
cat random.*.txt| cut -d " " -f 4 | ../../../scripts/histogram > histogram.txt
# read the values from STDIN and use the -col COLUMN, by default 0
my @values;
while(my $line = <>) {
chomp $line;
next if $line =~ /^\#/;
my @tok = split(" ",$line);
if(defined $CONF{col} && ! defined $tok[ $CONF{col} ]) {
die "The line [$line] has no number in column [$CONF{col}]";
}
push @values, $tok[ $CONF{col}];
}
# get min and max of values
my ($min,$max) = minmax(@values);
# change min/max range if redefined via -min and/or -max
$min = $CONF{min} if defined $CONF{min};# && $CONF{min} > $min;
$max = $CONF{max} if defined $CONF{max};# && $CONF{max} < $max;
# *very* quick and dirty percentile if -pmin or -pmax used
if($CONF{pmin} || $CONF{pmax}) {
my @sorted = sort {$a <=> $b} @values;
$min = $sorted[ $CONF{pmin}/100 * (@sorted-1) ] if $CONF{pmin};
$max = $sorted[ $CONF{pmax}/100 * (@sorted-1) ] if $CONF{pmax};
}
# determine bin size from either the -binsize command-line parameter
# or from -nbins. At least one of these is defined — we made sure of
# that in validateconfiguration()
my $bin_size;
if($CONF{binsize}) {
$bin_size = $CONF{binsize};
$CONF{nbins} = round(($max-$min)/$CONF{binsize});
} else {
$bin_size = ($max-$min)/$CONF{nbins};
}
# populate bins
my $bins;
for my $i (0..$CONF{nbins}-1) {
my $start = $min + $i * $bin_size;
my $end = $min + ($i+1) * $bin_size;
my $n = grep($_ >= $start && ($_ < $end || ( $i == $CONF{nbins}-1 && $_ <= $end)), @values);
push @$bins, { start=>$start,
i=>$i,
end=>$end,
size=>$bin_size,
n=>$n };
}
my $histogram_width = $CONF{height};
my $maxn = max(map { $_->{n} } @$bins);
my $sumn = sum(map { $_->{n} } @$bins);
my $cumuln = 0;
my $bin_count_width = max(map { length($_->{n}) } @$bins);
my $bin_count_fmt = sprintf("%%%dd",2+$bin_count_width);
my $bin_width_f_max = max(map { $_->{n}/$maxn } @$bins);
# report number and fraction of values smaller than requested minimum
printinfo(sprintf("%10s %10.4f>$bin_count_fmt %6.3f","",
$min,
int(grep($_ < $min, @values)),
int(grep($_ < $min, @values))/@values));
# report each bin: min, max, count, fractional count, cumulative count
for my $bin (@$bins) {
$cumuln += $bin->{n};
my $binwidth = $bin->{n}*$histogram_width/$maxn;
if(defined $CONF{sqrt}) {
my $power = $CONF{sqrt} || 0.5;
my $f = $bin->{n}/$maxn;
$binwidth = $f**$power/$bin_width_f_max**$power * $histogram_width;
}
printinfo(sprintf("%10.4f %10.4f $bin_count_fmt %6.3f %6.3f %-${histogram_width}s",
$bin->{start},
$bin->{end},
$bin->{n},
$bin->{n}/$sumn,
$cumuln/$sumn,
"*"x$binwidth));
}
# report number and fraction of values larger than requested maximum
printinfo(sprintf("%10s %10.4f<%6d %6.3f","",
$max,
int(grep($_ > $max, @values)),
int(grep($_ > $max, @values))/@values));
# aggregate stats for full data set
printinfo(sprintf("%8s %12d","n",int(@values)));
printinfo(sprintf("%8s %12.5f","average",average(@values)));
printinfo(sprintf("%8s %12.5f","sd",stddev(@values)));
printinfo(sprintf("%8s %12.5f","min",min(@values)));
printinfo(sprintf("%8s %12.5f","max",max(@values)));
printinfo(sprintf("%8s %12.5f","sum",sum(@values)));
#!/bin/bash
START=255,0,0
END=0,0,255
STEPS=9
./colorinterpolate -steps $STEPS -start $START -end $END -rootname mycolorlab | \
tee etc/mycolors.lab.conf
./colorinterpolate -steps $STEPS -start $START -end $END -calcspace hsv -rootname mycolorhsv | \
tee etc/mycolors.hsv.conf
#!/bin/bash
echo "parsing UCSC assembly table and making karyotype file"
tail -1 ebola.assembly.txt | cut -f 4 | awk '{print "chr - ebola ebola 0",$1,"black"}' > ebola.karyotype.txt
#!/bin/bash
g1=Zaire_1995
# Notice how we're redirecting STDOUT to the link file and the
# STDERR to the karyotype file. This is a very useful idiom to
# generate two independent files at the same time and
# use redirection for maximum flexibility.
cat ../../lecture.2/1/ebola.multiz.txt | ../1/parsemaf -g1 $g1 -karyotype > z.vs.all.txt 2> karyotype.txt
# -seed N sets a random seed so that we can reproduce
# the random output for debugging
srand($CONF{seed}) if defined $CONF{seed};
# -dict ATGC sets the dictionary from which bases are selected
# for example you can add N's with -dict ATGCN
my @dict = split("",$CONF{dict});
# iterate over lines (pairs of sequence)
for my $i (1..$CONF{lines}) {
# this is the "reference" sequence, which is a concatenation
# of $CONF{len} randomly selected characters from the dictionary
my $str = join("",map { $dict[rand(@dict)] } (1..$CONF{len}));
print "$str\n";
# for each position in the reference
for my $j (1..$CONF{len}) {
# make a snp but only if we pass the probability requirement
# the chance of a SNP at given position is $CONF{p}
next unless rand() < $CONF{p};
# this is the reference base ast this position
my $ref = substr($str,$j-1,1);
# replace the character at this position with a randomly
# selected character from the dictionary that is not
# the reference base
#
# substr() can be an lvalue too — very useful
substr($str,$j-1,1) = (grep($_ ne $ref, @dict))[rand(@dict-1)];
}
print "$str\n\n";
}
#!/bin/bash
# extract the list of sequence IDs from the alignment file
REF=hsa8K
OUTDIR=parsed
FILE=../../data/alignment.aln
cat $FILE | cut -d " " -f 1 | sort -u | grep -v -i clustal | grep "[a-z]" > $OUTDIR/id.txt
# for each sequence ID, retrieve the sequence and store it with one base per line
for id in `cat $OUTDIR/id.txt`; do
grep $id $FILE | tr -s " " | cut -d " " -f 2 | fold -c1 > $OUTDIR/$id.seq.txt
done
# paste hsa8K together with each other sequence id
# and remove lines with "-" indicating no alignment
# and make a file of all positions (one per line) at
# which bases are aligned
for id in `cat $OUTDIR/id.txt | grep -v $REF` ; do
# convert tabs to a single space, collapse all spaces and
# remove leading spaces and trailing spaces (done via shrinkwrap.sh script)
paste $OUTDIR/$REF.seq.txt $OUTDIR/$id.seq.txt | cat -n | \
./shrinkwrap.sh > $OUTDIR/align.$REF.$id.txt
# The line below is long but none of the steps are particularly complicated
# 1. replace all bases with an X (tr)
# 2. extract runs of lines that have the same fields (uniq), but skipping first field (-f 1)
# and counting the number of runs (-c)
# 3. remove all lines that have gaps (grep)
# 4. write the output to a file but also to STDOUT (tee)
# 5. add the name of the sequence ids to the file and compute start and end of alignment
# based on the start of the alignment and length of runs from uniq
cat $OUTDIR/align.$REF.$id.txt | tr ATGC X | uniq -f 1 -c | \
grep -v - | tee $OUTDIR/runs.$REF.$id.txt | \
awk -v ref=$REF -v id=$id '{print ref,$2,$2+$1-1,id,$2,$2+1-2}' > $OUTDIR/links.$REF.$id.txt
done
# Make sure you understand everything below all the way to the
# housekeeping section.
# Read about the MAF format at http://www.bx.psu.edu/~dcking/man/maf.xhtml
# This will be our alignment list reference. It's declared as
# a scalar because it will be a reference to a list. While it
# doesn't have to be a reference, get used to using references.
#
# Alternatively, we could have done this
#
# my @alignments;
# ...
# push @alignments, ...
# ...
# $alignments[-1]{strain}{$strain} = ...
#
# By using a list variable saves you from having to dereference
# it with @$alignments or using -> to pull out an element.
#
# Last element using @alignments list: $alignments[-1]
# Last element using $alignments as list reference: $alignments->[-1]
#
# We can always make a list reference by
#
# my $alignments = \@alignments;
my $alignments;
# Read from STDIN until end of stream. This way the script expects
# a pipe from another process, like cat
#
# cat .../1/ebola.multiz.txt | ./parsemaf
#
while(my $line = <>) {
# remove the trailing new line
chomp $line;
# if this file matches this regular expression, then it is
# the start of an alignment block
if($line =~ /^a score=(.+)/) {
# push a new alignment hash onto the list
# for now, only the score is defined
push @$alignments, {score=>$1};
} elsif($line =~ /^s /) {
# this is an individual alignment line and belongs
# to the current alignment block, which is the last
# element in the @$alignment list.
# chop up the line into fields using any whitespace as delimiter
my @tok = split(" ",$line);
# assign the first four elements of the tokens to
# individual variables. By putting 'undef' as the
# first element we discard the first element in
# the assignment. Another way to do this would be
#
# my ($strain,$start,$size,$strand,$genome_size) = @tok[1..5];
my (undef,$strain,$start,$size,$strand,$genome_size) = @tok;
# if the strain is in the format STR.STR (repeated string)
# joined by a period, throw out everything after
# the period
if.\1/) {
$strain =~ s/[.].*//;
}
# assign this alignment line to the last (current) alignment
# block. We could have made a list out of these but it's
# even more convenient to key them by the strain
$alignments->[-1]{strain}{$strain} =
{ start=>$start,size=>$size,strain=>$strain,genome_size=>$genome_size };
}
}
printdebug("parsed",int(@$alignments),"alignments");
# The parsing above could have been done by changing the
# input record separator in Perl, which is accessible using
# the special variable $/
#
# The default value of this variable depends on the operating
# system. For more details see https://perldoc.perl.org/perlvar.html
#
# I urge you against using these special variables (with the
# exception of $_ and @_) and get in the habit of doing things yourself.
# Your code will be much more legible. Some of these variables
# actually slow down your code.
# Just dump the data structure to STDOUT and quit. This
# helps understand what's happening under the hood.
#
# The alignments are a list. Each list item is a hash of this form
#
# { score => SCORE,
# strain => { STRAIN1 => { start=>START1,size=>SIZE1,strain=>STRAIN1 },
# STRAIN2 => { start=>START2,size=>SIZE2,strain=>STRAIN2 },
# STRAIN3 => { start=>START3,size=>SIZE3,strain=>STRAIN3 },
# ...
# }
# }
printdumperq($alignments) if $CONF{dump};
# Make a unique list of strains. This is a compound line made up of
# several nested functions. Each is a simple step and this is
# a powerful Perl idiom when the steps are combined together.
#
# the list of alignments
# @$alignments
#
# a list of strains (which are keys in the alignment hash) in a given alignment
# keys %{$a->{strain}}
#
# a list of strains from all alignments
# here we're basically mapping the above step to each alignment
# map { keys %{$->{strain}} } @$alignments
#
# a list of unique strains
# uniq ( map { keys %{$->{strain}} } @$alignments )
my @strains = uniq ( map { keys %{$_->{strain}} } @$alignments );
# If -g1 is not set, we just list the strains and quit.
# I've defined printinfo() below as a wrapper for space-delimited
# print that prints _undef_ for arguments that are not defined.
if(! $CONF{g1}) {
printinfo("*** These strains are available:");
for my $strain (sort @strains) {
printinfo($strain);
}
exit;
}
# If we defined -g1 and/or -g2 make sure that we've seen these in the file
my $quit;
for my $param (qw(g1 g2)) {
# die if we have defined the genome we want but it's not in the list of strains.
if($CONF{$param} && ! grep($_ eq $CONF{$param}, @strains)) {
printinfo("ERROR: There are no alignments for -$param [$CONF{$param}]");
# Even if there are strains that match $CONF{g1} or $CONF{g2} we can
# produce a list of strains that match it as a regular expression to help the user
# find a strain.
my @matching_strains = grep($_ =~ /$CONF{$param}/, @strains);
if(@matching_strains) {
printinfo("Strains that match your input $CONF{$param} as a regular expression are");
for (@matching_strains) {
printinfo(" $_");
}
} else {
printinfo("No strains match your input $CONF{$param} as a regular expression.");
}
$quit = 1;
}
}
die if $quit;
# Now we print out the alignments in Circos' link format
my $seen_strain;
for my $alignment (@$alignments) {
# alignment to our reference
my $a1 = $alignment->{strain}{$CONF{g1}};
# the reference may be absent from this alignment, in which case go to next alignment
next unless $a1;
# for this alignment block, make a list of alignments to match up with
# the reference. if we have defined -g2 STRAIN2 then it's the alignment for
# STRAIN2 (which may be absent). Othewise it's a list of all strains in this alignment.
my @a2 = $CONF{g2} ? $alignment->{strain}{$CONF{g2}} : values %{$alignment->{strain}};
# Loop over all the alignments in this block
for my $a2 (@a2) {
# Move to the next alignment line if this one is not defined. This shouldn't
# happen but it's safe to check.
next if ! $a2;
# Report the karyotype if we used -karyotype. These will be printed
# to STDERR. The reason for this is that we can write to two
# files at the same time by redirecting STDOUT and STDERR to
# different files
#
# ./parsemaf > stdout.txt 2> stderr.txt
#
# Or we can combine the two
#
# ./parsemaf > everything 2>&1
if($CONF{karyotype} && ! $seen_strain->{ $a1->{strain} }++) {
printerr(sprintf("chr - %s %s 0 %d black",
$a1->{strain},$a1->{strain},$a1->{genome_size}));
}
if($CONF{karyotype} && ! $seen_strain->{ $a2->{strain} }++) {
printerr(sprintf("chr - %s %s 0 %d black",
$a2->{strain},$a2->{strain},$a2->{genome_size}));
}
# We're reporting alignment pairs, so skip if we're asking for self-alignment
next if $a1->{strain} eq $a2->{strain};
printinfo(
$a1->{strain},
$a1->{start},
$a1->{start} + $a1->{size} - 1,
$a2->{strain},
$a2->{start},
$a2->{start} + $a2->{size} - 1,
sprintf("offset=%d",$a2->{start}-$a1->{start}),
);
}
}
#!/bin/bash
# Insert alignment record number in front of each line in the alignment
cat ../1/ebola.multiz.txt | grep -v \# | \
awk 'BEGIN {RS="\n\n"} {print gensub(/^|\n/,""sprintf("\n%04d",NR)" ","g")}' > ebola.multiz.numbered.txt
# Create temporary files with score and Zaire and Reston lines from
# each alignment, sorted by the alignment record number. Do not
# report the "i" lines
for var in score Zaire_1995 Reston_1996 ; do
grep $var ebola.multiz.numbered.txt | grep -v -w i | sort > tmp.$var.txt
done
# Join the lines on the alignment record number.
join tmp.score.txt tmp.Zaire_1995.txt | join - tmp.Reston_1996.txt > tmp.z.vs.r.txt
# Pull out the relevant columns and remove everything
# after the period.
cut -d " " -f 3,5-7,12-14 tmp.z.vs.r.txt | sed 's/[.][^ ]*//gi' > tmp.z.vs.r.2.txt
# Reformat
cat tmp.z.vs.r.2.txt | awk '{ print $2,$3,$3+$4,$5,$6,$6+$7,"offset="$6-$3","$1 }' > z.vs.r.txt
# Remove tmp files
\rm -f tmp*
$\ = "\n"; # default end of line delimiter
$, = " "; # default field delimiter
my $x = 1;
my @x = (1,2,3);
my %x = (a=>1,b=>2,c=>3);
# Note that $x @x and %x are all DIFFERENT variables, even
# though they have the same name. There is the scalar x, list x and hash x.
# For now, consider them to be independent and unrelated.
#
# Avoid doing things like
#
# @xlist = (1,2,3)
#
# since @xlist is obviously a list.
#
# It's generally bad practise to add the variable type to its name. However,
# I do this in this script for clarity. For example, the variable $xlistref
# would probably just be called $things or $fruits.
#
# Treat the words "list" and "array" as synonymous. Technically, an array
# is the data type and the list is the thing that it contains, but this
# is a minor distinction. For compound lists (e.g. lists of lists) you would
# say "two-dimensional array" and not "two-dimensional list".
print ("if we print a list we get",@x);
print ("if we print a hash we get",%x);
print ("list via Data::Dumper");
print Dumper(\@x);
print ("hash via Data::Dumper");
print Dumper(\%x);
my $xlistref = \@x;
my $xhashref = \%x;
print ("xlistref",$xlistref,"is reference to",ref $xlistref);
print ("xlistref",$xhashref," is reference to",ref $xhashref);
# Dereference the references back to their original types
my @y = @$xlistref;
my %y = %$xhashref;
# looping over a list
for my $item (@x) {
print ("looping over list",$item);
}
# when you get comfortable with the idea that $x and @x
# are different variables, you can do this
for my $x (@x) {
print ("looping again over list",$x);
}
for my $i (0..@x-1) {
print ("looping over list index",$i,"item",$x[$i]);
}
# looping over a reference of the list — requires that
# we dereference the list by prepending the list sigil @
# to the variable.
#
# to access a list item via reference, you use ->
for my $i (0..@$xlistref-1) {
print ("looping over list ref index",$i,"item",$xlistref->[$i]);
}
# looping over keys in a hash
for my $key (keys %x) {
print ("key",$key,"value",$x{$key});
}
# looping over keys in a hash reference
#
# to access a hash item via reference you use ->
for my $key (keys %$xhashref) {
print ("key",$key,"value",$xhashref->{$key});
}
# variables can be evaluated in a context to force
# an "interpretation" of the variable
# when a list is evaluated in scalar context (i.e. it is
# being assigned to a variable that is explicitly a scalar)
# you get the number of items in the list.
my $scalar = @x;
print("list as scalar",$scalar);
# Note that many contexts are list context such as
#
# print @x
# for (@x)
#
# Since perl 0-indexes lists, the last object is at index
my $lastidx = @x-1;
print("last index",$lastidx);
# The loop below cycles over the indexes in @x
for my $i (0..@x-1) {
print("index",$i,"value",$x[$i]);
}
# You can also access the last index via the special variable $#x
# so you can write the loop like this
for my $i (0..$#x) {}
# Personally, I do not like this notation and always write @x-1. This
# special variable is very very Perlish (Perl has lots of internal
# variables like this and it's up to you how many of them you want to use).
#
# http://perldoc.perl.org/perlvar.html
# For example, the current iterator in a for loop is accessed by $_
for (@x) {
print("current item",$_);
}
# Evaluating a scalar in list context
$x = "hello";
my @list = $x;
# simply gives you a single element list ("hello").
# To quickly define lists, use the word quotation operator qw()
@list = qw(some words in a list);
# This is much faster than
@list = ("some","words","in","a","short","list");
# If you evaluate a list in a hash context, Perl will expect an
# even number of elements and assign them as key/value pairs. Notice
# that Perl will interpolate variables inside a string defined
# with double quotes.
my %wordhash = @list;
for my $key (keys %wordhash) {
print "$key -> $wordhash{$key}";
}
# One of the reasons why Perl was very popular in its day was its
# support for regular expressions
#
# https://perldoc.perl.org/perlre.html
my $str = "abccddfffhiiijj";
if($str =~ /c/) {
print($str,"matches");
}
# For example to match any pair of identical characters you would
# use (.)\1 where . is the character class for any character, the
# (.) is a capture bracket which stores the match in an internal
# variable accessible immediately by \1.
#
# Then during replacement this internal variable is available as $1. The
# notation is different (e.g. vs \1) because we're no longer in the middle
# of doing the match. The match is done and we're now replacing strings.
$str =~ s/(.)\1/[2 x $1]/g;
print("replaced",$str);
# Regular expressions are a big topic, but you only need to know a few
# important things to get started. You can test-drive your regular expressions here
#
# https://regex101.com/
# You'll often be splitting strings into fields.
$str = "10 20 30 55 102 28";
# Using " " tells split to treat any amount of whitespace as a delimiter
my @tok = split(" ",$str);
print("split line into",@tok);
print("second field is",$tok[1]); # remember lists are 0-indexed
# You can now select items that you want
for my $tok (@tok) {
if($tok eq 10) { # string comparison
# ...
}
if($tok =~ /0/) {
print("item",$tok,"has a 0");
}
}
# Alternatively, a Perlish thing to do is to apply grep() to the list.
# Trep takes a condition as the first argument, which can be the
# regular expression
for my $tok (grep(/0/, @tok)) {
print("item",$tok,"has a 0 via grep");
}
# Strictly speaking grep(EXPRESSION,LIST) passes each value from LIST
# to the EXPRESSION via $_ and the expression is evaluated. If it
# evaluates to true then grep returns the list item, otherwise it does not.
for my $tok (grep($_ =~ /0/, @tok)) {}
# Grep is a *great* way to filter a list.
# To apply code to each list item, use map()
my @newtok = map { 2*$_ } @tok;
print("remapped list",@newtok);
# The code in { } can be anything and the new list is composed of the
# values returned by the code, as evaluated on each list element via $_
# You can combine map { } and grep(). For example, here we
# multiply each element that contains a 0 by 2. The list is not
# modified — both grep and map return copies of the elements.
print("map{} and grep()",map { 2*$_ } grep(/0/, @tok));
# To pull out all numbers made out of digits from a string
my $input = "aa1bb22ccc333dd444";
while( $input =~ /(\d+)/g ) {
print "Found $1 in $input";
}
# There is a lot going on here. The regular expression operator =~ applies
# the regular expression (\d+) to $input. Because the regular expression has capture
# brackets and we're using /g and this is evaluated in a while loop, the operator
# will continue to return as many matches as it can find. The substring matched will be stored in the
# special variable $1, which you can think of as the result of the first capture bracket.
# Below I add a second set of capture brackets to the regular expression, which grabs
# the letters immediately following the digit. Here the + means "one or more".
while( $input =~ /(\d+)([a-z]+)/g ) {
print "Found digit/letter combo $1/$2 in $input";
}
# A final useful regular expression character are the string anchors ^, which
# matches the start of a string, and $, which matches the end. Thus
if($input =~ /\d$/) { }
# will match if $input ends in a digit and
if($input =~ /^\d/) { }
# will match if it starts with a digit. To check that the string is made up exclusively
# of digits you would do this
if($input =~ /^\d$/) { }
#!/bin/bash
# This is the directory where the Circos tools are installed
# on my computer at work. But for your setup, it'll be different.
# Change this to the right directory. Take a look in /root/modules/circos
# to find it.
CTOOLS=/home/martink/work/circos/svn/tools/
# Let's make data sets for both the count and average size for 50, 75 and 100 kb
for binkb in 50 75 100 ; do
binsize=$((1000*binkb))
echo "Resampling at $binsize"
cat ../../data/genes.txt | $CTOOLS/resample/bin/resample -bin $binsize -count > genes.count.${binkb}kb.txt
cat ../../data/genes.txt | $CTOOLS/resample/bin/resample -bin $binsize -avg > genes.avgsize.${binkb}kb.txt
done
# Look at the manpage of the 'resample' script by running 'resample -man' You'll notice
# that it can also report the minimum and maximum values in a bin. Add these
# to the loop above to create files for these variables (e.g. genes.maxsize.100kb.txt)
#
# You transform the vertical scale in Circos but you can also do it
# when generating the data file. Here 'awk' is very helpful and its log() function
# which gives the natural logarithm. So log(x)/log(10) is the base 10 logarithm.
cat genes.count.50kb.txt | awk '{print $1,$2,$3,log($4)/log(10)}' > genes.count.50kb.log.txt
# Write a loop that applies this transformation to all the bins (50, 75, 100 kb) and all the
# quantities (count, avg, min, max). Watch out for bins with value 0 — the log() call
# will produce an error. How would you deal with this?
#!/bin/bash
# This is the directory where the Circos tools are installed
# on my computer at work. But for your setup, it'll be different.
# Change this to the right directory. Take a look in /root/modules/circos
# to find it.
CTOOLS=/home/martink/work/circos/svn/tools/
mkdir data
$CTOOLS/clustal2link/bin/clustal2link -aln ../../data/alignment.aln -idref hsa8K -dir data -debug
#!/bin/bash
# show the histogram of 1000 uniformly distributed random numbers
seq 1 1000 | awk 'BEGIN { srand() } {print rand()}' | ./histogram | tee histogram.uniform.txt
# show the histogram of 1000 normally(ish) (guaranteed by the Central
# Limit Theorem) random numbers generated from average of n = 10 samples
# of uniform random numbers
seq 1 1000 | \
awk 'BEGIN { srand();N=10 } { samplesum=0; for(i=1;i<=N;++i) samplesum+=rand(); print samplesum/N}' | \
./histogram | tee histogram.normal.txt
# show the histogram of the 10-90% of offsets
cat ../../lecture.3/2/z.vs.all.txt | sed 's/.*=//' | \
./histogram -bins 15 -pmin 10 -pmax 90 | tee histogram.10.90.txt
# show the histogram of offsets between -300 and 20% (-45).
cat ../../lecture.3/2/z.vs.all.txt | sed 's/.*=//' | \
./histogram -bins 15 -min -300 -pmax 20 | tee histogram-300.p20.txt
#!/bin/bash
# default delimiter is a space
#
# cat file.txt | shrinkwrap.sh
#
# but you can set it by passing an argument
#
# cat file.txt | shrinkwrap.sh ,
# Assign the first argument to delimiter unless it's not defined
# in which case assign just a space
#
# VAR=${1:-DEFAULTVALUE}
#
DELIM=${1:- }
# all spaces to tabs
expand -t1 | \
# collapse all spaces
tr -s " " | \
# only keep lines that have a non-space character
grep '[^ ]' | \
# remove leading and trailing spaces
sed 's/^[ ]//g' | sed 's/[ ]$//g' | \
# remove lines that begin with comment field
grep -v -e '^\#' |
# replace the space with our delimiter string (space by default)
tr " " "$DELIM"
#!/bin/bash
LEN=15
LINES=3
perl makesnp -p 0.1 -len $LEN -lines $LINES -seed 0 > snp.txt
# 0. filter blank lines
# 1. fold the lines to make one character per line
# 2. calculate
#
# rec - record number (line number modulo 2), 0|1
# pair - index of line pair (0,1,2,3...)
# pos - position along the line
#
# 3. each base gets a unique tag: pair.pos.base
#
# 4. positions at which there are snps will only have one entry of this
# tag, which we find by count unique tags and then looking for ones
# that only appear once
#
# The uniq -w7 flag compares only the first 7 characters (length of tag)
grep -v "^$" snp.txt | fold -w1 | \
awk -v len=$LEN -v lines=$LINES \
'{ rec=NR%2; pair=int((NR-1)/(2*len)); pos=(NR-1)%len; \
printf("%02d.%02d.%s %d %d\n",pair,pos,$1,rec,pair*len+pos) }' > snp.mod.txt
sort snp.mod.txt | uniq -w7 -c | sed 's/^ *//' | grep ^1 | sort -n -k 4,5 -n -k 3,4 | tee snp.report.txt
cat snp.report.txt | sed 's/\./ /g'| awk '{print $5,$6,$4}' > tmp.txt
grep ^0 tmp.txt | cut -d " " -f 2- | sort > tmp.1.txt
grep ^1 tmp.txt | cut -d " " -f 2- | sort > tmp.2.txt
join tmp.1.txt tmp.2.txt | sort -n > snp.list.txt
\rm tmp.*.txt