2024 π Daylatest newsbuy art
Safe, fallen down this way, I want to be just what I am.Cocteau Twinssafe at lastmore quotes
bioinformatics + data visualization

Learning Circos

Bioinformatics and Genome Analysis

Institut Pasteur Tunis Tunisia, December 10 – December 11, 2018

Download course materials
v2.00 6 Dec 2018
Download PDF slides
v2.00 6 Dec 2018

A 2- or 4-day practical mini-course in Circos, command-line parsing and scripting. This material is part of the Bioinformatics and Genome Analysis course held at the Institut Pasteur Tunis.

sessions / scripts
sessions / scripts / README

Scripts

This page lists all the scripts from all lectures.

When you untar the material, these scripts should be symbolic links back to their corresponding day and lecture

 sessions/scripts/parsemaf -> ../day.4/lecture.3/1/parsemaf

and some of the Perl scripts may have accompanying .conf files such as

 sessions/day.2/lecture.1/2/colorinterpolate.conf

These are automatically loaded by the script, as long as these files are in the same directory as the script or in ../etc or etc/ relative to the script.

day.3 / lecture.4 / 1 / binlinks.sh
#!/bin/bash

CTOOLS=/home/martink/work/circos/svn/tools/

# report a count of the number of links (-num) in 250 base windows (-bin_size 250)
# counting both ends of the link (-link_end 2)

cat ../../lecture.2/1/data/links.txt | $CTOOLS/binlinks/bin/binlinks -bin_size 250 -num -link_end 2
day.3 / lecture.3 / 1 / clustalparser
# This is a boilerplate Perl script for you to edit to build up a clustal parser
#
# Run it first as
#
# ./clustalpaser -h
#
# and then using the alignment file
#
# cat ../../data/alignment.aln | ./clustalparser

my $data = read_file($CONF{in});

# dump the data structure we created from the
# alignments to STDOUT

printdumper($data);

# Continue with the script here

# ...


# Convert a list of numbers into a
# list of contiguous ranges defined by [min,max]
#
# [ [min1,max1], [min2,max2], ...]
sub list2ranges {
my @x = sort {$a <=> $b} @_;
my $ranges = [];
# I wrote two approaches to solve this problem. The first
# is more like C code and the second is more Perlish.
# Change the 1 to a 0 in the outer if loop to use the second approach.
# Which one reads easier?
if(1) {
# This one is more like C
for ( my $i = 0; $i < @x; $i++) {
my $min = $i;
for my $j ($i+1..@x-1) {
if($x[$j] - $x[$j-1] > 1) {
last;
}
$i = $j;
}
push @$ranges, [$min,$i];
}
} else {
# This one is more Perlish
my $i = 0;
my $max;
while(@x) {
my $x = shift @x;
if(! defined $ranges->[-1] || $x - $max > 1) {
push @$ranges, [$i,$i];
} else {
$ranges->[-1][1] = $i;
}
$max = $x;
$i++;
}
}
return $ranges;
}

# Read the alignment file. For now, all it does
# is read the alignment lines and store them
# as a hash, keyed by the sequence id.
sub read_file {
my $file = shift;
my $inputhandle = get_handle($file);
my $data;
while(<$inputhandle>) {
# skip the first line, which is a header
# this is one of those rare times that a special variable
# other than $_ or @_ is usedful. The $. gives the
# line number of the file handle that is currently being read.
next if $. == 1;
# skip comments and blank lines
next if /^\s*\#/ ||/^\s*$/;
# trim newline
chomp;
# you now have the fields of this line
my ($id,$seq,$cumul) = split;
# put these values into a hash and push it onto the list
push @$data, {id=>$id,
seq=>$seq,
cumul=>$cumul};

}
return $data;
}
day.2 / lecture.1 / 2 / colorinterpolate
my $c1 = color2space([split(",",$CONF{start})],$CONF{space},$CONF{calcspace});
my $c2 = color2space([split(",",$CONF{end})],$CONF{space},$CONF{calcspace});

my $interpolated;
my $cintlabprev;
for my $i (0..$CONF{steps}) {
my $cint = [interpvalue($c1->[0],$c2->[0],$i,$CONF{steps}),
interpvalue($c1->[1],$c2->[1],$i,$CONF{steps}),
interpvalue($c1->[2],$c2->[2],$i,$CONF{steps})];
my $cout = color2space($cint,$CONF{calcspace},$CONF{space});
my $cintlab = color2space($cint,$CONF{calcspace},"lab");
my $de = $cintlabprev ? sqrt( ($cintlab->[0]-$cintlabprev->[0])**2 +
($cintlab->[1]-$cintlabprev->[1])**2 +
($cintlab->[2]-$cintlabprev->[2])**2) : 0;
push @$interpolated, {components=> ref $cout eq "ARRAY" ? [ map { nearest($CONF{nearest},$_) } @$cout ] : $cout,
i=>$i,
de=>$de,
lab=> $cintlab,
name=>sprintf("%s%03d",$CONF{rootname},$i)};
$cintlabprev = $cintlab;
}

for my $c (@$interpolated) {
printinfo(sprintf("%s = %9s # rgb %9s rgbhex %s idx %3d deltaE %6.2f lab(%s,%s,%s)",
$c->{name},
ref $c->{components} eq "ARRAY" ? sprintf("%d,%d,%d",@{$c->{components}}) : $c->{components},
sprintf("%d,%d,%d",lab2rgb(@{$c->{lab}})),
rgb2hex(lab2rgb(@{$c->{lab}})),
$c->{i},
nearest(0.01,$c->{de}),
(map { nearest(0.2,$_) } @{$c->{lab}})));

}

sub color2space {
my ($components,$from,$to) = @_;
my $fnfrom = sprintf("new_%s",$CONF{fn}{$from});
my $fnto = sprintf("as_%s",$CONF{fn}{$to});
my $c = $from =~ /hex/ ? Graphics::ColorObject->$fnfrom($components->[0]) : Graphics::ColorObject->$fnfrom($components);
my $cout = $c->$fnto();
printdebug("converting",@$components,"from",$from,"to",$to,item2list($cout));
return $cout;
}


sub interpvalue {
my ($start,$end,$step,$steps) = @_;
return $start + $step*($end-$start)/$steps;
}

sub item2list {
my $x = shift;
return ref $x eq "ARRAY" ? @$x : $x;
}

sub rgb2hsv {
my @c = @_;
my $c = Graphics::ColorObject->new_RGB([@c]);
my @nc = @{$c->as_HSV()};
printinfo(@c,@nc);
return @nc;
}
sub hsv2rgb {
my @c = @_;
my $c = Graphics::ColorObject->new_HSV([@c]);
return @{$c->as_RGB()};
}
sub rgb2hex {
my @c = @_;
my $c = Graphics::ColorObject->new_RGB255([@c]);
return $c->as_RGBhex();
}
sub lab2rgb {
my @c = @_;
my $c = Graphics::ColorObject->new_Lab([@c]);
return @{$c->as_RGB255()};
}
day.1 / lecture.4 / 3 / create.tracks.sh
#!/bin/bash

# extract genes

grep -v \# ../1/ebola.genes.txt | cut -f 2,5,6,9 | \
awk '{print "ebola",$2,$3,$4,"name="$1}' > ebola.genes.circos.txt

# extract SNPs

grep -v \# ../1/ebola.variation.txt | cut -f 2,3,4,10,15 | \
perl -pe 's/\w+:\w+//' | \
awk '{print "ebola",$1,$2,$5,"snp="$3",gene="$4}' > ebola.variation.circos.txt
day.2 / lecture.1 / 1 / generate.random.data.sh
#!/bin/bash

CTOOLS=/home/martink/work/circos/svn/tools/

# Generate some random data on the human chromosome.
#
# To list predefined rules that define data position and value, see
#
# $CTOOLS/randomdata/bin/randomdata -listrules
#
# To understand how the rules work, see the script's manpage
#
# $CTOOLS/randomdata/bin/randomdata -man

# Make 5 random data sets based on default rule (10 mb bins, standard normal distribution).
for i in `seq 1 5` ; do
$CTOOLS/randomdata/bin/randomdata -karyotype karyotype.human.txt -ruleset default > random.$i.txt
done

# Convince yourself that the values are indeed standard normal (mean 0, sd 1)
cat random.*.txt| cut -d " " -f 4 | ../../../scripts/histogram > histogram.txt
day.4 / lecture.4 / 2 / histogram
# read the values from STDIN and use the -col COLUMN, by default 0
my @values;
while(my $line = <>) {
chomp $line;
next if $line =~ /^\#/;
my @tok = split(" ",$line);
if(defined $CONF{col} && ! defined $tok[ $CONF{col} ]) {
die "The line [$line] has no number in column [$CONF{col}]";
}
push @values, $tok[ $CONF{col}];
}

# get min and max of values
my ($min,$max) = minmax(@values);

# change min/max range if redefined via -min and/or -max
$min = $CONF{min} if defined $CONF{min};# && $CONF{min} > $min;
$max = $CONF{max} if defined $CONF{max};# && $CONF{max} < $max;

# *very* quick and dirty percentile if -pmin or -pmax used
if($CONF{pmin} || $CONF{pmax}) {
my @sorted = sort {$a <=> $b} @values;
$min = $sorted[ $CONF{pmin}/100 * (@sorted-1) ] if $CONF{pmin};
$max = $sorted[ $CONF{pmax}/100 * (@sorted-1) ] if $CONF{pmax};
}

# determine bin size from either the -binsize command-line parameter
# or from -nbins. At least one of these is defined — we made sure of
# that in validateconfiguration()
my $bin_size;
if($CONF{binsize}) {
$bin_size = $CONF{binsize};
$CONF{nbins} = round(($max-$min)/$CONF{binsize});
} else {
$bin_size = ($max-$min)/$CONF{nbins};
}

# populate bins
my $bins;
for my $i (0..$CONF{nbins}-1) {
my $start = $min + $i * $bin_size;
my $end = $min + ($i+1) * $bin_size;
my $n = grep($_ >= $start && ($_ < $end || ( $i == $CONF{nbins}-1 && $_ <= $end)), @values);
push @$bins, { start=>$start,
i=>$i,
end=>$end,
size=>$bin_size,
n=>$n };
}

my $histogram_width = $CONF{height};
my $maxn = max(map { $_->{n} } @$bins);
my $sumn = sum(map { $_->{n} } @$bins);
my $cumuln = 0;

my $bin_count_width = max(map { length($_->{n}) } @$bins);
my $bin_count_fmt = sprintf("%%%dd",2+$bin_count_width);
my $bin_width_f_max = max(map { $_->{n}/$maxn } @$bins);

# report number and fraction of values smaller than requested minimum
printinfo(sprintf("%10s %10.4f>$bin_count_fmt %6.3f","",
$min,
int(grep($_ < $min, @values)),
int(grep($_ < $min, @values))/@values));

# report each bin: min, max, count, fractional count, cumulative count
for my $bin (@$bins) {
$cumuln += $bin->{n};
my $binwidth = $bin->{n}*$histogram_width/$maxn;
if(defined $CONF{sqrt}) {
my $power = $CONF{sqrt} || 0.5;
my $f = $bin->{n}/$maxn;
$binwidth = $f**$power/$bin_width_f_max**$power * $histogram_width;
}
printinfo(sprintf("%10.4f %10.4f $bin_count_fmt %6.3f %6.3f %-${histogram_width}s",
$bin->{start},
$bin->{end},
$bin->{n},
$bin->{n}/$sumn,
$cumuln/$sumn,
"*"x$binwidth));
}

# report number and fraction of values larger than requested maximum
printinfo(sprintf("%10s %10.4f<%6d %6.3f","",
$max,
int(grep($_ > $max, @values)),
int(grep($_ > $max, @values))/@values));

# aggregate stats for full data set
printinfo(sprintf("%8s %12d","n",int(@values)));
printinfo(sprintf("%8s %12.5f","average",average(@values)));
printinfo(sprintf("%8s %12.5f","sd",stddev(@values)));
printinfo(sprintf("%8s %12.5f","min",min(@values)));
printinfo(sprintf("%8s %12.5f","max",max(@values)));
printinfo(sprintf("%8s %12.5f","sum",sum(@values)));
day.2 / lecture.1 / 2 / make.color.ramp.sh
#!/bin/bash

START=255,0,0
END=0,0,255
STEPS=9
./colorinterpolate -steps $STEPS -start $START -end $END -rootname mycolorlab | \
tee etc/mycolors.lab.conf
./colorinterpolate -steps $STEPS -start $START -end $END -calcspace hsv -rootname mycolorhsv | \
tee etc/mycolors.hsv.conf
day.1 / lecture.4 / 1 / make.karyotype.sh
#!/bin/bash

echo "parsing UCSC assembly table and making karyotype file"
tail -1 ebola.assembly.txt | cut -f 4 | awk '{print "chr - ebola ebola 0",$1,"black"}' > ebola.karyotype.txt
day.4 / lecture.3 / 2 / make.tracks.sh
#!/bin/bash

g1=Zaire_1995

# Notice how we're redirecting STDOUT to the link file and the
# STDERR to the karyotype file. This is a very useful idiom to
# generate two independent files at the same time and
# use redirection for maximum flexibility.

cat ../../lecture.2/1/ebola.multiz.txt | ../1/parsemaf -g1 $g1 -karyotype > z.vs.all.txt 2> karyotype.txt
day.1 / lecture.4 / 5 / makesnp
# -seed N sets a random seed so that we can reproduce
# the random output for debugging
srand($CONF{seed}) if defined $CONF{seed};

# -dict ATGC sets the dictionary from which bases are selected
# for example you can add N's with -dict ATGCN

my @dict = split("",$CONF{dict});

# iterate over lines (pairs of sequence)
for my $i (1..$CONF{lines}) {
# this is the "reference" sequence, which is a concatenation
# of $CONF{len} randomly selected characters from the dictionary
my $str = join("",map { $dict[rand(@dict)] } (1..$CONF{len}));
print "$str\n";
# for each position in the reference
for my $j (1..$CONF{len}) {

# make a snp but only if we pass the probability requirement
# the chance of a SNP at given position is $CONF{p}
next unless rand() < $CONF{p};

# this is the reference base ast this position
my $ref = substr($str,$j-1,1);

# replace the character at this position with a randomly
# selected character from the dictionary that is not
# the reference base
#
# substr() can be an lvalue too — very useful
substr($str,$j-1,1) = (grep($_ ne $ref, @dict))[rand(@dict-1)];
}
print "$str\n\n";
}
day.3 / lecture.2 / 2 / parseclustal.sh
#!/bin/bash

# extract the list of sequence IDs from the alignment file

REF=hsa8K
OUTDIR=parsed
FILE=../../data/alignment.aln
cat $FILE | cut -d " " -f 1 | sort -u | grep -v -i clustal | grep "[a-z]" > $OUTDIR/id.txt

# for each sequence ID, retrieve the sequence and store it with one base per line

for id in `cat $OUTDIR/id.txt`; do
grep $id $FILE | tr -s " " | cut -d " " -f 2 | fold -c1 > $OUTDIR/$id.seq.txt
done

# paste hsa8K together with each other sequence id
# and remove lines with "-" indicating no alignment
# and make a file of all positions (one per line) at
# which bases are aligned

for id in `cat $OUTDIR/id.txt | grep -v $REF` ; do

# convert tabs to a single space, collapse all spaces and
# remove leading spaces and trailing spaces (done via shrinkwrap.sh script)
paste $OUTDIR/$REF.seq.txt $OUTDIR/$id.seq.txt | cat -n | \
./shrinkwrap.sh > $OUTDIR/align.$REF.$id.txt

# The line below is long but none of the steps are particularly complicated
# 1. replace all bases with an X (tr)
# 2. extract runs of lines that have the same fields (uniq), but skipping first field (-f 1)
# and counting the number of runs (-c)
# 3. remove all lines that have gaps (grep)
# 4. write the output to a file but also to STDOUT (tee)
# 5. add the name of the sequence ids to the file and compute start and end of alignment
# based on the start of the alignment and length of runs from uniq

cat $OUTDIR/align.$REF.$id.txt | tr ATGC X | uniq -f 1 -c | \
grep -v - | tee $OUTDIR/runs.$REF.$id.txt | \
awk -v ref=$REF -v id=$id '{print ref,$2,$2+$1-1,id,$2,$2+1-2}' > $OUTDIR/links.$REF.$id.txt

done
day.4 / lecture.3 / 1 / parsemaf
# Make sure you understand everything below all the way to the
# housekeeping section.

# Read about the MAF format at http://www.bx.psu.edu/~dcking/man/maf.xhtml

# This will be our alignment list reference. It's declared as
# a scalar because it will be a reference to a list. While it
# doesn't have to be a reference, get used to using references.
#
# Alternatively, we could have done this
#
# my @alignments;
# ...
# push @alignments, ...
# ...
# $alignments[-1]{strain}{$strain} = ...
#
# By using a list variable saves you from having to dereference
# it with @$alignments or using -> to pull out an element.
#
# Last element using @alignments list: $alignments[-1]
# Last element using $alignments as list reference: $alignments->[-1]
#
# We can always make a list reference by
#
# my $alignments = \@alignments;

my $alignments;

# Read from STDIN until end of stream. This way the script expects
# a pipe from another process, like cat
#
# cat .../1/ebola.multiz.txt | ./parsemaf
#
while(my $line = <>) {
# remove the trailing new line
chomp $line;
# if this file matches this regular expression, then it is
# the start of an alignment block
if($line =~ /^a score=(.+)/) {
# push a new alignment hash onto the list
# for now, only the score is defined
push @$alignments, {score=>$1};
} elsif($line =~ /^s /) {
# this is an individual alignment line and belongs
# to the current alignment block, which is the last
# element in the @$alignment list.

# chop up the line into fields using any whitespace as delimiter
my @tok = split(" ",$line);
# assign the first four elements of the tokens to
# individual variables. By putting 'undef' as the
# first element we discard the first element in
# the assignment. Another way to do this would be
#
# my ($strain,$start,$size,$strand,$genome_size) = @tok[1..5];
my (undef,$strain,$start,$size,$strand,$genome_size) = @tok;
# if the strain is in the format STR.STR (repeated string)
# joined by a period, throw out everything after
# the period
if.\1/) {
$strain =~ s/[.].*//;
}
# assign this alignment line to the last (current) alignment
# block. We could have made a list out of these but it's
# even more convenient to key them by the strain
$alignments->[-1]{strain}{$strain} =
{ start=>$start,size=>$size,strain=>$strain,genome_size=>$genome_size };
}
}
printdebug("parsed",int(@$alignments),"alignments");

# The parsing above could have been done by changing the
# input record separator in Perl, which is accessible using
# the special variable $/
#
# The default value of this variable depends on the operating
# system. For more details see https://perldoc.perl.org/perlvar.html
#
# I urge you against using these special variables (with the
# exception of $_ and @_) and get in the habit of doing things yourself.
# Your code will be much more legible. Some of these variables
# actually slow down your code.

# Just dump the data structure to STDOUT and quit. This
# helps understand what's happening under the hood.
#
# The alignments are a list. Each list item is a hash of this form
#
# { score => SCORE,
# strain => { STRAIN1 => { start=>START1,size=>SIZE1,strain=>STRAIN1 },
# STRAIN2 => { start=>START2,size=>SIZE2,strain=>STRAIN2 },
# STRAIN3 => { start=>START3,size=>SIZE3,strain=>STRAIN3 },
# ...
# }
# }

printdumperq($alignments) if $CONF{dump};

# Make a unique list of strains. This is a compound line made up of
# several nested functions. Each is a simple step and this is
# a powerful Perl idiom when the steps are combined together.
#
# the list of alignments
# @$alignments
#
# a list of strains (which are keys in the alignment hash) in a given alignment
# keys %{$a->{strain}}
#
# a list of strains from all alignments
# here we're basically mapping the above step to each alignment
# map { keys %{$->{strain}} } @$alignments
#
# a list of unique strains
# uniq ( map { keys %{$->{strain}} } @$alignments )
my @strains = uniq ( map { keys %{$_->{strain}} } @$alignments );

# If -g1 is not set, we just list the strains and quit.
# I've defined printinfo() below as a wrapper for space-delimited
# print that prints _undef_ for arguments that are not defined.
if(! $CONF{g1}) {
printinfo("*** These strains are available:");
for my $strain (sort @strains) {
printinfo($strain);
}
exit;
}

# If we defined -g1 and/or -g2 make sure that we've seen these in the file
my $quit;
for my $param (qw(g1 g2)) {
# die if we have defined the genome we want but it's not in the list of strains.
if($CONF{$param} && ! grep($_ eq $CONF{$param}, @strains)) {
printinfo("ERROR: There are no alignments for -$param [$CONF{$param}]");
# Even if there are strains that match $CONF{g1} or $CONF{g2} we can
# produce a list of strains that match it as a regular expression to help the user
# find a strain.
my @matching_strains = grep($_ =~ /$CONF{$param}/, @strains);
if(@matching_strains) {
printinfo("Strains that match your input $CONF{$param} as a regular expression are");
for (@matching_strains) {
printinfo(" $_");
}
} else {
printinfo("No strains match your input $CONF{$param} as a regular expression.");
}
$quit = 1;
}
}
die if $quit;

# Now we print out the alignments in Circos' link format
my $seen_strain;
for my $alignment (@$alignments) {
# alignment to our reference
my $a1 = $alignment->{strain}{$CONF{g1}};
# the reference may be absent from this alignment, in which case go to next alignment
next unless $a1;
# for this alignment block, make a list of alignments to match up with
# the reference. if we have defined -g2 STRAIN2 then it's the alignment for
# STRAIN2 (which may be absent). Othewise it's a list of all strains in this alignment.
my @a2 = $CONF{g2} ? $alignment->{strain}{$CONF{g2}} : values %{$alignment->{strain}};

# Loop over all the alignments in this block
for my $a2 (@a2) {
# Move to the next alignment line if this one is not defined. This shouldn't
# happen but it's safe to check.
next if ! $a2;

# Report the karyotype if we used -karyotype. These will be printed
# to STDERR. The reason for this is that we can write to two
# files at the same time by redirecting STDOUT and STDERR to
# different files
#
# ./parsemaf > stdout.txt 2> stderr.txt
#
# Or we can combine the two
#
# ./parsemaf > everything 2>&1

if($CONF{karyotype} && ! $seen_strain->{ $a1->{strain} }++) {
printerr(sprintf("chr - %s %s 0 %d black",
$a1->{strain},$a1->{strain},$a1->{genome_size}));
}
if($CONF{karyotype} && ! $seen_strain->{ $a2->{strain} }++) {
printerr(sprintf("chr - %s %s 0 %d black",
$a2->{strain},$a2->{strain},$a2->{genome_size}));
}

# We're reporting alignment pairs, so skip if we're asking for self-alignment
next if $a1->{strain} eq $a2->{strain};
printinfo(
$a1->{strain},
$a1->{start},
$a1->{start} + $a1->{size} - 1,
$a2->{strain},
$a2->{start},
$a2->{start} + $a2->{size} - 1,
sprintf("offset=%d",$a2->{start}-$a1->{start}),
);
}
}
day.4 / lecture.2 / 2 / parsemaf.sh
#!/bin/bash

# Insert alignment record number in front of each line in the alignment
cat ../1/ebola.multiz.txt | grep -v \# | \
awk 'BEGIN {RS="\n\n"} {print gensub(/^|\n/,""sprintf("\n%04d",NR)" ","g")}' > ebola.multiz.numbered.txt

# Create temporary files with score and Zaire and Reston lines from
# each alignment, sorted by the alignment record number. Do not
# report the "i" lines
for var in score Zaire_1995 Reston_1996 ; do
grep $var ebola.multiz.numbered.txt | grep -v -w i | sort > tmp.$var.txt
done

# Join the lines on the alignment record number.
join tmp.score.txt tmp.Zaire_1995.txt | join - tmp.Reston_1996.txt > tmp.z.vs.r.txt

# Pull out the relevant columns and remove everything
# after the period.
cut -d " " -f 3,5-7,12-14 tmp.z.vs.r.txt | sed 's/[.][^ ]*//gi' > tmp.z.vs.r.2.txt

# Reformat
cat tmp.z.vs.r.2.txt | awk '{ print $2,$3,$3+$4,$5,$6,$6+$7,"offset="$6-$3","$1 }' > z.vs.r.txt

# Remove tmp files
\rm -f tmp*
day.4 / lecture.1 / refresher
$\ = "\n"; # default end of line delimiter
$, = " "; # default field delimiter

my $x = 1;
my @x = (1,2,3);
my %x = (a=>1,b=>2,c=>3);

# Note that $x @x and %x are all DIFFERENT variables, even
# though they have the same name. There is the scalar x, list x and hash x.
# For now, consider them to be independent and unrelated.
#
# Avoid doing things like
#
# @xlist = (1,2,3)
#
# since @xlist is obviously a list.
#
# It's generally bad practise to add the variable type to its name. However,
# I do this in this script for clarity. For example, the variable $xlistref
# would probably just be called $things or $fruits.
#
# Treat the words "list" and "array" as synonymous. Technically, an array
# is the data type and the list is the thing that it contains, but this
# is a minor distinction. For compound lists (e.g. lists of lists) you would
# say "two-dimensional array" and not "two-dimensional list".

print ("if we print a list we get",@x);
print ("if we print a hash we get",%x);

print ("list via Data::Dumper");
print Dumper(\@x);
print ("hash via Data::Dumper");
print Dumper(\%x);

my $xlistref = \@x;
my $xhashref = \%x;

print ("xlistref",$xlistref,"is reference to",ref $xlistref);
print ("xlistref",$xhashref," is reference to",ref $xhashref);

# Dereference the references back to their original types

my @y = @$xlistref;
my %y = %$xhashref;

# looping over a list

for my $item (@x) {
print ("looping over list",$item);
}

# when you get comfortable with the idea that $x and @x
# are different variables, you can do this

for my $x (@x) {
print ("looping again over list",$x);
}

for my $i (0..@x-1) {
print ("looping over list index",$i,"item",$x[$i]);
}

# looping over a reference of the list — requires that
# we dereference the list by prepending the list sigil @
# to the variable.
#
# to access a list item via reference, you use ->

for my $i (0..@$xlistref-1) {
print ("looping over list ref index",$i,"item",$xlistref->[$i]);
}

# looping over keys in a hash
for my $key (keys %x) {
print ("key",$key,"value",$x{$key});
}

# looping over keys in a hash reference
#
# to access a hash item via reference you use ->
for my $key (keys %$xhashref) {
print ("key",$key,"value",$xhashref->{$key});
}

# variables can be evaluated in a context to force
# an "interpretation" of the variable

# when a list is evaluated in scalar context (i.e. it is
# being assigned to a variable that is explicitly a scalar)
# you get the number of items in the list.

my $scalar = @x;
print("list as scalar",$scalar);

# Note that many contexts are list context such as
#
# print @x
# for (@x)
#

# Since perl 0-indexes lists, the last object is at index

my $lastidx = @x-1;
print("last index",$lastidx);

# The loop below cycles over the indexes in @x

for my $i (0..@x-1) {
print("index",$i,"value",$x[$i]);
}

# You can also access the last index via the special variable $#x
# so you can write the loop like this

for my $i (0..$#x) {}

# Personally, I do not like this notation and always write @x-1. This
# special variable is very very Perlish (Perl has lots of internal
# variables like this and it's up to you how many of them you want to use).
#
# http://perldoc.perl.org/perlvar.html

# For example, the current iterator in a for loop is accessed by $_

for (@x) {
print("current item",$_);
}

# Evaluating a scalar in list context

$x = "hello";
my @list = $x;

# simply gives you a single element list ("hello").

# To quickly define lists, use the word quotation operator qw()

@list = qw(some words in a list);

# This is much faster than

@list = ("some","words","in","a","short","list");

# If you evaluate a list in a hash context, Perl will expect an
# even number of elements and assign them as key/value pairs. Notice
# that Perl will interpolate variables inside a string defined
# with double quotes.

my %wordhash = @list;
for my $key (keys %wordhash) {
print "$key -> $wordhash{$key}";
}

# One of the reasons why Perl was very popular in its day was its
# support for regular expressions
#
# https://perldoc.perl.org/perlre.html

my $str = "abccddfffhiiijj";
if($str =~ /c/) {
print($str,"matches");
}

# For example to match any pair of identical characters you would
# use (.)\1 where . is the character class for any character, the
# (.) is a capture bracket which stores the match in an internal
# variable accessible immediately by \1.
#
# Then during replacement this internal variable is available as $1. The
# notation is different (e.g. vs \1) because we're no longer in the middle
# of doing the match. The match is done and we're now replacing strings.

$str =~ s/(.)\1/[2 x $1]/g;
print("replaced",$str);

# Regular expressions are a big topic, but you only need to know a few
# important things to get started. You can test-drive your regular expressions here
#
# https://regex101.com/

# You'll often be splitting strings into fields.

$str = "10 20 30 55 102 28";

# Using " " tells split to treat any amount of whitespace as a delimiter

my @tok = split(" ",$str);
print("split line into",@tok);
print("second field is",$tok[1]); # remember lists are 0-indexed

# You can now select items that you want

for my $tok (@tok) {
if($tok eq 10) { # string comparison
# ...
}
if($tok =~ /0/) {
print("item",$tok,"has a 0");
}
}

# Alternatively, a Perlish thing to do is to apply grep() to the list.
# Trep takes a condition as the first argument, which can be the
# regular expression

for my $tok (grep(/0/, @tok)) {
print("item",$tok,"has a 0 via grep");
}

# Strictly speaking grep(EXPRESSION,LIST) passes each value from LIST
# to the EXPRESSION via $_ and the expression is evaluated. If it
# evaluates to true then grep returns the list item, otherwise it does not.

for my $tok (grep($_ =~ /0/, @tok)) {}

# Grep is a *great* way to filter a list.

# To apply code to each list item, use map()

my @newtok = map { 2*$_ } @tok;
print("remapped list",@newtok);

# The code in { } can be anything and the new list is composed of the
# values returned by the code, as evaluated on each list element via $_

# You can combine map { } and grep(). For example, here we
# multiply each element that contains a 0 by 2. The list is not
# modified — both grep and map return copies of the elements.

print("map{} and grep()",map { 2*$_ } grep(/0/, @tok));

# To pull out all numbers made out of digits from a string

my $input = "aa1bb22ccc333dd444";
while( $input =~ /(\d+)/g ) {
print "Found $1 in $input";
}

# There is a lot going on here. The regular expression operator =~ applies
# the regular expression (\d+) to $input. Because the regular expression has capture
# brackets and we're using /g and this is evaluated in a while loop, the operator
# will continue to return as many matches as it can find. The substring matched will be stored in the
# special variable $1, which you can think of as the result of the first capture bracket.

# Below I add a second set of capture brackets to the regular expression, which grabs
# the letters immediately following the digit. Here the + means "one or more".

while( $input =~ /(\d+)([a-z]+)/g ) {
print "Found digit/letter combo $1/$2 in $input";
}

# A final useful regular expression character are the string anchors ^, which
# matches the start of a string, and $, which matches the end. Thus

if($input =~ /\d$/) { }

# will match if $input ends in a digit and

if($input =~ /^\d/) { }

# will match if it starts with a digit. To check that the string is made up exclusively
# of digits you would do this

if($input =~ /^\d$/) { }
day.1 / lecture.2 / 5 / resample.sh
#!/bin/bash

# This is the directory where the Circos tools are installed
# on my computer at work. But for your setup, it'll be different.
# Change this to the right directory. Take a look in /root/modules/circos
# to find it.
CTOOLS=/home/martink/work/circos/svn/tools/

# Let's make data sets for both the count and average size for 50, 75 and 100 kb

for binkb in 50 75 100 ; do

binsize=$((1000*binkb))

echo "Resampling at $binsize"
cat ../../data/genes.txt | $CTOOLS/resample/bin/resample -bin $binsize -count > genes.count.${binkb}kb.txt
cat ../../data/genes.txt | $CTOOLS/resample/bin/resample -bin $binsize -avg > genes.avgsize.${binkb}kb.txt

done

# Look at the manpage of the 'resample' script by running 'resample -man' You'll notice
# that it can also report the minimum and maximum values in a bin. Add these
# to the loop above to create files for these variables (e.g. genes.maxsize.100kb.txt)
#
# You transform the vertical scale in Circos but you can also do it
# when generating the data file. Here 'awk' is very helpful and its log() function
# which gives the natural logarithm. So log(x)/log(10) is the base 10 logarithm.

cat genes.count.50kb.txt | awk '{print $1,$2,$3,log($4)/log(10)}' > genes.count.50kb.log.txt

# Write a loop that applies this transformation to all the bins (50, 75, 100 kb) and all the
# quantities (count, avg, min, max). Watch out for bins with value 0 — the log() call
# will produce an error. How would you deal with this?
day.3 / lecture.2 / 1 / run.sh
#!/bin/bash

# This is the directory where the Circos tools are installed
# on my computer at work. But for your setup, it'll be different.
# Change this to the right directory. Take a look in /root/modules/circos
# to find it.
CTOOLS=/home/martink/work/circos/svn/tools/

mkdir data
$CTOOLS/clustal2link/bin/clustal2link -aln ../../data/alignment.aln -idref hsa8K -dir data -debug
day.4 / lecture.4 / 2 / showhistogram.sh
#!/bin/bash

# show the histogram of 1000 uniformly distributed random numbers
seq 1 1000 | awk 'BEGIN { srand() } {print rand()}' | ./histogram | tee histogram.uniform.txt

# show the histogram of 1000 normally(ish) (guaranteed by the Central
# Limit Theorem) random numbers generated from average of n = 10 samples
# of uniform random numbers
seq 1 1000 | \
awk 'BEGIN { srand();N=10 } { samplesum=0; for(i=1;i<=N;++i) samplesum+=rand(); print samplesum/N}' | \
./histogram | tee histogram.normal.txt

# show the histogram of the 10-90% of offsets
cat ../../lecture.3/2/z.vs.all.txt | sed 's/.*=//' | \
./histogram -bins 15 -pmin 10 -pmax 90 | tee histogram.10.90.txt

# show the histogram of offsets between -300 and 20% (-45).
cat ../../lecture.3/2/z.vs.all.txt | sed 's/.*=//' | \
./histogram -bins 15 -min -300 -pmax 20 | tee histogram-300.p20.txt
day.3 / lecture.2 / 2 / shrinkwrap.sh
#!/bin/bash

# default delimiter is a space
#
# cat file.txt | shrinkwrap.sh
#
# but you can set it by passing an argument
#
# cat file.txt | shrinkwrap.sh ,

# Assign the first argument to delimiter unless it's not defined
# in which case assign just a space
#
# VAR=${1:-DEFAULTVALUE}
#

DELIM=${1:- }

# all spaces to tabs
expand -t1 | \
# collapse all spaces
tr -s " " | \
# only keep lines that have a non-space character
grep '[^ ]' | \
# remove leading and trailing spaces
sed 's/^[ ]//g' | sed 's/[ ]$//g' | \
# remove lines that begin with comment field
grep -v -e '^\#' |
# replace the space with our delimiter string (space by default)
tr " " "$DELIM"
day.1 / lecture.4 / 5 / testsnp.sh
#!/bin/bash

LEN=15
LINES=3

perl makesnp -p 0.1 -len $LEN -lines $LINES -seed 0 > snp.txt

# 0. filter blank lines
# 1. fold the lines to make one character per line
# 2. calculate
#
# rec - record number (line number modulo 2), 0|1
# pair - index of line pair (0,1,2,3...)
# pos - position along the line
#
# 3. each base gets a unique tag: pair.pos.base
#
# 4. positions at which there are snps will only have one entry of this
# tag, which we find by count unique tags and then looking for ones
# that only appear once
#
# The uniq -w7 flag compares only the first 7 characters (length of tag)

grep -v "^$" snp.txt | fold -w1 | \
awk -v len=$LEN -v lines=$LINES \
'{ rec=NR%2; pair=int((NR-1)/(2*len)); pos=(NR-1)%len; \
printf("%02d.%02d.%s %d %d\n",pair,pos,$1,rec,pair*len+pos) }' > snp.mod.txt

sort snp.mod.txt | uniq -w7 -c | sed 's/^ *//' | grep ^1 | sort -n -k 4,5 -n -k 3,4 | tee snp.report.txt

cat snp.report.txt | sed 's/\./ /g'| awk '{print $5,$6,$4}' > tmp.txt

grep ^0 tmp.txt | cut -d " " -f 2- | sort > tmp.1.txt
grep ^1 tmp.txt | cut -d " " -f 2- | sort > tmp.2.txt

join tmp.1.txt tmp.2.txt | sort -n > snp.list.txt

\rm tmp.*.txt
Martin Krzywinski | contact | Canada's Michael Smith Genome Sciences CentreBC Cancer Research CenterBC CancerPHSA
Google whack “vicissitudinal corporealization”
{ 10.9.234.152 }