Learning Circos

Bioinformatics and Genome Analysis

Institut Pasteur Tunis Tunisia, December 10 – December 11, 2018

Download course materials
v2.00 6 Dec 2018 Download PDF slides
v2.00 6 Dec 2018

A 2- or 4-day practical mini-course in Circos, command-line parsing and scripting. This material is part of the Bioinformatics and Genome Analysis course held at the Institut Pasteur Tunis.

Quick links

menu day 1 day 2 day 3 day 4 scripts references updates

sessions / scripts

sessions / scripts / README

Scripts

This page lists all the scripts from all lectures.

When you untar the material, these scripts should be symbolic links back to their corresponding day and lecture

 sessions/scripts/parsemaf -> ../day.4/lecture.3/1/parsemaf

and some of the Perl scripts may have accompanying .conf files such as

 sessions/day.2/lecture.1/2/colorinterpolate.conf

These are automatically loaded by the script, as long as these files are in the same directory as the script or in ../etc or etc/ relative to the script.

day.3 / lecture.4 / 1 / binlinks.sh

↓

#!/bin/bash

CTOOLS=/home/martink/work/circos/svn/tools/

# report a count of the number of links (-num) in 250 base windows (-bin_size 250)
# counting both ends of the link (-link_end 2)

cat ../../lecture.2/1/data/links.txt | $CTOOLS/binlinks/bin/binlinks -bin_size 250 -num -link_end 2

day.3 / lecture.3 / 1 / clustalparser

↓

# This is a boilerplate Perl script for you to edit to build up a clustal parser
#
# Run it first as
#
# ./clustalpaser -h
#
# and then using the alignment file
#
# cat ../../data/alignment.aln | ./clustalparser

my $data = read_file($CONF{in});

# dump the data structure we created from the
# alignments to STDOUT

printdumper($data);

# Continue with the script here

# ...


# Convert a list of numbers into a
# list of contiguous ranges defined by [min,max] 
#
# [ [min1,max1], [min2,max2], ...]
sub list2ranges {
  my @x = sort {$a <=> $b} @_;
  my $ranges = [];
  # I wrote two approaches to solve this problem. The first
  # is more like C code and the second is more Perlish.
  # Change the 1 to a 0 in the outer if loop to use the second approach.
  # Which one reads easier?
  if(1) {
    # This one is more like C
    for ( my $i = 0; $i < @x; $i++) {
      my $min = $i;
      for my $j ($i+1..@x-1) {
        if($x[$j] - $x[$j-1] > 1) {
          last;
        }
        $i = $j;
      }
      push @$ranges, [$min,$i];
    }
  } else {
    # This one is more Perlish
    my $i = 0;
    my $max;
    while(@x) {
      my $x = shift @x;
      if(! defined $ranges->[-1] || $x - $max > 1) {
        push @$ranges, [$i,$i];
      } else {
        $ranges->[-1][1] = $i;
      }
      $max = $x;
      $i++;
    }
  }
  return $ranges;
}

# Read the alignment file. For now, all it does
# is read the alignment lines and store them
# as a hash, keyed by the sequence id.
sub read_file {
  my $file        = shift;
  my $inputhandle = get_handle($file);
  my $data;
  while(<$inputhandle>) {
    # skip the first line, which is a header
    # this is one of those rare times that a special variable
    # other than $_ or @_ is usedful. The $. gives the
    # line number of the file handle that is currently being read.
    next if $. == 1;
    # skip comments and blank lines
    next if /^\s*\#/ ||/^\s*$/;
    # trim newline
    chomp;
    # you now have the fields of this line
    my ($id,$seq,$cumul) = split;
    # put these values into a hash and push it onto the list
    push @$data, {id=>$id,
                  seq=>$seq,
                  cumul=>$cumul};
                  
  }
  return $data;
}

day.2 / lecture.1 / 2 / colorinterpolate

↓

my $c1 = color2space([split(",",$CONF{start})],$CONF{space},$CONF{calcspace});
my $c2 = color2space([split(",",$CONF{end})],$CONF{space},$CONF{calcspace});

my $interpolated;
my $cintlabprev;
for my $i (0..$CONF{steps}) {
  my $cint  = [interpvalue($c1->[0],$c2->[0],$i,$CONF{steps}),
              interpvalue($c1->[1],$c2->[1],$i,$CONF{steps}),
              interpvalue($c1->[2],$c2->[2],$i,$CONF{steps})];
  my $cout    = color2space($cint,$CONF{calcspace},$CONF{space});
  my $cintlab = color2space($cint,$CONF{calcspace},"lab");
  my $de    = $cintlabprev ? sqrt( ($cintlab->[0]-$cintlabprev->[0])**2 +
                                   ($cintlab->[1]-$cintlabprev->[1])**2 +
                                   ($cintlab->[2]-$cintlabprev->[2])**2) : 0;
  push @$interpolated, {components=> ref $cout eq "ARRAY" ? [ map { nearest($CONF{nearest},$_) } @$cout ] : $cout,
                        i=>$i,
                        de=>$de,
                        lab=> $cintlab,
                        name=>sprintf("%s%03d",$CONF{rootname},$i)};
  $cintlabprev = $cintlab;
}

for my $c (@$interpolated) {
  printinfo(sprintf("%s = %9s # rgb %9s rgbhex %s idx %3d deltaE %6.2f lab(%s,%s,%s)",
                    $c->{name},
                    ref $c->{components} eq "ARRAY" ? sprintf("%d,%d,%d",@{$c->{components}}) : $c->{components},
                    sprintf("%d,%d,%d",lab2rgb(@{$c->{lab}})),
                    rgb2hex(lab2rgb(@{$c->{lab}})),
                    $c->{i},
                    nearest(0.01,$c->{de}),
                    (map { nearest(0.2,$_) } @{$c->{lab}})));

}

sub color2space {
  my ($components,$from,$to) = @_;
  my $fnfrom = sprintf("new_%s",$CONF{fn}{$from});
  my $fnto   = sprintf("as_%s",$CONF{fn}{$to});
  my $c      = $from =~ /hex/ ? Graphics::ColorObject->$fnfrom($components->[0]) : Graphics::ColorObject->$fnfrom($components);
  my $cout   = $c->$fnto();
  printdebug("converting",@$components,"from",$from,"to",$to,item2list($cout));
  return $cout;
}


sub interpvalue {
  my ($start,$end,$step,$steps) = @_;
  return $start + $step*($end-$start)/$steps;
}

sub item2list {
  my $x = shift;
  return ref $x eq "ARRAY" ? @$x : $x;
}

sub rgb2hsv {
  my @c = @_;
  my $c = Graphics::ColorObject->new_RGB([@c]);
  my @nc = @{$c->as_HSV()};
  printinfo(@c,@nc);
  return @nc;
}
sub hsv2rgb {
  my @c = @_;
  my $c = Graphics::ColorObject->new_HSV([@c]);
  return @{$c->as_RGB()};
}
sub rgb2hex {
  my @c = @_;
  my $c = Graphics::ColorObject->new_RGB255([@c]);
  return $c->as_RGBhex();
}
sub lab2rgb {
  my @c = @_;
  my $c = Graphics::ColorObject->new_Lab([@c]);
  return @{$c->as_RGB255()};
}

day.1 / lecture.4 / 3 / create.tracks.sh

↓

#!/bin/bash

# extract genes

grep -v \# ../1/ebola.genes.txt | cut -f 2,5,6,9 | \
    awk '{print "ebola",$2,$3,$4,"name="$1}' > ebola.genes.circos.txt

# extract SNPs

grep -v \# ../1/ebola.variation.txt | cut -f 2,3,4,10,15 | \
    perl -pe 's/\w+:\w+//' | \
    awk '{print "ebola",$1,$2,$5,"snp="$3",gene="$4}' > ebola.variation.circos.txt

day.2 / lecture.1 / 1 / generate.random.data.sh

↓

#!/bin/bash

CTOOLS=/home/martink/work/circos/svn/tools/

# Generate some random data on the human chromosome. 
#
# To list predefined rules that define data position and value, see
#
# $CTOOLS/randomdata/bin/randomdata -listrules
#
# To understand how the rules work, see the script's manpage
#
# $CTOOLS/randomdata/bin/randomdata -man

# Make 5 random data sets based on default rule (10 mb bins, standard normal distribution).
for i in `seq 1 5` ; do 
$CTOOLS/randomdata/bin/randomdata -karyotype karyotype.human.txt -ruleset default > random.$i.txt
done

# Convince yourself that the values are indeed standard normal (mean 0, sd 1)
cat random.*.txt| cut -d " " -f 4 | ../../../scripts/histogram > histogram.txt

day.4 / lecture.4 / 2 / histogram

↓

# read the values from STDIN and use the -col COLUMN, by default 0
my @values;
while(my $line = <>) {
  chomp $line;
  next if $line =~ /^\#/;
  my @tok = split(" ",$line);
  if(defined $CONF{col} && ! defined $tok[ $CONF{col} ]) {
    die "The line [$line] has no number in column [$CONF{col}]";
  }
  push @values, $tok[ $CONF{col}];
}

# get min and max of values
my ($min,$max) = minmax(@values);

# change min/max range if redefined via -min and/or -max
$min = $CONF{min} if defined $CONF{min};# && $CONF{min} > $min;
$max = $CONF{max} if defined $CONF{max};# && $CONF{max} < $max;

# *very* quick and dirty percentile if -pmin or -pmax used
if($CONF{pmin} || $CONF{pmax}) {
  my @sorted = sort {$a <=> $b} @values;
  $min = $sorted[ $CONF{pmin}/100 * (@sorted-1) ] if $CONF{pmin};
  $max = $sorted[ $CONF{pmax}/100 * (@sorted-1) ] if $CONF{pmax};
}

# determine bin size from either the -binsize command-line parameter
# or from -nbins. At least one of these is defined — we made sure of
# that in validateconfiguration()
my $bin_size;
if($CONF{binsize}) {
  $bin_size = $CONF{binsize};
  $CONF{nbins} = round(($max-$min)/$CONF{binsize});
} else {
  $bin_size = ($max-$min)/$CONF{nbins};
}

# populate bins
my $bins;
for my $i (0..$CONF{nbins}-1) {
  my $start = $min + $i * $bin_size;
  my $end   = $min + ($i+1) * $bin_size;
  my $n     = grep($_ >= $start && ($_ < $end || ( $i == $CONF{nbins}-1 && $_ <= $end)), @values);
  push @$bins, { start=>$start,
                 i=>$i,
                 end=>$end,
                 size=>$bin_size,
                 n=>$n };
}

my $histogram_width = $CONF{height};
my $maxn   = max(map { $_->{n} } @$bins);
my $sumn   = sum(map { $_->{n} } @$bins);
my $cumuln = 0;

my $bin_count_width = max(map { length($_->{n}) } @$bins);
my $bin_count_fmt   = sprintf("%%%dd",2+$bin_count_width);
my $bin_width_f_max = max(map { $_->{n}/$maxn } @$bins);

# report number and fraction of values smaller than requested minimum
printinfo(sprintf("%10s %10.4f>$bin_count_fmt %6.3f","",
                  $min,
                  int(grep($_ < $min, @values)),
                  int(grep($_ < $min, @values))/@values));

# report each bin: min, max, count, fractional count, cumulative count
for my $bin (@$bins) {
  $cumuln += $bin->{n};
  my $binwidth = $bin->{n}*$histogram_width/$maxn;
  if(defined $CONF{sqrt}) {
    my $power = $CONF{sqrt} || 0.5;
    my $f = $bin->{n}/$maxn;
    $binwidth = $f**$power/$bin_width_f_max**$power * $histogram_width;
  }
  printinfo(sprintf("%10.4f %10.4f $bin_count_fmt %6.3f %6.3f %-${histogram_width}s",
                    $bin->{start},
                    $bin->{end},
                    $bin->{n},
                    $bin->{n}/$sumn,
                    $cumuln/$sumn,
                    "*"x$binwidth));
}

# report number and fraction of values larger than requested maximum
printinfo(sprintf("%10s %10.4f<%6d %6.3f","",
                  $max,
                  int(grep($_ > $max, @values)),
                  int(grep($_ > $max, @values))/@values));

# aggregate stats for full data set
printinfo(sprintf("%8s %12d","n",int(@values)));
printinfo(sprintf("%8s %12.5f","average",average(@values)));
printinfo(sprintf("%8s %12.5f","sd",stddev(@values)));
printinfo(sprintf("%8s %12.5f","min",min(@values)));
printinfo(sprintf("%8s %12.5f","max",max(@values)));
printinfo(sprintf("%8s %12.5f","sum",sum(@values)));

day.2 / lecture.1 / 2 / make.color.ramp.sh

↓

#!/bin/bash

START=255,0,0
END=0,0,255
STEPS=9
./colorinterpolate -steps $STEPS -start $START -end $END -rootname mycolorlab | \
    tee etc/mycolors.lab.conf
./colorinterpolate -steps $STEPS -start $START -end $END -calcspace hsv -rootname mycolorhsv | \
    tee etc/mycolors.hsv.conf

day.1 / lecture.4 / 1 / make.karyotype.sh

↓

#!/bin/bash

echo "parsing UCSC assembly table and making karyotype file"
tail -1 ebola.assembly.txt | cut -f 4 | awk '{print "chr - ebola ebola 0",$1,"black"}' > ebola.karyotype.txt

day.4 / lecture.3 / 2 / make.tracks.sh

↓

#!/bin/bash

g1=Zaire_1995

# Notice how we're redirecting STDOUT to the link file and the
# STDERR to the karyotype file. This is a very useful idiom to
# generate two independent files at the same time and
# use redirection for maximum flexibility.

cat ../../lecture.2/1/ebola.multiz.txt | ../1/parsemaf -g1 $g1 -karyotype > z.vs.all.txt 2> karyotype.txt

day.1 / lecture.4 / 5 / makesnp

↓

# -seed N sets a random seed so that we can reproduce
# the random output for debugging
srand($CONF{seed}) if defined $CONF{seed};

# -dict ATGC sets the dictionary from which bases are selected
# for example you can add N's with -dict ATGCN

my @dict = split("",$CONF{dict});

# iterate over lines (pairs of sequence)
for my $i (1..$CONF{lines}) {
  # this is the "reference" sequence, which is a concatenation
  # of $CONF{len} randomly selected characters from the dictionary
  my $str = join("",map { $dict[rand(@dict)] } (1..$CONF{len}));
  print "$str\n";
  # for each position in the reference
  for my $j (1..$CONF{len}) {

    # make a snp but only if we pass the probability requirement
    # the chance of a SNP at given position is $CONF{p}
    next unless rand() < $CONF{p};

    # this is the reference base ast this position
    my $ref = substr($str,$j-1,1);

    # replace the character at this position with a randomly
    # selected character from the dictionary that is not
    # the reference base
    #
    # substr() can be an lvalue too — very useful
    substr($str,$j-1,1) = (grep($_ ne $ref, @dict))[rand(@dict-1)];
  }
  print "$str\n\n";
}

day.3 / lecture.2 / 2 / parseclustal.sh

↓

#!/bin/bash

# extract the list of sequence IDs from the alignment file

REF=hsa8K
OUTDIR=parsed
FILE=../../data/alignment.aln
cat $FILE | cut -d " " -f 1 | sort -u | grep -v -i clustal | grep "[a-z]" > $OUTDIR/id.txt

# for each sequence ID, retrieve the sequence and store it with one base per line

for id in `cat $OUTDIR/id.txt`; do
grep $id $FILE | tr -s " " | cut -d " " -f 2 | fold -c1 > $OUTDIR/$id.seq.txt
done

# paste hsa8K together with each other sequence id 
# and remove lines with "-" indicating no alignment
# and make a file of all positions (one per line) at 
# which bases are aligned

for id in `cat $OUTDIR/id.txt | grep -v $REF` ; do

# convert tabs to a single space, collapse all spaces and 
# remove leading spaces and trailing spaces (done via shrinkwrap.sh script)
paste $OUTDIR/$REF.seq.txt $OUTDIR/$id.seq.txt | cat -n | \
    ./shrinkwrap.sh > $OUTDIR/align.$REF.$id.txt

# The line below is long but none of the steps are particularly complicated
# 1. replace all bases with an X (tr)
# 2. extract runs of lines that have the same fields (uniq), but skipping first field (-f 1)
#      and counting the number of runs (-c)
# 3. remove all lines that have gaps (grep)
# 4. write the output to a file but also to STDOUT (tee)
# 5. add the name of the sequence ids to the file and compute start and end of alignment
#      based on the start of the alignment and length of runs from uniq

cat $OUTDIR/align.$REF.$id.txt | tr ATGC X | uniq -f 1 -c | \
    grep -v - | tee $OUTDIR/runs.$REF.$id.txt | \
    awk -v ref=$REF -v id=$id '{print ref,$2,$2+$1-1,id,$2,$2+1-2}' > $OUTDIR/links.$REF.$id.txt

done

day.4 / lecture.3 / 1 / parsemaf

↓

# Make sure you understand everything below all the way to the
# housekeeping section.

# Read about the MAF format at http://www.bx.psu.edu/~dcking/man/maf.xhtml

# This will be our alignment list reference. It's declared as
# a scalar because it will be a reference to a list. While it
# doesn't have to be a reference, get used to using references.
#
# Alternatively, we could have done this
#
# my @alignments;
# ...
# push @alignments, ...
# ...
# $alignments[-1]{strain}{$strain} = ...
#
# By using a list variable saves you from having to dereference
# it with @$alignments or using -> to pull out an element.
#
# Last element using @alignments list:              $alignments[-1]
# Last element using $alignments as list reference: $alignments->[-1]
#
# We can always make a list reference by 
#
# my $alignments = \@alignments;

my $alignments;

# Read from STDIN until end of stream. This way the script expects
# a pipe from another process, like cat
#
# cat .../1/ebola.multiz.txt | ./parsemaf
#
while(my $line = <>) {
  # remove the trailing new line
  chomp $line;
  # if this file matches this regular expression, then it is
  # the start of an alignment block
  if($line =~ /^a score=(.+)/) {
    # push a new alignment hash onto the list
    # for now, only the score is defined
    push @$alignments, {score=>$1};
  } elsif($line =~ /^s /) {
    # this is an individual alignment line and belongs
    # to the current alignment block, which is the last
    # element in the @$alignment list. 

    # chop up the line into fields using any whitespace as delimiter
    my @tok = split(" ",$line);
    # assign the first four elements of the tokens to 
    # individual variables. By putting 'undef' as the
    # first element we discard the first element in
    # the assignment. Another way to do this would be
    #
    # my ($strain,$start,$size,$strand,$genome_size) = @tok[1..5];
    my (undef,$strain,$start,$size,$strand,$genome_size) = @tok;
    # if the strain is in the format STR.STR (repeated string)
    # joined by a period, throw out everything after
    # the period
    if.\1/) {
      $strain =~ s/[.].*//;
    }
    # assign this alignment line to the last (current) alignment
    # block. We could have made a list out of these but it's
    # even more convenient to key them by the strain 
    $alignments->[-1]{strain}{$strain} = 
      { start=>$start,size=>$size,strain=>$strain,genome_size=>$genome_size };
  }
}
printdebug("parsed",int(@$alignments),"alignments");

# The parsing above could have been done by changing the
# input record separator in Perl, which is accessible using
# the special variable $/
#
# The default value of this variable depends on the operating
# system. For more details see https://perldoc.perl.org/perlvar.html
#
# I urge you against using these special variables (with the
# exception of $_ and @_) and get in the habit of doing things yourself. 
# Your code will be much more legible. Some of these variables
# actually slow down your code.

# Just dump the data structure to STDOUT and quit. This
# helps understand what's happening under the hood.
#
# The alignments are a list. Each list item is a hash of this form
#
#  { score  => SCORE,
#    strain => { STRAIN1 => { start=>START1,size=>SIZE1,strain=>STRAIN1 },
#                STRAIN2 => { start=>START2,size=>SIZE2,strain=>STRAIN2 },
#                STRAIN3 => { start=>START3,size=>SIZE3,strain=>STRAIN3 },
#                ...
#              }
#  }

printdumperq($alignments) if $CONF{dump};

# Make a unique list of strains. This is a compound line made up of
# several nested functions. Each is a simple step and this is
# a powerful Perl idiom when the steps are combined together.
#
# the list of alignments
# @$alignments  
#
# a list of strains (which are keys in the alignment hash) in a given alignment
# keys %{$a->{strain}}
#
# a list of strains from all alignments
# here we're basically mapping the above step to each alignment
# map { keys %{$->{strain}} } @$alignments
#
# a list of unique strains
# uniq ( map { keys %{$->{strain}} } @$alignments )
my @strains = uniq ( map { keys %{$_->{strain}} } @$alignments );

# If -g1 is not set, we just list the strains and quit. 
# I've defined printinfo() below as a wrapper for space-delimited
# print that prints _undef_ for arguments that are not defined.
if(! $CONF{g1}) {
  printinfo("*** These strains are available:");
  for my $strain (sort @strains) {
    printinfo($strain);
  }
  exit;
}

# If we defined -g1 and/or -g2 make sure that we've seen these in the file
my $quit;
for my $param (qw(g1 g2)) {
  # die if we have defined the genome we want but it's not in the list of strains.
  if($CONF{$param} && ! grep($_ eq $CONF{$param}, @strains)) {
    printinfo("ERROR: There are no alignments for -$param [$CONF{$param}]");
    # Even if there are strains that match $CONF{g1} or $CONF{g2} we can 
    # produce a list of strains that match it as a regular expression to help the user 
    # find a strain.
    my @matching_strains = grep($_ =~ /$CONF{$param}/, @strains);
    if(@matching_strains) {
      printinfo("Strains that match your input $CONF{$param} as a regular expression are");
      for (@matching_strains) {
        printinfo("  $_");
      } 
    } else {
      printinfo("No strains match your input $CONF{$param} as a regular expression.");
    }
    $quit = 1;
  }
}
die if $quit;

# Now we print out the alignments in Circos' link format
my $seen_strain;
for my $alignment (@$alignments) {
  # alignment to our reference
  my $a1 = $alignment->{strain}{$CONF{g1}};
  # the reference may be absent from this alignment, in which case go to next alignment
  next unless $a1;
  # for this alignment block, make a list of alignments to match up with
  # the reference. if we have defined -g2 STRAIN2 then it's the alignment for 
  # STRAIN2 (which may be absent). Othewise it's a list of all strains in this alignment.
  my @a2 = $CONF{g2} ? $alignment->{strain}{$CONF{g2}} : values %{$alignment->{strain}};

  # Loop over all the alignments in this block
  for my $a2 (@a2) {
    # Move to the next alignment line if this one is not defined. This shouldn't 
    # happen but it's safe to check.
    next if ! $a2;

    # Report the karyotype if we used -karyotype. These will be printed
    # to STDERR. The reason for this is that we can write to two 
    # files at the same time by redirecting STDOUT and STDERR to 
    # different files
    #
    # ./parsemaf > stdout.txt 2> stderr.txt
    #
    # Or we can combine the two
    #
    # ./parsemaf > everything 2>&1

    if($CONF{karyotype} && ! $seen_strain->{ $a1->{strain} }++) {
      printerr(sprintf("chr - %s %s 0 %d black",
                       $a1->{strain},$a1->{strain},$a1->{genome_size}));
    }
    if($CONF{karyotype} && ! $seen_strain->{ $a2->{strain} }++) {
      printerr(sprintf("chr - %s %s 0 %d black",
                       $a2->{strain},$a2->{strain},$a2->{genome_size}));
    }

    # We're reporting alignment pairs, so skip if we're asking for self-alignment
    next if $a1->{strain} eq $a2->{strain};
    printinfo(
              $a1->{strain},
              $a1->{start},
              $a1->{start} + $a1->{size} - 1,
              $a2->{strain},
              $a2->{start},
              $a2->{start} + $a2->{size} - 1,
              sprintf("offset=%d",$a2->{start}-$a1->{start}),
             );
  }
}

day.4 / lecture.2 / 2 / parsemaf.sh

↓

#!/bin/bash

# Insert alignment record number in front of each line in the alignment
cat ../1/ebola.multiz.txt | grep -v \# | \
    awk 'BEGIN {RS="\n\n"} {print gensub(/^|\n/,""sprintf("\n%04d",NR)" ","g")}' > ebola.multiz.numbered.txt

# Create temporary files with score and Zaire and Reston lines from
# each alignment, sorted by the alignment record number. Do not
# report the "i" lines
for var in score Zaire_1995 Reston_1996 ; do
grep $var ebola.multiz.numbered.txt | grep -v -w i | sort > tmp.$var.txt
done

# Join the lines on the alignment record number.
join tmp.score.txt tmp.Zaire_1995.txt | join - tmp.Reston_1996.txt > tmp.z.vs.r.txt

# Pull out the relevant columns and remove everything
# after the period.
cut -d " " -f 3,5-7,12-14 tmp.z.vs.r.txt | sed 's/[.][^ ]*//gi' > tmp.z.vs.r.2.txt

# Reformat
cat tmp.z.vs.r.2.txt | awk '{ print $2,$3,$3+$4,$5,$6,$6+$7,"offset="$6-$3","$1 }' > z.vs.r.txt

# Remove tmp files
\rm -f tmp*

day.4 / lecture.1 / refresher

↓

$\ = "\n"; # default end of line delimiter
$, = " ";  # default field delimiter

my $x = 1;
my @x = (1,2,3);
my %x = (a=>1,b=>2,c=>3);

# Note that $x @x and %x are all DIFFERENT variables, even
# though they have the same name. There is the scalar x, list x and hash x.
# For now, consider them to be independent and unrelated. 
#
# Avoid doing things like
#
# @xlist = (1,2,3)
#
# since @xlist is obviously a list. 
#
# It's generally bad practise to add the variable type to its name. However,
# I do this in this script for clarity. For example, the variable $xlistref
# would probably just be called $things or $fruits.
#
# Treat the words "list" and "array" as synonymous. Technically, an array
# is the data type and the list is the thing that it contains, but this
# is a minor distinction. For compound lists (e.g. lists of lists) you would
# say "two-dimensional array" and not "two-dimensional list".

print ("if we print a list we get",@x);
print ("if we print a hash we get",%x);

print ("list via Data::Dumper");
print Dumper(\@x);
print ("hash via Data::Dumper");
print Dumper(\%x);

my $xlistref = \@x;
my $xhashref = \%x;

print ("xlistref",$xlistref,"is reference to",ref $xlistref);
print ("xlistref",$xhashref," is reference to",ref $xhashref);

# Dereference the references back to their original types

my @y = @$xlistref;
my %y = %$xhashref;

# looping over a list

for my $item (@x) {
  print ("looping over list",$item);
}

# when you get comfortable with the idea that $x and @x
# are different variables, you can do this

for my $x (@x) {
  print ("looping again over list",$x);
}

for my $i (0..@x-1) {
  print ("looping over list index",$i,"item",$x[$i]);
}

# looping over a reference of the list — requires that
# we dereference the list by prepending the list sigil @
# to the variable.
#
# to access a list item via reference, you use ->

for my $i (0..@$xlistref-1) {
  print ("looping over list ref index",$i,"item",$xlistref->[$i]);
}

# looping over keys in a hash
for my $key (keys %x) {
  print ("key",$key,"value",$x{$key});
}

# looping over keys in a hash reference
#
# to access a hash item via reference you use ->
for my $key (keys %$xhashref) {
  print ("key",$key,"value",$xhashref->{$key});
}

# variables can be evaluated in a context to force
# an "interpretation" of the variable

# when a list is evaluated in scalar context (i.e. it is
# being assigned to a variable that is explicitly a scalar)
# you get the number of items in the list.

my $scalar = @x;
print("list as scalar",$scalar);

# Note that many contexts are list context such as 
#
# print @x
# for (@x)
#

# Since perl 0-indexes lists, the last object is at index

my $lastidx = @x-1;
print("last index",$lastidx);

# The loop below cycles over the indexes in @x

for my $i (0..@x-1) {
  print("index",$i,"value",$x[$i]);
}

# You can also access the last index via the special variable $#x
# so you can write the loop like this

for my $i (0..$#x) {}

# Personally, I do not like this notation and always write @x-1. This 
# special variable is very very Perlish (Perl has lots of internal
# variables like this and it's up to you how many of them you want to use).
#
# http://perldoc.perl.org/perlvar.html

# For example, the current iterator in a for loop is accessed by $_

for (@x) {
  print("current item",$_);
}

# Evaluating a scalar in list context

$x = "hello";
my @list = $x;

# simply gives you a single element list ("hello").

# To quickly define lists, use the word quotation operator qw()

@list = qw(some words in a list);

# This is much faster than

@list = ("some","words","in","a","short","list");

# If you evaluate a list in a hash context, Perl will expect an
# even number of elements and assign them as key/value pairs. Notice
# that Perl will interpolate variables inside a string defined 
# with double quotes.

my %wordhash = @list;
for my $key (keys %wordhash) {
  print "$key -> $wordhash{$key}";
}

# One of the reasons why Perl was very popular in its day was its
# support for regular expressions
#
# https://perldoc.perl.org/perlre.html

my $str = "abccddfffhiiijj";
if($str =~ /c/) {
  print($str,"matches");
}

# For example to match any pair of identical characters you would
# use (.)\1 where . is the character class for any character, the
# (.) is a capture bracket which stores the match in an internal
# variable accessible immediately by \1.
#
# Then during replacement this internal variable is available as $1. The
# notation is different (e.g. vs \1) because we're no longer in the middle
# of doing the match. The match is done and we're now replacing strings.

$str =~ s/(.)\1/[2 x $1]/g;
print("replaced",$str);

# Regular expressions are a big topic, but you only need to know a few
# important things to get started. You can test-drive your regular expressions here
#
# https://regex101.com/

# You'll often be splitting strings into fields. 

$str = "10 20    30 55   102 28";

# Using " " tells split to treat any amount of whitespace as a delimiter

my @tok = split(" ",$str);
print("split line into",@tok);
print("second field is",$tok[1]); # remember lists are 0-indexed

# You can now select items that you want

for my $tok (@tok) {
  if($tok eq 10) { # string comparison
    # ...
  }
  if($tok =~ /0/) {
    print("item",$tok,"has a 0");
  }
}

# Alternatively, a Perlish thing to do is to apply grep() to the list.
# Trep takes a condition as the first argument, which can be the
# regular expression

for my $tok (grep(/0/, @tok)) {
  print("item",$tok,"has a 0 via grep");
}

# Strictly speaking grep(EXPRESSION,LIST) passes each value from LIST
# to the EXPRESSION via $_ and the expression is evaluated. If it
# evaluates to true then grep returns the list item, otherwise it does not.

for my $tok (grep($_ =~ /0/, @tok)) {}

# Grep is a *great* way to filter a list.

# To apply code to each list item, use map()

my @newtok = map { 2*$_ } @tok;
print("remapped list",@newtok);

# The code in { } can be anything and the new list is composed of the
# values returned by the code, as evaluated on each list element via $_

# You can combine map { } and grep(). For example, here we
# multiply each element that contains a 0 by 2. The list is not
# modified — both grep and map return copies of the elements.

print("map{} and grep()",map { 2*$_ } grep(/0/, @tok));

# To pull out all numbers made out of digits from a string

my $input = "aa1bb22ccc333dd444";
while( $input =~ /(\d+)/g ) {
  print "Found $1 in $input";
}

# There is a lot going on here. The regular expression operator =~ applies
# the regular expression (\d+) to $input. Because the regular expression has capture
# brackets and we're using /g and this is evaluated in a while loop, the operator
# will continue to return as many matches as it can find. The substring matched will be stored in the 
# special variable $1, which you can think of as the result of the first capture bracket.

# Below I add a second set of capture brackets to the regular expression, which grabs
# the letters immediately following the digit. Here the + means "one or more". 

while( $input =~ /(\d+)([a-z]+)/g ) {
  print "Found digit/letter combo $1/$2 in $input";
}

# A final useful regular expression character are the string anchors ^, which 
# matches the start of a string, and $, which matches the end. Thus

if($input =~ /\d$/) { }

# will match if $input ends in a digit and

if($input =~ /^\d/) { }

# will match if it starts with a digit. To check that the string is made up exclusively
# of digits you would do this

if($input =~ /^\d$/) { }

day.1 / lecture.2 / 5 / resample.sh

↓

#!/bin/bash

# This is the directory where the Circos tools are installed
# on my computer at work. But for your setup, it'll be different.
# Change this to the right directory. Take a look in /root/modules/circos
# to find it.
CTOOLS=/home/martink/work/circos/svn/tools/

# Let's make data sets for both the count and average size for 50, 75 and 100 kb

for binkb in 50 75 100 ; do

  binsize=$((1000*binkb))

  echo "Resampling at $binsize"
  cat ../../data/genes.txt | $CTOOLS/resample/bin/resample -bin $binsize -count > genes.count.${binkb}kb.txt
  cat ../../data/genes.txt | $CTOOLS/resample/bin/resample -bin $binsize -avg   > genes.avgsize.${binkb}kb.txt

done

# Look at the manpage of the 'resample' script by running 'resample -man' You'll notice
# that it can also report the minimum and maximum values in a bin. Add these
# to the loop above to create files for these variables (e.g. genes.maxsize.100kb.txt)
#
# You transform the vertical scale in Circos but you can also do it
# when generating the data file. Here 'awk' is very helpful and its log() function
# which gives the natural logarithm. So log(x)/log(10) is the base 10 logarithm.

cat genes.count.50kb.txt | awk '{print $1,$2,$3,log($4)/log(10)}' > genes.count.50kb.log.txt

# Write a loop that applies this transformation to all the bins (50, 75, 100 kb) and all the
# quantities (count, avg, min, max). Watch out for bins with value 0 — the log() call
# will produce an error. How would you deal with this?

day.3 / lecture.2 / 1 / run.sh

↓

#!/bin/bash

# This is the directory where the Circos tools are installed
# on my computer at work. But for your setup, it'll be different.
# Change this to the right directory. Take a look in /root/modules/circos
# to find it.
CTOOLS=/home/martink/work/circos/svn/tools/

mkdir data
$CTOOLS/clustal2link/bin/clustal2link -aln ../../data/alignment.aln -idref hsa8K -dir data -debug

day.4 / lecture.4 / 2 / showhistogram.sh

↓

#!/bin/bash

# show the histogram of 1000 uniformly distributed random numbers
seq 1 1000 | awk 'BEGIN { srand() } {print rand()}' | ./histogram | tee histogram.uniform.txt

# show the histogram of 1000 normally(ish) (guaranteed by the Central
# Limit Theorem) random numbers generated from average of n = 10 samples
# of uniform random numbers
seq 1 1000 | \
    awk 'BEGIN { srand();N=10 } { samplesum=0; for(i=1;i<=N;++i) samplesum+=rand(); print samplesum/N}' | \
    ./histogram | tee histogram.normal.txt

# show the histogram of the 10-90% of offsets
cat ../../lecture.3/2/z.vs.all.txt | sed 's/.*=//' | \
    ./histogram -bins 15 -pmin 10 -pmax 90 | tee histogram.10.90.txt

# show the histogram of offsets between -300 and 20% (-45).
cat ../../lecture.3/2/z.vs.all.txt | sed 's/.*=//' | \
    ./histogram -bins 15 -min -300 -pmax 20 | tee histogram-300.p20.txt

day.3 / lecture.2 / 2 / shrinkwrap.sh

↓

#!/bin/bash

# default delimiter is a space
#
# cat file.txt | shrinkwrap.sh
#
# but you can set it by passing an argument
#
# cat file.txt | shrinkwrap.sh ,

# Assign the first argument to delimiter unless it's not defined
# in which case assign just a space
#
#   VAR=${1:-DEFAULTVALUE}
#

DELIM=${1:- }

# all spaces to tabs
expand -t1 | \
    # collapse all spaces 
tr -s " " | \
    # only keep lines that have a non-space character
grep '[^ ]' | \
    # remove leading and trailing spaces
sed 's/^[ ]//g' | sed 's/[ ]$//g' | \
    # remove lines that begin with comment field
grep -v -e '^\#' |
    # replace the space with our delimiter string (space by default)
tr " " "$DELIM"

day.1 / lecture.4 / 5 / testsnp.sh

↓

#!/bin/bash

LEN=15
LINES=3

perl makesnp -p 0.1 -len $LEN -lines $LINES -seed 0 > snp.txt

# 0. filter blank lines
# 1. fold the lines to make one character per line
# 2. calculate
#
#    rec - record number (line number modulo 2), 0|1
#   pair - index of line pair (0,1,2,3...)
#    pos - position along the line
#
# 3. each base gets a unique tag: pair.pos.base
#
# 4. positions at which there are snps will only have one entry of this
#    tag, which we find by count unique tags and then looking for ones
#    that only appear once
#
# The uniq -w7 flag compares only the first 7 characters (length of tag)

grep -v "^$" snp.txt | fold -w1 | \
    awk -v len=$LEN -v lines=$LINES \
    '{ rec=NR%2; pair=int((NR-1)/(2*len)); pos=(NR-1)%len; \
       printf("%02d.%02d.%s %d %d\n",pair,pos,$1,rec,pair*len+pos) }' > snp.mod.txt

sort snp.mod.txt | uniq -w7 -c | sed 's/^ *//' | grep ^1 | sort -n -k 4,5  -n -k 3,4 | tee snp.report.txt

cat snp.report.txt | sed 's/\./ /g'| awk '{print $5,$6,$4}' > tmp.txt

grep ^0 tmp.txt | cut -d " " -f 2- | sort > tmp.1.txt
grep ^1 tmp.txt | cut -d " " -f 2- | sort > tmp.2.txt

join tmp.1.txt tmp.2.txt | sort -n > snp.list.txt

\rm tmp.*.txt