functions.pl

# General-purpose functions called by other scripts

# Return a hash of config parameters from the file QC.cfg
# in the format
#
# PARAMETER=value
#
#
sub read_config_params()
{
my $bindir = dirname(abs_path($0)) . "/";
my $cfgfile = $bindir . "QC.cfg";
my %cfgparams;
my $incfg; 
open($incfg,$cfgfile) or die "Cannot open $cfgfile\n";
while($line=<$incfg>)
{
        chomp $line;
        ($key,$value) = split "=",$line;
        $cfgparams{$key} = $value;
}
close $incfg;

return %cfgparams;
}

1;

GBS_qual_control1.pl

#!/usr/bin/perl -w
use strict;

use Cwd 'abs_path';
use File::Basename;
my $bindirectory = dirname(abs_path($0)) . "/";
require $bindirectory . "functions.pl";

# Get machine-specific configuration parameters
my %cfgparams = read_config_params();

my ($flowcell, $qseq, $gz, $enzyme, $lane_file, $zea_lane_file, $s_lane, $insubdir, $prefix);
my $usage = 'GBS_qual_control1.pl  FLOWCELL  qseq|fastq  txt|gz  ApeKI|PstI|EcoT22I lane_file maize_lane_file insubdir';
if ($ARGV[0] && $ARGV[1] && $ARGV[2] && $ARGV[3] && $ARGV[4] && $ARGV[5]) {
  $flowcell = $ARGV[0];
  $lane_file = $ARGV[4];     # file with one space-delimited line per (present) enzyme, e.g., ApeKI 1 4 6
  $zea_lane_file = $ARGV[5];# one maize lane per line
  $insubdir = $ARGV[6];
  if ($ARGV[1] eq "qseq") {
      $qseq = 1;
  } elsif ($ARGV[1] eq "fastq") {
      $qseq = 0;
  } else {
      die "USAGE: $usage\n\n\n";
  }

  if ($ARGV[2] eq "gz") {
      $gz = 1;
  } elsif ($ARGV[2] eq "txt") {
      $gz = 0;
  } else {
      die "USAGE: $usage\n\n\n";
  }

  if ($ARGV[3] eq "ApeKI") {
      $enzyme = "ApeKI";
  } elsif ($ARGV[3] eq "PstI") {
      $enzyme = "PstI";
  } elsif ($ARGV[3] eq "EcoT22I") {
      $enzyme = "EcoT22I";
  } else {
      die "Unknown enzyme: $ARGV[3]\n\nUSAGE: $usage\n\n\n";
  }
} else {
    die "USAGE: $usage\n\n\n";
}

# grab the lanes for this enzyme
my ($txtline, $junk);
my @Lanes = ();
open IN, "<$lane_file" or die "ERROR: Cannot open lanes file $lane_file: $!\n\n";
while($txtline = <IN>)
{
	chomp $txtline;
	if($txtline =~ m/$enzyme/)
	{
		($junk, @Lanes) = split " ", $txtline;
	}

}
close IN;
if($#Lanes < 0)
{
	print STDOUT "No lanes to process for this enzyme...\n";
	exit;
}

# Grab the maize lanes
my %maizelanes = (1 => "", 2 => "", 3 => "", 4 => "", 5 => "", 6 => "", 7 => "", 8 => "", "ALL" => "");
open IN, "<$zea_lane_file" or die "ERROR: Cannot open lanes file $zea_lane_file: $!\n\n";
while($txtline = <IN>)
{
        chomp $txtline;
	$maizelanes{$txtline} = 1;
}
close IN;
# In the future, do some consistency checking on the lanes provided - should be all different and between 1 and 8

#my $key_file = "/usr/local/maizediv/illumina/qual_control/" . $flowcell . "/" . $flowcell . "_key.txt"; 
#my $key_file = "/cbsufsrv4/data1/maizediv/illumina/PerkinElmer/" . $flowcell . "/" . $flowcell . "_key.txt";   
# RB change
my $flowcellroot = $cfgparams{"FLOWCELLROOT"};
my $key_file = $flowcellroot . $flowcell . "/" . $flowcell . "_key.txt";   

# Data Input files - take them from lane list on command line
my %infiles;  # key = lane, value = filename
#$infiles{1} = 'D09FYACXX_1_fastq.gz';
#$infiles{2} = 'B064FABXX_2_qseq.txt.gz';
#$infiles{3} = 'B064FABXX_3_qseq.txt.gz';
#$infiles{4} = 'B064FABXX_4_qseq.txt.gz';
#$infiles{5} = 'B064FABXX_5_qseq.txt.gz';
#$infiles{6} = 'B064FABXX_6_qseq.txt.gz';
#$infiles{7} = 'D09FYACXX_7_fastq.gz';
#$infiles{8} = '10247084_C00R8ABXX_s_8_qseq.txt.gz';
my $subdirstr = "";
if($insubdir) 
{ 	
	$subdirstr = "fastq/";
	if($qseq) { $subdirstr = "qseq/"; }
}
my $flestr;
my $cmd;
foreach (@Lanes)
{
	$cmd = "ls -1 " . $flowcellroot . $flowcell . "/" . $subdirstr . "*$flowcell" . "*_$_" . "_*.gz";
	$flestr = `$cmd`;
	chomp $flestr;
	$infiles{$_} = $flestr;
}

# my $output_path = '/usr/local/maizediv/illumina/qual_control/' . $flowcell;
# my $output_path = '/cbsufsrv4/data1/maizediv/illumina/qual_control/' . $flowcell . '/';
my $output_path = "./$ARGV[3]/";    # call it after enzyme
#$output_path = $qseq ? $output_path . '/prefilter/' : $output_path . '/postfilter/';  # no longer relevant with Casava 1.8

print STDOUT "\n\nGreetings Panzean! Let's test the quality of Illumina flow cell $flowcell by checking for barcodes & cut sites...\n\n";
print STDOUT "The following files will be processed:\n\n";
foreach my $key (keys %infiles)
{
        if($infiles{$key} eq "") { next; }
        print STDOUT "Lane $key: $infiles{$key}";
	if($maizelanes{$key} ne "")
	{
		print STDOUT " (maize lane) ";
	}
	else
	{
		print STDOUT " (non-maize lane) ";
	}
	print STDOUT "\n";
}
print STDOUT "\n";

# exit;   # just for test

my ($cut_site_remnant, $cut_site_remnant1, $cut_site_remnant2, $common_adapter, $common_adapter1, $common_adapter2);

if ($enzyme eq "ApeKI") {
  $cut_site_remnant =  'C[AT]GC';
  $cut_site_remnant1 = 'CAGC';
  $cut_site_remnant2 = 'CTGC';
  $common_adapter = 'C[AT]GAGATCGGAAGAGCGGTTCAGCAGG';  # first 27 bases
  $common_adapter1 =   'CAGAGATCGGAAGAGCGGTTCAGCAGG';  # first 27 bases
  $common_adapter2 =   'CTGAGATCGGAAGAGCGGTTCAGCAGG';  # first 27 bases
  #                        AGATCGGAAGAGCGGTTCAGCAGG  = common adapter minus overhang
} elsif ($enzyme eq "PstI") {
  $cut_site_remnant =  'TGCAG';
  $cut_site_remnant1 = 'TGCAG';
  $cut_site_remnant2 = 'none';
  $common_adapter =    'TGCAAGATCGGAAGAGCGGTTCAGCAGG';  # first 28 bases
  $common_adapter1 =   'TGCAAGATCGGAAGAGCGGTTCAGCAGG';  # first 28 bases
  #                         AGATCGGAAGAGCGGTTCAGCAGG  = common adapter minus overhang
  $common_adapter2 =   'none';  # first 27 bases
} elsif ($enzyme eq "EcoT22I") {
	$cut_site_remnant =  'TGCAT';
	$cut_site_remnant1 = 'TGCAT';
	$cut_site_remnant2 = 'none';
	$common_adapter =    'TGCAAGATCGGAAGAGCGGTTCAGCAGG';  # first 28 bases
	$common_adapter1 =   'TGCAAGATCGGAAGAGCGGTTCAGCAGG';  # first 28 bases
	#                         AGATCGGAAGAGCGGTTCAGCAGG  = common adapter minus overhang
	$common_adapter2 =   'none';  # first 27 bases
}

# read in the barcodes (read from a file for now -- will read from DB later?)
my %barcodes;          # key1 = lane, key2 = barcode, value = sample
my %barcode_platemap;  # key1 = lane, key2 = barcode, value = plate__row__column
my %barcode_counts;    # key1 = lane, key2 = barcode, value = count
my %barcode_n;         # key = lane, value = n_barcodes in that lane that are not "empty" or "blank"  (case insensitive)
my %plates_by_lane;             # key = lane, value = plate
my ($tot_n_barcodes, $sum_sqrs, $sum_sqrs_lane, $mean, $sd, $coeff_var);
my @input_line;
open IN, "<$key_file" or die "ERROR: Cannot open multiplex key file $key_file: $!\n\n";
<IN>;  # skip header line
while (<IN>) {
  s/\r$//;  # convert DOS to unix if necessary
  chomp;
  @input_line = split "\t";
  my ($fcell, $lane, $barcode, $sample, $plate, $row, $col) = @input_line[0,1,2,3,4,5,6];
  die "ERROR: Flow cell mismatch in barcode key file: $fcell (expect $flowcell)\n\n" if $fcell ne $flowcell;
  if (exists $infiles{$lane} ) {   # only work with the lanes that are in the infile list
    $barcodes{$lane}{$barcode} = $sample;
    $barcode_platemap{$lane}{$barcode} = join "\t", $plate, $row, $col;
    $plates_by_lane{$lane} = $plate;
    $barcode_counts{$lane}{$barcode} = 0;
    unless ($sample =~ /empty/i || $sample =~ /blank/i) {
      ++$barcode_n{$lane};
      ++$tot_n_barcodes;
    }
  }
}
print STDOUT "\nFinished reading barcodes for $tot_n_barcodes non-blank samples from the key file...\n";
close IN;


# Output files:
my $outfile;  # a fasta file (one per lane) containing each trimmed read (barcode removed) that passes our criteria (barcode & cut site, no Ns in 64 bp, no adapter dimers, etc)
              #   - these LARGE files will be sampled from for BLAST to a set of reference reads in part 2 = nextgen_qual_control2.pl
              #   - the files should be deleted thereafter to save disk space

my $rptfile = $output_path . $flowcell . "_QC1.txt";  # pass_counts by lane
open RPT, ">$rptfile" or die "ERROR: Cannot open output file \'$rptfile\': $!\n\n";
print RPT  join("\t",  "Lane",
                       "Plate",
                       "Pass_Count", 
                       "Read_Count", 
                       "Pass_Rate", 
                       "Pass_Filter_Count", 
                       "Pass_Filter_Rate", 
                       "Adapter_dimers", 
                       "n_samples",
                       "Mean_count",
                       "SD_count",
                       "CV_count_(%)"),
           "\n";

my $rptfile2 = $output_path . $flowcell . "_QC2.txt";  # pass_counts by barcode within lane

my $dimer_file = $output_path . $flowcell . "_adapter_dimers.txt";  
open DIMERS, ">$dimer_file" or die "ERROR: Cannot open reverse adapter file \'$dimer_file\': $!\n\n";

my $lane;
my @line;
my $N_char;
my ($barcode, $seqlen, $dimer);
my ($machine,$run,$fc,$lane_num,$tile,$x,$y,$seq,$qual,$pair_member,$filtered,$cntrl,$pass_filter);
my $barcode_matched;  # boolean

my ($read_count, $pass_count, $pass_filter_count, $n_short_seqs, $n_dimers) = (0, 0, 0, 0, 0, 0);
my ($pass_count_lane, $read_count_lane, $pass_filter_count_lane, $pass_filter_rate_lane, $pass_filter_rate, $n_short_seqs_lane, $n_dimers_lane);
foreach $lane (sort {$a <=> $b} keys %infiles) {
  $outfile =  $output_path . $flowcell . "_" . $lane . "_QC.fasta";
  if($maizelanes{$lane} ne "")
  {
        open OUT, ">$outfile" or die "ERROR: Cannot open output file \'$outfile\': $!\n\n";
  }
  else
  {
	print STDOUT "Lane $lane is not maize - no fasta file will be created.\n";
  }

  ($pass_count_lane, $read_count_lane, $pass_filter_count_lane, $n_short_seqs_lane, $n_dimers_lane) = (0, 0, 0, 0, 0, 0);
  if ($gz) {
    open IN, "gunzip -c $infiles{$lane} |" or die "ERROR: Cannot open compressed (*.gz) input file: $!\n\n\n";
  }
  else {
    open IN, "<$infiles{$lane}" or die "ERROR: Cannot open input file: $!\n\n\n";
  }
  print STDOUT "\n\nReading data from the input file:  $infiles{$lane}...\n";
  while (<IN>) {
    $barcode_matched = 0;
    chomp;
    if ($qseq) {
      $N_char = '.';
      if (/^@/) {
        die "\n\n***ERROR: looks like a fastq file (not qseq)***\n\n";
      }
      @line = split "\t";
      ($machine,$run,$lane_num,$tile,$x,$y,$seq,$qual,$pass_filter) = @line[0,1,2,3,4,5,8,9,10];
      $pass_filter_count += $pass_filter;
      $pass_filter_count_lane += $pass_filter;
    }
    else {  # fastq
      $N_char = 'N';
      my $line1 = $_;
#      unless ($line1 =~ s<^@(\S+)#0/1$><$1>)  # old style fastq 
      unless ($line1 =~ s<^@(\S+)\s(\S+)$><$1:$2>) # Cassava 1.8 fastq. Replace space with colon 
      {
        die "\n\n***ERROR: looks like a qseq file (not fastq)***\n\n";
      }
      ($machine,$run,$fc,$lane_num,$tile,$x,$y,$pair_member,$filtered,$cntrl) = split ":", $line1;
      warn "WARNING: Flowcell ($fc) does not match expected ($flowcell)\n" if $fc ne $flowcell;
      $pass_filter = $filtered eq 'N'? 1 : 0;  # N means "not filtered" which means that it passed filter
      $pass_filter_count += $pass_filter;
      $pass_filter_count_lane += $pass_filter;
      chomp($seq = <IN>);
      my $line3 = <IN>;
      chomp($qual = <IN>);
    }

    my $cutsite_start = index($seq, $cut_site_remnant1);
    if ($cutsite_start > 3 && $cutsite_start < 11) {
      $barcode = substr($seq, 0, $cutsite_start);
      if (exists $barcodes{$lane}{$barcode}) {
        $barcode_matched = 1;
        $seq =~ s/^$barcode//;  # trim the barcode
      }
    }
    else {
      $cutsite_start = index($seq, $cut_site_remnant2);
      if ($cutsite_start > 3 && $cutsite_start < 11) {
        $barcode = substr($seq, 0, $cutsite_start);
        if (exists $barcodes{$lane}{$barcode}) {
          $barcode_matched = 1;
          $seq =~ s/^$barcode//;  # trim the barcode
        }
      }
      else {   # check for adapter/adapter dimers
        my $common_adapter_start = index($seq, $common_adapter1);
        if ($common_adapter_start > 3 && $common_adapter_start < 11) {
          $barcode = substr($seq, 0, $common_adapter_start);
          if (exists $barcodes{$lane}{$barcode}) {
            ++$n_dimers_lane;
            ++$n_dimers;
            $seq =~ s/^$barcode//;  # trim the barcode
            print DIMERS ">", join(":", $machine, $run, $flowcell, $lane_num, $tile, $x, $y, $barcodes{$lane}{$barcode}, $pass_filter), "\n";
            print DIMERS $barcode, "__", $seq, "\n";
          }
        }
        else {
          $common_adapter_start = index($seq, $common_adapter2);
          if ($common_adapter_start > 3 && $common_adapter_start < 11) {
            $barcode = substr($seq, 0, $common_adapter_start);
            if (exists $barcodes{$lane}{$barcode}) {
              ++$n_dimers_lane;
              ++$n_dimers;
              $seq =~ s/^$barcode//;  # trim the barcode
              print DIMERS ">", join(":", $machine, $run, $flowcell, $lane_num, $tile, $x, $y, $barcodes{$lane}{$barcode}, $pass_filter), "\n";
              print DIMERS $barcode, "__", $seq, "\n";
            }
          }
        }
      }
    }
    if ($barcode_matched) {
      if (index($seq,$N_char) == -1 || index($seq,$N_char) > 63) {   # sequences with N's don't pass
        ++$barcode_counts{$lane}{$barcode};
        unless ($barcodes{$lane}{$barcode} =~ /empty/i || $barcodes{$lane}{$barcode} =~ /blank/i) {
         if($maizelanes{$lane} ne "")
         {
          print OUT ">", join(":", $machine, $run, $flowcell, $lane_num, $tile, $x, $y, $barcodes{$lane}{$barcode}, $pass_filter), "\n";
          print OUT "$seq\n";
         }
          ++$pass_count;
          ++$pass_count_lane;
        }
      }
    }
    ++$read_count;
    ++$read_count_lane;
  }

  $sum_sqrs_lane = 0;
  foreach my $brcd (keys %{ $barcode_counts{$lane} } ) {
    unless ($barcodes{$lane}{$brcd} =~ /empty/i || $barcodes{$lane}{$brcd} =~ /blank/i) {
      $sum_sqrs_lane += $barcode_counts{$lane}{$brcd}**2;
      $sum_sqrs += $barcode_counts{$lane}{$brcd}**2;
    }
  }

  close IN;
  if($maizelanes{$lane} ne "") { close OUT; }
  $mean = $pass_count_lane/$barcode_n{$lane};
  $sd = sqrt( ($sum_sqrs_lane/$barcode_n{$lane})-($mean**2) );
  $coeff_var = $mean ? sprintf("%.2f",100*$sd/$mean) : 0;
#  if ($qseq) {  #### Cassava 1.8 fastq now contains both unfiltered (passed filter) and filtered reads
    $pass_filter_rate_lane = $read_count_lane ? sprintf("%.4f", $pass_filter_count_lane/$read_count_lane) : 0;
#  }
#  else {
#    $pass_filter_rate_lane = "NA";
#  }
  my $pass_rate_lane = $read_count_lane ? $pass_count_lane/$read_count_lane : 0;
  print RPT join "\t", $lane, $plates_by_lane{$lane}, $pass_count_lane, $read_count_lane, sprintf("%.4f", $pass_rate_lane),
                       $pass_filter_count_lane, $pass_filter_rate_lane, $n_dimers_lane, $barcode_n{$lane},
                       sprintf("%.0f", $mean), 
                       sprintf("%.0f", $sd), 
                       $coeff_var;
  print RPT "%\n";  
  print STDOUT "...finished processing $ARGV[1] file $infiles{$lane} for plate $plates_by_lane{$lane}\n"
           .     "pass filter count:  $pass_filter_count_lane\n"
           .     "adapter dimers:     $n_dimers_lane\n"
           .     "barcode_CV:         $coeff_var%\n" 
           .     "pass count:         $pass_count_lane\n" 
           .     "read count:         $read_count_lane\n";
}


$mean = $pass_count/$tot_n_barcodes;  
$sd = sqrt( ($sum_sqrs/$tot_n_barcodes) - ($mean**2) );
$coeff_var = $mean ? sprintf("%.2f",100*$sd/$mean) : 0;
#  if ($qseq) {  #### Cassava 1.8 fastq now contains both unfiltered (passed filter) and filtered reads
  $pass_filter_rate = $read_count ? sprintf("%.4f", $pass_filter_count/$read_count) : 0;
#}
#else {
#  $pass_filter_rate = "NA";
#}
my $pass_rate = $read_count ? $pass_count/$read_count : 0;
print RPT join "\t", 'ALL', 'ALL', $pass_count, $read_count, sprintf("%.4f", $pass_rate), 
                            $pass_filter_count, $pass_filter_rate, $n_dimers, $tot_n_barcodes,
                            sprintf("%.0f", $mean), 
                            sprintf("%.0f",  $sd),
                            $coeff_var;
print RPT "%\n";  
close RPT;

open RPT, ">$rptfile2" or die "ERROR: Cannot open output file \'$rptfile2\': $!\n\n";
print RPT join("\t", "Lane_Barcode", "Lane", "Barcode", "Sample", "Pass_Count", "Plate", "Row", "Col"), "\n";
foreach my $ln (sort {$a<=>$b} keys %barcode_counts) {
  foreach my $brcd (sort keys %{ $barcode_counts{$ln} } ) {
    print RPT $ln, "_", $brcd, "\t", join("\t", $ln, $brcd, $barcodes{$ln}{$brcd}, $barcode_counts{$ln}{$brcd}, $barcode_platemap{$ln}{$brcd}), "\n";
  }
}

print STDOUT "\n\nFinished!\n\n" 
           . "  In total, $pass_filter_count reads passed filter\n"
           . "  and       $n_dimers adaptor dimers were detected\n"
           . "  and       $pass_count reads matched the barcode and cut site and had no N's within the 64 base read\n"
           . "  out of    $read_count reads examined (in $ARGV[1] format)\n\nSee the following files for output:\n"
           . "\t$outfile\n"
           . "\t$rptfile\n"
           . "\t$rptfile2\n"
           . "\t$dimer_file\n";

print STDOUT "\n\n\nFare thee well, Panzean!!\n\n\n";

GBS_qual_control2.pl

#!/usr/bin/perl -w
use strict;
#use Bio::SearchIO;
use POSIX qw(floor);
# use File::Temp "tempfile";

use Cwd 'abs_path';
use File::Basename;
my $bindirectory = dirname(abs_path($0)) . "/";
require $bindirectory . "functions.pl";

# Look for the configuration file called QC.cfg located in the directory where the script
# is called from...
my %cfgparams = read_config_params();

my $blastdb = $cfgparams{"BLASTDB"};
my $blastprog = $cfgparams{"BLASTPROG"};

# This will hold zea lanes specified in $maize_lanes_list
my %maizelanes = (1 => "", 2 => "", 3 => "", 4 => "", 5 => "", 6 => "", 7 => "", 8 => "", "ALL" => "");

my $flowcell;
my $maize_lanes_list;
if ($#ARGV == 1) {
  $flowcell =  $ARGV[0];
  $maize_lanes_list = $ARGV[1];
}
else {
  die "\n\nUSAGE: GBS_qual_control2.pl FLOWCELL maize_lanes_list\n\n\n";
}

# read in the maize lanes (one lane per line)
open IN, "<$maize_lanes_list" or die "ERROR: Cannot open maize lanes file $maize_lanes_list: $!\n\n";
my $line;
while($line=<IN>)
{
	chomp $line;
	$maizelanes{$line} = 1;
}
close IN;


print STDOUT "\n\nGreetings Panzean! Let's test the quality of Illumina GAII flow cell $flowcell via BLAST to the reference genome...\n\n";


# read in the pass_counts = n_reads per lane matching the bar code and cut site, no rev_adapt starting < 24 bases (read from a file for now -- will read from DB later?)
my %passcounts;   # key1 = lane, value = pass_count
my @input_line;
my ($lane, $passcount);
my (%platename,%readcount,%passfilter,%dimers,%n_DNAs,%cv_barcode_count);   # key = lane
my $passcount_file = $flowcell . "_QC1.txt";
open IN, "<$passcount_file" or die "ERROR: Cannot open pass_count file $passcount_file: $!\n\n";
<IN>;  # skip header line
while (<IN>) {
  s/\r$//;  # convert DOS to unix if necessary
  chomp;
  @input_line = split "\t";
  $lane = $input_line[0];
  if($maizelanes{$lane} eq "") { next; }  # Only consider lanes specified on command line
  ($platename{$lane},$passcounts{$lane},$readcount{$lane},$passfilter{$lane},$dimers{$lane},$n_DNAs{$lane},$cv_barcode_count{$lane}) = @input_line[1,2,3,5,7,8,11] unless $lane eq 'ALL';
}
close IN;

my $numofzealanes = (scalar keys %passcounts);
if($numofzealanes == 0)
{
	print STDOUT "No maize lanes to process - exiting...\n";
	goto NOZEALANES;
} 
else
{
	print STDOUT "The following maize lanes will be subject to BLAST test:\n";
	foreach $lane (sort {$a<=>$b} keys %passcounts)
	{
        	print STDOUT "$lane\n";
	}
	# exit;  # this is just for test
}


my $QC_stats_file = $flowcell . "_QC_stats.txt";
open RES, ">$QC_stats_file" or die "ERROR: cannot open QC_stats file $QC_stats_file: $!\n\n";
print RES join("\t", qw(Flowcell Machine Run Lane Plate nSamples Reads PassFilter Dimers BarcodedEtc %barcoded CV BlastSample %1+hits AvgHitLenAll AvgHitLenHits MedianHitLen EstGoodReads EstGoodReads/DNA)), "\n";


# go through each lane's fasta file, write a random sample of reads to a new fasta file, perform BLAST and write results
my $rand_num;
my %random_reads;  # key = rand_num, value = 1 (arbitrary)
foreach $lane (sort {$a<=>$b} keys %passcounts)  {
  print STDOUT "Sampling reads for BLASTing from lane $lane (1000 random reads)...\n";
  %random_reads = ();
  if ($passcounts{$lane} > 5000) {
    # get 1000 random reads (with good barcodes and cut sites) per lane
    for my $read (0 .. 999) {
      $rand_num = int(rand($passcounts{$lane})) + 1;
      redo if exists $random_reads{$rand_num};
      $random_reads{$rand_num} = 1;    
    }

    # write these reads to a fasta for blast
    my $in_fasta_file = $flowcell . "_" . $lane . "_QC.fasta";   # temp fasta file filtered for reads having bardcode and cut site, etc
    open IN, "<$in_fasta_file" or die "ERROR: Cannot open fasta file $in_fasta_file: $!\n\n"; 
    my $out_fasta_file = $flowcell . "_" . $lane . "_sample.fasta";  # fasta file containing the sampled reads to be blasted
    open OUT, ">$out_fasta_file" or die "ERROR: Cannot open output file \'$out_fasta_file\': $!\n\n";
    my ($readcount, $writecount) = (0, 0);
    my ($name,$read);
    while (<IN>) {
      $name = $_;
      $read = <IN>;
      ++$readcount;
      if (exists $random_reads{$readcount})  {
        print OUT $name;
        print OUT $read;
        ++$writecount;
      }
    }
    close IN;
    close OUT;
    print STDOUT "\t...$writecount reads were sampled from lane $lane for BLASTing out of $readcount reads with barcodes and cut sites\n\n";


    # perform the blast of the sample from this lane
    print STDOUT "\tNow performing BLAST for sample from lane $lane...\n";
    my $blast_results_file = $flowcell . '_' . $lane . '_sample_blastresults.txt';
#    open OUT, ">$blast_results_file" or die "ERROR: Cannot open output file \'$blast_results_file\': $!\n\n";
#    close OUT;

    my $commandline  = "$blastprog -p blastn -d $blastdb -i $out_fasta_file -e 0.00001 -K 1 -m 8 -o $blast_results_file";
    unless ( system ($commandline) ) {
      print STDOUT "\t...BLAST was sucessful\n\n";
    } else {
      die "\n\n***ERROR: the blastall command failed.  Program aborted.\n\n\n";
    }
    
  
    # parse the table of blast results
    print STDOUT "\tparsing top hits from BLAST results...\n";
    open IN, "<$blast_results_file" or die "\n\nERROR: cannot open blast results file $blast_results_file: $!\n\n\n";
    my $summary_file = $flowcell . '_' . $lane . "_blast_summary.txt";

    open OUT, ">$summary_file" or die "ERROR: Cannot open BLAST summary file \'$summary_file\': $!\n\n";
    print OUT join "\t", "Query", "Hit?", "Top_hit_align_len", "Top_hit_len", "%ident", "n_mismatches", "n_gap_openings", "e_value", "bit_score\n";

    my @hit_lens = ();
    my ($hitlen, $current_read, $ident, $align_len, $n_mismatch, $n_gap_openings, $q_start, $q_end, $hit_start, $hit_end, $e_val, $bit_score);
    my ($tot_n_hits, $prop_that_hit, $avg_hit_len, $median_hit_len) = (0, 0, 0, 0);
    my $prev_read = "nada";
    while (<IN>) {
      chomp;
      @input_line = split "\t";

      #            0         (1)           2          3                4            5             6       7         8       9       10       11 
      # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
      
      $current_read = $input_line[0];
      if ($current_read ne $prev_read) {
        ($ident, $align_len, $n_mismatch, $n_gap_openings, $q_start, $q_end, $hit_start, $hit_end, $e_val, $bit_score) = @input_line[2..11];
        ++$prop_that_hit;
        $hitlen = abs($hit_end - $hit_start) + 1;
        $avg_hit_len += $hitlen;
        push @hit_lens, $hitlen;
        print OUT join "\t", $current_read, 1, $align_len, $hitlen, $ident, $n_mismatch, $n_gap_openings, $e_val, "$bit_score\n";
        $prev_read = $current_read;
      }
      ++$tot_n_hits;
    }
    close IN;
    close OUT;
    print STDOUT "\t...finished parsing $tot_n_hits BLAST results for the $prop_that_hit top hits\n\n";


    # calculate and print the summary QC statistics
    my ($machine,$run,$fc,$ln,$tile,$x,$y,$sample,$pass_filt) = split ":", $current_read; 
    my $n_hits = $prop_that_hit;
    my $avg_hit_len_one_plus = $avg_hit_len/$n_hits;
    $avg_hit_len = $avg_hit_len / $writecount;
    $prop_that_hit = $prop_that_hit / $writecount;

    my $size = @hit_lens;
    print STDERR "\n\nWARNING: size of array hit_lens not equal to n_hits\n\n" unless $n_hits == $size;
    @hit_lens = sort {$a <=> $b} @hit_lens;
    $median_hit_len = $hit_lens[floor($n_hits/2)-1];

    print RES  join( "\t",
      $flowcell, $machine, $run, $lane, $platename{$lane}, $n_DNAs{$lane}, $readcount{$lane}, $passfilter{$lane}, $dimers{$lane}, 
      $passcounts{$lane}, sprintf("%.2f",100*$passcounts{$lane}/$readcount{$lane}).'%', $cv_barcode_count{$lane}, 
      $writecount, sprintf("%.2f",100*$prop_that_hit).'%', sprintf("%.2f", $avg_hit_len), sprintf("%.2f", $avg_hit_len_one_plus),
      $median_hit_len, sprintf("%.0f",$prop_that_hit*$passcounts{$lane}), sprintf("%.0f",$prop_that_hit*$passcounts{$lane}/$n_DNAs{$lane}) ), "\n";
  }
  else {
    print STDOUT "\t\tLane $lane skipped (not BLASTed) because fewer than 5000 good reads\n";
    print RES  join("\t",
      $flowcell, 'machine', 'run', $lane, $platename{$lane}, $n_DNAs{$lane}, $readcount{$lane}, $passfilter{$lane}, $dimers{$lane}, 
      $passcounts{$lane}, sprintf("%.2f",100*$passcounts{$lane}/$readcount{$lane}).'%', $cv_barcode_count{$lane}, 
      0, 'NA', 'NA', 'NA', 'NA', 0, 0), "\n";
  }
}
close RES;


print STDOUT "\n\nFinished!\n\n" 
           . "For results summary, see:\n\t$QC_stats_file\n";

NOZEALANES:

print STDOUT "\n\n\nFare thee well, Panzean!!\n\n\n";


#sub doblast {
#  my ($seqname,$seq) = @_[0,1];
#  my ($numhits,$hitlen,$nident,$ngaps,$fractident);
#
##  For some reason, these temp files don't seem to get deleted (cron job?  This is not what the perl documentation says)
##  my ( $seq_fh, $tmpqueryfilename )=tempfile("blastqueryXXXXXX", SUFFIX => '.tmp');
##  print $seq_fh $seqname, $seq;
##  close $seq_fh;
##  my ( $res_fh, $tmpblastresultfile )=tempfile("blastresultXXXXXX", SUFFIX => '.tmp');
#
#  my $tmpqueryfilename = 'blastquery.tmp';
#  open SEQ, ">$tmpqueryfilename";
#  print SEQ $seqname, $seq;
#  close SEQ;
# 
#  my $tmpblastresultfile = 'blastresult.tmp';
#  open RES, ">$tmpblastresultfile";
#  close RES;
#
#  my $commandline  = "blastall -p blastn -d ../count091102b.fas -i $tmpqueryfilename -e 0.00001 -K 1 -m 7 -o $tmpblastresultfile";
#  my $runblast = system ($commandline);
#  unless ( $runblast =~/^\[blastall\].*ERROR/) {
#    my $searchio = new Bio::SearchIO( -format => 'blastxml',
# 	                        			      -file   => $tmpblastresultfile );
#    
#    if ( my $result = $searchio->next_result ) {
#      $numhits = $result->num_hits;
#      if ( my $hit = $result->next_hit ) {
#        if (my $hsp = $hit->next_hsp) {
#          $hitlen = $hsp->hsp_length;   # Note: this is the length of the alignment INCLUDING gaps
#          $nident = $hsp->num_identical;
#          $ngaps = $hsp->gaps;
#          $fractident = $hsp->frac_identical;
#        }
#        else {
#          ($hitlen,$nident,$ngaps,$fractident) = (-9,-9,-9,-9);
#        }
#      }  
#      else {
#        ($hitlen,$nident,$ngaps,$fractident) = (0,0,0,0);
#      }
#    }
#    else {
#      ($numhits,$hitlen,$nident,$ngaps,$fractident) = (-99,-99,-99,-99,-99);
#    }
#
#  } else {
#    print STDERR "WARNING: No response from server for read $seqname\n\n";
#    ($numhits,$hitlen,$nident,$ngaps,$fractident) = (-999,-999,-999,-999,-999);
#  }
#  ($numhits,$hitlen,$nident,$ngaps,$fractident);
#}

get_lanes_for_enzyme.pl

#!/usr/bin/perl

# Read the key file ane produce a list of lanes treated with each enzyme

@enzymes =("ApeKI", "EcoT22I", "PstI");
%lanes = {};

# Parse the header line of the key file in search for indices corresponding
# to enzyme and flowcell ID. Just in case the order of columns ever changes....
$line=<>;
chomp $line;
@aux = split "\t", $line;
for($i=0;$i<=$#aux;$i++)
{
	if($aux[$i] eq "Lane") { $index_lane = $i; }
	if($aux[$i] eq "Enzyme") { $index_enzyme = $i; }
}

while($line=<>)
{
	chomp $line;
	@aux = split "\t", $line;
	$lanes{$aux[$index_enzyme]}{$aux[$index_lane]}++;
}

# print out the result...
foreach $enzyme (keys %lanes)
{
	if($lanes{$enzyme} eq "") { next; }
	print "$enzyme ";
	foreach $lane (sort {$a <=> $b} keys %{$lanes{$enzyme}})
	{
		if($lanes{$enzyme}{$lane} eq "") { next; }
		print "$lane ";
	}
	print "\n";
}

get_zea_lanes.pl

#!/usr/bin/perl

# Read the key file and produce a list of maize lanes 
# to be processed with QC2 (BLAST). Currently: genus zea and anzyme ApeKI.

%lanes = {};

# Parse the header line of the key file in search for indices corresponding
# to enzyme and flowcell ID. Just in case the order of columns ever changes....
$line=<>;
chomp $line;
@aux = split "\t", $line;
for($i=0;$i<=$#aux;$i++)
{
	if($aux[$i] eq "Lane") { $index_lane = $i; }
        if($aux[$i] eq "Genus") { $index_genus = $i; }
        if($aux[$i] eq "Enzyme") { $index_enzyme = $i; }
}

while($line=<>)
{
	chomp $line;
	@aux = split "\t", $line;
	if(lc($aux[$index_genus]) eq "zea" && $aux[$index_enzyme] eq "ApeKI") { $lanes{$aux[$index_lane]}++; }
}

# print out the result...
foreach $lane (keys %lanes)
{
	if($lanes{$lane} eq "") { next; }
	print "$lane\n";
}

parseAdapterDimers.pl

#!/usr/bin/perl -w
use strict;

my $flowcell;
if ($ARGV[0]) {
  $flowcell =  $ARGV[0];
}
else {
  die "\n\nUSAGE: parseAdapterDimers.pl FLOWCELL\n\n\n";
}

print STDOUT "\n\nGreetings Panzean! Let's parse the adapter dimer file output from GBS_qual_control1.pl for flowcell $flowcell...\n\n";

my $in_file = $flowcell . "_adapter_dimers.txt";
open IN, "<$in_file" or die "ERROR: Cannot open input file \'$in_file\': $!\n\n"; 
my $dimercount = 0;
my ($name, $read);
my ($machine, $run, $fc, $lane_num, $tile, $x, $y, $sample, $pass_filter);
my ($barcode, $dimer);
my %dimersByLaneBarcode;
my %samplesByLaneBarcode;
while (<IN>) {
	$name = $_;
	$name =~ s/^>//;
  ($machine, $run, $fc, $lane_num, $tile, $x, $y, $sample, $pass_filter) = split(":",$name);
	$read = <IN>;
	($barcode, $dimer) = split("__",$read);
	++$dimersByLaneBarcode{$lane_num}{$barcode};
	$samplesByLaneBarcode{$lane_num}{$barcode} = $sample;
	++$dimercount;
}
close IN;

my $out_file = $flowcell . "_adapter_dimers_parsed.txt";
open OUT, ">$out_file" or die "ERROR: Cannot open output file \'$out_file\': $!\n\n";
print OUT join("\t", qw[Flowcell Lane Barcode Sample nAdapterDimers]), "\n";
my $lanecount = 0;
foreach my $lane (sort {$a<=>$b} keys %dimersByLaneBarcode) {
	++$lanecount;
	foreach my $bc (sort keys %{ $dimersByLaneBarcode{$lane} }) {
		print OUT join("\t", $fc, $lane, $bc, $samplesByLaneBarcode{$lane}{$bc}, $dimersByLaneBarcode{$lane}{$bc}), "\n";
	}
}
close OUT;

print STDOUT "\t...$dimercount dimers were found in the adapter dimer file, which covered $lanecount lanes\n\n";
print STDOUT "\n\nFinished!\n\n" 
           . "For the parsed results, see:\n\t$out_file\n";
print STDOUT "\n\n\nFare thee well, Panzean!!\n\n\n";

QC.cfg

FLOWCELLROOT=/data1/maizediv/illumina/GBSFlowCells/
QUALROOT=/data1/bukowski/qual_control_tmp/
BINROOT=/data1/bukowski/panzea/panzea_filemgm/
BLASTDB=/data1/maizediv/illumina/qual_control/QC2_blast_db/Zea_ApeKI_GBS_QC_db_091102.fas
BLASTPROG=/data1/bukowski/panzea/blast-2.2.24/bin/blastall

run_QC.pl

#!/usr/bin/perl

use Cwd 'abs_path'; 
use File::Basename;
my $bindirectory = dirname(abs_path($0)) . "/";
require $bindirectory . "functions.pl";

# Given the flowcell name, reads format, flag on whether the reads are in a subdirectory under flowcell dir,
#  and assuming the key file present
# in the flowcell directory, run the whole QC pipeline....

# Look for the configuration file called QC.cfg located in the directory where the script
# is called from...
my %cfgparams = read_config_params();

my $flowcell = $ARGV[0];
my $format = $ARGV[1];   # fasq or qseq
my $insubdir = $ARGV[2]; # 1 if files are in subdir under flowcell dir, 0 otherwise

# Read off the parameters.....
my $flowcellroot = $cfgparams{"FLOWCELLROOT"};
my $qualroot = $cfgparams{"QUALROOT"};
my $binroot = $cfgparams{"BINROOT"};

my $key_file = $flowcellroot . $flowcell . "/" . $flowcell . "_key.txt";
my $datadir = $flowcellroot . $flowcell;
my $qualdir = $qualroot . $flowcell;

# Create qualdir if it does not yet exist
unless( -d $qualdir)  
{
	mkdir $qualdir or die "Cannot create directory $qualdir\n";
}


# generate enzyme-lane mapping file (all_lanes) and
# maize lane list file (maize_lanes digested with ApeKI - subject to QC2)
chdir($qualdir);
system($binroot . "get_lanes_for_enzyme.pl < $key_file > all_lanes");
system($binroot . "get_zea_lanes.pl < $key_file > maize_lanes");

# Make enzyme directories and run the pipelines
my @enzymes = `awk '{print \$1}' all_lanes`;
foreach $enz (@enzymes)
{
	chdir($qualdir);
	chomp $enz;
	mkdir $enz;
	$cmd = $binroot ."GBS_qual_control1.pl $flowcell $format gz $enz ./all_lanes ./maize_lanes $insubdir >& QC1_$enz.log";
	system($cmd);
	chdir($enz);
	$cmd = $binroot ."parseAdapterDimers.pl $flowcell >> ../QC1_$enz.log 2>&1";
	system($cmd);
	$cmd = $binroot ."GBS_qual_control2.pl $flowcell ../maize_lanes >& ../QC2_$enz.log";
	system($cmd);
	system("rm ${flowcell}_*_QC.fasta");
}
chdir($qualdir);

Page tree

QC Scripts

functions.pl

GBS_qual_control1.pl

GBS_qual_control2.pl

get_lanes_for_enzyme.pl

get_zea_lanes.pl

parseAdapterDimers.pl

QC.cfg

run_QC.pl