functions.pl Code Block |
---|
# General-purpose functions called by other scripts
# Return a hash of config parameters from the file QC.cfg
# in the format
#
# PARAMETER=value
#
#
sub read_config_params()
{
my $bindir = dirname(abs_path($0)) . "/";
my $cfgfile = $bindir . "QC.cfg";
my %cfgparams;
my $incfg;
open($incfg,$cfgfile) or die "Cannot open $cfgfile\n";
while($line=<$incfg>)
{
chomp $line;
($key,$value) = split "=",$line;
$cfgparams{$key} = $value;
}
close $incfg;
return %cfgparams;
}
1;
|
GBS_qual_control1.pl Code Block |
---|
#!/usr/bin/perl -w
use strict;
use Cwd 'abs_path';
use File::Basename;
my $bindirectory = dirname(abs_path($0)) . "/";
require $bindirectory . "functions.pl";
# Get machine-specific configuration parameters
my %cfgparams = read_config_params();
my ($flowcell, $qseq, $gz, $enzyme, $lane_file, $zea_lane_file, $s_lane, $insubdir, $prefix);
my $usage = 'GBS_qual_control1.pl FLOWCELL qseq|fastq txt|gz ApeKI|PstI|EcoT22I lane_file maize_lane_file insubdir';
if ($ARGV[0] && $ARGV[1] && $ARGV[2] && $ARGV[3] && $ARGV[4] && $ARGV[5]) {
$flowcell = $ARGV[0];
$lane_file = $ARGV[4]; # file with one space-delimited line per (present) enzyme, e.g., ApeKI 1 4 6
$zea_lane_file = $ARGV[5];# one maize lane per line
$insubdir = $ARGV[6];
if ($ARGV[1] eq "qseq") {
$qseq = 1;
} elsif ($ARGV[1] eq "fastq") {
$qseq = 0;
} else {
die "USAGE: $usage\n\n\n";
}
if ($ARGV[2] eq "gz") {
$gz = 1;
} elsif ($ARGV[2] eq "txt") {
$gz = 0;
} else {
die "USAGE: $usage\n\n\n";
}
if ($ARGV[3] eq "ApeKI") {
$enzyme = "ApeKI";
} elsif ($ARGV[3] eq "PstI") {
$enzyme = "PstI";
} elsif ($ARGV[3] eq "EcoT22I") {
$enzyme = "EcoT22I";
} else {
die "Unknown enzyme: $ARGV[3]\n\nUSAGE: $usage\n\n\n";
}
} else {
die "USAGE: $usage\n\n\n";
}
# grab the lanes for this enzyme
my ($txtline, $junk);
my @Lanes = ();
open IN, "<$lane_file" or die "ERROR: Cannot open lanes file $lane_file: $!\n\n";
while($txtline = <IN>)
{
chomp $txtline;
if($txtline =~ m/$enzyme/)
{
($junk, @Lanes) = split " ", $txtline;
}
}
close IN;
if($#Lanes < 0)
{
print STDOUT "No lanes to process for this enzyme...\n";
exit;
}
# Grab the maize lanes
my %maizelanes = (1 => "", 2 => "", 3 => "", 4 => "", 5 => "", 6 => "", 7 => "", 8 => "", "ALL" => "");
open IN, "<$zea_lane_file" or die "ERROR: Cannot open lanes file $zea_lane_file: $!\n\n";
while($txtline = <IN>)
{
chomp $txtline;
$maizelanes{$txtline} = 1;
}
close IN;
# In the future, do some consistency checking on the lanes provided - should be all different and between 1 and 8
#my $key_file = "/usr/local/maizediv/illumina/qual_control/" . $flowcell . "/" . $flowcell . "_key.txt";
#my $key_file = "/cbsufsrv4/data1/maizediv/illumina/PerkinElmer/" . $flowcell . "/" . $flowcell . "_key.txt";
# RB change
my $flowcellroot = $cfgparams{"FLOWCELLROOT"};
my $key_file = $flowcellroot . $flowcell . "/" . $flowcell . "_key.txt";
# Data Input files - take them from lane list on command line
my %infiles; # key = lane, value = filename
#$infiles{1} = 'D09FYACXX_1_fastq.gz';
#$infiles{2} = 'B064FABXX_2_qseq.txt.gz';
#$infiles{3} = 'B064FABXX_3_qseq.txt.gz';
#$infiles{4} = 'B064FABXX_4_qseq.txt.gz';
#$infiles{5} = 'B064FABXX_5_qseq.txt.gz';
#$infiles{6} = 'B064FABXX_6_qseq.txt.gz';
#$infiles{7} = 'D09FYACXX_7_fastq.gz';
#$infiles{8} = '10247084_C00R8ABXX_s_8_qseq.txt.gz';
my $subdirstr = "";
if($insubdir)
{
$subdirstr = "fastq/";
if($qseq) { $subdirstr = "qseq/"; }
}
my $flestr;
my $cmd;
foreach (@Lanes)
{
$cmd = "ls -1 " . $flowcellroot . $flowcell . "/" . $subdirstr . "*$flowcell" . "*_$_" . "_*.gz";
$flestr = `$cmd`;
chomp $flestr;
$infiles{$_} = $flestr;
}
# my $output_path = '/usr/local/maizediv/illumina/qual_control/' . $flowcell;
# my $output_path = '/cbsufsrv4/data1/maizediv/illumina/qual_control/' . $flowcell . '/';
my $output_path = "./$ARGV[3]/"; # call it after enzyme
#$output_path = $qseq ? $output_path . '/prefilter/' : $output_path . '/postfilter/'; # no longer relevant with Casava 1.8
print STDOUT "\n\nGreetings Panzean! Let's test the quality of Illumina flow cell $flowcell by checking for barcodes & cut sites...\n\n";
print STDOUT "The following files will be processed:\n\n";
foreach my $key (keys %infiles)
{
if($infiles{$key} eq "") { next; }
print STDOUT "Lane $key: $infiles{$key}";
if($maizelanes{$key} ne "")
{
print STDOUT " (maize lane) ";
}
else
{
print STDOUT " (non-maize lane) ";
}
print STDOUT "\n";
}
print STDOUT "\n";
# exit; # just for test
my ($cut_site_remnant, $cut_site_remnant1, $cut_site_remnant2, $common_adapter, $common_adapter1, $common_adapter2);
if ($enzyme eq "ApeKI") {
$cut_site_remnant = 'C[AT]GC';
$cut_site_remnant1 = 'CAGC';
$cut_site_remnant2 = 'CTGC';
$common_adapter = 'C[AT]GAGATCGGAAGAGCGGTTCAGCAGG'; # first 27 bases
$common_adapter1 = 'CAGAGATCGGAAGAGCGGTTCAGCAGG'; # first 27 bases
$common_adapter2 = 'CTGAGATCGGAAGAGCGGTTCAGCAGG'; # first 27 bases
# AGATCGGAAGAGCGGTTCAGCAGG = common adapter minus overhang
} elsif ($enzyme eq "PstI") {
$cut_site_remnant = 'TGCAG';
$cut_site_remnant1 = 'TGCAG';
$cut_site_remnant2 = 'none';
$common_adapter = 'TGCAAGATCGGAAGAGCGGTTCAGCAGG'; # first 28 bases
$common_adapter1 = 'TGCAAGATCGGAAGAGCGGTTCAGCAGG'; # first 28 bases
# AGATCGGAAGAGCGGTTCAGCAGG = common adapter minus overhang
$common_adapter2 = 'none'; # first 27 bases
} elsif ($enzyme eq "EcoT22I") {
$cut_site_remnant = 'TGCAT';
$cut_site_remnant1 = 'TGCAT';
$cut_site_remnant2 = 'none';
$common_adapter = 'TGCAAGATCGGAAGAGCGGTTCAGCAGG'; # first 28 bases
$common_adapter1 = 'TGCAAGATCGGAAGAGCGGTTCAGCAGG'; # first 28 bases
# AGATCGGAAGAGCGGTTCAGCAGG = common adapter minus overhang
$common_adapter2 = 'none'; # first 27 bases
}
# read in the barcodes (read from a file for now -- will read from DB later?)
my %barcodes; # key1 = lane, key2 = barcode, value = sample
my %barcode_platemap; # key1 = lane, key2 = barcode, value = plate__row__column
my %barcode_counts; # key1 = lane, key2 = barcode, value = count
my %barcode_n; # key = lane, value = n_barcodes in that lane that are not "empty" or "blank" (case insensitive)
my %plates_by_lane; # key = lane, value = plate
my ($tot_n_barcodes, $sum_sqrs, $sum_sqrs_lane, $mean, $sd, $coeff_var);
my @input_line;
open IN, "<$key_file" or die "ERROR: Cannot open multiplex key file $key_file: $!\n\n";
<IN>; # skip header line
while (<IN>) {
s/\r$//; # convert DOS to unix if necessary
chomp;
@input_line = split "\t";
my ($fcell, $lane, $barcode, $sample, $plate, $row, $col) = @input_line[0,1,2,3,4,5,6];
die "ERROR: Flow cell mismatch in barcode key file: $fcell (expect $flowcell)\n\n" if $fcell ne $flowcell;
if (exists $infiles{$lane} ) { # only work with the lanes that are in the infile list
$barcodes{$lane}{$barcode} = $sample;
$barcode_platemap{$lane}{$barcode} = join "\t", $plate, $row, $col;
$plates_by_lane{$lane} = $plate;
$barcode_counts{$lane}{$barcode} = 0;
unless ($sample =~ /empty/i || $sample =~ /blank/i) {
++$barcode_n{$lane};
++$tot_n_barcodes;
}
}
}
print STDOUT "\nFinished reading barcodes for $tot_n_barcodes non-blank samples from the key file...\n";
close IN;
# Output files:
my $outfile; # a fasta file (one per lane) containing each trimmed read (barcode removed) that passes our criteria (barcode & cut site, no Ns in 64 bp, no adapter dimers, etc)
# - these LARGE files will be sampled from for BLAST to a set of reference reads in part 2 = nextgen_qual_control2.pl
# - the files should be deleted thereafter to save disk space
my $rptfile = $output_path . $flowcell . "_QC1.txt"; # pass_counts by lane
open RPT, ">$rptfile" or die "ERROR: Cannot open output file \'$rptfile\': $!\n\n";
print RPT join("\t", "Lane",
"Plate",
"Pass_Count",
"Read_Count",
"Pass_Rate",
"Pass_Filter_Count",
"Pass_Filter_Rate",
"Adapter_dimers",
"n_samples",
"Mean_count",
"SD_count",
"CV_count_(%)"),
"\n";
my $rptfile2 = $output_path . $flowcell . "_QC2.txt"; # pass_counts by barcode within lane
my $dimer_file = $output_path . $flowcell . "_adapter_dimers.txt";
open DIMERS, ">$dimer_file" or die "ERROR: Cannot open reverse adapter file \'$dimer_file\': $!\n\n";
my $lane;
my @line;
my $N_char;
my ($barcode, $seqlen, $dimer);
my ($machine,$run,$fc,$lane_num,$tile,$x,$y,$seq,$qual,$pair_member,$filtered,$cntrl,$pass_filter);
my $barcode_matched; # boolean
my ($read_count, $pass_count, $pass_filter_count, $n_short_seqs, $n_dimers) = (0, 0, 0, 0, 0, 0);
my ($pass_count_lane, $read_count_lane, $pass_filter_count_lane, $pass_filter_rate_lane, $pass_filter_rate, $n_short_seqs_lane, $n_dimers_lane);
foreach $lane (sort {$a <=> $b} keys %infiles) {
$outfile = $output_path . $flowcell . "_" . $lane . "_QC.fasta";
if($maizelanes{$lane} ne "")
{
open OUT, ">$outfile" or die "ERROR: Cannot open output file \'$outfile\': $!\n\n";
}
else
{
print STDOUT "Lane $lane is not maize - no fasta file will be created.\n";
}
($pass_count_lane, $read_count_lane, $pass_filter_count_lane, $n_short_seqs_lane, $n_dimers_lane) = (0, 0, 0, 0, 0, 0);
if ($gz) {
open IN, "gunzip -c $infiles{$lane} |" or die "ERROR: Cannot open compressed (*.gz) input file: $!\n\n\n";
}
else {
open IN, "<$infiles{$lane}" or die "ERROR: Cannot open input file: $!\n\n\n";
}
print STDOUT "\n\nReading data from the input file: $infiles{$lane}...\n";
while (<IN>) {
$barcode_matched = 0;
chomp;
if ($qseq) {
$N_char = '.';
if (/^@/) {
die "\n\n***ERROR: looks like a fastq file (not qseq)***\n\n";
}
@line = split "\t";
($machine,$run,$lane_num,$tile,$x,$y,$seq,$qual,$pass_filter) = @line[0,1,2,3,4,5,8,9,10];
$pass_filter_count += $pass_filter;
$pass_filter_count_lane += $pass_filter;
}
else { # fastq
$N_char = 'N';
my $line1 = $_;
# unless ($line1 =~ s<^@(\S+)#0/1$><$1>) # old style fastq
unless ($line1 =~ s<^@(\S+)\s(\S+)$><$1:$2>) # Cassava 1.8 fastq. Replace space with colon
{
die "\n\n***ERROR: looks like a qseq file (not fastq)***\n\n";
}
($machine,$run,$fc,$lane_num,$tile,$x,$y,$pair_member,$filtered,$cntrl) = split ":", $line1;
warn "WARNING: Flowcell ($fc) does not match expected ($flowcell)\n" if $fc ne $flowcell;
$pass_filter = $filtered eq 'N'? 1 : 0; # N means "not filtered" which means that it passed filter
$pass_filter_count += $pass_filter;
$pass_filter_count_lane += $pass_filter;
chomp($seq = <IN>);
my $line3 = <IN>;
chomp($qual = <IN>);
}
my $cutsite_start = index($seq, $cut_site_remnant1);
if ($cutsite_start > 3 && $cutsite_start < 11) {
$barcode = substr($seq, 0, $cutsite_start);
if (exists $barcodes{$lane}{$barcode}) {
$barcode_matched = 1;
$seq =~ s/^$barcode//; # trim the barcode
}
}
else {
$cutsite_start = index($seq, $cut_site_remnant2);
if ($cutsite_start > 3 && $cutsite_start < 11) {
$barcode = substr($seq, 0, $cutsite_start);
if (exists $barcodes{$lane}{$barcode}) {
$barcode_matched = 1;
$seq =~ s/^$barcode//; # trim the barcode
}
}
else { # check for adapter/adapter dimers
my $common_adapter_start = index($seq, $common_adapter1);
if ($common_adapter_start > 3 && $common_adapter_start < 11) {
$barcode = substr($seq, 0, $common_adapter_start);
if (exists $barcodes{$lane}{$barcode}) {
++$n_dimers_lane;
++$n_dimers;
$seq =~ s/^$barcode//; # trim the barcode
print DIMERS ">", join(":", $machine, $run, $flowcell, $lane_num, $tile, $x, $y, $barcodes{$lane}{$barcode}, $pass_filter), "\n";
print DIMERS $barcode, "__", $seq, "\n";
}
}
else {
$common_adapter_start = index($seq, $common_adapter2);
if ($common_adapter_start > 3 && $common_adapter_start < 11) {
$barcode = substr($seq, 0, $common_adapter_start);
if (exists $barcodes{$lane}{$barcode}) {
++$n_dimers_lane;
++$n_dimers;
$seq =~ s/^$barcode//; # trim the barcode
print DIMERS ">", join(":", $machine, $run, $flowcell, $lane_num, $tile, $x, $y, $barcodes{$lane}{$barcode}, $pass_filter), "\n";
print DIMERS $barcode, "__", $seq, "\n";
}
}
}
}
}
if ($barcode_matched) {
if (index($seq,$N_char) == -1 || index($seq,$N_char) > 63) { # sequences with N's don't pass
++$barcode_counts{$lane}{$barcode};
unless ($barcodes{$lane}{$barcode} =~ /empty/i || $barcodes{$lane}{$barcode} =~ /blank/i) {
if($maizelanes{$lane} ne "")
{
print OUT ">", join(":", $machine, $run, $flowcell, $lane_num, $tile, $x, $y, $barcodes{$lane}{$barcode}, $pass_filter), "\n";
print OUT "$seq\n";
}
++$pass_count;
++$pass_count_lane;
}
}
}
++$read_count;
++$read_count_lane;
}
$sum_sqrs_lane = 0;
foreach my $brcd (keys %{ $barcode_counts{$lane} } ) {
unless ($barcodes{$lane}{$brcd} =~ /empty/i || $barcodes{$lane}{$brcd} =~ /blank/i) {
$sum_sqrs_lane += $barcode_counts{$lane}{$brcd}**2;
$sum_sqrs += $barcode_counts{$lane}{$brcd}**2;
}
}
close IN;
if($maizelanes{$lane} ne "") { close OUT; }
$mean = $pass_count_lane/$barcode_n{$lane};
$sd = sqrt( ($sum_sqrs_lane/$barcode_n{$lane})-($mean**2) );
$coeff_var = $mean ? sprintf("%.2f",100*$sd/$mean) : 0;
# if ($qseq) { #### Cassava 1.8 fastq now contains both unfiltered (passed filter) and filtered reads
$pass_filter_rate_lane = $read_count_lane ? sprintf("%.4f", $pass_filter_count_lane/$read_count_lane) : 0;
# }
# else {
# $pass_filter_rate_lane = "NA";
# }
my $pass_rate_lane = $read_count_lane ? $pass_count_lane/$read_count_lane : 0;
print RPT join "\t", $lane, $plates_by_lane{$lane}, $pass_count_lane, $read_count_lane, sprintf("%.4f", $pass_rate_lane),
$pass_filter_count_lane, $pass_filter_rate_lane, $n_dimers_lane, $barcode_n{$lane},
sprintf("%.0f", $mean),
sprintf("%.0f", $sd),
$coeff_var;
print RPT "%\n";
print STDOUT "...finished processing $ARGV[1] file $infiles{$lane} for plate $plates_by_lane{$lane}\n"
. "pass filter count: $pass_filter_count_lane\n"
. "adapter dimers: $n_dimers_lane\n"
. "barcode_CV: $coeff_var%\n"
. "pass count: $pass_count_lane\n"
. "read count: $read_count_lane\n";
}
$mean = $pass_count/$tot_n_barcodes;
$sd = sqrt( ($sum_sqrs/$tot_n_barcodes) - ($mean**2) );
$coeff_var = $mean ? sprintf("%.2f",100*$sd/$mean) : 0;
# if ($qseq) { #### Cassava 1.8 fastq now contains both unfiltered (passed filter) and filtered reads
$pass_filter_rate = $read_count ? sprintf("%.4f", $pass_filter_count/$read_count) : 0;
#}
#else {
# $pass_filter_rate = "NA";
#}
my $pass_rate = $read_count ? $pass_count/$read_count : 0;
print RPT join "\t", 'ALL', 'ALL', $pass_count, $read_count, sprintf("%.4f", $pass_rate),
$pass_filter_count, $pass_filter_rate, $n_dimers, $tot_n_barcodes,
sprintf("%.0f", $mean),
sprintf("%.0f", $sd),
$coeff_var;
print RPT "%\n";
close RPT;
open RPT, ">$rptfile2" or die "ERROR: Cannot open output file \'$rptfile2\': $!\n\n";
print RPT join("\t", "Lane_Barcode", "Lane", "Barcode", "Sample", "Pass_Count", "Plate", "Row", "Col"), "\n";
foreach my $ln (sort {$a<=>$b} keys %barcode_counts) {
foreach my $brcd (sort keys %{ $barcode_counts{$ln} } ) {
print RPT $ln, "_", $brcd, "\t", join("\t", $ln, $brcd, $barcodes{$ln}{$brcd}, $barcode_counts{$ln}{$brcd}, $barcode_platemap{$ln}{$brcd}), "\n";
}
}
print STDOUT "\n\nFinished!\n\n"
. " In total, $pass_filter_count reads passed filter\n"
. " and $n_dimers adaptor dimers were detected\n"
. " and $pass_count reads matched the barcode and cut site and had no N's within the 64 base read\n"
. " out of $read_count reads examined (in $ARGV[1] format)\n\nSee the following files for output:\n"
. "\t$outfile\n"
. "\t$rptfile\n"
. "\t$rptfile2\n"
. "\t$dimer_file\n";
print STDOUT "\n\n\nFare thee well, Panzean!!\n\n\n";
|
GBS_qual_control2.pl Code Block |
---|
#!/usr/bin/perl -w
use strict;
#use Bio::SearchIO;
use POSIX qw(floor);
# use File::Temp "tempfile";
use Cwd 'abs_path';
use File::Basename;
my $bindirectory = dirname(abs_path($0)) . "/";
require $bindirectory . "functions.pl";
# Look for the configuration file called QC.cfg located in the directory where the script
# is called from...
my %cfgparams = read_config_params();
my $blastdb = $cfgparams{"BLASTDB"};
my $blastprog = $cfgparams{"BLASTPROG"};
# This will hold zea lanes specified in $maize_lanes_list
my %maizelanes = (1 => "", 2 => "", 3 => "", 4 => "", 5 => "", 6 => "", 7 => "", 8 => "", "ALL" => "");
my $flowcell;
my $maize_lanes_list;
if ($#ARGV == 1) {
$flowcell = $ARGV[0];
$maize_lanes_list = $ARGV[1];
}
else {
die "\n\nUSAGE: GBS_qual_control2.pl FLOWCELL maize_lanes_list\n\n\n";
}
# read in the maize lanes (one lane per line)
open IN, "<$maize_lanes_list" or die "ERROR: Cannot open maize lanes file $maize_lanes_list: $!\n\n";
my $line;
while($line=<IN>)
{
chomp $line;
$maizelanes{$line} = 1;
}
close IN;
print STDOUT "\n\nGreetings Panzean! Let's test the quality of Illumina GAII flow cell $flowcell via BLAST to the reference genome...\n\n";
# read in the pass_counts = n_reads per lane matching the bar code and cut site, no rev_adapt starting < 24 bases (read from a file for now -- will read from DB later?)
my %passcounts; # key1 = lane, value = pass_count
my @input_line;
my ($lane, $passcount);
my (%platename,%readcount,%passfilter,%dimers,%n_DNAs,%cv_barcode_count); # key = lane
my $passcount_file = $flowcell . "_QC1.txt";
open IN, "<$passcount_file" or die "ERROR: Cannot open pass_count file $passcount_file: $!\n\n";
<IN>; # skip header line
while (<IN>) {
s/\r$//; # convert DOS to unix if necessary
chomp;
@input_line = split "\t";
$lane = $input_line[0];
if($maizelanes{$lane} eq "") { next; } # Only consider lanes specified on command line
($platename{$lane},$passcounts{$lane},$readcount{$lane},$passfilter{$lane},$dimers{$lane},$n_DNAs{$lane},$cv_barcode_count{$lane}) = @input_line[1,2,3,5,7,8,11] unless $lane eq 'ALL';
}
close IN;
my $numofzealanes = (scalar keys %passcounts);
if($numofzealanes == 0)
{
print STDOUT "No maize lanes to process - exiting...\n";
goto NOZEALANES;
}
else
{
print STDOUT "The following maize lanes will be subject to BLAST test:\n";
foreach $lane (sort {$a<=>$b} keys %passcounts)
{
print STDOUT "$lane\n";
}
# exit; # this is just for test
}
my $QC_stats_file = $flowcell . "_QC_stats.txt";
open RES, ">$QC_stats_file" or die "ERROR: cannot open QC_stats file $QC_stats_file: $!\n\n";
print RES join("\t", qw(Flowcell Machine Run Lane Plate nSamples Reads PassFilter Dimers BarcodedEtc %barcoded CV BlastSample %1+hits AvgHitLenAll AvgHitLenHits MedianHitLen EstGoodReads EstGoodReads/DNA)), "\n";
# go through each lane's fasta file, write a random sample of reads to a new fasta file, perform BLAST and write results
my $rand_num;
my %random_reads; # key = rand_num, value = 1 (arbitrary)
foreach $lane (sort {$a<=>$b} keys %passcounts) {
print STDOUT "Sampling reads for BLASTing from lane $lane (1000 random reads)...\n";
%random_reads = ();
if ($passcounts{$lane} > 5000) {
# get 1000 random reads (with good barcodes and cut sites) per lane
for my $read (0 .. 999) {
$rand_num = int(rand($passcounts{$lane})) + 1;
redo if exists $random_reads{$rand_num};
$random_reads{$rand_num} = 1;
}
# write these reads to a fasta for blast
my $in_fasta_file = $flowcell . "_" . $lane . "_QC.fasta"; # temp fasta file filtered for reads having bardcode and cut site, etc
open IN, "<$in_fasta_file" or die "ERROR: Cannot open fasta file $in_fasta_file: $!\n\n";
my $out_fasta_file = $flowcell . "_" . $lane . "_sample.fasta"; # fasta file containing the sampled reads to be blasted
open OUT, ">$out_fasta_file" or die "ERROR: Cannot open output file \'$out_fasta_file\': $!\n\n";
my ($readcount, $writecount) = (0, 0);
my ($name,$read);
while (<IN>) {
$name = $_;
$read = <IN>;
++$readcount;
if (exists $random_reads{$readcount}) {
print OUT $name;
print OUT $read;
++$writecount;
}
}
close IN;
close OUT;
print STDOUT "\t...$writecount reads were sampled from lane $lane for BLASTing out of $readcount reads with barcodes and cut sites\n\n";
# perform the blast of the sample from this lane
print STDOUT "\tNow performing BLAST for sample from lane $lane...\n";
my $blast_results_file = $flowcell . '_' . $lane . '_sample_blastresults.txt';
# open OUT, ">$blast_results_file" or die "ERROR: Cannot open output file \'$blast_results_file\': $!\n\n";
# close OUT;
my $commandline = "$blastprog -p blastn -d $blastdb -i $out_fasta_file -e 0.00001 -K 1 -m 8 -o $blast_results_file";
unless ( system ($commandline) ) {
print STDOUT "\t...BLAST was sucessful\n\n";
} else {
die "\n\n***ERROR: the blastall command failed. Program aborted.\n\n\n";
}
# parse the table of blast results
print STDOUT "\tparsing top hits from BLAST results...\n";
open IN, "<$blast_results_file" or die "\n\nERROR: cannot open blast results file $blast_results_file: $!\n\n\n";
my $summary_file = $flowcell . '_' . $lane . "_blast_summary.txt";
open OUT, ">$summary_file" or die "ERROR: Cannot open BLAST summary file \'$summary_file\': $!\n\n";
print OUT join "\t", "Query", "Hit?", "Top_hit_align_len", "Top_hit_len", "%ident", "n_mismatches", "n_gap_openings", "e_value", "bit_score\n";
my @hit_lens = ();
my ($hitlen, $current_read, $ident, $align_len, $n_mismatch, $n_gap_openings, $q_start, $q_end, $hit_start, $hit_end, $e_val, $bit_score);
my ($tot_n_hits, $prop_that_hit, $avg_hit_len, $median_hit_len) = (0, 0, 0, 0);
my $prev_read = "nada";
while (<IN>) {
chomp;
@input_line = split "\t";
# 0 (1) 2 3 4 5 6 7 8 9 10 11
# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score
$current_read = $input_line[0];
if ($current_read ne $prev_read) {
($ident, $align_len, $n_mismatch, $n_gap_openings, $q_start, $q_end, $hit_start, $hit_end, $e_val, $bit_score) = @input_line[2..11];
++$prop_that_hit;
$hitlen = abs($hit_end - $hit_start) + 1;
$avg_hit_len += $hitlen;
push @hit_lens, $hitlen;
print OUT join "\t", $current_read, 1, $align_len, $hitlen, $ident, $n_mismatch, $n_gap_openings, $e_val, "$bit_score\n";
$prev_read = $current_read;
}
++$tot_n_hits;
}
close IN;
close OUT;
print STDOUT "\t...finished parsing $tot_n_hits BLAST results for the $prop_that_hit top hits\n\n";
# calculate and print the summary QC statistics
my ($machine,$run,$fc,$ln,$tile,$x,$y,$sample,$pass_filt) = split ":", $current_read;
my $n_hits = $prop_that_hit;
my $avg_hit_len_one_plus = $avg_hit_len/$n_hits;
$avg_hit_len = $avg_hit_len / $writecount;
$prop_that_hit = $prop_that_hit / $writecount;
my $size = @hit_lens;
print STDERR "\n\nWARNING: size of array hit_lens not equal to n_hits\n\n" unless $n_hits == $size;
@hit_lens = sort {$a <=> $b} @hit_lens;
$median_hit_len = $hit_lens[floor($n_hits/2)-1];
print RES join( "\t",
$flowcell, $machine, $run, $lane, $platename{$lane}, $n_DNAs{$lane}, $readcount{$lane}, $passfilter{$lane}, $dimers{$lane},
$passcounts{$lane}, sprintf("%.2f",100*$passcounts{$lane}/$readcount{$lane}).'%', $cv_barcode_count{$lane},
$writecount, sprintf("%.2f",100*$prop_that_hit).'%', sprintf("%.2f", $avg_hit_len), sprintf("%.2f", $avg_hit_len_one_plus),
$median_hit_len, sprintf("%.0f",$prop_that_hit*$passcounts{$lane}), sprintf("%.0f",$prop_that_hit*$passcounts{$lane}/$n_DNAs{$lane}) ), "\n";
}
else {
print STDOUT "\t\tLane $lane skipped (not BLASTed) because fewer than 5000 good reads\n";
print RES join("\t",
$flowcell, 'machine', 'run', $lane, $platename{$lane}, $n_DNAs{$lane}, $readcount{$lane}, $passfilter{$lane}, $dimers{$lane},
$passcounts{$lane}, sprintf("%.2f",100*$passcounts{$lane}/$readcount{$lane}).'%', $cv_barcode_count{$lane},
0, 'NA', 'NA', 'NA', 'NA', 0, 0), "\n";
}
}
close RES;
print STDOUT "\n\nFinished!\n\n"
. "For results summary, see:\n\t$QC_stats_file\n";
NOZEALANES:
print STDOUT "\n\n\nFare thee well, Panzean!!\n\n\n";
#sub doblast {
# my ($seqname,$seq) = @_[0,1];
# my ($numhits,$hitlen,$nident,$ngaps,$fractident);
#
## For some reason, these temp files don't seem to get deleted (cron job? This is not what the perl documentation says)
## my ( $seq_fh, $tmpqueryfilename )=tempfile("blastqueryXXXXXX", SUFFIX => '.tmp');
## print $seq_fh $seqname, $seq;
## close $seq_fh;
## my ( $res_fh, $tmpblastresultfile )=tempfile("blastresultXXXXXX", SUFFIX => '.tmp');
#
# my $tmpqueryfilename = 'blastquery.tmp';
# open SEQ, ">$tmpqueryfilename";
# print SEQ $seqname, $seq;
# close SEQ;
#
# my $tmpblastresultfile = 'blastresult.tmp';
# open RES, ">$tmpblastresultfile";
# close RES;
#
# my $commandline = "blastall -p blastn -d ../count091102b.fas -i $tmpqueryfilename -e 0.00001 -K 1 -m 7 -o $tmpblastresultfile";
# my $runblast = system ($commandline);
# unless ( $runblast =~/^\[blastall\].*ERROR/) {
# my $searchio = new Bio::SearchIO( -format => 'blastxml',
# -file => $tmpblastresultfile );
#
# if ( my $result = $searchio->next_result ) {
# $numhits = $result->num_hits;
# if ( my $hit = $result->next_hit ) {
# if (my $hsp = $hit->next_hsp) {
# $hitlen = $hsp->hsp_length; # Note: this is the length of the alignment INCLUDING gaps
# $nident = $hsp->num_identical;
# $ngaps = $hsp->gaps;
# $fractident = $hsp->frac_identical;
# }
# else {
# ($hitlen,$nident,$ngaps,$fractident) = (-9,-9,-9,-9);
# }
# }
# else {
# ($hitlen,$nident,$ngaps,$fractident) = (0,0,0,0);
# }
# }
# else {
# ($numhits,$hitlen,$nident,$ngaps,$fractident) = (-99,-99,-99,-99,-99);
# }
#
# } else {
# print STDERR "WARNING: No response from server for read $seqname\n\n";
# ($numhits,$hitlen,$nident,$ngaps,$fractident) = (-999,-999,-999,-999,-999);
# }
# ($numhits,$hitlen,$nident,$ngaps,$fractident);
#}
|
get_lanes_for_enzyme.pl Code Block |
---|
#!/usr/bin/perl
# Read the key file ane produce a list of lanes treated with each enzyme
@enzymes =("ApeKI", "EcoT22I", "PstI");
%lanes = {};
# Parse the header line of the key file in search for indices corresponding
# to enzyme and flowcell ID. Just in case the order of columns ever changes....
$line=<>;
chomp $line;
@aux = split "\t", $line;
for($i=0;$i<=$#aux;$i++)
{
if($aux[$i] eq "Lane") { $index_lane = $i; }
if($aux[$i] eq "Enzyme") { $index_enzyme = $i; }
}
while($line=<>)
{
chomp $line;
@aux = split "\t", $line;
$lanes{$aux[$index_enzyme]}{$aux[$index_lane]}++;
}
# print out the result...
foreach $enzyme (keys %lanes)
{
if($lanes{$enzyme} eq "") { next; }
print "$enzyme ";
foreach $lane (sort {$a <=> $b} keys %{$lanes{$enzyme}})
{
if($lanes{$enzyme}{$lane} eq "") { next; }
print "$lane ";
}
print "\n";
}
|
get_zea_lanes.pl Code Block |
---|
#!/usr/bin/perl
# Read the key file and produce a list of maize lanes
# to be processed with QC2 (BLAST). Currently: genus zea and anzyme ApeKI.
%lanes = {};
# Parse the header line of the key file in search for indices corresponding
# to enzyme and flowcell ID. Just in case the order of columns ever changes....
$line=<>;
chomp $line;
@aux = split "\t", $line;
for($i=0;$i<=$#aux;$i++)
{
if($aux[$i] eq "Lane") { $index_lane = $i; }
if($aux[$i] eq "Genus") { $index_genus = $i; }
if($aux[$i] eq "Enzyme") { $index_enzyme = $i; }
}
while($line=<>)
{
chomp $line;
@aux = split "\t", $line;
if(lc($aux[$index_genus]) eq "zea" && $aux[$index_enzyme] eq "ApeKI") { $lanes{$aux[$index_lane]}++; }
}
# print out the result...
foreach $lane (keys %lanes)
{
if($lanes{$lane} eq "") { next; }
print "$lane\n";
}
|
parseAdapterDimers.pl Code Block |
---|
#!/usr/bin/perl -w
use strict;
my $flowcell;
if ($ARGV[0]) {
$flowcell = $ARGV[0];
}
else {
die "\n\nUSAGE: parseAdapterDimers.pl FLOWCELL\n\n\n";
}
print STDOUT "\n\nGreetings Panzean! Let's parse the adapter dimer file output from GBS_qual_control1.pl for flowcell $flowcell...\n\n";
my $in_file = $flowcell . "_adapter_dimers.txt";
open IN, "<$in_file" or die "ERROR: Cannot open input file \'$in_file\': $!\n\n";
my $dimercount = 0;
my ($name, $read);
my ($machine, $run, $fc, $lane_num, $tile, $x, $y, $sample, $pass_filter);
my ($barcode, $dimer);
my %dimersByLaneBarcode;
my %samplesByLaneBarcode;
while (<IN>) {
$name = $_;
$name =~ s/^>//;
($machine, $run, $fc, $lane_num, $tile, $x, $y, $sample, $pass_filter) = split(":",$name);
$read = <IN>;
($barcode, $dimer) = split("__",$read);
++$dimersByLaneBarcode{$lane_num}{$barcode};
$samplesByLaneBarcode{$lane_num}{$barcode} = $sample;
++$dimercount;
}
close IN;
my $out_file = $flowcell . "_adapter_dimers_parsed.txt";
open OUT, ">$out_file" or die "ERROR: Cannot open output file \'$out_file\': $!\n\n";
print OUT join("\t", qw[Flowcell Lane Barcode Sample nAdapterDimers]), "\n";
my $lanecount = 0;
foreach my $lane (sort {$a<=>$b} keys %dimersByLaneBarcode) {
++$lanecount;
foreach my $bc (sort keys %{ $dimersByLaneBarcode{$lane} }) {
print OUT join("\t", $fc, $lane, $bc, $samplesByLaneBarcode{$lane}{$bc}, $dimersByLaneBarcode{$lane}{$bc}), "\n";
}
}
close OUT;
print STDOUT "\t...$dimercount dimers were found in the adapter dimer file, which covered $lanecount lanes\n\n";
print STDOUT "\n\nFinished!\n\n"
. "For the parsed results, see:\n\t$out_file\n";
print STDOUT "\n\n\nFare thee well, Panzean!!\n\n\n";
|
QC.cfg Code Block |
---|
FLOWCELLROOT=/data1/maizediv/illumina/GBSFlowCells/
QUALROOT=/data1/bukowski/qual_control_tmp/
BINROOT=/data1/bukowski/panzea/panzea_filemgm/
BLASTDB=/data1/maizediv/illumina/qual_control/QC2_blast_db/Zea_ApeKI_GBS_QC_db_091102.fas
BLASTPROG=/data1/bukowski/panzea/blast-2.2.24/bin/blastall
|
run_QC.pl Code Block |
---|
#!/usr/bin/perl
use Cwd 'abs_path';
use File::Basename;
my $bindirectory = dirname(abs_path($0)) . "/";
require $bindirectory . "functions.pl";
# Given the flowcell name, reads format, flag on whether the reads are in a subdirectory under flowcell dir,
# and assuming the key file present
# in the flowcell directory, run the whole QC pipeline....
# Look for the configuration file called QC.cfg located in the directory where the script
# is called from...
my %cfgparams = read_config_params();
my $flowcell = $ARGV[0];
my $format = $ARGV[1]; # fasq or qseq
my $insubdir = $ARGV[2]; # 1 if files are in subdir under flowcell dir, 0 otherwise
# Read off the parameters.....
my $flowcellroot = $cfgparams{"FLOWCELLROOT"};
my $qualroot = $cfgparams{"QUALROOT"};
my $binroot = $cfgparams{"BINROOT"};
my $key_file = $flowcellroot . $flowcell . "/" . $flowcell . "_key.txt";
my $datadir = $flowcellroot . $flowcell;
my $qualdir = $qualroot . $flowcell;
# Create qualdir if it does not yet exist
unless( -d $qualdir)
{
mkdir $qualdir or die "Cannot create directory $qualdir\n";
}
# generate enzyme-lane mapping file (all_lanes) and
# maize lane list file (maize_lanes digested with ApeKI - subject to QC2)
chdir($qualdir);
system($binroot . "get_lanes_for_enzyme.pl < $key_file > all_lanes");
system($binroot . "get_zea_lanes.pl < $key_file > maize_lanes");
# Make enzyme directories and run the pipelines
my @enzymes = `awk '{print \$1}' all_lanes`;
foreach $enz (@enzymes)
{
chdir($qualdir);
chomp $enz;
mkdir $enz;
$cmd = $binroot ."GBS_qual_control1.pl $flowcell $format gz $enz ./all_lanes ./maize_lanes $insubdir >& QC1_$enz.log";
system($cmd);
chdir($enz);
$cmd = $binroot ."parseAdapterDimers.pl $flowcell >> ../QC1_$enz.log 2>&1";
system($cmd);
$cmd = $binroot ."GBS_qual_control2.pl $flowcell ../maize_lanes >& ../QC2_$enz.log";
system($cmd);
system("rm ${flowcell}_*_QC.fasta");
}
chdir($qualdir);
|
|