functions.pl
# General-purpose functions called by other scripts # Return a hash of config parameters from the file QC.cfg # in the format # # PARAMETER=value # # sub read_config_params() { my $bindir = dirname(abs_path($0)) . "/"; my $cfgfile = $bindir . "QC.cfg"; my %cfgparams; my $incfg; open($incfg,$cfgfile) or die "Cannot open $cfgfile\n"; while($line=<$incfg>) { chomp $line; ($key,$value) = split "=",$line; $cfgparams{$key} = $value; } close $incfg; return %cfgparams; } 1;
GBS_qual_control1.pl
#!/usr/bin/perl -w use strict; use Cwd 'abs_path'; use File::Basename; my $bindirectory = dirname(abs_path($0)) . "/"; require $bindirectory . "functions.pl"; # Get machine-specific configuration parameters my %cfgparams = read_config_params(); my ($flowcell, $qseq, $gz, $enzyme, $lane_file, $zea_lane_file, $s_lane, $insubdir, $prefix); my $usage = 'GBS_qual_control1.pl FLOWCELL qseq|fastq txt|gz ApeKI|PstI|EcoT22I lane_file maize_lane_file insubdir'; if ($ARGV[0] && $ARGV[1] && $ARGV[2] && $ARGV[3] && $ARGV[4] && $ARGV[5]) { $flowcell = $ARGV[0]; $lane_file = $ARGV[4]; # file with one space-delimited line per (present) enzyme, e.g., ApeKI 1 4 6 $zea_lane_file = $ARGV[5];# one maize lane per line $insubdir = $ARGV[6]; if ($ARGV[1] eq "qseq") { $qseq = 1; } elsif ($ARGV[1] eq "fastq") { $qseq = 0; } else { die "USAGE: $usage\n\n\n"; } if ($ARGV[2] eq "gz") { $gz = 1; } elsif ($ARGV[2] eq "txt") { $gz = 0; } else { die "USAGE: $usage\n\n\n"; } if ($ARGV[3] eq "ApeKI") { $enzyme = "ApeKI"; } elsif ($ARGV[3] eq "PstI") { $enzyme = "PstI"; } elsif ($ARGV[3] eq "EcoT22I") { $enzyme = "EcoT22I"; } else { die "Unknown enzyme: $ARGV[3]\n\nUSAGE: $usage\n\n\n"; } } else { die "USAGE: $usage\n\n\n"; } # grab the lanes for this enzyme my ($txtline, $junk); my @Lanes = (); open IN, "<$lane_file" or die "ERROR: Cannot open lanes file $lane_file: $!\n\n"; while($txtline = <IN>) { chomp $txtline; if($txtline =~ m/$enzyme/) { ($junk, @Lanes) = split " ", $txtline; } } close IN; if($#Lanes < 0) { print STDOUT "No lanes to process for this enzyme...\n"; exit; } # Grab the maize lanes my %maizelanes = (1 => "", 2 => "", 3 => "", 4 => "", 5 => "", 6 => "", 7 => "", 8 => "", "ALL" => ""); open IN, "<$zea_lane_file" or die "ERROR: Cannot open lanes file $zea_lane_file: $!\n\n"; while($txtline = <IN>) { chomp $txtline; $maizelanes{$txtline} = 1; } close IN; # In the future, do some consistency checking on the lanes provided - should be all different and between 1 and 8 #my $key_file = "/usr/local/maizediv/illumina/qual_control/" . $flowcell . "/" . $flowcell . "_key.txt"; #my $key_file = "/cbsufsrv4/data1/maizediv/illumina/PerkinElmer/" . $flowcell . "/" . $flowcell . "_key.txt"; # RB change my $flowcellroot = $cfgparams{"FLOWCELLROOT"}; my $key_file = $flowcellroot . $flowcell . "/" . $flowcell . "_key.txt"; # Data Input files - take them from lane list on command line my %infiles; # key = lane, value = filename #$infiles{1} = 'D09FYACXX_1_fastq.gz'; #$infiles{2} = 'B064FABXX_2_qseq.txt.gz'; #$infiles{3} = 'B064FABXX_3_qseq.txt.gz'; #$infiles{4} = 'B064FABXX_4_qseq.txt.gz'; #$infiles{5} = 'B064FABXX_5_qseq.txt.gz'; #$infiles{6} = 'B064FABXX_6_qseq.txt.gz'; #$infiles{7} = 'D09FYACXX_7_fastq.gz'; #$infiles{8} = '10247084_C00R8ABXX_s_8_qseq.txt.gz'; my $subdirstr = ""; if($insubdir) { $subdirstr = "fastq/"; if($qseq) { $subdirstr = "qseq/"; } } my $flestr; my $cmd; foreach (@Lanes) { $cmd = "ls -1 " . $flowcellroot . $flowcell . "/" . $subdirstr . "*$flowcell" . "*_$_" . "_*.gz"; $flestr = `$cmd`; chomp $flestr; $infiles{$_} = $flestr; } # my $output_path = '/usr/local/maizediv/illumina/qual_control/' . $flowcell; # my $output_path = '/cbsufsrv4/data1/maizediv/illumina/qual_control/' . $flowcell . '/'; my $output_path = "./$ARGV[3]/"; # call it after enzyme #$output_path = $qseq ? $output_path . '/prefilter/' : $output_path . '/postfilter/'; # no longer relevant with Casava 1.8 print STDOUT "\n\nGreetings Panzean! Let's test the quality of Illumina flow cell $flowcell by checking for barcodes & cut sites...\n\n"; print STDOUT "The following files will be processed:\n\n"; foreach my $key (keys %infiles) { if($infiles{$key} eq "") { next; } print STDOUT "Lane $key: $infiles{$key}"; if($maizelanes{$key} ne "") { print STDOUT " (maize lane) "; } else { print STDOUT " (non-maize lane) "; } print STDOUT "\n"; } print STDOUT "\n"; # exit; # just for test my ($cut_site_remnant, $cut_site_remnant1, $cut_site_remnant2, $common_adapter, $common_adapter1, $common_adapter2); if ($enzyme eq "ApeKI") { $cut_site_remnant = 'C[AT]GC'; $cut_site_remnant1 = 'CAGC'; $cut_site_remnant2 = 'CTGC'; $common_adapter = 'C[AT]GAGATCGGAAGAGCGGTTCAGCAGG'; # first 27 bases $common_adapter1 = 'CAGAGATCGGAAGAGCGGTTCAGCAGG'; # first 27 bases $common_adapter2 = 'CTGAGATCGGAAGAGCGGTTCAGCAGG'; # first 27 bases # AGATCGGAAGAGCGGTTCAGCAGG = common adapter minus overhang } elsif ($enzyme eq "PstI") { $cut_site_remnant = 'TGCAG'; $cut_site_remnant1 = 'TGCAG'; $cut_site_remnant2 = 'none'; $common_adapter = 'TGCAAGATCGGAAGAGCGGTTCAGCAGG'; # first 28 bases $common_adapter1 = 'TGCAAGATCGGAAGAGCGGTTCAGCAGG'; # first 28 bases # AGATCGGAAGAGCGGTTCAGCAGG = common adapter minus overhang $common_adapter2 = 'none'; # first 27 bases } elsif ($enzyme eq "EcoT22I") { $cut_site_remnant = 'TGCAT'; $cut_site_remnant1 = 'TGCAT'; $cut_site_remnant2 = 'none'; $common_adapter = 'TGCAAGATCGGAAGAGCGGTTCAGCAGG'; # first 28 bases $common_adapter1 = 'TGCAAGATCGGAAGAGCGGTTCAGCAGG'; # first 28 bases # AGATCGGAAGAGCGGTTCAGCAGG = common adapter minus overhang $common_adapter2 = 'none'; # first 27 bases } # read in the barcodes (read from a file for now -- will read from DB later?) my %barcodes; # key1 = lane, key2 = barcode, value = sample my %barcode_platemap; # key1 = lane, key2 = barcode, value = plate__row__column my %barcode_counts; # key1 = lane, key2 = barcode, value = count my %barcode_n; # key = lane, value = n_barcodes in that lane that are not "empty" or "blank" (case insensitive) my %plates_by_lane; # key = lane, value = plate my ($tot_n_barcodes, $sum_sqrs, $sum_sqrs_lane, $mean, $sd, $coeff_var); my @input_line; open IN, "<$key_file" or die "ERROR: Cannot open multiplex key file $key_file: $!\n\n"; <IN>; # skip header line while (<IN>) { s/\r$//; # convert DOS to unix if necessary chomp; @input_line = split "\t"; my ($fcell, $lane, $barcode, $sample, $plate, $row, $col) = @input_line[0,1,2,3,4,5,6]; die "ERROR: Flow cell mismatch in barcode key file: $fcell (expect $flowcell)\n\n" if $fcell ne $flowcell; if (exists $infiles{$lane} ) { # only work with the lanes that are in the infile list $barcodes{$lane}{$barcode} = $sample; $barcode_platemap{$lane}{$barcode} = join "\t", $plate, $row, $col; $plates_by_lane{$lane} = $plate; $barcode_counts{$lane}{$barcode} = 0; unless ($sample =~ /empty/i || $sample =~ /blank/i) { ++$barcode_n{$lane}; ++$tot_n_barcodes; } } } print STDOUT "\nFinished reading barcodes for $tot_n_barcodes non-blank samples from the key file...\n"; close IN; # Output files: my $outfile; # a fasta file (one per lane) containing each trimmed read (barcode removed) that passes our criteria (barcode & cut site, no Ns in 64 bp, no adapter dimers, etc) # - these LARGE files will be sampled from for BLAST to a set of reference reads in part 2 = nextgen_qual_control2.pl # - the files should be deleted thereafter to save disk space my $rptfile = $output_path . $flowcell . "_QC1.txt"; # pass_counts by lane open RPT, ">$rptfile" or die "ERROR: Cannot open output file \'$rptfile\': $!\n\n"; print RPT join("\t", "Lane", "Plate", "Pass_Count", "Read_Count", "Pass_Rate", "Pass_Filter_Count", "Pass_Filter_Rate", "Adapter_dimers", "n_samples", "Mean_count", "SD_count", "CV_count_(%)"), "\n"; my $rptfile2 = $output_path . $flowcell . "_QC2.txt"; # pass_counts by barcode within lane my $dimer_file = $output_path . $flowcell . "_adapter_dimers.txt"; open DIMERS, ">$dimer_file" or die "ERROR: Cannot open reverse adapter file \'$dimer_file\': $!\n\n"; my $lane; my @line; my $N_char; my ($barcode, $seqlen, $dimer); my ($machine,$run,$fc,$lane_num,$tile,$x,$y,$seq,$qual,$pair_member,$filtered,$cntrl,$pass_filter); my $barcode_matched; # boolean my ($read_count, $pass_count, $pass_filter_count, $n_short_seqs, $n_dimers) = (0, 0, 0, 0, 0, 0); my ($pass_count_lane, $read_count_lane, $pass_filter_count_lane, $pass_filter_rate_lane, $pass_filter_rate, $n_short_seqs_lane, $n_dimers_lane); foreach $lane (sort {$a <=> $b} keys %infiles) { $outfile = $output_path . $flowcell . "_" . $lane . "_QC.fasta"; if($maizelanes{$lane} ne "") { open OUT, ">$outfile" or die "ERROR: Cannot open output file \'$outfile\': $!\n\n"; } else { print STDOUT "Lane $lane is not maize - no fasta file will be created.\n"; } ($pass_count_lane, $read_count_lane, $pass_filter_count_lane, $n_short_seqs_lane, $n_dimers_lane) = (0, 0, 0, 0, 0, 0); if ($gz) { open IN, "gunzip -c $infiles{$lane} |" or die "ERROR: Cannot open compressed (*.gz) input file: $!\n\n\n"; } else { open IN, "<$infiles{$lane}" or die "ERROR: Cannot open input file: $!\n\n\n"; } print STDOUT "\n\nReading data from the input file: $infiles{$lane}...\n"; while (<IN>) { $barcode_matched = 0; chomp; if ($qseq) { $N_char = '.'; if (/^@/) { die "\n\n***ERROR: looks like a fastq file (not qseq)***\n\n"; } @line = split "\t"; ($machine,$run,$lane_num,$tile,$x,$y,$seq,$qual,$pass_filter) = @line[0,1,2,3,4,5,8,9,10]; $pass_filter_count += $pass_filter; $pass_filter_count_lane += $pass_filter; } else { # fastq $N_char = 'N'; my $line1 = $_; # unless ($line1 =~ s<^@(\S+)#0/1$><$1>) # old style fastq unless ($line1 =~ s<^@(\S+)\s(\S+)$><$1:$2>) # Cassava 1.8 fastq. Replace space with colon { die "\n\n***ERROR: looks like a qseq file (not fastq)***\n\n"; } ($machine,$run,$fc,$lane_num,$tile,$x,$y,$pair_member,$filtered,$cntrl) = split ":", $line1; warn "WARNING: Flowcell ($fc) does not match expected ($flowcell)\n" if $fc ne $flowcell; $pass_filter = $filtered eq 'N'? 1 : 0; # N means "not filtered" which means that it passed filter $pass_filter_count += $pass_filter; $pass_filter_count_lane += $pass_filter; chomp($seq = <IN>); my $line3 = <IN>; chomp($qual = <IN>); } my $cutsite_start = index($seq, $cut_site_remnant1); if ($cutsite_start > 3 && $cutsite_start < 11) { $barcode = substr($seq, 0, $cutsite_start); if (exists $barcodes{$lane}{$barcode}) { $barcode_matched = 1; $seq =~ s/^$barcode//; # trim the barcode } } else { $cutsite_start = index($seq, $cut_site_remnant2); if ($cutsite_start > 3 && $cutsite_start < 11) { $barcode = substr($seq, 0, $cutsite_start); if (exists $barcodes{$lane}{$barcode}) { $barcode_matched = 1; $seq =~ s/^$barcode//; # trim the barcode } } else { # check for adapter/adapter dimers my $common_adapter_start = index($seq, $common_adapter1); if ($common_adapter_start > 3 && $common_adapter_start < 11) { $barcode = substr($seq, 0, $common_adapter_start); if (exists $barcodes{$lane}{$barcode}) { ++$n_dimers_lane; ++$n_dimers; $seq =~ s/^$barcode//; # trim the barcode print DIMERS ">", join(":", $machine, $run, $flowcell, $lane_num, $tile, $x, $y, $barcodes{$lane}{$barcode}, $pass_filter), "\n"; print DIMERS $barcode, "__", $seq, "\n"; } } else { $common_adapter_start = index($seq, $common_adapter2); if ($common_adapter_start > 3 && $common_adapter_start < 11) { $barcode = substr($seq, 0, $common_adapter_start); if (exists $barcodes{$lane}{$barcode}) { ++$n_dimers_lane; ++$n_dimers; $seq =~ s/^$barcode//; # trim the barcode print DIMERS ">", join(":", $machine, $run, $flowcell, $lane_num, $tile, $x, $y, $barcodes{$lane}{$barcode}, $pass_filter), "\n"; print DIMERS $barcode, "__", $seq, "\n"; } } } } } if ($barcode_matched) { if (index($seq,$N_char) == -1 || index($seq,$N_char) > 63) { # sequences with N's don't pass ++$barcode_counts{$lane}{$barcode}; unless ($barcodes{$lane}{$barcode} =~ /empty/i || $barcodes{$lane}{$barcode} =~ /blank/i) { if($maizelanes{$lane} ne "") { print OUT ">", join(":", $machine, $run, $flowcell, $lane_num, $tile, $x, $y, $barcodes{$lane}{$barcode}, $pass_filter), "\n"; print OUT "$seq\n"; } ++$pass_count; ++$pass_count_lane; } } } ++$read_count; ++$read_count_lane; } $sum_sqrs_lane = 0; foreach my $brcd (keys %{ $barcode_counts{$lane} } ) { unless ($barcodes{$lane}{$brcd} =~ /empty/i || $barcodes{$lane}{$brcd} =~ /blank/i) { $sum_sqrs_lane += $barcode_counts{$lane}{$brcd}**2; $sum_sqrs += $barcode_counts{$lane}{$brcd}**2; } } close IN; if($maizelanes{$lane} ne "") { close OUT; } $mean = $pass_count_lane/$barcode_n{$lane}; $sd = sqrt( ($sum_sqrs_lane/$barcode_n{$lane})-($mean**2) ); $coeff_var = $mean ? sprintf("%.2f",100*$sd/$mean) : 0; # if ($qseq) { #### Cassava 1.8 fastq now contains both unfiltered (passed filter) and filtered reads $pass_filter_rate_lane = $read_count_lane ? sprintf("%.4f", $pass_filter_count_lane/$read_count_lane) : 0; # } # else { # $pass_filter_rate_lane = "NA"; # } my $pass_rate_lane = $read_count_lane ? $pass_count_lane/$read_count_lane : 0; print RPT join "\t", $lane, $plates_by_lane{$lane}, $pass_count_lane, $read_count_lane, sprintf("%.4f", $pass_rate_lane), $pass_filter_count_lane, $pass_filter_rate_lane, $n_dimers_lane, $barcode_n{$lane}, sprintf("%.0f", $mean), sprintf("%.0f", $sd), $coeff_var; print RPT "%\n"; print STDOUT "...finished processing $ARGV[1] file $infiles{$lane} for plate $plates_by_lane{$lane}\n" . "pass filter count: $pass_filter_count_lane\n" . "adapter dimers: $n_dimers_lane\n" . "barcode_CV: $coeff_var%\n" . "pass count: $pass_count_lane\n" . "read count: $read_count_lane\n"; } $mean = $pass_count/$tot_n_barcodes; $sd = sqrt( ($sum_sqrs/$tot_n_barcodes) - ($mean**2) ); $coeff_var = $mean ? sprintf("%.2f",100*$sd/$mean) : 0; # if ($qseq) { #### Cassava 1.8 fastq now contains both unfiltered (passed filter) and filtered reads $pass_filter_rate = $read_count ? sprintf("%.4f", $pass_filter_count/$read_count) : 0; #} #else { # $pass_filter_rate = "NA"; #} my $pass_rate = $read_count ? $pass_count/$read_count : 0; print RPT join "\t", 'ALL', 'ALL', $pass_count, $read_count, sprintf("%.4f", $pass_rate), $pass_filter_count, $pass_filter_rate, $n_dimers, $tot_n_barcodes, sprintf("%.0f", $mean), sprintf("%.0f", $sd), $coeff_var; print RPT "%\n"; close RPT; open RPT, ">$rptfile2" or die "ERROR: Cannot open output file \'$rptfile2\': $!\n\n"; print RPT join("\t", "Lane_Barcode", "Lane", "Barcode", "Sample", "Pass_Count", "Plate", "Row", "Col"), "\n"; foreach my $ln (sort {$a<=>$b} keys %barcode_counts) { foreach my $brcd (sort keys %{ $barcode_counts{$ln} } ) { print RPT $ln, "_", $brcd, "\t", join("\t", $ln, $brcd, $barcodes{$ln}{$brcd}, $barcode_counts{$ln}{$brcd}, $barcode_platemap{$ln}{$brcd}), "\n"; } } print STDOUT "\n\nFinished!\n\n" . " In total, $pass_filter_count reads passed filter\n" . " and $n_dimers adaptor dimers were detected\n" . " and $pass_count reads matched the barcode and cut site and had no N's within the 64 base read\n" . " out of $read_count reads examined (in $ARGV[1] format)\n\nSee the following files for output:\n" . "\t$outfile\n" . "\t$rptfile\n" . "\t$rptfile2\n" . "\t$dimer_file\n"; print STDOUT "\n\n\nFare thee well, Panzean!!\n\n\n";
GBS_qual_control2.pl
#!/usr/bin/perl -w use strict; #use Bio::SearchIO; use POSIX qw(floor); # use File::Temp "tempfile"; use Cwd 'abs_path'; use File::Basename; my $bindirectory = dirname(abs_path($0)) . "/"; require $bindirectory . "functions.pl"; # Look for the configuration file called QC.cfg located in the directory where the script # is called from... my %cfgparams = read_config_params(); my $blastdb = $cfgparams{"BLASTDB"}; my $blastprog = $cfgparams{"BLASTPROG"}; # This will hold zea lanes specified in $maize_lanes_list my %maizelanes = (1 => "", 2 => "", 3 => "", 4 => "", 5 => "", 6 => "", 7 => "", 8 => "", "ALL" => ""); my $flowcell; my $maize_lanes_list; if ($#ARGV == 1) { $flowcell = $ARGV[0]; $maize_lanes_list = $ARGV[1]; } else { die "\n\nUSAGE: GBS_qual_control2.pl FLOWCELL maize_lanes_list\n\n\n"; } # read in the maize lanes (one lane per line) open IN, "<$maize_lanes_list" or die "ERROR: Cannot open maize lanes file $maize_lanes_list: $!\n\n"; my $line; while($line=<IN>) { chomp $line; $maizelanes{$line} = 1; } close IN; print STDOUT "\n\nGreetings Panzean! Let's test the quality of Illumina GAII flow cell $flowcell via BLAST to the reference genome...\n\n"; # read in the pass_counts = n_reads per lane matching the bar code and cut site, no rev_adapt starting < 24 bases (read from a file for now -- will read from DB later?) my %passcounts; # key1 = lane, value = pass_count my @input_line; my ($lane, $passcount); my (%platename,%readcount,%passfilter,%dimers,%n_DNAs,%cv_barcode_count); # key = lane my $passcount_file = $flowcell . "_QC1.txt"; open IN, "<$passcount_file" or die "ERROR: Cannot open pass_count file $passcount_file: $!\n\n"; <IN>; # skip header line while (<IN>) { s/\r$//; # convert DOS to unix if necessary chomp; @input_line = split "\t"; $lane = $input_line[0]; if($maizelanes{$lane} eq "") { next; } # Only consider lanes specified on command line ($platename{$lane},$passcounts{$lane},$readcount{$lane},$passfilter{$lane},$dimers{$lane},$n_DNAs{$lane},$cv_barcode_count{$lane}) = @input_line[1,2,3,5,7,8,11] unless $lane eq 'ALL'; } close IN; my $numofzealanes = (scalar keys %passcounts); if($numofzealanes == 0) { print STDOUT "No maize lanes to process - exiting...\n"; goto NOZEALANES; } else { print STDOUT "The following maize lanes will be subject to BLAST test:\n"; foreach $lane (sort {$a<=>$b} keys %passcounts) { print STDOUT "$lane\n"; } # exit; # this is just for test } my $QC_stats_file = $flowcell . "_QC_stats.txt"; open RES, ">$QC_stats_file" or die "ERROR: cannot open QC_stats file $QC_stats_file: $!\n\n"; print RES join("\t", qw(Flowcell Machine Run Lane Plate nSamples Reads PassFilter Dimers BarcodedEtc %barcoded CV BlastSample %1+hits AvgHitLenAll AvgHitLenHits MedianHitLen EstGoodReads EstGoodReads/DNA)), "\n"; # go through each lane's fasta file, write a random sample of reads to a new fasta file, perform BLAST and write results my $rand_num; my %random_reads; # key = rand_num, value = 1 (arbitrary) foreach $lane (sort {$a<=>$b} keys %passcounts) { print STDOUT "Sampling reads for BLASTing from lane $lane (1000 random reads)...\n"; %random_reads = (); if ($passcounts{$lane} > 5000) { # get 1000 random reads (with good barcodes and cut sites) per lane for my $read (0 .. 999) { $rand_num = int(rand($passcounts{$lane})) + 1; redo if exists $random_reads{$rand_num}; $random_reads{$rand_num} = 1; } # write these reads to a fasta for blast my $in_fasta_file = $flowcell . "_" . $lane . "_QC.fasta"; # temp fasta file filtered for reads having bardcode and cut site, etc open IN, "<$in_fasta_file" or die "ERROR: Cannot open fasta file $in_fasta_file: $!\n\n"; my $out_fasta_file = $flowcell . "_" . $lane . "_sample.fasta"; # fasta file containing the sampled reads to be blasted open OUT, ">$out_fasta_file" or die "ERROR: Cannot open output file \'$out_fasta_file\': $!\n\n"; my ($readcount, $writecount) = (0, 0); my ($name,$read); while (<IN>) { $name = $_; $read = <IN>; ++$readcount; if (exists $random_reads{$readcount}) { print OUT $name; print OUT $read; ++$writecount; } } close IN; close OUT; print STDOUT "\t...$writecount reads were sampled from lane $lane for BLASTing out of $readcount reads with barcodes and cut sites\n\n"; # perform the blast of the sample from this lane print STDOUT "\tNow performing BLAST for sample from lane $lane...\n"; my $blast_results_file = $flowcell . '_' . $lane . '_sample_blastresults.txt'; # open OUT, ">$blast_results_file" or die "ERROR: Cannot open output file \'$blast_results_file\': $!\n\n"; # close OUT; my $commandline = "$blastprog -p blastn -d $blastdb -i $out_fasta_file -e 0.00001 -K 1 -m 8 -o $blast_results_file"; unless ( system ($commandline) ) { print STDOUT "\t...BLAST was sucessful\n\n"; } else { die "\n\n***ERROR: the blastall command failed. Program aborted.\n\n\n"; } # parse the table of blast results print STDOUT "\tparsing top hits from BLAST results...\n"; open IN, "<$blast_results_file" or die "\n\nERROR: cannot open blast results file $blast_results_file: $!\n\n\n"; my $summary_file = $flowcell . '_' . $lane . "_blast_summary.txt"; open OUT, ">$summary_file" or die "ERROR: Cannot open BLAST summary file \'$summary_file\': $!\n\n"; print OUT join "\t", "Query", "Hit?", "Top_hit_align_len", "Top_hit_len", "%ident", "n_mismatches", "n_gap_openings", "e_value", "bit_score\n"; my @hit_lens = (); my ($hitlen, $current_read, $ident, $align_len, $n_mismatch, $n_gap_openings, $q_start, $q_end, $hit_start, $hit_end, $e_val, $bit_score); my ($tot_n_hits, $prop_that_hit, $avg_hit_len, $median_hit_len) = (0, 0, 0, 0); my $prev_read = "nada"; while (<IN>) { chomp; @input_line = split "\t"; # 0 (1) 2 3 4 5 6 7 8 9 10 11 # Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score $current_read = $input_line[0]; if ($current_read ne $prev_read) { ($ident, $align_len, $n_mismatch, $n_gap_openings, $q_start, $q_end, $hit_start, $hit_end, $e_val, $bit_score) = @input_line[2..11]; ++$prop_that_hit; $hitlen = abs($hit_end - $hit_start) + 1; $avg_hit_len += $hitlen; push @hit_lens, $hitlen; print OUT join "\t", $current_read, 1, $align_len, $hitlen, $ident, $n_mismatch, $n_gap_openings, $e_val, "$bit_score\n"; $prev_read = $current_read; } ++$tot_n_hits; } close IN; close OUT; print STDOUT "\t...finished parsing $tot_n_hits BLAST results for the $prop_that_hit top hits\n\n"; # calculate and print the summary QC statistics my ($machine,$run,$fc,$ln,$tile,$x,$y,$sample,$pass_filt) = split ":", $current_read; my $n_hits = $prop_that_hit; my $avg_hit_len_one_plus = $avg_hit_len/$n_hits; $avg_hit_len = $avg_hit_len / $writecount; $prop_that_hit = $prop_that_hit / $writecount; my $size = @hit_lens; print STDERR "\n\nWARNING: size of array hit_lens not equal to n_hits\n\n" unless $n_hits == $size; @hit_lens = sort {$a <=> $b} @hit_lens; $median_hit_len = $hit_lens[floor($n_hits/2)-1]; print RES join( "\t", $flowcell, $machine, $run, $lane, $platename{$lane}, $n_DNAs{$lane}, $readcount{$lane}, $passfilter{$lane}, $dimers{$lane}, $passcounts{$lane}, sprintf("%.2f",100*$passcounts{$lane}/$readcount{$lane}).'%', $cv_barcode_count{$lane}, $writecount, sprintf("%.2f",100*$prop_that_hit).'%', sprintf("%.2f", $avg_hit_len), sprintf("%.2f", $avg_hit_len_one_plus), $median_hit_len, sprintf("%.0f",$prop_that_hit*$passcounts{$lane}), sprintf("%.0f",$prop_that_hit*$passcounts{$lane}/$n_DNAs{$lane}) ), "\n"; } else { print STDOUT "\t\tLane $lane skipped (not BLASTed) because fewer than 5000 good reads\n"; print RES join("\t", $flowcell, 'machine', 'run', $lane, $platename{$lane}, $n_DNAs{$lane}, $readcount{$lane}, $passfilter{$lane}, $dimers{$lane}, $passcounts{$lane}, sprintf("%.2f",100*$passcounts{$lane}/$readcount{$lane}).'%', $cv_barcode_count{$lane}, 0, 'NA', 'NA', 'NA', 'NA', 0, 0), "\n"; } } close RES; print STDOUT "\n\nFinished!\n\n" . "For results summary, see:\n\t$QC_stats_file\n"; NOZEALANES: print STDOUT "\n\n\nFare thee well, Panzean!!\n\n\n"; #sub doblast { # my ($seqname,$seq) = @_[0,1]; # my ($numhits,$hitlen,$nident,$ngaps,$fractident); # ## For some reason, these temp files don't seem to get deleted (cron job? This is not what the perl documentation says) ## my ( $seq_fh, $tmpqueryfilename )=tempfile("blastqueryXXXXXX", SUFFIX => '.tmp'); ## print $seq_fh $seqname, $seq; ## close $seq_fh; ## my ( $res_fh, $tmpblastresultfile )=tempfile("blastresultXXXXXX", SUFFIX => '.tmp'); # # my $tmpqueryfilename = 'blastquery.tmp'; # open SEQ, ">$tmpqueryfilename"; # print SEQ $seqname, $seq; # close SEQ; # # my $tmpblastresultfile = 'blastresult.tmp'; # open RES, ">$tmpblastresultfile"; # close RES; # # my $commandline = "blastall -p blastn -d ../count091102b.fas -i $tmpqueryfilename -e 0.00001 -K 1 -m 7 -o $tmpblastresultfile"; # my $runblast = system ($commandline); # unless ( $runblast =~/^\[blastall\].*ERROR/) { # my $searchio = new Bio::SearchIO( -format => 'blastxml', # -file => $tmpblastresultfile ); # # if ( my $result = $searchio->next_result ) { # $numhits = $result->num_hits; # if ( my $hit = $result->next_hit ) { # if (my $hsp = $hit->next_hsp) { # $hitlen = $hsp->hsp_length; # Note: this is the length of the alignment INCLUDING gaps # $nident = $hsp->num_identical; # $ngaps = $hsp->gaps; # $fractident = $hsp->frac_identical; # } # else { # ($hitlen,$nident,$ngaps,$fractident) = (-9,-9,-9,-9); # } # } # else { # ($hitlen,$nident,$ngaps,$fractident) = (0,0,0,0); # } # } # else { # ($numhits,$hitlen,$nident,$ngaps,$fractident) = (-99,-99,-99,-99,-99); # } # # } else { # print STDERR "WARNING: No response from server for read $seqname\n\n"; # ($numhits,$hitlen,$nident,$ngaps,$fractident) = (-999,-999,-999,-999,-999); # } # ($numhits,$hitlen,$nident,$ngaps,$fractident); #}
get_lanes_for_enzyme.pl
#!/usr/bin/perl # Read the key file ane produce a list of lanes treated with each enzyme @enzymes =("ApeKI", "EcoT22I", "PstI"); %lanes = {}; # Parse the header line of the key file in search for indices corresponding # to enzyme and flowcell ID. Just in case the order of columns ever changes.... $line=<>; chomp $line; @aux = split "\t", $line; for($i=0;$i<=$#aux;$i++) { if($aux[$i] eq "Lane") { $index_lane = $i; } if($aux[$i] eq "Enzyme") { $index_enzyme = $i; } } while($line=<>) { chomp $line; @aux = split "\t", $line; $lanes{$aux[$index_enzyme]}{$aux[$index_lane]}++; } # print out the result... foreach $enzyme (keys %lanes) { if($lanes{$enzyme} eq "") { next; } print "$enzyme "; foreach $lane (sort {$a <=> $b} keys %{$lanes{$enzyme}}) { if($lanes{$enzyme}{$lane} eq "") { next; } print "$lane "; } print "\n"; }
get_zea_lanes.pl
#!/usr/bin/perl # Read the key file and produce a list of maize lanes # to be processed with QC2 (BLAST). Currently: genus zea and anzyme ApeKI. %lanes = {}; # Parse the header line of the key file in search for indices corresponding # to enzyme and flowcell ID. Just in case the order of columns ever changes.... $line=<>; chomp $line; @aux = split "\t", $line; for($i=0;$i<=$#aux;$i++) { if($aux[$i] eq "Lane") { $index_lane = $i; } if($aux[$i] eq "Genus") { $index_genus = $i; } if($aux[$i] eq "Enzyme") { $index_enzyme = $i; } } while($line=<>) { chomp $line; @aux = split "\t", $line; if(lc($aux[$index_genus]) eq "zea" && $aux[$index_enzyme] eq "ApeKI") { $lanes{$aux[$index_lane]}++; } } # print out the result... foreach $lane (keys %lanes) { if($lanes{$lane} eq "") { next; } print "$lane\n"; }
parseAdapterDimers.pl
#!/usr/bin/perl -w use strict; my $flowcell; if ($ARGV[0]) { $flowcell = $ARGV[0]; } else { die "\n\nUSAGE: parseAdapterDimers.pl FLOWCELL\n\n\n"; } print STDOUT "\n\nGreetings Panzean! Let's parse the adapter dimer file output from GBS_qual_control1.pl for flowcell $flowcell...\n\n"; my $in_file = $flowcell . "_adapter_dimers.txt"; open IN, "<$in_file" or die "ERROR: Cannot open input file \'$in_file\': $!\n\n"; my $dimercount = 0; my ($name, $read); my ($machine, $run, $fc, $lane_num, $tile, $x, $y, $sample, $pass_filter); my ($barcode, $dimer); my %dimersByLaneBarcode; my %samplesByLaneBarcode; while (<IN>) { $name = $_; $name =~ s/^>//; ($machine, $run, $fc, $lane_num, $tile, $x, $y, $sample, $pass_filter) = split(":",$name); $read = <IN>; ($barcode, $dimer) = split("__",$read); ++$dimersByLaneBarcode{$lane_num}{$barcode}; $samplesByLaneBarcode{$lane_num}{$barcode} = $sample; ++$dimercount; } close IN; my $out_file = $flowcell . "_adapter_dimers_parsed.txt"; open OUT, ">$out_file" or die "ERROR: Cannot open output file \'$out_file\': $!\n\n"; print OUT join("\t", qw[Flowcell Lane Barcode Sample nAdapterDimers]), "\n"; my $lanecount = 0; foreach my $lane (sort {$a<=>$b} keys %dimersByLaneBarcode) { ++$lanecount; foreach my $bc (sort keys %{ $dimersByLaneBarcode{$lane} }) { print OUT join("\t", $fc, $lane, $bc, $samplesByLaneBarcode{$lane}{$bc}, $dimersByLaneBarcode{$lane}{$bc}), "\n"; } } close OUT; print STDOUT "\t...$dimercount dimers were found in the adapter dimer file, which covered $lanecount lanes\n\n"; print STDOUT "\n\nFinished!\n\n" . "For the parsed results, see:\n\t$out_file\n"; print STDOUT "\n\n\nFare thee well, Panzean!!\n\n\n";
QC.cfg
FLOWCELLROOT=/data1/maizediv/illumina/GBSFlowCells/ QUALROOT=/data1/bukowski/qual_control_tmp/ BINROOT=/data1/bukowski/panzea/panzea_filemgm/ BLASTDB=/data1/maizediv/illumina/qual_control/QC2_blast_db/Zea_ApeKI_GBS_QC_db_091102.fas BLASTPROG=/data1/bukowski/panzea/blast-2.2.24/bin/blastall
run_QC.pl
#!/usr/bin/perl use Cwd 'abs_path'; use File::Basename; my $bindirectory = dirname(abs_path($0)) . "/"; require $bindirectory . "functions.pl"; # Given the flowcell name, reads format, flag on whether the reads are in a subdirectory under flowcell dir, # and assuming the key file present # in the flowcell directory, run the whole QC pipeline.... # Look for the configuration file called QC.cfg located in the directory where the script # is called from... my %cfgparams = read_config_params(); my $flowcell = $ARGV[0]; my $format = $ARGV[1]; # fasq or qseq my $insubdir = $ARGV[2]; # 1 if files are in subdir under flowcell dir, 0 otherwise # Read off the parameters..... my $flowcellroot = $cfgparams{"FLOWCELLROOT"}; my $qualroot = $cfgparams{"QUALROOT"}; my $binroot = $cfgparams{"BINROOT"}; my $key_file = $flowcellroot . $flowcell . "/" . $flowcell . "_key.txt"; my $datadir = $flowcellroot . $flowcell; my $qualdir = $qualroot . $flowcell; # Create qualdir if it does not yet exist unless( -d $qualdir) { mkdir $qualdir or die "Cannot create directory $qualdir\n"; } # generate enzyme-lane mapping file (all_lanes) and # maize lane list file (maize_lanes digested with ApeKI - subject to QC2) chdir($qualdir); system($binroot . "get_lanes_for_enzyme.pl < $key_file > all_lanes"); system($binroot . "get_zea_lanes.pl < $key_file > maize_lanes"); # Make enzyme directories and run the pipelines my @enzymes = `awk '{print \$1}' all_lanes`; foreach $enz (@enzymes) { chdir($qualdir); chomp $enz; mkdir $enz; $cmd = $binroot ."GBS_qual_control1.pl $flowcell $format gz $enz ./all_lanes ./maize_lanes $insubdir >& QC1_$enz.log"; system($cmd); chdir($enz); $cmd = $binroot ."parseAdapterDimers.pl $flowcell >> ../QC1_$enz.log 2>&1"; system($cmd); $cmd = $binroot ."GBS_qual_control2.pl $flowcell ../maize_lanes >& ../QC2_$enz.log"; system($cmd); system("rm ${flowcell}_*_QC.fasta"); } chdir($qualdir);