#!/usr/bin/perl -w # The program fastq_qual_stats.pl takes fastq files and calculates summary statistics for the quality scores. # For each base position, the mean quality score is calculated and the quality scores are sorted in 6 bins # from 0-9, 10-19, 20-29, 30-39, 40-49, >=50. The number of bases and % of total bases at that position in each bin is given as output. # Copyright (C) 2012 Minou Nowrousian # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details (http://www.gnu.org/licenses/). use strict; use warnings; use Getopt::Std; my %opts = (o=>33,); getopts('o:', \%opts); my $summary = qq( Summary: fastq_qual_stats.pl Options: -o offset for the quality scores (default: 33, for certain types of Illumina pipelines it can be 64, then this option must be set) Important: Bases and quality sequences must be only ONE line each (i.e. four lines containing the information for each read). \n); die($summary) if ($opts{"o"} =~/\D/); ## only numeric my %positions = (); ## reference to an anonymous array, key is position, ## values are sum of quality scores for all reads, number of counted reads with base at this position, ## number of reads with bases in each of the 6 bins (see above) my @output_stats; ## array for output stats columns are base position, mean quality and no. of bases and % of bases for each bin, ## lines are base position starting with base 1. my $outfile; if ( (@ARGV == 1) && (-e $ARGV[0]) ) { print "\ndetermine quality scores of reads in file $ARGV[0]:\n quality score offset $opts{'o'}\n\n"; $outfile = $ARGV[0] . ".qualstats"; open INPUTFILE, "< $ARGV[0]" or die "Kann Datei nicht oeffnen: $!"; my $anzahl_reads = 0; while () { # parses through lines of inputfile my $line1 = $_; if ($line1 !~ /^@/) { print "first line of read does not start with @ at read $anzahl_reads\n"; } $_ = ; my $line2 = $_; $_ = ; my $line3 = $_; $_ = ; my $line4 = $_; $anzahl_reads += 1; my @qual = &get_qual($line4); my $n = 0; ## counter for the number of bases in the read while (@qual) { my $qual = shift(@qual); $n++; unless (exists $positions{$n}) { ## if this is the first time that this base position is encountered, initialize hash element $positions{$n} = [0, 0, 0, 0, 0, 0, 0, 0]; } $positions{$n}->[0] += $qual; ## first element of array is sum of quality scores $positions{$n}->[1] += 1; ## second element of array is the number of counted bases at this position if ($qual >= 50) { ## elements 2-7 of array are number of reads for the six bins $positions{$n}->[7] += 1; } elsif ($qual >= 40) { $positions{$n}->[6] += 1; } elsif ($qual >= 30) { $positions{$n}->[5] += 1; } elsif ($qual >= 20) { $positions{$n}->[4] += 1; } elsif ($qual >= 10) { $positions{$n}->[3] += 1; } else { $positions{$n}->[2] += 1; } } } ## now prepare output information push(@output_stats, "base position\ttotal number of bases\tmean quality score\tbin 0-9 bases\tbin 0-9 % of bases\tbin 10-19 bases\tbin 10-19 % of bases\tbin 20-29 bases\tbin 20-29 % of bases\tbin 30-39 bases\tbin 30-39 % of bases\tbin 40-49 bases\tbin 40-49 % of bases\tbin >50 bases\tbin >50 % of bases\n"); foreach my $position (sort { $a <=> $b } keys %positions) { my ($qual_sum, $total_bases, $bin1, $bin2, $bin3, $bin4, $bin5, $bin6) = @{$positions{$position}}; my $mean = sprintf "%.1f", $qual_sum / $total_bases; ## sum of quality scores / number of bases my $percent1 = sprintf "%.1f", $bin1 * 100 / $total_bases; my $percent2 = sprintf "%.1f", $bin2 * 100 / $total_bases; my $percent3 = sprintf "%.1f", $bin3 * 100 / $total_bases; my $percent4 = sprintf "%.1f", $bin4 * 100 / $total_bases; my $percent5 = sprintf "%.1f", $bin5 * 100 / $total_bases; my $percent6 = sprintf "%.1f", $bin6 * 100 / $total_bases; push(@output_stats, "$position\t$total_bases\t$mean\t$bin1\t$percent1\t$bin2\t$percent2\t$bin3\t$percent3\t$bin4\t$percent4\t$bin5\t$percent5\t$bin6\t$percent6\n"); } open OUTPUT, "> $outfile" or die "Kann Datei nicht oeffnen: $!"; print OUTPUT @output_stats; close OUTPUT; print "Parsed $anzahl_reads reads in file $ARGV[0]\n"; } else { die($summary); } sub get_qual { ## returns array with numeric quality scores my $line4 = shift(@_); $line4 =~ s/\s+$//; # remove all trailing whitespace, end of line, tabs etc. my @qual; my $laenge = length($line4); $laenge = $laenge - 1; for (my $i=0; $i<=$laenge; $i++) { my $score = substr($line4, $i, 1); my $quality = ord($score) - $opts{"o"}; push(@qual, $quality); } return @qual; }