#!/usr/bin/perl -w

# The program fastq_qual_stats.pl takes fastq files and calculates summary statistics for the quality scores.
# For each base position, the mean quality score is calculated and the quality scores are sorted in 6 bins 
# from 0-9, 10-19, 20-29, 30-39, 40-49, >=50. The number of bases and % of total bases at that position in each bin is given as output.
# Copyright (C) 2012 Minou Nowrousian

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details (http://www.gnu.org/licenses/).


use strict;
use warnings;
use Getopt::Std;

my %opts = (o=>33,);
getopts('o:', \%opts);


my $summary = qq(
Summary:   fastq_qual_stats.pl <options> <file>

Options: -o offset for the quality scores (default: 33, 
            for certain types of Illumina pipelines 
            it can be 64, then this option must be set)

Important: Bases and quality sequences must be only ONE line each (i.e. four lines containing the information for each read).
\n);

die($summary) if ($opts{"o"} =~/\D/);      ## only numeric

my %positions = (); ## reference to an anonymous array, key is position, 
                    ## values are sum of quality scores for all reads, number of counted reads with base at this position,
					## number of reads with bases in each of the 6 bins (see above)
my @output_stats;   ## array for output stats columns are base position, mean quality and no. of bases and % of bases for each bin,
                    ## lines are base position starting with base 1.
my $outfile;

if ( (@ARGV == 1) && (-e $ARGV[0]) ) {

	print "\ndetermine quality scores of reads in file $ARGV[0]:\n  quality score offset $opts{'o'}\n\n";
	$outfile = $ARGV[0] . ".qualstats";
	
	open INPUTFILE, "< $ARGV[0]" or die "Kann Datei nicht oeffnen: $!";
	
	my $anzahl_reads = 0;

	while (<INPUTFILE>) {                         # parses through lines of inputfile

		my $line1 = $_;

		if ($line1 !~ /^@/) {
			print "first line of read does not start with @ at read $anzahl_reads\n";
		}

		$_ = <INPUTFILE>;
		my $line2 = $_;
		$_ = <INPUTFILE>;
		my $line3 = $_;
		$_ = <INPUTFILE>;
		my $line4 = $_;
		
		$anzahl_reads += 1;
	
		my @qual = &get_qual($line4);
		
		my $n = 0;  ## counter for the number of bases in the read
		
		while (@qual) {
			
			my $qual = shift(@qual);
			$n++;
			
			unless (exists $positions{$n}) { ## if this is the first time that this base position is encountered, initialize hash element
				$positions{$n} = [0, 0, 0, 0, 0, 0, 0, 0];
			}
			
			$positions{$n}->[0] += $qual;  ## first element of array is sum of quality scores
			$positions{$n}->[1] += 1;      ## second element of array is the number of counted bases at this position
			if ($qual >= 50) {             ## elements 2-7 of array are number of reads for the six bins
				$positions{$n}->[7] += 1;
			} elsif ($qual >= 40) {
				$positions{$n}->[6] += 1;
			} elsif ($qual >= 30) {
				$positions{$n}->[5] += 1;
			} elsif ($qual >= 20) {
				$positions{$n}->[4] += 1;
			} elsif ($qual >= 10) {
				$positions{$n}->[3] += 1;
			} else {
				$positions{$n}->[2] += 1;
			}
			
		}			
		
	}
	
	## now prepare output information
	
	push(@output_stats, "base position\ttotal number of bases\tmean quality score\tbin 0-9 bases\tbin 0-9 % of bases\tbin 10-19 bases\tbin 10-19 % of bases\tbin 20-29 bases\tbin 20-29 % of bases\tbin 30-39 bases\tbin 30-39 % of bases\tbin 40-49 bases\tbin 40-49 % of bases\tbin >50 bases\tbin >50 % of bases\n");
	
	foreach my $position (sort { $a <=> $b } keys %positions) {
	
		my ($qual_sum, $total_bases, $bin1, $bin2, $bin3, $bin4, $bin5, $bin6) = @{$positions{$position}};
		my $mean = sprintf "%.1f", $qual_sum / $total_bases;  ## sum of quality scores / number of bases
		my $percent1 = sprintf "%.1f", $bin1 * 100 / $total_bases;
		my $percent2 = sprintf "%.1f", $bin2 * 100 / $total_bases;
		my $percent3 = sprintf "%.1f", $bin3 * 100 / $total_bases;
		my $percent4 = sprintf "%.1f", $bin4 * 100 / $total_bases;
		my $percent5 = sprintf "%.1f", $bin5 * 100 / $total_bases;
		my $percent6 = sprintf "%.1f", $bin6 * 100 / $total_bases;
		
		push(@output_stats, "$position\t$total_bases\t$mean\t$bin1\t$percent1\t$bin2\t$percent2\t$bin3\t$percent3\t$bin4\t$percent4\t$bin5\t$percent5\t$bin6\t$percent6\n");
		
	}	
	
	open OUTPUT, "> $outfile" or die "Kann Datei nicht oeffnen: $!";
	print OUTPUT @output_stats;
	close OUTPUT;

	print "Parsed $anzahl_reads reads in file $ARGV[0]\n";
	
} else {

	die($summary);
	
}


sub get_qual {

	## returns array with numeric quality scores

	my $line4 = shift(@_);
	
	$line4 =~ s/\s+$//;                   # remove all trailing whitespace, end of line, tabs etc.
	
	my @qual;
		
	my $laenge = length($line4);
	$laenge = $laenge - 1;
		
	for (my $i=0; $i<=$laenge; $i++) {

		my $score = substr($line4, $i, 1);
		my $quality = ord($score) - $opts{"o"};
		push(@qual, $quality);
			
	}
	
	return @qual;
	
}