#!/usr/bin/perl -w

# The program trim_fastq.pl takes fastq files of single or paired-end reads and trims low-quality bases from the 3' and 5' end.
# It also removes reads with remaining (internal) N after trimming.
# Copyright (C) 2014 Minou Nowrousian

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details (http://www.gnu.org/licenses/).


use strict;
use warnings;
use Getopt::Std;

my %opts = (a=>"x", o=>33, l=>40, q=>10, r=>"trim_fastq");
getopts('a:o:l:q:r:', \%opts);


my $summary = qq(
Summary:   trim_fastq.pl <options> <file(s)>

Options: -a type of analysis: 
             s for single reads
             t for paired-end reads with pairs in two files
             p for paired-end reads with pairs in one file
         -o offset for the quality scores (default: 33, 
            for certain types of Illumina pipelines 
            it can be 64, then this option must be set)
         -l minimum length of read that is kept (default: 40)
         -q minimum base quality that must be reached at 
            the 5' and 3' ends for the trimming to stop (default: 10)
         -r core name for results file(s). For paired-ends, 
            will be extended by _pairs.fastq, for single-ends
            by _singlets.fastq, default trim_fastq.
            Must contain only letters, numbers, - or _

Important: Bases and quality sequences must be only ONE line each (i.e. four lines containing the information for each read).
\n);

die($summary) if ($opts{"a"} =~/[^spt]/);  ## only the options s, p, or t are ok for type of analysis
die($summary) if ($opts{"o"} =~/\D/);      ## only numeric
die($summary) if ($opts{"l"} =~/\D/);
die($summary) if ($opts{"q"} =~/\D/);
die($summary) if ($opts{"r"} =~/[^\w_]/);      ## only allowed letters, numbers, - and _ for file name

my %number_of_reads;  ## hash for read stats (how many reads of which length were kept)
my @output_stats;     ## array for output stats (input and output number of reads)
my $outfile_paired = $opts{"r"} . "_pairs.fastq";
my $outfile_single = $opts{"r"} . "_singlets.fastq";

if ( ($opts{"a"} eq "s") && (@ARGV == 1) && (-e $ARGV[0]) ) {

	print "\ntrimming of single reads in file $ARGV[0]:\n  quality score offset $opts{'o'}\n  minimum length $opts{'l'}\n  minimum quality $opts{'q'} at 3' end\n\n";
	&single_reads;
	
} elsif ( ($opts{"a"} eq "p") && (@ARGV == 1) && (-e $ARGV[0]) ) {
	
	print "\ntrimming of paired-end reads in file $ARGV[0]:\n  quality score offset $opts{'o'}\n  minimum length $opts{'l'}\n  minimum quality $opts{'q'} at 3' end\n\n";
	&paired_reads_one_file;
	
} elsif ( ($opts{"a"} eq "t") && (@ARGV == 2) && (-e $ARGV[0]) && (-e $ARGV[1]) ) {

	print "\ntrimming of paired-end reads in files $ARGV[0] and $ARGV[1]:\n  quality score offset $opts{'o'}\n  minimum length $opts{'l'}\n  minimum quality $opts{'q'} at 3' end\n\n";
	&paired_reads_two_files;
	
} else {

	die($summary);
	
}

push(@output_stats, "\nlength of read\tnumber of reads\n");

foreach my $schluessel (sort { $a <=> $b } keys %number_of_reads) {
	push(@output_stats, "$schluessel\t$number_of_reads{$schluessel}\n");
}

my $outstats = $opts{"r"} . "_output_stats.txt";
open STATS, "> $outstats" or die "Kann Datei nicht oeffnen: $!";
print STATS @output_stats;
close STATS;

sub single_reads {

	my $anzahl_reads = 0;
	my $reads_kept = 0;

	open INPUTFILE, "< $ARGV[0]" or die "Kann Datei nicht oeffnen: $!";
	open OUTPUTSINGLE, "> $outfile_single" or die "Kann Datei nicht oeffnen: $!";


	while (<INPUTFILE>) {                         # parses through lines of inputfile

		my $line1 = $_;

		if ($line1 !~ /^@/) {
			print "first line of read does not start with @ at read $anzahl_reads\n";
		}

		$_ = <INPUTFILE>;
		my $line2 = $_;
		$_ = <INPUTFILE>;
		my $line3 = $_;
		$_ = <INPUTFILE>;
		my $line4 = $_;
		
		$anzahl_reads += 1;
	
		my @read = &trim_read($line1, $line2, $line4);
	
		my $read1_ok = $read[0];
	
		if ($read1_ok ne "badread") {

			my $laenge = pop(@read);
		
			print OUTPUTSINGLE @read;
		
			$number_of_reads{$laenge} += 1;
			
			$reads_kept += 1;
		
		} else {
		}
	}
	
	close INPUTFILE;
	close OUTPUTSINGLE;
	
	@output_stats = "Anzahl der reads im Input-File: $anzahl_reads\n\nAnzahl der reads im cleaned file: $reads_kept\n\n";
	
}


sub paired_reads_two_files {

	my $anzahl_reads = 0;
	my $reads_kept_paired = 0;
	my $reads_kept_single = 0;

	open INPUTFILE1, "< $ARGV[0]" or die "Kann Datei nicht oeffnen: $!";
	open INPUTFILE2, "< $ARGV[1]" or die "Kann Datei nicht oeffnen: $!";
	open OUTPUTPAIRED, "> $outfile_paired" or die "Kann Datei nicht oeffnen: $!";
	open OUTPUTSINGLE, "> $outfile_single" or die "Kann Datei nicht oeffnen: $!";


	while (<INPUTFILE1>) {                         # parses through lines of inputfile

		my $line1 = $_;

		if ($line1 !~ /^@/) {
			print "first line of read does not start with @ at read $anzahl_reads\n";
		}

		$_ = <INPUTFILE1>;
		my $line2 = $_;
		$_ = <INPUTFILE1>;
		my $line3 = $_;
		$_ = <INPUTFILE1>;
		my $line4 = $_;
		
		$_ = <INPUTFILE2>;
		my $line5 = $_;

		if ($line5 !~ /^@/) {
			print "first line of read does not start with @ at read $anzahl_reads\n";
		}

		$_ = <INPUTFILE2>;
		my $line6 = $_;
		$_ = <INPUTFILE2>;
		my $line7 = $_;
		$_ = <INPUTFILE2>;
		my $line8 = $_;

		$anzahl_reads += 2;
	
		my @read1 = &trim_read($line1, $line2, $line4);
		my @read2 = &trim_read($line5, $line6, $line8);
	
		my $read1_ok = $read1[0];
		my $read2_ok = $read2[0];
	
		if ( ($read1_ok ne "badread") && ($read2_ok ne "badread") ) {

			my $laenge1 = pop(@read1);
			my $laenge2 = pop(@read2);
			
			print OUTPUTPAIRED @read1, @read2;
	
			$number_of_reads{$laenge1} += 1;
			$number_of_reads{$laenge2} += 1;
			
			$reads_kept_paired += 2;
			
		} elsif ($read1_ok ne "badread") {

			my $laenge = pop(@read1);
		
			print OUTPUTSINGLE @read1;
		
			$number_of_reads{$laenge} += 1;
			
			$reads_kept_single += 1;
	
		} elsif ($read2_ok ne "badread") {

			my $laenge = pop(@read2);
			print OUTPUTSINGLE @read2;
		
			$number_of_reads{$laenge} += 1;
			
			$reads_kept_single += 1;
		
		} else {
		}
	}
	
	close INPUTFILE1;
	close INPUTFILE2;
	close OUTPUTPAIRED;
	close OUTPUTSINGLE;
	
	@output_stats = "Anzahl der reads im Input-File: $anzahl_reads\n\nAnzahl der reads im Paired file: $reads_kept_paired\n\nAnzahl der reads im single file: $reads_kept_single\n";
	
}

sub paired_reads_one_file {

	my $anzahl_reads = 0;
	my $reads_kept_paired = 0;
	my $reads_kept_single = 0;

	open INPUTFILE, "< $ARGV[0]" or die "Kann Datei nicht oeffnen: $!";
	open OUTPUTPAIRED, "> $outfile_paired" or die "Kann Datei nicht oeffnen: $!";
	open OUTPUTSINGLE, "> $outfile_single" or die "Kann Datei nicht oeffnen: $!";


	while (<INPUTFILE>) {                         # parses through lines of inputfile

		my $line1 = $_;

		if ($line1 !~ /^@/) {
			print "first line of read does not start with @ at read $anzahl_reads\n";
		}

		$_ = <INPUTFILE>;
		my $line2 = $_;
		$_ = <INPUTFILE>;
		my $line3 = $_;
		$_ = <INPUTFILE>;
		my $line4 = $_;
		$_ = <INPUTFILE>;
		my $line5 = $_;

		if ($line5 !~ /^@/) {
			print "first line of read does not start with @ at read $anzahl_reads\n";
		}

		$_ = <INPUTFILE>;
		my $line6 = $_;
		$_ = <INPUTFILE>;
		my $line7 = $_;
		$_ = <INPUTFILE>;
		my $line8 = $_;

		$anzahl_reads += 2;
	
		my @read1 = &trim_read($line1, $line2, $line4);
		my @read2 = &trim_read($line5, $line6, $line8);
	
		my $read1_ok = $read1[0];
		my $read2_ok = $read2[0];
	
		if ( ($read1_ok ne "badread") && ($read2_ok ne "badread") ) {

			my $laenge1 = pop(@read1);
			my $laenge2 = pop(@read2);
			
			print OUTPUTPAIRED @read1, @read2;
	
			$number_of_reads{$laenge1} += 1;
			$number_of_reads{$laenge2} += 1;
			
			$reads_kept_paired += 2;
			
		} elsif ($read1_ok ne "badread") {

			my $laenge = pop(@read1);
		
			print OUTPUTSINGLE @read1;
		
			$number_of_reads{$laenge} += 1;
			
			$reads_kept_single += 1;
	
		} elsif ($read2_ok ne "badread") {

			my $laenge = pop(@read2);
			print OUTPUTSINGLE @read2;
		
			$number_of_reads{$laenge} += 1;
			
			$reads_kept_single += 1;
		
		} else {
		}
	}
	
	close INPUTFILE;
	close OUTPUTPAIRED;
	close OUTPUTSINGLE;
	
	@output_stats = "Anzahl der reads im Input-File: $anzahl_reads\n\nAnzahl der reads im Paired file: $reads_kept_paired\n\nAnzahl der reads im single file: $reads_kept_single\n";
	
}


sub trim_read {

	## returns array with the four lines for the fastq definition and the last element the length of the remaining read
	## or "badread" if read too short or still contains (internal) "N" after trimming

	my ($line1, $line2, $line4) = @_;
	
	my @trimmed = "badread";

	$line1 =~ s/\s+$//;                   # remove all trailing whitespace, end of line, tabs etc.
	$line2 =~ s/\s+$//;
	$line4 =~ s/\s+$//;
		
	my $laenge = length($line4);
	my $score = substr($line4, 0, 1);
	my $quality = ord($score) - $opts{"o"};

	## remove bad quality bases from 5' end
	until ( ($quality >= $opts{"q"}) || ($laenge < $opts{"l"}) ) {

		$line4 =~ s/^.//;  ## remove first quality score
		$score = substr($line4, 0, 1);
		$quality = ord($score) - $opts{"o"};
		$laenge = length($line4);
			
	}
		
	if ($laenge >= $opts{"l"}) {
	
		$line2 = substr($line2, - $laenge, $laenge);  ## start counting from end of string, i.e. remove bases with bad 5' quality scores

		$score = substr($line4, -1, 1);
		$quality = ord($score) - $opts{"o"};
		
		## remove bad quality bases from 3' end
		until ( ($quality >= $opts{"q"}) || ($laenge < $opts{"l"}) ) {

			$line4 =~ s/.$//;  ## remove last quality score
			$score = substr($line4, -1, 1);
			$quality = ord($score) - $opts{"o"};
			$laenge = length($line4);
			
		}
		
		$line2 = substr($line2, 0, $laenge);

		if ( ($laenge >= $opts{"l"}) && ($line2 !~ /N/i) ) { ## don't use reads with internal N
			$line1 .= "\n";
			$line2 .= "\n";
			$line4 .= "\n";
			@trimmed = ($line1, $line2, "+\n", $line4, $laenge);
		}
				
	}
	
	return @trimmed;
	
}