#!/usr/bin/perl -w # The program trim_fastq.pl takes fastq files of single or paired-end reads and trims low-quality bases from the 3' and 5' end. # It also removes reads with remaining (internal) N after trimming. # Copyright (C) 2014 Minou Nowrousian # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details (http://www.gnu.org/licenses/). use strict; use warnings; use Getopt::Std; my %opts = (a=>"x", o=>33, l=>40, q=>10, r=>"trim_fastq"); getopts('a:o:l:q:r:', \%opts); my $summary = qq( Summary: trim_fastq.pl Options: -a type of analysis: s for single reads t for paired-end reads with pairs in two files p for paired-end reads with pairs in one file -o offset for the quality scores (default: 33, for certain types of Illumina pipelines it can be 64, then this option must be set) -l minimum length of read that is kept (default: 40) -q minimum base quality that must be reached at the 5' and 3' ends for the trimming to stop (default: 10) -r core name for results file(s). For paired-ends, will be extended by _pairs.fastq, for single-ends by _singlets.fastq, default trim_fastq. Must contain only letters, numbers, - or _ Important: Bases and quality sequences must be only ONE line each (i.e. four lines containing the information for each read). \n); die($summary) if ($opts{"a"} =~/[^spt]/); ## only the options s, p, or t are ok for type of analysis die($summary) if ($opts{"o"} =~/\D/); ## only numeric die($summary) if ($opts{"l"} =~/\D/); die($summary) if ($opts{"q"} =~/\D/); die($summary) if ($opts{"r"} =~/[^\w_]/); ## only allowed letters, numbers, - and _ for file name my %number_of_reads; ## hash for read stats (how many reads of which length were kept) my @output_stats; ## array for output stats (input and output number of reads) my $outfile_paired = $opts{"r"} . "_pairs.fastq"; my $outfile_single = $opts{"r"} . "_singlets.fastq"; if ( ($opts{"a"} eq "s") && (@ARGV == 1) && (-e $ARGV[0]) ) { print "\ntrimming of single reads in file $ARGV[0]:\n quality score offset $opts{'o'}\n minimum length $opts{'l'}\n minimum quality $opts{'q'} at 3' end\n\n"; &single_reads; } elsif ( ($opts{"a"} eq "p") && (@ARGV == 1) && (-e $ARGV[0]) ) { print "\ntrimming of paired-end reads in file $ARGV[0]:\n quality score offset $opts{'o'}\n minimum length $opts{'l'}\n minimum quality $opts{'q'} at 3' end\n\n"; &paired_reads_one_file; } elsif ( ($opts{"a"} eq "t") && (@ARGV == 2) && (-e $ARGV[0]) && (-e $ARGV[1]) ) { print "\ntrimming of paired-end reads in files $ARGV[0] and $ARGV[1]:\n quality score offset $opts{'o'}\n minimum length $opts{'l'}\n minimum quality $opts{'q'} at 3' end\n\n"; &paired_reads_two_files; } else { die($summary); } push(@output_stats, "\nlength of read\tnumber of reads\n"); foreach my $schluessel (sort { $a <=> $b } keys %number_of_reads) { push(@output_stats, "$schluessel\t$number_of_reads{$schluessel}\n"); } my $outstats = $opts{"r"} . "_output_stats.txt"; open STATS, "> $outstats" or die "Kann Datei nicht oeffnen: $!"; print STATS @output_stats; close STATS; sub single_reads { my $anzahl_reads = 0; my $reads_kept = 0; open INPUTFILE, "< $ARGV[0]" or die "Kann Datei nicht oeffnen: $!"; open OUTPUTSINGLE, "> $outfile_single" or die "Kann Datei nicht oeffnen: $!"; while () { # parses through lines of inputfile my $line1 = $_; if ($line1 !~ /^@/) { print "first line of read does not start with @ at read $anzahl_reads\n"; } $_ = ; my $line2 = $_; $_ = ; my $line3 = $_; $_ = ; my $line4 = $_; $anzahl_reads += 1; my @read = &trim_read($line1, $line2, $line4); my $read1_ok = $read[0]; if ($read1_ok ne "badread") { my $laenge = pop(@read); print OUTPUTSINGLE @read; $number_of_reads{$laenge} += 1; $reads_kept += 1; } else { } } close INPUTFILE; close OUTPUTSINGLE; @output_stats = "Anzahl der reads im Input-File: $anzahl_reads\n\nAnzahl der reads im cleaned file: $reads_kept\n\n"; } sub paired_reads_two_files { my $anzahl_reads = 0; my $reads_kept_paired = 0; my $reads_kept_single = 0; open INPUTFILE1, "< $ARGV[0]" or die "Kann Datei nicht oeffnen: $!"; open INPUTFILE2, "< $ARGV[1]" or die "Kann Datei nicht oeffnen: $!"; open OUTPUTPAIRED, "> $outfile_paired" or die "Kann Datei nicht oeffnen: $!"; open OUTPUTSINGLE, "> $outfile_single" or die "Kann Datei nicht oeffnen: $!"; while () { # parses through lines of inputfile my $line1 = $_; if ($line1 !~ /^@/) { print "first line of read does not start with @ at read $anzahl_reads\n"; } $_ = ; my $line2 = $_; $_ = ; my $line3 = $_; $_ = ; my $line4 = $_; $_ = ; my $line5 = $_; if ($line5 !~ /^@/) { print "first line of read does not start with @ at read $anzahl_reads\n"; } $_ = ; my $line6 = $_; $_ = ; my $line7 = $_; $_ = ; my $line8 = $_; $anzahl_reads += 2; my @read1 = &trim_read($line1, $line2, $line4); my @read2 = &trim_read($line5, $line6, $line8); my $read1_ok = $read1[0]; my $read2_ok = $read2[0]; if ( ($read1_ok ne "badread") && ($read2_ok ne "badread") ) { my $laenge1 = pop(@read1); my $laenge2 = pop(@read2); print OUTPUTPAIRED @read1, @read2; $number_of_reads{$laenge1} += 1; $number_of_reads{$laenge2} += 1; $reads_kept_paired += 2; } elsif ($read1_ok ne "badread") { my $laenge = pop(@read1); print OUTPUTSINGLE @read1; $number_of_reads{$laenge} += 1; $reads_kept_single += 1; } elsif ($read2_ok ne "badread") { my $laenge = pop(@read2); print OUTPUTSINGLE @read2; $number_of_reads{$laenge} += 1; $reads_kept_single += 1; } else { } } close INPUTFILE1; close INPUTFILE2; close OUTPUTPAIRED; close OUTPUTSINGLE; @output_stats = "Anzahl der reads im Input-File: $anzahl_reads\n\nAnzahl der reads im Paired file: $reads_kept_paired\n\nAnzahl der reads im single file: $reads_kept_single\n"; } sub paired_reads_one_file { my $anzahl_reads = 0; my $reads_kept_paired = 0; my $reads_kept_single = 0; open INPUTFILE, "< $ARGV[0]" or die "Kann Datei nicht oeffnen: $!"; open OUTPUTPAIRED, "> $outfile_paired" or die "Kann Datei nicht oeffnen: $!"; open OUTPUTSINGLE, "> $outfile_single" or die "Kann Datei nicht oeffnen: $!"; while () { # parses through lines of inputfile my $line1 = $_; if ($line1 !~ /^@/) { print "first line of read does not start with @ at read $anzahl_reads\n"; } $_ = ; my $line2 = $_; $_ = ; my $line3 = $_; $_ = ; my $line4 = $_; $_ = ; my $line5 = $_; if ($line5 !~ /^@/) { print "first line of read does not start with @ at read $anzahl_reads\n"; } $_ = ; my $line6 = $_; $_ = ; my $line7 = $_; $_ = ; my $line8 = $_; $anzahl_reads += 2; my @read1 = &trim_read($line1, $line2, $line4); my @read2 = &trim_read($line5, $line6, $line8); my $read1_ok = $read1[0]; my $read2_ok = $read2[0]; if ( ($read1_ok ne "badread") && ($read2_ok ne "badread") ) { my $laenge1 = pop(@read1); my $laenge2 = pop(@read2); print OUTPUTPAIRED @read1, @read2; $number_of_reads{$laenge1} += 1; $number_of_reads{$laenge2} += 1; $reads_kept_paired += 2; } elsif ($read1_ok ne "badread") { my $laenge = pop(@read1); print OUTPUTSINGLE @read1; $number_of_reads{$laenge} += 1; $reads_kept_single += 1; } elsif ($read2_ok ne "badread") { my $laenge = pop(@read2); print OUTPUTSINGLE @read2; $number_of_reads{$laenge} += 1; $reads_kept_single += 1; } else { } } close INPUTFILE; close OUTPUTPAIRED; close OUTPUTSINGLE; @output_stats = "Anzahl der reads im Input-File: $anzahl_reads\n\nAnzahl der reads im Paired file: $reads_kept_paired\n\nAnzahl der reads im single file: $reads_kept_single\n"; } sub trim_read { ## returns array with the four lines for the fastq definition and the last element the length of the remaining read ## or "badread" if read too short or still contains (internal) "N" after trimming my ($line1, $line2, $line4) = @_; my @trimmed = "badread"; $line1 =~ s/\s+$//; # remove all trailing whitespace, end of line, tabs etc. $line2 =~ s/\s+$//; $line4 =~ s/\s+$//; my $laenge = length($line4); my $score = substr($line4, 0, 1); my $quality = ord($score) - $opts{"o"}; ## remove bad quality bases from 5' end until ( ($quality >= $opts{"q"}) || ($laenge < $opts{"l"}) ) { $line4 =~ s/^.//; ## remove first quality score $score = substr($line4, 0, 1); $quality = ord($score) - $opts{"o"}; $laenge = length($line4); } if ($laenge >= $opts{"l"}) { $line2 = substr($line2, - $laenge, $laenge); ## start counting from end of string, i.e. remove bases with bad 5' quality scores $score = substr($line4, -1, 1); $quality = ord($score) - $opts{"o"}; ## remove bad quality bases from 3' end until ( ($quality >= $opts{"q"}) || ($laenge < $opts{"l"}) ) { $line4 =~ s/.$//; ## remove last quality score $score = substr($line4, -1, 1); $quality = ord($score) - $opts{"o"}; $laenge = length($line4); } $line2 = substr($line2, 0, $laenge); if ( ($laenge >= $opts{"l"}) && ($line2 !~ /N/i) ) { ## don't use reads with internal N $line1 .= "\n"; $line2 .= "\n"; $line4 .= "\n"; @trimmed = ($line1, $line2, "+\n", $line4, $laenge); } } return @trimmed; }