#!/usr/bin/perl -w # The program fasta_stats_N50.pl calculates some basic stats (GC content, total length, number of gaps etc.) from a file with multiple nucleic acid sequences in fasta format (e.g. genomic contigs or scaffolds). # Copyright (C) 2012 Minou Nowrousian # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details (http://www.gnu.org/licenses/). use strict; use warnings; unless (@ARGV == 1) { die("program usage: fasta_stats_N50.pl inputfile.fasta\n"); } unless (-e $ARGV[0]) { die("Can't open file $ARGV[0]: $!"); } open INPUTFILE, "< $ARGV[0]" or die "Kann Datei nicht oeffnen: $!"; my $header_found = 0; ## set variable to indicate that fasta header line (begins with >) was found my $header; ## contains the header line of the current fasta sequence my $seq; my $z = 0; ## line counter my $gc = 0; ## total gc content of all reads my $total_length = 0; ## length of all reads my $number_all_gaps = 0; ## number of gaps of one or more undefined bases ("N" or "n") in all sequences my $length_all_gaps = 0; ## number of undefined bases ("N" or "n") in all sequences (add up length of gaps) my @lengths; ## array with the lengths of all fasta sequences my @stats = "fasta header\tlength of sequence\tgc content in percent\tnumber of gaps\tlength of gaps\tpercent of gaps\n"; while () { # parses through lines of inputfile my $line = $_; $z++; $line =~ s/\s+$//; # remove all trailing whitespace, end of line, tabs etc. if ($line =~ /^>/) { if (defined($seq)) { my ($length, $gc_seq, $gaps, $length_gaps, $percent_gaps) = &stats($seq); push(@stats, "$header\t$length\t$gc_seq\t$gaps\t$length_gaps\t$percent_gaps\n"); } $header = $line; $header =~ s/^>//; $header_found = 1; $seq = undef; } elsif ($line =~ /[^acgtnACGTN]/) { print "unexpected character (not a, c, g, t, n, A, C, G, T, N) in line $z, analysis will proceed anyway\n"; $seq .= $line; } else { $seq .= $line; } } ## process data for last sequence if (defined($seq)) { my ($length, $gc_seq, $gaps, $length_gaps, $percent_gaps) = &stats($seq); push(@stats, "$header\t$length\t$gc_seq\t$gaps\t$length_gaps\t$percent_gaps\n"); } ## calculate N50 my @sorted_lengths = sort { $b <=> $a } @lengths; ## sort length of fasta sequences in reverse order (biggest first) my $n = @sorted_lengths; my $N50 = 0; my $addup = 0; my $halfsize = $total_length / 2; until ( ($addup >= $halfsize) || ($n == 0) ) { $N50 = shift(@sorted_lengths); $addup += $N50; $n = @sorted_lengths; } ## now put stats for all sequences in array for output my $number_seqs = @lengths; my @output = ("number of sequences: ", $number_seqs, "\n\n"); my $gc_content = sprintf "%.1f", ($gc/$total_length)*100; push(@output, "GC content in percent\t" . $gc_content . "\n\n"); push(@output, "Total length of all sequences in bases\t" . $total_length . "\n\n"); push(@output, "N50 in bases\t" . $N50 . "\n\n"); push(@output, "number of gaps\t" . $number_all_gaps . "\n\n"); push(@output, "Total length of all gaps in bases\t" . $length_all_gaps . "\n\n"); my $percent_gaps = sprintf "%.1f", $length_all_gaps * 100 / $total_length; push(@output, "Total gap length in percent\t" . $percent_gaps . "\n\n"); ## now add stats for individual sequences to output and save output file push(@output, @stats); my $filename = $ARGV[0] . "_stats.txt"; open NEU, "> $filename" or die "Kann Datei nicht oeffnen: $!"; print NEU @output; close NEU; sub stats { my $seq = shift(@_); my $length = length($seq); $total_length += $length; push(@lengths, $length); ## determine GC content my $gc_seq = 0; while ($seq =~ /[CG]/ig) { $gc_seq += 1; } $gc += $gc_seq; $gc_seq = sprintf "%.1f", $gc_seq * 100 / $length; ## GC content in percent ## determine gaps my $gaps = 0; my $length_gaps = 0; my $percent_gaps = 0; if ($seq =~ /NN*/i) { my @partialseqs = split /NN*/i, $seq; $gaps = @partialseqs - 1; my $restseq = join "", @partialseqs; my $restlength = length($restseq); $length_gaps = $length - $restlength; $percent_gaps = sprintf "%.1f", ($length_gaps/$length)*100; } $number_all_gaps += $gaps; $length_all_gaps += $length_gaps; return ($length, $gc_seq, $gaps, $length_gaps, $percent_gaps); }