#!/usr/bin/perl -w #A Glycine codon usage calculator. I have this version print out a #lot of things along the way. You could easily shorten it. You should add #an extra few lines to calculate percentages of use of the various #glycine codons here, both locally (i.e., within each ORF) and globally #(i.e., taking all the ORF's in the input data together at once). print "\nCalculating Glycine codon usage:\n\n"; print "Please enter DNA data file with no markings:\n"; $dnadataname = ; chomp $dnadataname; unless(open(FILE,$dnadataname)){ print "Trouble opening DNA data file\.\n\n"; exit; } @DNA = ; close FILE; $longstring = join('',@DNA); $longstring =~ s/\n//g; $longstring =~ s/\s//g; print "\nORF's in the original sequence:\n\n"; @nucleotides = split('',$longstring); #As we already saw, this is how you open an output file. Don't forget #the " "'s or > in the open(STOREDORFS, ">$outputfile)! $outputfile = "storedorfs"; unless(open(STOREDORFS, ">$outputfile")){ print "\n Cannot open file \"$outputfile\"\n"; exit; } $location = 0; $ATG = 0; $ORF = 0; while($location < @nucleotides - 2) {$a = $nucleotides[$location].$nucleotides[$location + 1].$nucleotides[$location + 2]; ++$location; if($a =~ /ATG/i) { $b = $location; print "ATG found at position $location\n"; ++$ATG; while($b < @nucleotides - 1){ $c = $nucleotides[$b-1].$nucleotides[$b].$nucleotides[$b+1]; if($c =~ /TAA/i){ ++$ORF; print "TAA found at position $b\.\n"; $b = $b+2; print "ORF number $ORF from position $location to $b\.\n"; $string = substr($longstring, $location-1, $b - $location + 1); print STOREDORFS ">ORF number $ORF:\n$string\n\n"; $b = @nucleotides; }elsif($c =~ /TAG/i){ ++$ORF; print "TAG found at position $b\.\n"; $b = $b+2; print "ORF number $ORF from position $location to $b\.\n\n"; $string = substr($longstring, $location-1, $b - $location + 1); print STOREDORFS ">ORF number $ORF:\n$string\n\n"; $b = @nucleotides; }elsif($c =~ /TGA/i){ ++$ORF; print "TGA found at position $b\.\n"; $b = $b+2; print "ORF number $ORF from position $location to $b\.\n\n"; $string = substr($longstring, $location-1, $b - $location + 1); print STOREDORFS ">ORF number $ORF:\n$string\n\n"; $b = @nucleotides; }$b = $b + 3; } }$location = $location + 2; } print "\nTotal number of ORF\'s found in the original sequence = $ORF\.\n\n"; close STOREDORFS; $odata = "storedorfs"; open(ODATA,$odata); @orfdata = ; print @orfdata,"\n\n"; close ODATA; #The first new piece is housekeeping: the output file "storedorfs" looked good #when we printed it out, but we need to get rid of all the "\n"'s. #@redodata = ''; #Stands for "reduced orf data". A foreach loop creates a variable #(here $line) which runs over all the arrayed strings inside an array (here @odata). #The loop here is just getting rid of "\n"'s and blank lines left in the file. @reodata = ''; #Since these are ORF's we know that the useful lines all begin with #'A', which we can use for filtering purposes. foreach $line (@orfdata){ if($line =~ /A/){ chomp $line; @redodata = (@redodata, $line); } } #Now we go back to calling our cleaned up file '@orfdata'. You could #save the original @orfdata if you want to have a copy of the data #formatted in a special way, for example, some kind of FastA arrangement. @orfdata = @redodata; #You could use either of these to double check that we have a clean @orfdata now. #print $#orfdata,"\n"; #print scalar(@orfdata),"\n\n\n"; #Another double check is to compare this print out to the earlier print #out of the array. It uses another way to loop through a list like an array: in Perl, #the expression $#orfdata means the hghest list number of the strings in @orfdata. #$_ is the standard Perl defalt variable inside a loop. for(0..$#orfdata){ print $orfdata[$_],"\n\n"; } #Next we set some global counters for the four types of glycine codons: GG*. #We will keep track of a lot of things, so we will have local versions of these #counters within each ORF, too. $GGA = 0; $GGC = 0; $GGG = 0; $GGT = 0; $Gly = 0; $o = 0; #This will label the ORF's. foreach $line (@orfdata){ print length($line),"\n\n"; } foreach $string (@orfdata){ ++$o; #Setting local counters. Adding 'my' to a declaration is a way to guarantee #they don't get used outside this loop. my $gga = 0; my $ggc = 0; my $ggg = 0; my $ggt = 0; my $gly = 0; my $p = 0; while($p < length($string) - 2){ my $codon = substr($string, $p, 3); if($codon =~ /GGA/){ ++$gga; ++$gly; ++$GGA; ++$Gly; $q = $p + 1; print $codon,"($q)\n"; }elsif($codon =~ /GGC/){ ++$ggc; ++$gly; ++$GGC; ++$Gly; $q = $p + 1; print $codon,"($q)\n"; }elsif($codon =~ /GGG/){ ++$ggg; ++$gly; ++$GGG; ++$Gly; $q = $p + 1; print $codon,"($q)\n"; }elsif($codon =~ /GGT/){ ++$ggt; ++$gly; ++$GGT; ++$Gly; $q = $p + 1; print $codon,"($q)\n"; }$p = $p + 3; } #We have finished our counts for this ORF. We compute the codon biases within this ORF: #This time we will report the numbers and percentages found for each ORF. The if-else #is to guarantee we don't bother the computer by trying to divide by zero. print "In ORF $o, there are $gly glycine codons.\n"; print "Of these, there are $gga GGA codons;\n"; print "There are $ggc GGC codons;\n"; print "There are $ggg GGG codons;\n"; print "There are $ggt GGT codons.\n\n"; } #Now we can print out the global statistics: print "\nThe total number of Glycine codons within all $o ORF's = $Gly\.\n"; print "\nThere are $GGA GGA codons\.\n"; print "\nThere are $GGC GGC codons\.\n"; print "\nThere are $GGG GGG codons\.\n"; print "\nThere are $GGT GGT codons\.\n\n"; exit;