#!/usr/bin/perl -w #This module will take as input a string of DNA and print an output #file which contains the ORF's found in the DNA string. This is an #adjustment to somethign done last week, as you should recognize. print "\nStoring codon information:\n\n"; print "Please enter DNA data file with no markings:\n"; $dnadataname = ; chomp $dnadataname; #It is a good idea when you have an interactive data entry to have #an unless line to explain why the program stopped, if it had to: unless(open(FILE,$dnadataname)){ print "Trouble opening DNA data file\.\n\n"; exit; } @DNA = ; $longstring = join('',@DNA); $longstring =~ s/\n//g; $longstring =~ s/\s//g; print "\nORF's in the original sequence:\n\n"; @nucleotides = split('',$longstring); #As we already saw, this is how you open an output file. Don't forget #the " "'s or > in the open(STOREDORFS, ">$outputfile)! $outputfile = "storedorfs"; unless(open(STOREDORFS, ">$outputfile")){ print "\n Cannot open file \"$outputfile\"\n"; exit; } close FILE; $location = 0; $ATG = 0; $ORF = 0; #This next loop is the main piece, and mainly taken from last week: while($location < @nucleotides - 2) {$a = $nucleotides[$location].$nucleotides[$location + 1].$nucleotides[$location + 2]; ++$location; if($a =~ /ATG/i) { $b = $location; print "ATG found at position $location\n"; ++$ATG; while($b < @nucleotides - 1){ $c = $nucleotides[$b-1].$nucleotides[$b].$nucleotides[$b+1]; if($c =~ /TAA/i){ ++$ORF; print "TAA found at position $b\.\n"; $b = $b+2; print "ORF number $ORF from position $location to $b\.\n"; #The next two lines are the new part, and are repeated in each of the three if loops #which check the three types of stop codons as the end of our ORF. The first #creates a string by the substr (=substring) function. It looks like sunstr($A,$B,$C), #where $A is a scalar variable (i.e., a string), $B is a numeric variable which #tells you the beginning position of the substring, and $C tells what the length #of the substring will be. So the next example below creates a string (named #$string) which is a substring of $longline, beginning at position $location -1 (I #switched back to the Perl location system, which starts from 0, not 1), and is #$b - $location + 1 characters long. It is then printed to theoutputfile as an added #line in that file. $string = substr($longstring, $location-1, $b - $location + 1); print STOREDORFS ">ORF number $ORF:\n$string\n\n"; $b = @nucleotides; }elsif($c =~ /TAG/i){ ++$ORF; print "TAG found at position $b\.\n"; $b = $b+2; print "ORF number $ORF from position $location to $b\.\n\n"; $string = substr($longstring, $location-1, $b - $location + 1); print STOREDORFS ">ORF number $ORF:\n$string\n\n"; $b = @nucleotides; }elsif($c =~ /TGA/i){ ++$ORF; print "TGA found at position $b\.\n"; $b = $b+2; print "ORF number $ORF from position $location to $b\.\n\n"; $string = substr($longstring, $location-1, $b - $location + 1); print STOREDORFS ">ORF number $ORF:\n$string\n\n"; $b = @nucleotides; }$b = $b + 3; } }$location = $location + 2; } print "\nTotal number of ORF\'s found in the original sequence = $ORF\.\n\n"; close STOREDORFS; #Now let us have a look at what we have put in the file. The outputfile should remain #as a file in the directory in which you have been using Perl. $odata = "storedorfs"; open(ODATA,$odata); @orfdata = ; close ODATA; print @orfdata; #Global counters: $o = 0; $GGA = 0; $GGC = 0; $GGG = 0; $GGT = 0; $Gly = 0; foreach $line (@orfdata){ chomp $line; if(not $line =~ />/){ $line = $line . "\n"; print $line,"\n"; chomp $line; ++$o; $p = 0; $gga = 0; $ggc = 0; $ggg = 0; $ggt = 0; $gly = 0; while($p+2 < length $line){ $codon = substr($line, $p, 3); print $codon; if($codon =~ /GGA/i){ ++$gga; ++$gly; ++$GGA; ++$Gly; }elsif($codon =~ /GGC/i){ ++$ggc; ++$gly; ++$GGC; ++$Gly; }elsif($codon =~ /GGG/i){ ++$ggg; ++$gly; ++$GGG; ++$Gly; }elsif($codon =~ /GGT/i){ ++$ggt; ++$gly; ++$GGT; ++$Gly; }$p = $p + 3; } $pgga = $gga/$gly; $pggc = $ggc/$gly; $pggg = $ggg/$gly; $pggt = $ggt/$gly; print "\nORF number $o contains $gly glycine codons.\nOf these, $gga are GGA ($pgga%);\n$ggc are GGC ($pggc%);\n$ggg are GGG ($pggg%);\n$ggt are GGT ($pggt%)\.\n\n"; } } $PGGA = $GGA/$Gly; $PGGC = $GGC/$Gly; $PGGG = $GGG/$Gly; $PGGT = $GGT/$Gly; print "The overall codon usage statistics for these codons are:\n"; print "$Gly glycine codons, of which\n\n"; print "$GGA are GGA ($PGGA%);\n"; print "$GGC are GGC ($PGGC%);\n"; print "$GGG are GGG ($PGGG%);\n"; print "$GGT are GGT ($PGGT%)\.\n\n"; exit;