#!/usr/local/bin/perl &learn_genetic_code; open(OUT,">./infile.nuc")||die "cannot open outfile\n"; open(CLUSTAL,"CD2_mod.aln")||die "cannot open protein infile\n"; open(RAWDNA,"CD2.cds")||die "cannot open DNA infile\n"; $firstline=.; if ($firstline =~ /Len=/){ my @words = split /Len=/, $firstline; $size = @words[1]; $dnasize = ($size * 3); } while (){ chop; ($species,$seq)=split; $protein_alignment{$species} .= $seq; } $speciescounter=0; while (){ chop; if (/>/){ $count = ++$speciescounter; ($name,$trash)=split; $name=substr($name,1); }else{ $rawdna{$name} .= $_; } } print OUT "$count $dnasize\n"; foreach (sort keys %protein_alignment){ $species=$_; next unless ($species =~ /\w/); $protein=$protein_alignment{$species}; $rawdna=$rawdna{$species}; $rawdna =~ s/[^A-Za-z]//g; $protein =~ s/[^A-Za-z\-]//g; #print("species: ",$species," ",length($protein)," ",length($rawdna),"\n"); $outdna=""; $counter=0; for ($i=0; $i < length($protein); $i++){ if (substr($protein,$i,1) eq "-"){ $outdna .= "---"; }else{ $codon = substr($rawdna,$counter,3); #########put in to account for ESTSCan codons <3bases ############## if (length($codon) eq 2){ print "$codon\n"; $codon = ($codon . 'x'); print "$codon\n"; } elsif (length($codon) eq 1){ print "$codon\n"; $codon = ($codon . 'xx'); print "$codon\n"; } ####################################################################### $outdna .= $codon; $outdna =~ tr/a-z/A-Z/; $outdna =~ tr/X/N/; $counter += 3; #check translation: $aa = substr($protein,$i,1); $codon =~ tr/A-Z/a-z/; if ($aa{$codon} ne $aa){ print "$codon translated as $aa in $species\n"; } } } print OUT "$species\n"; for ($j=0; $j <= length($outdna); $j += 60){ print OUT (substr($outdna,$j,60),"\n"); } } %rawdna = ""; %protein_alignment = ""; close (CLUSTAL)||die "cannot close CLUSTAL\n"; close (RAWDNA)||die "cannot close RAWDNA\n"; close (OUT)||die "cannot close OUT\n"; ####### sub learn_genetic_code{ $long=" ttt F Phe:tct S Ser:tat Y Tyr:tgt C Cys: ttc F Phe:tcc S Ser:tac Y Tyr:tgc C Cys: tta L Leu:tca S Ser:taa * ter:tga * ter: ttg L Leu:tcg S Ser:tag * ter:tgg W Trp: ctt L Leu:cct P Pro:cat H His:cgt R Arg: ctc L Leu:ccc P Pro:cac H His:cgc R Arg: cta L Leu:cca P Pro:caa Q Gln:cga R Arg: ctg L Leu:ccg P Pro:cag Q Gln:cgg R Arg: att I Ile:act T Thr:aat N Asn:agt S Ser: atc I Ile:acc T Thr:aac N Asn:agc S Ser: ata I Ile:aca T Thr:aaa K Lys:aga R Arg: atg M Met:acg T Thr:aag K Lys:agg R Arg: gtt V Val:gct A Ala:gat D Asp:ggt G Gly: gtc V Val:gcc A Ala:gac D Asp:ggc G Gly: gta V Val:gca A Ala:gaa E Glu:gga G Gly: gtg V Val:gcg A Ala:gag E Glu:ggg G Gly: gxx X X:axx X X:txx X X:cxx X X: xxg X X:xxa X X:xxt X X:xxc X X: gax X X:gcx A ALA:gtx V VAL:ggx G GLY: aax X X:acx T THR:agx X X:atx X X: cax X X:ccx X X:cgx R ARG:ctx L LEU: tax X X:tcx S SER:tgx X X:ttx X X: tgy C CYS:gay D ASP:gar E GLU:TTY P PHE: cay H HIS:ath I ILE:aar K LYS:aay N ASN: ccx P PRO:car Q Gln:agr R ARG:agy S SER: tay Y TYR: "; @trios=split(/:/,$long); foreach (@trios){ ($codon,$one,$three)=split; $aa{$codon}=$one; $three{$codon}=$three; } $aa{"---"}="-"; #learn degeneracies of amino acids (one-letter-code) (useful for RSCU): foreach (@codons){$degeneracy{$aa{$_}}++;} }#return ###################################################