Perl Translation protein

目录 Perl

将CDS翻译成蛋白的脚本cds2aa.pl

#!/usr/bin/perl
use strict;

if(scalar @ARGV==0){
    die "This program is used to trans cds to pep
    perl $0 <cds file> <pep>\n"
;
}

open IN,"<$ARGV[0]";
open OUT,">$ARGV[1]";

$/=">";
<IN>;

while (<IN>){
    chomp;
    next if (/^\s+$/);
    my ($id,$dna)=(split /\n/,$_,2)[0,1];
    $dna=~ s/\n//g;
    my $protein="";
    for (my $i=0;$i< (length($dna)-2);$i+=3){
        $protein.= &codon2aa (substr ($dna,$i,3));
    }
    print OUT ">$id\n";
    print OUT "$protein\n";
}
close IN;
$/="\n";

sub codon2aa {
    my ($codon)=@_;
    $codon=uc $codon;
    my(%genetic_code)=(
    'TCA'=>'S',#Serine
    'TCC'=>'S',#Serine
    'TCG'=>'S',#Serine
    'TCT'=>'S',#Serine
   
    'TTC'=>'F',#Phenylalanine;
    'TTT'=>'F',#Phenylalanine;
    'TTA'=>'L',#Leucine
    'TTG'=>'L',#Leucine
   
    'TAC'=>'Y',#Tyrosine
    'TAT'=>'Y',#Tyrosine
    'TAA'=>'', #Stop
    'TAG'=>'', #Stop
   
    'TGC'=>'C',#Cysteine
    'TGT'=>'C',#Cysteine
    'TGA'=>'', #Stop
    'TGG'=>'W',#Tryptophan
   
    'CTA'=>'L',#Leucine
    'CTC'=>'L',#Leucine
    'CTG'=>'L',#Leucine
    'CTT'=>'L',#Leucine
   
    'CCA'=>'P',#Proline
    'CCC'=>'P',#Proline
    'CCG'=>'P',#Proline
    'CCT'=>'P',#Proline
   
    'CAC'=>'H',#Histidine
    'CAT'=>'H',#Histidine
    'CAA'=>'Q',#Glutamine
    'CAG'=>'Q',#Glutamine
   
    'CGA'=>'R',#Arginine
    'CGC'=>'R',#Arginine
    'CGG'=>'R',#Arginine
    'CGT'=>'R',#Arginine
   
    'ATA'=>'I',#Isoleucine
    'ATC'=>'I',#Isoleucine
    'ATT'=>'I',#Isoleucine
    'ATG'=>'M',#Methionine
   
    'ACA'=>'T',#Threonine
    'ACC'=>'T',#Threonine
    'ACG'=>'T',#Threonine
    'ACT'=>'T',#Threonine
   
    'AAC'=>'N',#Asparagine
    'AAT'=>'N',#Asparagine
    'AAA'=>'K',#Lysine
    'AAG'=>'K',#Lysine
   
    'AGC'=>'S',#Serine
    'AGT'=>'S',#Serine
    'AGA'=>'R',#Arginine
    'AGG'=>'R',#Arginine
   
    'GTA'=>'V',#Valine
    'GTC'=>'V',#Valine
    'GTG'=>'V',#Valine
    'GTT'=>'V',#Valine
   
    'GCA'=>'A',#Alanine
    'GCC'=>'A',#Alanine
    'GCG'=>'A',#Alanine
    'GCT'=>'A',#Alanine
   
    'GAC'=>'D',#Aspartic Acid
    'GAT'=>'D',#Aspartic Acid
    'GAA'=>'E',#Glutamine Acid
    'GAG'=>'E',#Glutamine Acid
   
    'GGA'=>'G',#Glycine
    'GGC'=>'G',#Glycine
    'GGG'=>'G',#Glycine
    'GGT'=>'G',#Glycine
    );
    if (exists $genetic_code{$codon}) {
        return "$genetic_code{$codon}";
    }else{
        return "X";
    }
}

测试的CDS文件test.fa:

>orf00001
ATGGTACAATATAACAATAATTATCCACAAGACAATAAGGAAGAAGCTATGACGGAAAACGAACAACTATTTTGGAATAGAGTACTAGAGCTATCTCGTTCTCAAATAGCACCAGCAGCTTATGAATTTTTTGTTCTAGAGGCTAGACTCCTCAAAATTGAACATCAAACTGCAGTTATTACTTTAGATAACATTGAAATGAAAAAGCTATTCTGGGAACAAAATTTGGGGCCTGTTATCCTAACAGCTGGTTTTGAAATTTTCAATGCTGAAATTACAGCTAACTATGTCTCAAACGATTTACATTTACAAGAAACTAGTTTTTCTAACTACCAGCAATCTAGCAATGAAGTAAATACTTTACCAATTAGAAAAATCGACTCTAATCTTAAAGAGAAATATACTTTTGCTAATTTTGTTCAAGGAGATGAAAATAGATGGGCTGTTTCAGCATCAATTGCTGTAGCTGATAGTCCTGGCACGACTTATAATCCTCTATTTATCTGGGGAGGACCTGGTCTAGGAAAGACGCATCTACTAAATGCTATTGGAAATCAAGTCTTAAGAGATAATCCAAACGCGAGGGTTTTATACATCACTGCTGAGAATTTTATTAATGAATTCGTCAGTCATATTCGTTTAGATTCGATGGAAGAATTAAAAGAAAAGTTTCGCAACTTGGACTTACTCCTGATTGATGATATTCAGTCGCTTGCTAAGAAAACCTTAGGGGGGACCCAAGAGGAGTTCTTCAATACTTTCAATGCTTTACATACAAACGATAAACAAATCGTATTGACCAGTGACCGAAATCCAAATCAATTAAATGATCTAGAAGAACGTCTAGTCACGCGCTTTAGTTGGGGACTCCCAGTAAATATCACACCACCTGATTTTGAAACACGAGTTGCTATTTTAACCAATAAAATTCAAGAATATCCTTATGATTTTCCTCAAGATACCATTGAATACTTAGCAGGAGAATTTGATTCCAACGTACGTGAATTAGAAGGAGCCTTGAAAAATATTAGTCTAGTTGCTGACTTTAAGCATGCTAAAACTATTACAGTAGATATAGCTGCAGAAGCTATCAGAGCACGTAAAAATGACGGTCCTATTGTTACTGTCATTCCTATAGAAGAAATTCAAATACAGGTTGGTAAATTCTATGGCGTAACTGTAAAAGAGATAAAAGCAACTAAAAGAACACAAGATATTGTCCTTGCAAGACAGGTAGCCATGTACTTAGCTCGTGAGATGACAGATAACAGTCTCCCCAAAATAGGTAAAGAATTTGGGGGACGAGATCACTCAACTGTTCTCCACGCTTATAATAAAATAAAAAATATGGTTGCTCAAGATGACAACTTACGAATTGAGATAGAAACTATCAAAAATAAAATCAGGTAG
>orf00002
ATGATTCATTTTTCAATAAATAAAAATTTCTTCTTGCATGCTCTAACGGTAACCAAACGAGCTATTAGTCATAAAAATGCGATTCCAATCCTTTCAACTGTTAAAATAGAAGTGACTAGAGATGCTATCATTTTAACGGGGTCAAATGGACAAATTTCAATTGAAAATACTATTCCTGCTTCAAATGAAAACGCAGGTTTACTAGTAACGAATCCAGGCTCTATTTTGTTAGAAGCTGGTTTCTTTATTAATATTATTTCAAGTTTACCAGATGTAACTTTAGAATTTACAGAGATTGAACAACATCAAATTGTTCTTACTAGTGGAAAATCAGAGATTACTTTGAAAGGTAAGGATGTCGATCAATACCCTCGTCTGCAGGAAATGACAACAGATACTCCATTAACATTAGAAACTAAACTGTTAAAATCAATTATTAATGAAACTGCTTTTGCTGCTAGCCAACAAGAAAGCCGTCCAATCTTAACAGGTGTTCATTTGGTTATCAGTCAAAATAAATACTTTAAGGCTGTTGCGACAGATTCACACCGTATGAGTCAACGCACTTTCCAATTAGAGAAATCGGCTAATAATTTTGATTTGGTTGTTCCAAGTAAATCCCTTCGAGAATTTTCGGCTGTTTTTACAGATGATATTGAAACTGTAGAGGTTTTCTTCTCAGATAGTCAAATGTTATTTAGAAGTGAAAATATCAGCTTCTATACACGTTTGCTTGAAGGAAACTACCCTGATACTGATCGCCTCCTAACTAATCAGTTTGAAACCGAAATTATCTTTAATACAAATGCTTTACGCCATGCTATGGAACGTGCTTATTTAATTTCGAATGCAACTCAGAACGGTACTGTTCGTTTAGAAATTCAAAATGAAACAGTCTCAGCTCATGTAAACTCTCCAGAAGTTGGTAAAGTTAATGAGGAATTGGATACTGTTAGCCTTAAAGGTGATAGTTTAAATATTAGTTTTAATCCAACTTACCTAATTGAATCTTTAAAAGCAGTAAAAAGCGAAACAGTTACGATTCGATTTATTTCTCCAGTACGTCCATTTACTTTGACACCTGGTGAAGATACTGAAGATTTCATACAATTAATAACTCCTGTTCGTACTAACTAA
>orf00003
ATGATGAGTAATATGACTCTATATATAATAGCTAACCCCCATGCTGGTAATAAAAATGCCTCCACTATTGTTGGTCAAATTCAGGAGTTTTATCATACTGAAGATATTTCTGTGTTCTATACAGAACAGAAAGATGATGAAAAAAAACAAGTCATTAATATACTAAGGTCTTTTAAAGAAAGTGATCATCTAATGATTATAGGAGGAGATGGTACCTTATCAAAAGTAATGACTTATCTCCCCAACATATTCCGTGCGCTTATTATCCTGTTGGTTCGGGAAATGATTTTGCCAGAGCTTTGA

测试命令:

perl cds2aa.pl test.fa test.pep.txt

测试结果文件test.pep.txt

>orf00001
MVQYNNNYPQDNKEEAMTENEQLFWNRVLELSRSQIAPAAYEFFVLEARLLKIEHQTAVITLDNIEMKKLFWEQNLGPVILTAGFEIFNAEITANYVSNDLHLQETSFSNYQQSSNEVNTLPIRKIDSNLKEKYTFANFVQGDENRWAVSASIAVADSPGTTYNPLFIWGGPGLGKTHLLNAIGNQVLRDNPNARVLYITAENFINEFVSHIRLDSMEELKEKFRNLDLLLIDDIQSLAKKTLGGTQEEFFNTFNALHTNDKQIVLTSDRNPNQLNDLEERLVTRFSWGLPVNITPPDFETRVAILTNKIQEYPYDFPQDTIEYLAGEFDSNVRELEGALKNISLVADFKHAKTITVDIAAEAIRARKNDGPIVTVIPIEEIQIQVGKFYGVTVKEIKATKRTQDIVLARQVAMYLAREMTDNSLPKIGKEFGGRDHSTVLHAYNKIKNMVAQDDNLRIEIETIKNKIR
>orf00002
MIHFSINKNFFLHALTVTKRAISHKNAIPILSTVKIEVTRDAIILTGSNGQISIENTIPASNENAGLLVTNPGSILLEAGFFINIISSLPDVTLEFTEIEQHQIVLTSGKSEITLKGKDVDQYPRLQEMTTDTPLTLETKLLKSIINETAFAASQQESRPILTGVHLVISQNKYFKAVATDSHRMSQRTFQLEKSANNFDLVVPSKSLREFSAVFTDDIETVEVFFSDSQMLFRSENISFYTRLLEGNYPDTDRLLTNQFETEIIFNTNALRHAMERAYLISNATQNGTVRLEIQNETVSAHVNSPEVGKVNEELDTVSLKGDSLNISFNPTYLIESLKAVKSETVTIRFISPVRPFTLTPGEDTEDFIQLITPVRTN
>orf00003
MMSNMTLYIIANPHAGNKNASTIVGQIQEFYHTEDISVFYTEQKDDEKKQVINILRSFKESDHLMIIGGDGTLSKVMTYLPNIFRALIILLVREMILPEL

暂无评论

发表评论