1  
 
  2  '''
 
  3  Created 2012
 
  4  
 
  5  Contains various help functions which initialize / translate /preprocess the data
 
  6  
 
  7  
 
  8  @author: Sven Giese''' 
  9  
 
 10  import cPickle as pickle 
 11  import random 
 12  
 
 13  ''' INIT DICTIONARIES ''' 
 14  genetic_code={'GCT':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A',
 
 15                'CGT':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R', 'AGA':'R', 'AGG':'R',
 
 16                'AAT':'N', 'AAC':'N',
 
 17                'GAT':'D', 'GAC':'D',
 
 18                'TGT':'C', 'TGC':'C',
 
 19                'CAA':'Q', 'CAG':'Q',
 
 20                'GAA':'E', 'GAG':'E',
 
 21                'GGT':'G', 'GGC':'G','GGA':'G', 'GGG':'G',
 
 22                'CAT':'H', 'CAC':'H',
 
 23                'ATT':'I', 'ATC':'I','ATA':'I',
 
 24                'ATG':'M',
 
 25                'TTA':'L', 'TTG':'L', 'CTT':'L', 'CTC':'L', 'CTA':'L', 'CTG':'L',
 
 26                'AAA':'K', 'AAG':'K',
 
 27                'TTT':'F', 'TTC':'F',
 
 28                'CCT':'P', 'CCC':'P','CCA':'P', 'CCG':'P',
 
 29                'TCT':'S', 'TCC':'S', 'TCA':'S', 'TCG':'S', 'AGT':'S', 'AGC':'S',
 
 30                'ACT':'T', 'ACC':'T','ACA':'T', 'ACG':'T',
 
 31                'TGG':'W',
 
 32                'TAT':'Y', 'TAC':'Y',
 
 33                'GTT':'V', 'GTC':'V','GTA':'V', 'GTG':'V',
 
 34                'TAA':'*', 'TGA':'*','TAG':'*','NNN':'n'} 
 35  
 
 36  
 
 37  
 
 39      """
 
 40      Creates the dictionary for the AA triplets and searches the starting indices 
 
 41      of the triplets in the given aminoacid sequence.
 
 42  
 
 43      @type  AAsequence: string
 
 44      @param AAsequence: aminoacid sequence
 
 45      @rtype:   dictionary
 
 46      @return:  A dictionary with starting positions of each triplet in the given AA sequence
 
 47      
 
 48      """ 
 49      
 
 50      liste = ["A","R","N","D","C","E","Q","G","H","I","L","K","M","F","P","S","T","W","Y","V","*"] 
 51      aa_triplets = {} 
 52      
 
 53      
 
 54      for i in range(0,len(liste)): 
 55          for k in range(0,len(liste)): 
 56              for l in range(0,len(liste)): 
 57                  aa_triplets[liste[i]+liste[k]+liste[l]]= [] 
 58                  
 
 59      
 
 60      
 
 61      
 
 62      for i in range(1,len(AAsequence),3): 
 63          if i+3 > len(AAsequence): 
 64              break 
 65          if AAsequence[i:i+3] in aa_triplets: 
 66              aa_triplets[AAsequence[i:i+3]].append(i) 
 67      return(aa_triplets) 
  68  
 
 69  
 
 70  
 
 71  
 
 73      """
 
 74      Function which checks if a given triplet has max hamming distance of 1 
 
 75      to a other triplet. Used for generation of possible substitutions triplets
 
 76  
 
 77      @type  codon: string
 
 78      @param codon: nucleotide triplet
 
 79      @type  dictentry: string
 
 80      @param dictentry: nucleotide triplet
 
 81      @rtype:   bool
 
 82      @return:  Boolean value. True if max hamming distance 1,else False .
 
 83      
 
 84      """ 
 85      counter = 0 
 86      
 
 87      for i in range (0,3): 
 88         
 
 89          if codon[i]== dictentry[i]: 
 90              counter+=1 
 91          else: 
 92              continue 
 93          
 
 94      if counter == 2: 
 95          return (True) 
 96      else: 
 97          return (False) 
  98  
 
100      """
 
101      Funtion which translates DNA to AA
 
102  
 
103      @type  DNA: list
 
104      @param DNA: nucleotide sequence
 
105      @rtype:   prot,rest
 
106      @return:  Translated aminoacid sequence,untranslated nucleotide sequence
 
107      """ 
108      protein=[] 
109      prot = "" 
110      rest="" 
111      
 
112      DNA = "".join(DNA) 
113      for i in range(0,len(DNA),3): 
114          
 
115          if(i+3 > len(DNA)): 
116              rest +=DNA[i:i+3] 
117          
 
118              break 
119          
 
120          if("N" in DNA[i:i+3]): 
121              a_a = "n" 
122              protein.append(a_a) 
123          else: 
124              
 
125              codon=DNA[i:i+3] 
126              
 
127              a_a=genetic_code[codon] 
128              protein.append(a_a) 
129              
 
130      
 
131      prot = "".join(protein) 
132      return (prot,rest) 
 133  
 
134  ''' DEBUG HELP FUNCTIONS ''' 
135  
 
136  
 
138      """
 
139      basic pickle functions. actually for debugging and to speed up multiple simulations ( possible to load orf lists) 
 
140  
 
141      @type  dictionary: dictionary
 
142      @param dictionary: Dictionary containg start and end positions of ORFs.
 
143      @type  outputname: string
 
144      @param outputname: Filename for saving.
 
145      
 
146      """ 
147      pickle.dump( dictionary, open(outputname +".p", "wb" ) ) 
148      print("Saved .pickle to: " + outputname +".p") 
 149  
 
151      """
 
152      basic pickle functions. actually for debugging and to speed up multiple simulations ( possible to load orf lists) 
 
153  
 
154  
 
155      @type  inputname: string
 
156      @param inputname: Filename for loading.
 
157      @rtype:   dictionary
 
158      @return:  Dictionary containing start and end positions of ORFs.
 
159      """ 
160      dictionary= pickle.load( open(inputname )) 
161      print("Loaded "+inputname+" pickle!") 
162      return (dictionary) 
 163