annotate data_stores/fasta_format.py @ 2:269d246ce6d0 draft default tip

Uploaded
author damion
date Fri, 23 Oct 2015 17:53:29 -0400
parents 5c5027485f7d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1 #!/usr/bin/python
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
2 # Simple comparison and conversion tools for big fasta data
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
3
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
4 import sys, os, optparse
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
5
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
6 VERSION = "1.0.0"
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
7
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
8 class MyParser(optparse.OptionParser):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
9 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
10 Provides a better class for displaying formatted help info.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
11 From http://stackoverflow.com/questions/1857346/python-optparse-how-to-include-additional-info-in-usage-output.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
12 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
13 def format_epilog(self, formatter):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
14 return self.epilog
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
15
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
16
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
17 def split_len(seq, length):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
18 return [seq[i:i+length] for i in range(0, len(seq), length)]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
19
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
20
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
21 def check_file_path(file_path, message = "File "):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
22
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
23 path = os.path.normpath(file_path)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
24 # make sure any relative paths are converted to absolute ones
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
25 if not os.path.isdir(os.path.dirname(path)) or not os.path.isfile(path):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
26 # Not an absolute path, so try default folder where script was called:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
27 path = os.path.normpath(os.path.join(os.getcwd(), path) )
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
28 if not os.path.isfile(path):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
29 print message + "[" + file_path + "] doesn't exist!"
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
30 sys.exit(1)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
31
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
32 return path
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
33
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
34
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
35 class FastaFormat(object):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
36
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
37 def __main__(self):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
38
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
39 options, args = self.get_command_line()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
40
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
41 if options.code_version:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
42 print VERSION
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
43 return VERSION
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
44
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
45 if len(args) > 0:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
46 file_a = check_file_path(args[0])
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
47 else: file_a = False
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
48
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
49 if len(args) > 1:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
50 file_b = check_file_path(args[1])
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
51 else: file_b = False
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
52
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
53 if options.to_fasta == True:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
54 # Transform from key-value file to regular fasta format: 1 line for identifier and description(s), remaining 80 character lines for fasta data.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
55
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
56 with sys.stdout as outputFile :
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
57 for line in open(file_a,'r') if file_a else sys.stdin:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
58 line_data = line.rsplit('\t',1)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
59 if len(line_data) > 1:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
60 outputFile.write(line_data[0] + '\n' + '\n'.join(split_len(line_data[1],80)) )
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
61 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
62 # Fasta one-liner didn't have any sequence data
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
63 outputFile.write(line_data[0])
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
64
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
65 #outputFile.close() #Otherwise terminal never looks like it closes?
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
66
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
67
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
68 elif options.to_keyvalue == True:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
69 # Transform from fasta format to key-value format:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
70 # Separates sequence lines are merged and separated from id/description line by a tab.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
71 with sys.stdout as outputFile:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
72 start = True
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
73 for line in open(file_a,'r') if file_a else sys.stdin:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
74 if line[0] == ">":
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
75 if start == False:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
76 outputFile.write('\n')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
77 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
78 start = False
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
79 outputFile.write(line.strip() + '\t')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
80
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
81 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
82 outputFile.write(line.strip())
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
83
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
84
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
85 elif options.compare == True:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
86
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
87 if len(args) < 2:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
88 print "Error: Need two fasta file paths to compare"
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
89 sys.exit(1)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
90
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
91 file_a = open(file_a,'r')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
92 file_b = open(file_b,'r')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
93
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
94 p = 3
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
95 count_a = 0
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
96 count_b = 0
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
97 sample_length = 50
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
98
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
99 while True:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
100
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
101 if p&1:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
102 a = file_a.readline()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
103 count_a += 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
104
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
105 if p&2:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
106 b = file_b.readline()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
107 count_b += 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
108
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
109 if not a or not b: # blank line still has "cr\lf" in it so doesn't match here
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
110 print "EOF"
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
111 break
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
112
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
113 a = a.strip()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
114 b = b.strip()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
115
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
116 if a == b:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
117 p = 3
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
118
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
119 elif a < b:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
120 sys.stdout.write('f1 ln %s: -%s\nvs. %s \n' % (count_a, a[0:sample_length] , b[0:sample_length]))
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
121 p = 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
122
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
123 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
124 sys.stdout.write('f2 ln %s: +%s\nvs. %s \n' % (count_b, b[0:sample_length] , a[0:sample_length]))
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
125 p = 2
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
126
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
127
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
128 if count_a % 1000000 == 0:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
129 print "At line %s:" % count_a
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
130
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
131 # For testing:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
132 #if count_a > 50:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
133 #
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
134 # print "Quick exit at line 500"
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
135 # sys.exit(1)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
136
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
137
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
138 for line in file_a.readlines():
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
139 count_a += 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
140 sys.stdout.write('f1 ln %s: -%s\nvs. %s \n' % (count_a, line[0:sample_length] ))
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
141
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
142 for line in file_b.readlines():
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
143 count_b += 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
144 sys.stdout.write('f2 ln %s: +%s\nvs. %s \n' % (count_b, line[0:sample_length] ))
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
145
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
146
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
147
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
148
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
149 def get_command_line(self):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
150 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
151 *************************** Parse Command Line *****************************
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
152
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
153 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
154 parser = MyParser(
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
155 description = 'Tool for comparing two fasta files, or transforming fasta data to single-line key-value format, or visa versa.',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
156 usage = 'fasta_format.py [options]',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
157 epilog="""
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
158 Note: This tool uses stdin and stdout for transforming fasta data.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
159
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
160 Convert from key-value data to fasta format data:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
161 fasta_format.py [file] -f --fasta
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
162 cat [file] | fasta_format.py -f --fasta
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
163
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
164 Convert from fasta format data to key-value data:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
165 fasta_format.py [file] -k --keyvalue
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
166 cat [file] | fasta_format.py -k --keyvalue
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
167
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
168 Compare two fasta format files:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
169 fasta_format.py [file1] [file2] -c --compare
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
170
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
171 Return version of this code:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
172 fasta_format.py -v --version
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
173
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
174 """)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
175
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
176 parser.add_option('-c', '--compare', dest='compare', default=False, action='store_true',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
177 help='Compare two fasta files')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
178
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
179 parser.add_option('-f', '--fasta', dest='to_fasta', default=False, action='store_true',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
180 help='Transform key-value file to fasta file format')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
181
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
182 parser.add_option('-k', '--keyvalue', dest='to_keyvalue', default=False, action='store_true',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
183 help='Transform fasta file format to key-value format')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
184
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
185 parser.add_option('-v', '--version', dest='code_version', default=False, action='store_true',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
186 help='Return version of this code.')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
187
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
188 return parser.parse_args()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
189
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
190
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
191 if __name__ == '__main__':
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
192
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
193 fasta_format = FastaFormat()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
194 fasta_format.__main__()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
195