comparison data_stores/fasta_format.py @ 1:5c5027485f7d draft

Uploaded correct file
author damion
date Sun, 09 Aug 2015 16:07:50 -0400
parents
children
comparison
equal deleted inserted replaced
0:d31a1bd74e63 1:5c5027485f7d
1 #!/usr/bin/python
2 # Simple comparison and conversion tools for big fasta data
3
4 import sys, os, optparse
5
6 VERSION = "1.0.0"
7
8 class MyParser(optparse.OptionParser):
9 """
10 Provides a better class for displaying formatted help info.
11 From http://stackoverflow.com/questions/1857346/python-optparse-how-to-include-additional-info-in-usage-output.
12 """
13 def format_epilog(self, formatter):
14 return self.epilog
15
16
17 def split_len(seq, length):
18 return [seq[i:i+length] for i in range(0, len(seq), length)]
19
20
21 def check_file_path(file_path, message = "File "):
22
23 path = os.path.normpath(file_path)
24 # make sure any relative paths are converted to absolute ones
25 if not os.path.isdir(os.path.dirname(path)) or not os.path.isfile(path):
26 # Not an absolute path, so try default folder where script was called:
27 path = os.path.normpath(os.path.join(os.getcwd(), path) )
28 if not os.path.isfile(path):
29 print message + "[" + file_path + "] doesn't exist!"
30 sys.exit(1)
31
32 return path
33
34
35 class FastaFormat(object):
36
37 def __main__(self):
38
39 options, args = self.get_command_line()
40
41 if options.code_version:
42 print VERSION
43 return VERSION
44
45 if len(args) > 0:
46 file_a = check_file_path(args[0])
47 else: file_a = False
48
49 if len(args) > 1:
50 file_b = check_file_path(args[1])
51 else: file_b = False
52
53 if options.to_fasta == True:
54 # Transform from key-value file to regular fasta format: 1 line for identifier and description(s), remaining 80 character lines for fasta data.
55
56 with sys.stdout as outputFile :
57 for line in open(file_a,'r') if file_a else sys.stdin:
58 line_data = line.rsplit('\t',1)
59 if len(line_data) > 1:
60 outputFile.write(line_data[0] + '\n' + '\n'.join(split_len(line_data[1],80)) )
61 else:
62 # Fasta one-liner didn't have any sequence data
63 outputFile.write(line_data[0])
64
65 #outputFile.close() #Otherwise terminal never looks like it closes?
66
67
68 elif options.to_keyvalue == True:
69 # Transform from fasta format to key-value format:
70 # Separates sequence lines are merged and separated from id/description line by a tab.
71 with sys.stdout as outputFile:
72 start = True
73 for line in open(file_a,'r') if file_a else sys.stdin:
74 if line[0] == ">":
75 if start == False:
76 outputFile.write('\n')
77 else:
78 start = False
79 outputFile.write(line.strip() + '\t')
80
81 else:
82 outputFile.write(line.strip())
83
84
85 elif options.compare == True:
86
87 if len(args) < 2:
88 print "Error: Need two fasta file paths to compare"
89 sys.exit(1)
90
91 file_a = open(file_a,'r')
92 file_b = open(file_b,'r')
93
94 p = 3
95 count_a = 0
96 count_b = 0
97 sample_length = 50
98
99 while True:
100
101 if p&1:
102 a = file_a.readline()
103 count_a += 1
104
105 if p&2:
106 b = file_b.readline()
107 count_b += 1
108
109 if not a or not b: # blank line still has "cr\lf" in it so doesn't match here
110 print "EOF"
111 break
112
113 a = a.strip()
114 b = b.strip()
115
116 if a == b:
117 p = 3
118
119 elif a < b:
120 sys.stdout.write('f1 ln %s: -%s\nvs. %s \n' % (count_a, a[0:sample_length] , b[0:sample_length]))
121 p = 1
122
123 else:
124 sys.stdout.write('f2 ln %s: +%s\nvs. %s \n' % (count_b, b[0:sample_length] , a[0:sample_length]))
125 p = 2
126
127
128 if count_a % 1000000 == 0:
129 print "At line %s:" % count_a
130
131 # For testing:
132 #if count_a > 50:
133 #
134 # print "Quick exit at line 500"
135 # sys.exit(1)
136
137
138 for line in file_a.readlines():
139 count_a += 1
140 sys.stdout.write('f1 ln %s: -%s\nvs. %s \n' % (count_a, line[0:sample_length] ))
141
142 for line in file_b.readlines():
143 count_b += 1
144 sys.stdout.write('f2 ln %s: +%s\nvs. %s \n' % (count_b, line[0:sample_length] ))
145
146
147
148
149 def get_command_line(self):
150 """
151 *************************** Parse Command Line *****************************
152
153 """
154 parser = MyParser(
155 description = 'Tool for comparing two fasta files, or transforming fasta data to single-line key-value format, or visa versa.',
156 usage = 'fasta_format.py [options]',
157 epilog="""
158 Note: This tool uses stdin and stdout for transforming fasta data.
159
160 Convert from key-value data to fasta format data:
161 fasta_format.py [file] -f --fasta
162 cat [file] | fasta_format.py -f --fasta
163
164 Convert from fasta format data to key-value data:
165 fasta_format.py [file] -k --keyvalue
166 cat [file] | fasta_format.py -k --keyvalue
167
168 Compare two fasta format files:
169 fasta_format.py [file1] [file2] -c --compare
170
171 Return version of this code:
172 fasta_format.py -v --version
173
174 """)
175
176 parser.add_option('-c', '--compare', dest='compare', default=False, action='store_true',
177 help='Compare two fasta files')
178
179 parser.add_option('-f', '--fasta', dest='to_fasta', default=False, action='store_true',
180 help='Transform key-value file to fasta file format')
181
182 parser.add_option('-k', '--keyvalue', dest='to_keyvalue', default=False, action='store_true',
183 help='Transform fasta file format to key-value format')
184
185 parser.add_option('-v', '--version', dest='code_version', default=False, action='store_true',
186 help='Return version of this code.')
187
188 return parser.parse_args()
189
190
191 if __name__ == '__main__':
192
193 fasta_format = FastaFormat()
194 fasta_format.__main__()
195