Mercurial > repos > damion > versioned_data
comparison data_stores/fasta_format.py @ 1:5c5027485f7d draft
Uploaded correct file
author | damion |
---|---|
date | Sun, 09 Aug 2015 16:07:50 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:d31a1bd74e63 | 1:5c5027485f7d |
---|---|
1 #!/usr/bin/python | |
2 # Simple comparison and conversion tools for big fasta data | |
3 | |
4 import sys, os, optparse | |
5 | |
6 VERSION = "1.0.0" | |
7 | |
8 class MyParser(optparse.OptionParser): | |
9 """ | |
10 Provides a better class for displaying formatted help info. | |
11 From http://stackoverflow.com/questions/1857346/python-optparse-how-to-include-additional-info-in-usage-output. | |
12 """ | |
13 def format_epilog(self, formatter): | |
14 return self.epilog | |
15 | |
16 | |
17 def split_len(seq, length): | |
18 return [seq[i:i+length] for i in range(0, len(seq), length)] | |
19 | |
20 | |
21 def check_file_path(file_path, message = "File "): | |
22 | |
23 path = os.path.normpath(file_path) | |
24 # make sure any relative paths are converted to absolute ones | |
25 if not os.path.isdir(os.path.dirname(path)) or not os.path.isfile(path): | |
26 # Not an absolute path, so try default folder where script was called: | |
27 path = os.path.normpath(os.path.join(os.getcwd(), path) ) | |
28 if not os.path.isfile(path): | |
29 print message + "[" + file_path + "] doesn't exist!" | |
30 sys.exit(1) | |
31 | |
32 return path | |
33 | |
34 | |
35 class FastaFormat(object): | |
36 | |
37 def __main__(self): | |
38 | |
39 options, args = self.get_command_line() | |
40 | |
41 if options.code_version: | |
42 print VERSION | |
43 return VERSION | |
44 | |
45 if len(args) > 0: | |
46 file_a = check_file_path(args[0]) | |
47 else: file_a = False | |
48 | |
49 if len(args) > 1: | |
50 file_b = check_file_path(args[1]) | |
51 else: file_b = False | |
52 | |
53 if options.to_fasta == True: | |
54 # Transform from key-value file to regular fasta format: 1 line for identifier and description(s), remaining 80 character lines for fasta data. | |
55 | |
56 with sys.stdout as outputFile : | |
57 for line in open(file_a,'r') if file_a else sys.stdin: | |
58 line_data = line.rsplit('\t',1) | |
59 if len(line_data) > 1: | |
60 outputFile.write(line_data[0] + '\n' + '\n'.join(split_len(line_data[1],80)) ) | |
61 else: | |
62 # Fasta one-liner didn't have any sequence data | |
63 outputFile.write(line_data[0]) | |
64 | |
65 #outputFile.close() #Otherwise terminal never looks like it closes? | |
66 | |
67 | |
68 elif options.to_keyvalue == True: | |
69 # Transform from fasta format to key-value format: | |
70 # Separates sequence lines are merged and separated from id/description line by a tab. | |
71 with sys.stdout as outputFile: | |
72 start = True | |
73 for line in open(file_a,'r') if file_a else sys.stdin: | |
74 if line[0] == ">": | |
75 if start == False: | |
76 outputFile.write('\n') | |
77 else: | |
78 start = False | |
79 outputFile.write(line.strip() + '\t') | |
80 | |
81 else: | |
82 outputFile.write(line.strip()) | |
83 | |
84 | |
85 elif options.compare == True: | |
86 | |
87 if len(args) < 2: | |
88 print "Error: Need two fasta file paths to compare" | |
89 sys.exit(1) | |
90 | |
91 file_a = open(file_a,'r') | |
92 file_b = open(file_b,'r') | |
93 | |
94 p = 3 | |
95 count_a = 0 | |
96 count_b = 0 | |
97 sample_length = 50 | |
98 | |
99 while True: | |
100 | |
101 if p&1: | |
102 a = file_a.readline() | |
103 count_a += 1 | |
104 | |
105 if p&2: | |
106 b = file_b.readline() | |
107 count_b += 1 | |
108 | |
109 if not a or not b: # blank line still has "cr\lf" in it so doesn't match here | |
110 print "EOF" | |
111 break | |
112 | |
113 a = a.strip() | |
114 b = b.strip() | |
115 | |
116 if a == b: | |
117 p = 3 | |
118 | |
119 elif a < b: | |
120 sys.stdout.write('f1 ln %s: -%s\nvs. %s \n' % (count_a, a[0:sample_length] , b[0:sample_length])) | |
121 p = 1 | |
122 | |
123 else: | |
124 sys.stdout.write('f2 ln %s: +%s\nvs. %s \n' % (count_b, b[0:sample_length] , a[0:sample_length])) | |
125 p = 2 | |
126 | |
127 | |
128 if count_a % 1000000 == 0: | |
129 print "At line %s:" % count_a | |
130 | |
131 # For testing: | |
132 #if count_a > 50: | |
133 # | |
134 # print "Quick exit at line 500" | |
135 # sys.exit(1) | |
136 | |
137 | |
138 for line in file_a.readlines(): | |
139 count_a += 1 | |
140 sys.stdout.write('f1 ln %s: -%s\nvs. %s \n' % (count_a, line[0:sample_length] )) | |
141 | |
142 for line in file_b.readlines(): | |
143 count_b += 1 | |
144 sys.stdout.write('f2 ln %s: +%s\nvs. %s \n' % (count_b, line[0:sample_length] )) | |
145 | |
146 | |
147 | |
148 | |
149 def get_command_line(self): | |
150 """ | |
151 *************************** Parse Command Line ***************************** | |
152 | |
153 """ | |
154 parser = MyParser( | |
155 description = 'Tool for comparing two fasta files, or transforming fasta data to single-line key-value format, or visa versa.', | |
156 usage = 'fasta_format.py [options]', | |
157 epilog=""" | |
158 Note: This tool uses stdin and stdout for transforming fasta data. | |
159 | |
160 Convert from key-value data to fasta format data: | |
161 fasta_format.py [file] -f --fasta | |
162 cat [file] | fasta_format.py -f --fasta | |
163 | |
164 Convert from fasta format data to key-value data: | |
165 fasta_format.py [file] -k --keyvalue | |
166 cat [file] | fasta_format.py -k --keyvalue | |
167 | |
168 Compare two fasta format files: | |
169 fasta_format.py [file1] [file2] -c --compare | |
170 | |
171 Return version of this code: | |
172 fasta_format.py -v --version | |
173 | |
174 """) | |
175 | |
176 parser.add_option('-c', '--compare', dest='compare', default=False, action='store_true', | |
177 help='Compare two fasta files') | |
178 | |
179 parser.add_option('-f', '--fasta', dest='to_fasta', default=False, action='store_true', | |
180 help='Transform key-value file to fasta file format') | |
181 | |
182 parser.add_option('-k', '--keyvalue', dest='to_keyvalue', default=False, action='store_true', | |
183 help='Transform fasta file format to key-value format') | |
184 | |
185 parser.add_option('-v', '--version', dest='code_version', default=False, action='store_true', | |
186 help='Return version of this code.') | |
187 | |
188 return parser.parse_args() | |
189 | |
190 | |
191 if __name__ == '__main__': | |
192 | |
193 fasta_format = FastaFormat() | |
194 fasta_format.__main__() | |
195 |