1
|
1 #!/usr/bin/python
|
|
2 # Simple comparison and conversion tools for big fasta data
|
|
3
|
|
4 import sys, os, optparse
|
|
5
|
|
6 VERSION = "1.0.0"
|
|
7
|
|
8 class MyParser(optparse.OptionParser):
|
|
9 """
|
|
10 Provides a better class for displaying formatted help info.
|
|
11 From http://stackoverflow.com/questions/1857346/python-optparse-how-to-include-additional-info-in-usage-output.
|
|
12 """
|
|
13 def format_epilog(self, formatter):
|
|
14 return self.epilog
|
|
15
|
|
16
|
|
17 def split_len(seq, length):
|
|
18 return [seq[i:i+length] for i in range(0, len(seq), length)]
|
|
19
|
|
20
|
|
21 def check_file_path(file_path, message = "File "):
|
|
22
|
|
23 path = os.path.normpath(file_path)
|
|
24 # make sure any relative paths are converted to absolute ones
|
|
25 if not os.path.isdir(os.path.dirname(path)) or not os.path.isfile(path):
|
|
26 # Not an absolute path, so try default folder where script was called:
|
|
27 path = os.path.normpath(os.path.join(os.getcwd(), path) )
|
|
28 if not os.path.isfile(path):
|
|
29 print message + "[" + file_path + "] doesn't exist!"
|
|
30 sys.exit(1)
|
|
31
|
|
32 return path
|
|
33
|
|
34
|
|
35 class FastaFormat(object):
|
|
36
|
|
37 def __main__(self):
|
|
38
|
|
39 options, args = self.get_command_line()
|
|
40
|
|
41 if options.code_version:
|
|
42 print VERSION
|
|
43 return VERSION
|
|
44
|
|
45 if len(args) > 0:
|
|
46 file_a = check_file_path(args[0])
|
|
47 else: file_a = False
|
|
48
|
|
49 if len(args) > 1:
|
|
50 file_b = check_file_path(args[1])
|
|
51 else: file_b = False
|
|
52
|
|
53 if options.to_fasta == True:
|
|
54 # Transform from key-value file to regular fasta format: 1 line for identifier and description(s), remaining 80 character lines for fasta data.
|
|
55
|
|
56 with sys.stdout as outputFile :
|
|
57 for line in open(file_a,'r') if file_a else sys.stdin:
|
|
58 line_data = line.rsplit('\t',1)
|
|
59 if len(line_data) > 1:
|
|
60 outputFile.write(line_data[0] + '\n' + '\n'.join(split_len(line_data[1],80)) )
|
|
61 else:
|
|
62 # Fasta one-liner didn't have any sequence data
|
|
63 outputFile.write(line_data[0])
|
|
64
|
|
65 #outputFile.close() #Otherwise terminal never looks like it closes?
|
|
66
|
|
67
|
|
68 elif options.to_keyvalue == True:
|
|
69 # Transform from fasta format to key-value format:
|
|
70 # Separates sequence lines are merged and separated from id/description line by a tab.
|
|
71 with sys.stdout as outputFile:
|
|
72 start = True
|
|
73 for line in open(file_a,'r') if file_a else sys.stdin:
|
|
74 if line[0] == ">":
|
|
75 if start == False:
|
|
76 outputFile.write('\n')
|
|
77 else:
|
|
78 start = False
|
|
79 outputFile.write(line.strip() + '\t')
|
|
80
|
|
81 else:
|
|
82 outputFile.write(line.strip())
|
|
83
|
|
84
|
|
85 elif options.compare == True:
|
|
86
|
|
87 if len(args) < 2:
|
|
88 print "Error: Need two fasta file paths to compare"
|
|
89 sys.exit(1)
|
|
90
|
|
91 file_a = open(file_a,'r')
|
|
92 file_b = open(file_b,'r')
|
|
93
|
|
94 p = 3
|
|
95 count_a = 0
|
|
96 count_b = 0
|
|
97 sample_length = 50
|
|
98
|
|
99 while True:
|
|
100
|
|
101 if p&1:
|
|
102 a = file_a.readline()
|
|
103 count_a += 1
|
|
104
|
|
105 if p&2:
|
|
106 b = file_b.readline()
|
|
107 count_b += 1
|
|
108
|
|
109 if not a or not b: # blank line still has "cr\lf" in it so doesn't match here
|
|
110 print "EOF"
|
|
111 break
|
|
112
|
|
113 a = a.strip()
|
|
114 b = b.strip()
|
|
115
|
|
116 if a == b:
|
|
117 p = 3
|
|
118
|
|
119 elif a < b:
|
|
120 sys.stdout.write('f1 ln %s: -%s\nvs. %s \n' % (count_a, a[0:sample_length] , b[0:sample_length]))
|
|
121 p = 1
|
|
122
|
|
123 else:
|
|
124 sys.stdout.write('f2 ln %s: +%s\nvs. %s \n' % (count_b, b[0:sample_length] , a[0:sample_length]))
|
|
125 p = 2
|
|
126
|
|
127
|
|
128 if count_a % 1000000 == 0:
|
|
129 print "At line %s:" % count_a
|
|
130
|
|
131 # For testing:
|
|
132 #if count_a > 50:
|
|
133 #
|
|
134 # print "Quick exit at line 500"
|
|
135 # sys.exit(1)
|
|
136
|
|
137
|
|
138 for line in file_a.readlines():
|
|
139 count_a += 1
|
|
140 sys.stdout.write('f1 ln %s: -%s\nvs. %s \n' % (count_a, line[0:sample_length] ))
|
|
141
|
|
142 for line in file_b.readlines():
|
|
143 count_b += 1
|
|
144 sys.stdout.write('f2 ln %s: +%s\nvs. %s \n' % (count_b, line[0:sample_length] ))
|
|
145
|
|
146
|
|
147
|
|
148
|
|
149 def get_command_line(self):
|
|
150 """
|
|
151 *************************** Parse Command Line *****************************
|
|
152
|
|
153 """
|
|
154 parser = MyParser(
|
|
155 description = 'Tool for comparing two fasta files, or transforming fasta data to single-line key-value format, or visa versa.',
|
|
156 usage = 'fasta_format.py [options]',
|
|
157 epilog="""
|
|
158 Note: This tool uses stdin and stdout for transforming fasta data.
|
|
159
|
|
160 Convert from key-value data to fasta format data:
|
|
161 fasta_format.py [file] -f --fasta
|
|
162 cat [file] | fasta_format.py -f --fasta
|
|
163
|
|
164 Convert from fasta format data to key-value data:
|
|
165 fasta_format.py [file] -k --keyvalue
|
|
166 cat [file] | fasta_format.py -k --keyvalue
|
|
167
|
|
168 Compare two fasta format files:
|
|
169 fasta_format.py [file1] [file2] -c --compare
|
|
170
|
|
171 Return version of this code:
|
|
172 fasta_format.py -v --version
|
|
173
|
|
174 """)
|
|
175
|
|
176 parser.add_option('-c', '--compare', dest='compare', default=False, action='store_true',
|
|
177 help='Compare two fasta files')
|
|
178
|
|
179 parser.add_option('-f', '--fasta', dest='to_fasta', default=False, action='store_true',
|
|
180 help='Transform key-value file to fasta file format')
|
|
181
|
|
182 parser.add_option('-k', '--keyvalue', dest='to_keyvalue', default=False, action='store_true',
|
|
183 help='Transform fasta file format to key-value format')
|
|
184
|
|
185 parser.add_option('-v', '--version', dest='code_version', default=False, action='store_true',
|
|
186 help='Return version of this code.')
|
|
187
|
|
188 return parser.parse_args()
|
|
189
|
|
190
|
|
191 if __name__ == '__main__':
|
|
192
|
|
193 fasta_format = FastaFormat()
|
|
194 fasta_format.__main__()
|
|
195
|