Mercurial > repos > damion > versioned_data
comparison data_stores/fasta_format.py @ 1:5c5027485f7d draft
Uploaded correct file
| author | damion |
|---|---|
| date | Sun, 09 Aug 2015 16:07:50 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:d31a1bd74e63 | 1:5c5027485f7d |
|---|---|
| 1 #!/usr/bin/python | |
| 2 # Simple comparison and conversion tools for big fasta data | |
| 3 | |
| 4 import sys, os, optparse | |
| 5 | |
| 6 VERSION = "1.0.0" | |
| 7 | |
| 8 class MyParser(optparse.OptionParser): | |
| 9 """ | |
| 10 Provides a better class for displaying formatted help info. | |
| 11 From http://stackoverflow.com/questions/1857346/python-optparse-how-to-include-additional-info-in-usage-output. | |
| 12 """ | |
| 13 def format_epilog(self, formatter): | |
| 14 return self.epilog | |
| 15 | |
| 16 | |
| 17 def split_len(seq, length): | |
| 18 return [seq[i:i+length] for i in range(0, len(seq), length)] | |
| 19 | |
| 20 | |
| 21 def check_file_path(file_path, message = "File "): | |
| 22 | |
| 23 path = os.path.normpath(file_path) | |
| 24 # make sure any relative paths are converted to absolute ones | |
| 25 if not os.path.isdir(os.path.dirname(path)) or not os.path.isfile(path): | |
| 26 # Not an absolute path, so try default folder where script was called: | |
| 27 path = os.path.normpath(os.path.join(os.getcwd(), path) ) | |
| 28 if not os.path.isfile(path): | |
| 29 print message + "[" + file_path + "] doesn't exist!" | |
| 30 sys.exit(1) | |
| 31 | |
| 32 return path | |
| 33 | |
| 34 | |
| 35 class FastaFormat(object): | |
| 36 | |
| 37 def __main__(self): | |
| 38 | |
| 39 options, args = self.get_command_line() | |
| 40 | |
| 41 if options.code_version: | |
| 42 print VERSION | |
| 43 return VERSION | |
| 44 | |
| 45 if len(args) > 0: | |
| 46 file_a = check_file_path(args[0]) | |
| 47 else: file_a = False | |
| 48 | |
| 49 if len(args) > 1: | |
| 50 file_b = check_file_path(args[1]) | |
| 51 else: file_b = False | |
| 52 | |
| 53 if options.to_fasta == True: | |
| 54 # Transform from key-value file to regular fasta format: 1 line for identifier and description(s), remaining 80 character lines for fasta data. | |
| 55 | |
| 56 with sys.stdout as outputFile : | |
| 57 for line in open(file_a,'r') if file_a else sys.stdin: | |
| 58 line_data = line.rsplit('\t',1) | |
| 59 if len(line_data) > 1: | |
| 60 outputFile.write(line_data[0] + '\n' + '\n'.join(split_len(line_data[1],80)) ) | |
| 61 else: | |
| 62 # Fasta one-liner didn't have any sequence data | |
| 63 outputFile.write(line_data[0]) | |
| 64 | |
| 65 #outputFile.close() #Otherwise terminal never looks like it closes? | |
| 66 | |
| 67 | |
| 68 elif options.to_keyvalue == True: | |
| 69 # Transform from fasta format to key-value format: | |
| 70 # Separates sequence lines are merged and separated from id/description line by a tab. | |
| 71 with sys.stdout as outputFile: | |
| 72 start = True | |
| 73 for line in open(file_a,'r') if file_a else sys.stdin: | |
| 74 if line[0] == ">": | |
| 75 if start == False: | |
| 76 outputFile.write('\n') | |
| 77 else: | |
| 78 start = False | |
| 79 outputFile.write(line.strip() + '\t') | |
| 80 | |
| 81 else: | |
| 82 outputFile.write(line.strip()) | |
| 83 | |
| 84 | |
| 85 elif options.compare == True: | |
| 86 | |
| 87 if len(args) < 2: | |
| 88 print "Error: Need two fasta file paths to compare" | |
| 89 sys.exit(1) | |
| 90 | |
| 91 file_a = open(file_a,'r') | |
| 92 file_b = open(file_b,'r') | |
| 93 | |
| 94 p = 3 | |
| 95 count_a = 0 | |
| 96 count_b = 0 | |
| 97 sample_length = 50 | |
| 98 | |
| 99 while True: | |
| 100 | |
| 101 if p&1: | |
| 102 a = file_a.readline() | |
| 103 count_a += 1 | |
| 104 | |
| 105 if p&2: | |
| 106 b = file_b.readline() | |
| 107 count_b += 1 | |
| 108 | |
| 109 if not a or not b: # blank line still has "cr\lf" in it so doesn't match here | |
| 110 print "EOF" | |
| 111 break | |
| 112 | |
| 113 a = a.strip() | |
| 114 b = b.strip() | |
| 115 | |
| 116 if a == b: | |
| 117 p = 3 | |
| 118 | |
| 119 elif a < b: | |
| 120 sys.stdout.write('f1 ln %s: -%s\nvs. %s \n' % (count_a, a[0:sample_length] , b[0:sample_length])) | |
| 121 p = 1 | |
| 122 | |
| 123 else: | |
| 124 sys.stdout.write('f2 ln %s: +%s\nvs. %s \n' % (count_b, b[0:sample_length] , a[0:sample_length])) | |
| 125 p = 2 | |
| 126 | |
| 127 | |
| 128 if count_a % 1000000 == 0: | |
| 129 print "At line %s:" % count_a | |
| 130 | |
| 131 # For testing: | |
| 132 #if count_a > 50: | |
| 133 # | |
| 134 # print "Quick exit at line 500" | |
| 135 # sys.exit(1) | |
| 136 | |
| 137 | |
| 138 for line in file_a.readlines(): | |
| 139 count_a += 1 | |
| 140 sys.stdout.write('f1 ln %s: -%s\nvs. %s \n' % (count_a, line[0:sample_length] )) | |
| 141 | |
| 142 for line in file_b.readlines(): | |
| 143 count_b += 1 | |
| 144 sys.stdout.write('f2 ln %s: +%s\nvs. %s \n' % (count_b, line[0:sample_length] )) | |
| 145 | |
| 146 | |
| 147 | |
| 148 | |
| 149 def get_command_line(self): | |
| 150 """ | |
| 151 *************************** Parse Command Line ***************************** | |
| 152 | |
| 153 """ | |
| 154 parser = MyParser( | |
| 155 description = 'Tool for comparing two fasta files, or transforming fasta data to single-line key-value format, or visa versa.', | |
| 156 usage = 'fasta_format.py [options]', | |
| 157 epilog=""" | |
| 158 Note: This tool uses stdin and stdout for transforming fasta data. | |
| 159 | |
| 160 Convert from key-value data to fasta format data: | |
| 161 fasta_format.py [file] -f --fasta | |
| 162 cat [file] | fasta_format.py -f --fasta | |
| 163 | |
| 164 Convert from fasta format data to key-value data: | |
| 165 fasta_format.py [file] -k --keyvalue | |
| 166 cat [file] | fasta_format.py -k --keyvalue | |
| 167 | |
| 168 Compare two fasta format files: | |
| 169 fasta_format.py [file1] [file2] -c --compare | |
| 170 | |
| 171 Return version of this code: | |
| 172 fasta_format.py -v --version | |
| 173 | |
| 174 """) | |
| 175 | |
| 176 parser.add_option('-c', '--compare', dest='compare', default=False, action='store_true', | |
| 177 help='Compare two fasta files') | |
| 178 | |
| 179 parser.add_option('-f', '--fasta', dest='to_fasta', default=False, action='store_true', | |
| 180 help='Transform key-value file to fasta file format') | |
| 181 | |
| 182 parser.add_option('-k', '--keyvalue', dest='to_keyvalue', default=False, action='store_true', | |
| 183 help='Transform fasta file format to key-value format') | |
| 184 | |
| 185 parser.add_option('-v', '--version', dest='code_version', default=False, action='store_true', | |
| 186 help='Return version of this code.') | |
| 187 | |
| 188 return parser.parse_args() | |
| 189 | |
| 190 | |
| 191 if __name__ == '__main__': | |
| 192 | |
| 193 fasta_format = FastaFormat() | |
| 194 fasta_format.__main__() | |
| 195 |
