| 0 | 1 #!/usr/bin/env python | 
|  | 2 """ | 
|  | 3 Input: fasta, minimal length, maximal length | 
|  | 4 Output: fasta | 
|  | 5 Return sequences whose lengths are within the range. | 
|  | 6 """ | 
|  | 7 import sys, os | 
|  | 8 | 
|  | 9 assert sys.version_info[:2] >= ( 2, 4 ) | 
|  | 10 | 
|  | 11 def stop_err( msg ): | 
|  | 12     sys.stderr.write( msg ) | 
|  | 13     sys.exit() | 
|  | 14 | 
|  | 15 def __main__(): | 
|  | 16     infile = sys.argv[1] | 
|  | 17     title_col = sys.argv[2] | 
|  | 18     seq_col = sys.argv[3] | 
|  | 19     outfile = sys.argv[4] | 
|  | 20 | 
|  | 21     if title_col == None or title_col == 'None' or seq_col == None or seq_col == 'None': | 
|  | 22         stop_err( "Columns not specified." ) | 
|  | 23     try: | 
|  | 24         seq_col = int( seq_col ) - 1 | 
|  | 25     except: | 
|  | 26         stop_err( "Invalid Sequence Column: %s." %str( seq_col ) ) | 
|  | 27 | 
|  | 28     title_col_list = title_col.split( ',' ) | 
|  | 29     out = open( outfile, 'w' ) | 
|  | 30     skipped_lines = 0 | 
|  | 31     first_invalid_line = 0 | 
|  | 32     invalid_line = "" | 
|  | 33     i = 0 | 
|  | 34 | 
|  | 35     for i, line in enumerate( open( infile ) ): | 
|  | 36         error = False | 
|  | 37         line = line.rstrip( '\r\n' ) | 
|  | 38         if line and not line.startswith( '#' ): | 
|  | 39             fields = line.split( '\t' ) | 
|  | 40             fasta_title = [] | 
|  | 41             for j in title_col_list: | 
|  | 42                 try: | 
|  | 43                     j = int( j ) - 1 | 
|  | 44                     fasta_title.append( fields[j] ) | 
|  | 45                 except: | 
|  | 46                     skipped_lines += 1 | 
|  | 47                     if not invalid_line: | 
|  | 48                         first_invalid_line = i + 1 | 
|  | 49                         invalid_line = line | 
|  | 50                     error = True | 
|  | 51                     break | 
|  | 52             if not error: | 
|  | 53                 try: | 
|  | 54                     fasta_seq = fields[seq_col] | 
|  | 55                     if fasta_title[0].startswith( ">" ): | 
|  | 56                         fasta_title[0] = fasta_title[0][1:] | 
|  | 57                     print >> out, ">%s\n%s" % ( "_".join( fasta_title ), fasta_seq ) | 
|  | 58                 except: | 
|  | 59                     skipped_lines += 1 | 
|  | 60                     if not invalid_line: | 
|  | 61                         first_invalid_line = i + 1 | 
|  | 62                         invalid_line = line | 
|  | 63     out.close() | 
|  | 64 | 
|  | 65     if skipped_lines > 0: | 
|  | 66         print 'Data issue: skipped %d blank or invalid lines starting at #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) | 
|  | 67 | 
|  | 68 if __name__ == "__main__" : __main__() |