comparison find_in_reference.py @ 0:e7e56b51d156

Uploaded
author jjohnson
date Wed, 05 Feb 2014 08:12:47 -0500
parents
children e83e0ce8fb68
comparison
equal deleted inserted replaced
-1:000000000000 0:e7e56b51d156
1 #!/usr/bin/env python
2 """
3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2013, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author:
9 #
10 # James E Johnson
11 #
12 #------------------------------------------------------------------------------
13 """
14
15 """
16 Takes 2 tabular files as input:
17 1. The file to be filtered
18 2. The reference file
19
20 The string value of selected column of the input file is searched for
21 in the string values of the selected column of the reference file.
22
23 The intended purpose is to filter a peptide fasta file in tabular format
24 by whether those peptide sequences are found in a reference fasta file.
25
26 """
27 import sys,re,os.path
28 import tempfile
29 import optparse
30 from optparse import OptionParser
31 import logging
32
33
34 def __main__():
35 #Parse Command Line
36 parser = optparse.OptionParser()
37 parser.add_option( '-i', '--input', dest='input', help='The input file to filter. (Otherwise read from stdin)' )
38 parser.add_option( '-r', '--reference', dest='reference', help='The reference file to filter against' )
39 parser.add_option( '-o', '--output', dest='output', help='The output file for input lines filtered by reference')
40 parser.add_option( '-f', '--filtered', dest='filtered', help='The output file for input lines not in the output')
41 parser.add_option('-c','--input_column', dest='input_column', default=None, help='The column for the value in the input file. (first column = 1, default to last column)')
42 parser.add_option('-C','--reference_column', dest='reference_column', default=None, help='The column for the value in the reference file. (first column = 1, default to last column)')
43 parser.add_option( '-I', '--case_insensitive', dest='ignore_case', action="store_true", default=False, help='case insensitive' )
44 parser.add_option( '-k', '--keep', dest='keep', action="store_true", default=False, help='' )
45 parser.add_option( '-a', '--annotation_columns', dest='annotation_columns', default=None, help='If string is found, add these columns from reference' )
46 parser.add_option( '-s', '--annotation_separator', dest='annotation_separator', default=';', help='separator character between annotations from different lines' )
47 parser.add_option( '-S', '--annotation_col_sep', dest='annotation_col_sep', default=',', help='separator character between annotation column from the same line' )
48 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' )
49 (options, args) = parser.parse_args()
50 # Input files
51 if options.input != None:
52 try:
53 inputPath = os.path.abspath(options.input)
54 inputFile = open(inputPath, 'r')
55 except Exception, e:
56 print >> sys.stderr, "failed: %s" % e
57 exit(2)
58 else:
59 inputFile = sys.stdin
60 # Reference
61 if options.reference == None:
62 print >> sys.stderr, "failed: reference file is required"
63 exit(2)
64 # Output files
65 outFile = None
66 filteredFile = None
67 if options.filtered == None and options.output == None:
68 #write to stdout
69 outFile = sys.stdout
70 else:
71 if options.output != None:
72 try:
73 outPath = os.path.abspath(options.output)
74 outFile = open(outPath, 'w')
75 except Exception, e:
76 print >> sys.stderr, "failed: %s" % e
77 exit(3)
78 if options.filtered != None:
79 try:
80 filteredPath = os.path.abspath(options.filtered)
81 filteredFile = open(filteredPath, 'w')
82 except Exception, e:
83 print >> sys.stderr, "failed: %s" % e
84 exit(3)
85 incol = -1
86 if options.input_column and options.input_column > 0:
87 incol = int(options.input_column)-1
88 refcol = -1
89 if options.reference_column and options.reference_column > 0:
90 refcol = int(options.reference_column)-1
91 if options.annotation_columns:
92 annotate = True
93 annotation_columns = [int(x) - 1 for x in options.annotation_columns.split(',')]
94 else:
95 annotate = False
96 refFile = None
97 num_found = 0
98 num_novel = 0
99 for ln,line in enumerate(inputFile):
100 annotations = []
101 try:
102 found = False
103 search_string = line.split('\t')[incol].rstrip('\r\n')
104 if options.ignore_case:
105 search_string = search_string.upper()
106 if options.debug:
107 print >> sys.stderr, "search: %s" % (search_string)
108 refFile = open(options.reference,'r')
109 for tn,fline in enumerate(refFile):
110 fields = fline.split('\t')
111 target_string =fields[refcol]
112 if options.ignore_case:
113 target_string = target_string.upper()
114 if options.debug:
115 print >> sys.stderr, "in: %s %s %s" % (search_string,search_string in target_string,target_string)
116 if search_string in target_string:
117 found = True
118 if annotate:
119 annotation = options.annotation_col_sep.join([fields[i] for i in annotation_columns])
120 annotations.append(annotation)
121 else:
122 break
123 if found:
124 num_found += 1
125 if annotate:
126 line = '%s\t%s\n' % (line.rstrip('\r\n'),options.annotation_separator.join(annotations))
127 if options.keep == True:
128 if outFile:
129 outFile.write(line)
130 else:
131 if filteredFile:
132 filteredFile.write(line)
133 else:
134 num_novel += 1
135 if options.keep == True:
136 if filteredFile:
137 filteredFile.write(line)
138 else:
139 if outFile:
140 outFile.write(line)
141 except Exception, e:
142 print >> sys.stderr, "failed: Error reading %s - %s" % (options.reference,e)
143 finally:
144 if refFile:
145 refFile.close()
146 print >> sys.stdout, "found: %d novel: %d" % (num_found,num_novel)
147
148 if __name__ == "__main__" : __main__()
149