comparison find_in_reference.py @ 3:2429b413d90a draft default tip

"planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/find_in_reference commit 074e95e1b598ec41f0e18a2798b00cf65e9b399e-dirty"
author jjohnson
date Thu, 12 May 2022 19:30:54 +0000
parents c4fd2ea4f988
children
comparison
equal deleted inserted replaced
2:c4fd2ea4f988 3:2429b413d90a
1 #!/usr/bin/env python 1 #!/usr/bin/env python3
2
3
4 import os.path
5 import sys
6 import optparse
7
8
2 """ 9 """
3 # 10 #
4 #------------------------------------------------------------------------------ 11 #------------------------------------------------------------------------------
5 # University of Minnesota 12 # University of Minnesota
6 # Copyright 2013, Regents of the University of Minnesota 13 # Copyright 2013, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------ 14 #------------------------------------------------------------------------------
8 # Author: 15 # Author:
9 # 16 #
10 # James E Johnson 17 # James E Johnson
11 # 18 #
12 #------------------------------------------------------------------------------ 19 #------------------------------------------------------------------------------
13 """ 20 """
14 21
15 """ 22 """
16 Takes 2 tabular files as input: 23 Takes 2 tabular files as input:
17 1. The file to be filtered 24 1. The file to be filtered
18 2. The reference file 25 2. The reference file
19 26
20 The string value of selected column of the input file is searched for 27 The string value of selected column of the input file is searched for
21 in the string values of the selected column of the reference file. 28 in the string values of the selected column of the reference file.
22 29
23 The intended purpose is to filter a peptide fasta file in tabular format 30 The intended purpose is to filter a peptide fasta file in tabular format
24 by whether those peptide sequences are found in a reference fasta file. 31 by whether those peptide sequences are found in a reference fasta file.
25 32
26 """ 33 """
27 import sys,re,os.path
28 import tempfile
29 import optparse
30 from optparse import OptionParser
31 import logging
32 34
33 35
34 def __main__(): 36 def __main__():
35 #Parse Command Line 37 # Parse Command Line
36 parser = optparse.OptionParser() 38 parser = optparse.OptionParser()
37 parser.add_option( '-i', '--input', dest='input', help='The input file to filter. (Otherwise read from stdin)' ) 39 parser.add_option('-i', '--input', dest='input', help='The input file to filter. (Otherwise read from stdin)')
38 parser.add_option( '-r', '--reference', dest='reference', help='The reference file to filter against' ) 40 parser.add_option('-r', '--reference', dest='reference', help='The reference file to filter against')
39 parser.add_option( '-o', '--output', dest='output', help='The output file for input lines filtered by reference') 41 parser.add_option('-o', '--output', dest='output', help='The output file for input lines filtered by reference')
40 parser.add_option( '-f', '--filtered', dest='filtered', help='The output file for input lines not in the output') 42 parser.add_option('-f', '--filtered', dest='filtered', help='The output file for input lines not in the output')
41 parser.add_option('-c','--input_column', dest='input_column', default=None, help='The column for the value in the input file. (first column = 1, default to last column)') 43 parser.add_option('-c', '--input_column', dest='input_column', type="int", default=None, help='The column for the value in the input file. (first column = 1, default to last column)')
42 parser.add_option('-C','--reference_column', dest='reference_column', default=None, help='The column for the value in the reference file. (first column = 1, default to last column)') 44 parser.add_option('-C', '--reference_column', dest='reference_column', type="int", default=None, help='The column for the value in the reference file. (first column = 1, default to last column)')
43 parser.add_option( '-I', '--case_insensitive', dest='ignore_case', action="store_true", default=False, help='case insensitive' ) 45 parser.add_option('-I', '--case_insensitive', dest='ignore_case', action="store_true", default=False, help='case insensitive')
44 parser.add_option( '-R', '--reverse_find', dest='reverse_find', action="store_true", default=False, help='find the reference string in the input string' ) 46 parser.add_option('-R', '--reverse_find', dest='reverse_find', action="store_true", default=False, help='find the reference string in the input string')
45 parser.add_option( '-B', '--test_reverse', dest='test_reverse', action="store_true", default=False, help='Also search for reversed input string in reference' ) 47 parser.add_option('-B', '--test_reverse', dest='test_reverse', action="store_true", default=False, help='Also search for reversed input string in reference')
46 parser.add_option( '-D', '--test_dna_reverse_complement', dest='test_reverse_comp', action="store_true", default=False, help='Also search for the DNA reverse complement of input string' ) 48 parser.add_option('-D', '--test_dna_reverse_complement', dest='test_reverse_comp', action="store_true", default=False, help='Also search for the DNA reverse complement of input string')
47 parser.add_option( '-k', '--keep', dest='keep', action="store_true", default=False, help='' ) 49 parser.add_option('-k', '--keep', dest='keep', action="store_true", default=False, help='')
48 parser.add_option( '-a', '--annotation_columns', dest='annotation_columns', default=None, help='If string is found, add these columns from reference' ) 50 parser.add_option('-a', '--annotation_columns', dest='annotation_columns', default=None, help='If string is found, add these columns from reference')
49 parser.add_option( '-s', '--annotation_separator', dest='annotation_separator', default=';', help='separator character between annotations from different lines' ) 51 parser.add_option('-s', '--annotation_separator', dest='annotation_separator', default=';', help='separator character between annotations from different lines')
50 parser.add_option( '-S', '--annotation_col_sep', dest='annotation_col_sep', default=',', help='separator character between annotation column from the same line' ) 52 parser.add_option('-S', '--annotation_col_sep', dest='annotation_col_sep', default=', ', help='separator character between annotation column from the same line')
51 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) 53 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout')
52 (options, args) = parser.parse_args() 54 (options, args) = parser.parse_args()
53 55
54 revcompl = lambda x: ''.join([{'A':'T','C':'G','G':'C','T':'A','a':'t','c':'g','g':'c','t':'a','N':'N','n':'n'}[B] for B in x][::-1]) 56 # revcompl = lambda x: ''.join([{'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a', 'N': 'N', 'n': 'n'}[B] for B in x][: : -1])
55 def test_rcomplement(seq, target):
56 if options.test_reverse_comp:
57 try:
58 comp = revcompl(seq)
59 return comp in target
60 except:
61 pass
62 return False
63 57
64 def test_reverse(seq,target): 58 COMP = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'a': 't', 'c': 'g', 'g': 'c', 't': 'a', 'N': 'N', 'n': 'n'}
65 return options.test_reverse and seq and seq[::-1] in target
66
67 # Input files
68 if options.input != None:
69 try:
70 inputPath = os.path.abspath(options.input)
71 inputFile = open(inputPath, 'r')
72 except Exception, e:
73 print >> sys.stderr, "failed: %s" % e
74 exit(2)
75 else:
76 inputFile = sys.stdin
77 # Reference
78 if options.reference == None:
79 print >> sys.stderr, "failed: reference file is required"
80 exit(2)
81 # Output files
82 outFile = None
83 filteredFile = None
84 if options.filtered == None and options.output == None:
85 #write to stdout
86 outFile = sys.stdout
87 else:
88 if options.output != None:
89 try:
90 outPath = os.path.abspath(options.output)
91 outFile = open(outPath, 'w')
92 except Exception, e:
93 print >> sys.stderr, "failed: %s" % e
94 exit(3)
95 if options.filtered != None:
96 try:
97 filteredPath = os.path.abspath(options.filtered)
98 filteredFile = open(filteredPath, 'w')
99 except Exception, e:
100 print >> sys.stderr, "failed: %s" % e
101 exit(3)
102 incol = -1
103 if options.input_column and options.input_column > 0:
104 incol = int(options.input_column)-1
105 refcol = -1
106 if options.reference_column and options.reference_column > 0:
107 refcol = int(options.reference_column)-1
108 if options.annotation_columns:
109 annotate = True
110 annotation_columns = [int(x) - 1 for x in options.annotation_columns.split(',')]
111 else:
112 annotate = False
113 refFile = None
114 num_found = 0
115 num_novel = 0
116 for ln,line in enumerate(inputFile):
117 annotations = []
118 try:
119 found = False
120 search_string = line.split('\t')[incol].rstrip('\r\n')
121 if options.ignore_case:
122 search_string = search_string.upper()
123 if options.debug:
124 print >> sys.stderr, "search: %s" % (search_string)
125 refFile = open(options.reference,'r')
126 for tn,fline in enumerate(refFile):
127 fields = fline.split('\t')
128 target_string = fields[refcol].rstrip('\r\n')
129 if options.ignore_case:
130 target_string = target_string.upper()
131 search = search_string if not options.reverse_find else target_string
132 target = target_string if not options.reverse_find else search_string
133 if options.debug:
134 print >> sys.stderr, "in: %s %s %s" % (search,search in target,target)
135 if search in target or test_reverse(search,target) or test_rcomplement(search,target):
136 found = True
137 if annotate:
138 annotation = options.annotation_col_sep.join([fields[i] for i in annotation_columns])
139 annotations.append(annotation)
140 else:
141 break
142 if found:
143 num_found += 1
144 if annotate:
145 line = '%s\t%s\n' % (line.rstrip('\r\n'),options.annotation_separator.join(annotations))
146 if options.keep == True:
147 if outFile:
148 outFile.write(line)
149 else:
150 if filteredFile:
151 filteredFile.write(line)
152 else:
153 num_novel += 1
154 if options.keep == True:
155 if filteredFile:
156 filteredFile.write(line)
157 else:
158 if outFile:
159 outFile.write(line)
160 except Exception, e:
161 print >> sys.stderr, "failed: Error reading %s - %s" % (options.reference,e)
162 finally:
163 if refFile:
164 refFile.close()
165 print >> sys.stdout, "found: %d novel: %d" % (num_found,num_novel)
166 59
167 if __name__ == "__main__" : __main__() 60 def revcompl(seq):
61 return ''.join([COMP[B] for B in seq][::-1])
168 62
63 def test_rcomplement(seq, target):
64 if options.test_reverse_comp:
65 try:
66 comp = revcompl(seq)
67 return comp in target
68 except Exception:
69 pass
70 return False
71
72 def test_reverse(seq, target):
73 return options.test_reverse and seq and seq[::-1] in target
74
75 # Input files
76 if options.input is not None:
77 try:
78 inputPath = os.path.abspath(options.input)
79 inputFile = open(inputPath, 'r')
80 except Exception as e:
81 print("failed: %s" % e, file=sys.stderr)
82 exit(2)
83 else:
84 inputFile = sys.stdin
85 # Reference
86 if options.reference is None:
87 print("failed: reference file is required", file=sys.stderr)
88 exit(2)
89 # Output files
90 outFile = None
91 filteredFile = None
92 if options.filtered is None and options.output is None:
93 # write to stdout
94 outFile = sys.stdout
95 else:
96 if options.output is not None:
97 try:
98 outPath = os.path.abspath(options.output)
99 outFile = open(outPath, 'w')
100 except Exception as e:
101 print("failed: %s" % e, file=sys.stderr)
102 exit(3)
103 if options.filtered is not None:
104 try:
105 filteredPath = os.path.abspath(options.filtered)
106 filteredFile = open(filteredPath, 'w')
107 except Exception as e:
108 print("failed: %s" % e, file=sys.stderr)
109 exit(3)
110 incol = -1
111 if options.input_column and options.input_column > 0:
112 incol = int(options.input_column)-1
113 refcol = -1
114 if options.reference_column and options.reference_column > 0:
115 refcol = int(options.reference_column)-1
116 if options.annotation_columns:
117 annotate = True
118 annotation_columns = [int(x) - 1 for x in options.annotation_columns.split(', ')]
119 else:
120 annotate = False
121 refFile = None
122 num_found = 0
123 num_novel = 0
124 for ln, line in enumerate(inputFile):
125 annotations = []
126 try:
127 found = False
128 search_string = line.split('\t')[incol].rstrip('\r\n')
129 if options.ignore_case:
130 search_string = search_string.upper()
131 if options.debug:
132 print("search: %s" % (search_string), file=sys.stderr)
133 refFile = open(options.reference, 'r')
134 for tn, fline in enumerate(refFile):
135 fields = fline.split('\t')
136 target_string = fields[refcol].rstrip('\r\n')
137 if options.ignore_case:
138 target_string = target_string.upper()
139 search = search_string if not options.reverse_find else target_string
140 target = target_string if not options.reverse_find else search_string
141 if options.debug:
142 print("in: %s %s %s" % (search, search in target, target), file=sys.stderr)
143 if search in target or test_reverse(search, target) or test_rcomplement(search, target):
144 found = True
145 if annotate:
146 annotation = options.annotation_col_sep.join([fields[i] for i in annotation_columns])
147 annotations.append(annotation)
148 else:
149 break
150 if found:
151 num_found += 1
152 if annotate:
153 line = '%s\t%s\n' % (line.rstrip('\r\n'), options.annotation_separator.join(annotations))
154 if options.keep is True:
155 if outFile:
156 outFile.write(line)
157 else:
158 if filteredFile:
159 filteredFile.write(line)
160 else:
161 num_novel += 1
162 if options.keep is True:
163 if filteredFile:
164 filteredFile.write(line)
165 else:
166 if outFile:
167 outFile.write(line)
168 except Exception as e:
169 print("failed: Error reading %s - %s" % (options.reference, e), file=sys.stderr)
170 finally:
171 if refFile:
172 refFile.close()
173 print("found: %d novel: %d" % (num_found, num_novel), file=sys.stdout)
174
175
176 if __name__ == "__main__":
177 __main__()