annotate query.py @ 20:1dc3f0c61817 draft

Uploaded 20190304
author fabio
date Mon, 04 Mar 2019 09:14:04 -0500
parents 7f712cc0d3d5
children c619ad82600e
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
19
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
1 #!/usr/bin/env python
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
2
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
3 import sys, os, optparse, shutil
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
4
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
5 __version__ = "1.0.0"
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
6 VALID_CHARS = '.-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
7 # in the case of collections, exitcodes equal to 0 and 1 are not considered errors
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
8 ERR_EXIT_CODE = 2
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
9 OK_EXIT_CODE = 0
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
10
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
11 def printLog( logfilepath, message, exitcode=OK_EXIT_CODE, exit=False ):
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
12 print message
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
13 with open( logfilepath, 'a+' ) as out:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
14 out.write( message + '\n' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
15 if exit:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
16 sys.exit( exitcode )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
17
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
18 def querySBT( options, args ):
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
19 output_dir_path = options.outputdir
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
20 outlogfile = options.outfile
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
21
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
22 tree_file_paths = options.treep.split( ',' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
23 tree_file_names = options.treen.split( ',' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
24 tree_def_filepath = None
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
25 leafnames_filepath = None
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
26 for idx, tree_file_name in enumerate( tree_file_names ):
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
27 if tree_file_name == 'howde':
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
28 tree_def_filepath = tree_file_paths[ idx ]
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
29 elif tree_file_name == 'leafnames':
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
30 leafnames_filepath = tree_file_paths[ idx ]
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
31 if tree_def_filepath is not None and leafnames_filepath is not None:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
32 break
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
33
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
34 if tree_def_filepath is not None and leafnames_filepath is not None:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
35 leafnames_counter = 0
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
36 with open( leafnames_filepath ) as leafnames_file:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
37 for line in leafnames_file:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
38 if line.strip():
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
39 leafnames_counter += 1
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
40 if leafnames_counter > 0:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
41 printLog( outlogfile, 'The selected collection contains a valid tree' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
42 shutil.copyfile( tree_def_filepath, 'howde.txt' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
43 tree_def_filepath = 'howde.txt'
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
44 for idx, tree_file_name in enumerate( tree_file_names ):
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
45 if tree_file_name.endswith( 'detbrief.rrr' ):
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
46 shutil.copyfile( tree_file_paths[ idx ], tree_file_name + '.bf' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
47
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
48 printLog( outlogfile, 'Creating batch of queries' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
49 # create tmp batch file
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
50 batch_file_name = 'queries.fa'
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
51 batch_file = open( batch_file_name, 'w' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
52
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
53 comma_sep_file_paths = options.files
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
54 # check if options.files contains at least one file path
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
55 if comma_sep_file_paths is not None:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
56 # split file paths
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
57 file_paths = comma_sep_file_paths.split(",")
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
58 # split file names
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
59 file_names = options.names.split(",")
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
60 for idx, file_path in enumerate(file_paths):
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
61 fixed_file_name = ''.join( c for c in file_names[ idx ] if c in VALID_CHARS )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
62 printLog( outlogfile, '> processing file ' + file_names[ idx ] + ' ( fixed_name=\"' + fixed_file_name + '\" ) ' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
63 with open(file_path, 'r') as content_file:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
64 for line in content_file:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
65 line = line.strip()
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
66 if line:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
67 line_split = line.strip().split("\t") # split on tab
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
68 if len(line_split) == 2: # 0:id , 1:seq , otherwise skip line
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
69 original_seq_id = line_split[0]
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
70 # fix seq_id using valid chars only
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
71 seq_id = ''.join( c for c in original_seq_id if c in VALID_CHARS )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
72 printLog( outlogfile, '> sequence ' + original_seq_id + ' ( fixed_name=\"' + seq_id + '\" )' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
73 seq_text = line_split[1]
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
74
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
75 # write on batch file
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
76 batch_file.write( '> ' + fixed_file_name + '_' + seq_id + '\n' + seq_text + '\n' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
77 batch_file.close()
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
78 # query the tree
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
79 printLog( outlogfile, 'Querying the tree' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
80 query_res_file_path = os.path.abspath( 'answer.txt' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
81 sort_param = '--sort'
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
82 if options.sort == 0:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
83 sort_param = ''
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
84 query_exitcode = os.system( 'howdesbt query --tree=' + os.path.abspath( tree_def_filepath ) + ' ' + os.path.abspath( batch_file_name ) + '=' + str(options.threshold) + ' --out=' + query_res_file_path ) + ' ' + sort_param
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
85 if query_exitcode > 0:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
86 printLog( outlogfile, '> ERROR: an error has occurred while querying the tree with the sequence [id: ' + seq_id + '] in input file ' + file_names[ idx ] )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
87 else:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
88 if os.path.exists( query_res_file_path ):
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
89 with open( query_res_file_path ) as query_res_file:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
90 file_path = ''
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
91 theta_matches = 0
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
92 for line in query_res_file:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
93 line = line.strip()
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
94 if line:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
95 if line.startswith( '*' ):
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
96 line_split = line.split( ' ' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
97 theta_matches = int( line_split[ 1 ] )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
98 file_name = line_split[ 0 ].replace( '*', '' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
99 file_path = os.path.join( output_dir_path, file_name + '_txt' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
100 open( file_path, 'a' ).close()
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
101 else:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
102 res_file = open( file_path, 'a+' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
103 fraction = str( theta_matches ) + '/' + str( leafnames_counter )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
104 score = format( round( float( theta_matches ) / float( leafnames_counter ) , 6 ), '6f' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
105 res_file.write( line + '\t' + fraction + '\t' + score + '\n' )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
106 res_file.close()
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
107 else:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
108 printLog( outlogfile, 'An error has occurred while querying the tree', exitcode=ERR_EXIT_CODE, exit=True )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
109 else:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
110 printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
111 else:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
112 printLog( outlogfile, 'The selected collection does not contain a valid tree', exitcode=ERR_EXIT_CODE, exit=True )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
113
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
114 def __main__():
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
115 # Parse the command line options
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
116 usage = "Usage: query.py --files comma_sep_file_paths --names comma_seq_file_names --sequences sequences_text --search search_mode --exact exact_alg --sthreshold threshold --outputdir output_dir_path"
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
117 parser = optparse.OptionParser(usage = usage)
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
118 parser.add_option("-v", "--version", action="store_true", dest="version",
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
119 default=False, help="display version and exit")
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
120 parser.add_option("-f", "--files", type="string",
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
121 action="store", dest="files", help="comma separated files path")
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
122 parser.add_option("-n", "--names", type="string",
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
123 action="store", dest="names", help="comma separated names associated to the files specified in --files")
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
124 parser.add_option("-k", "--treep", type="string",
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
125 action="store", dest="treep", help="paths of files in collection")
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
126 parser.add_option("-m", "--treen", type="string",
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
127 action="store", dest="treen", help="names of files in collection")
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
128 parser.add_option("-t", "--threshold", type="float", default=0.7,
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
129 action="store", dest="threshold", help="search threshold")
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
130 parser.add_option("-s", "--sort", type="int", default=1,
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
131 action="store", dest="sort", help="sort results")
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
132 parser.add_option("-o", "--outputdir", type="string", default="output",
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
133 action="store", dest="outputdir", help="output directory (collection) path")
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
134 parser.add_option("-r", "--outfile", type="string", default="query.txt",
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
135 action="store", dest="outfile", help="output log file path")
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
136
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
137 (options, args) = parser.parse_args()
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
138 if options.version:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
139 print __version__
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
140 else:
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
141 # create output dir (collection)
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
142 output_dir_path = options.outputdir
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
143 if not os.path.exists(output_dir_path):
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
144 os.makedirs(output_dir_path)
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
145
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
146 querySBT( options, args )
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
147
7f712cc0d3d5 Uploaded 20190304.2
fabio
parents:
diff changeset
148 if __name__ == "__main__": __main__()