0
|
1 #!/usr/bin/env python
|
|
2 # -*- coding: utf-8 -*-
|
|
3 ###
|
|
4 # From a megablast file of results (tabular 25 columns) and taxid(s) user is interested in,
|
|
5 # provide 1 file with 2 columns: taxids acc
|
|
6 ###
|
|
7
|
|
8 ### Libraries to import:
|
|
9 # NOTE: Python 2.7 because needs krona env that MUST use 2.7 (last krona version load 2022 01 21)
|
|
10 # NOTE: to update krona tax in conda env, run:
|
|
11 # ktUpdateTaxonomy.sh
|
|
12 # ktUpdateTaxonomy.sh --accessions (this one NOT PROVIDED IN DOCUMENTATION)
|
|
13 import argparse, os, sys, warnings
|
|
14 # NEEDS to use krona conda environnement if access ktGetTaxIDFromAcc
|
|
15 from os import path
|
|
16
|
|
17 # to be able to report line number in error messages
|
|
18 import inspect
|
|
19 frame = inspect.currentframe()
|
|
20
|
|
21 # debug
|
|
22 b_test_creates_taxid_acc_f_from_megablast_res = False # ok 2022 01 21
|
|
23
|
|
24 prog_tag = '[' + os.path.basename(__file__) + ']'
|
|
25
|
|
26 krona_taxid_acc_f = ''
|
|
27 krona_tab_dir = ''
|
|
28
|
|
29 # taxid found under the taxid searched for
|
|
30 tax_in = []
|
|
31
|
|
32 # boolean to know if we dowload ncbi taxonomy file in current env
|
|
33 b_test_all = False
|
|
34 b_test = False
|
|
35
|
|
36 # min_nr_reads_by_accnr = 1
|
|
37
|
|
38 parser = argparse.ArgumentParser()
|
|
39 parser.add_argument("-r", "--megablast_tabular_f", dest='megablast_f',
|
|
40 help="megablast results tabular file (25 colums), including accession numbers in col 2",
|
|
41 metavar="FILE")
|
|
42 parser.add_argument("-o", "--tax_acc_out_f", dest='tax_acc_out_f',
|
|
43 help="Output text file with accession numbers from krona_taxid_acc_f NOT found under taxid in ncbi taxonomy tree",
|
|
44 metavar="FILE")
|
|
45 #parser.add_argument("-m", "--min_number_off_reads_by_acc_nr", dest='min_nr_reads_by_accnr',
|
|
46 # help="[Optional] minimal number of reads matching an accession number to take it into account (default:1)",
|
|
47 # metavar="INT")
|
|
48 parser.add_argument("-z", "--test_all", dest='b_test_all',
|
|
49 help="[Optional] run all tests",
|
|
50 action='store_true')
|
|
51 parser.set_defaults(b_load_ncbi_tax_f=False)
|
|
52 parser.set_defaults(b_test_all=False)
|
|
53 parser.set_defaults(min_nr_reads_by_accnr=1)
|
|
54
|
|
55 args = parser.parse_args()
|
|
56
|
|
57 # -------------------------------------------
|
|
58 # check arguments
|
|
59 b_test_all = args.b_test_all
|
|
60
|
|
61 if b_test_all:
|
|
62 b_test_creates_taxid_acc_f_from_megablast_res = True
|
|
63 b_test = True
|
|
64 else:
|
|
65 b_test = b_test_creates_taxid_acc_f_from_megablast_res
|
|
66
|
|
67 if ((not b_test)and
|
|
68 ((len(sys.argv) < 5) or (len(sys.argv) > 5))):
|
|
69 print("\n".join(["To use this scripts, install first MEGABLAST_TAB_get_acc_under_taxid_in_out.yaml conda env. Then run:",
|
|
70 "conda activate MEGABLAST_TAB_get_taxid_acc",
|
|
71 "ktUpdateTaxonomy.sh",
|
|
72 "ktUpdateTaxonomy.sh --accessions",
|
|
73 "\n\n",
|
|
74 "Example: "+ ' '.join(['./MEGABLAST_TAB_get_taxid_acc.py',
|
|
75 '-r megablast_out_f_25clmn.tsv',
|
|
76 '-o megablast_out_f_taxid_acc.tsv']),"\n\n" ]))
|
|
77 parser.print_help()
|
|
78 print(prog_tag + "[Error] we found "+str(len(sys.argv)) +
|
|
79 " arguments, exit line "+str(frame.f_lineno))
|
|
80 sys.exit(0)
|
|
81
|
|
82 # print('args:', args)
|
|
83 if(not b_test):
|
|
84 if args.megablast_f is not None:
|
|
85 # get absolute path in case of files
|
|
86 megablast_f = os.path.abspath(args.megablast_f)
|
|
87 else:
|
|
88 sys.exit("[Error] You must provide megablast_f")
|
|
89 if args.tax_acc_out_f is not None:
|
|
90 tax_acc_out_f = os.path.abspath(args.tax_acc_out_f)
|
|
91 # if min_nr_reads_by_accnr is not None:
|
|
92 # min_nr_reads_by_accnr = args.min_nr_reads_by_accnr
|
|
93
|
|
94 # ------------------------------------------------------------------------
|
|
95 # from a megablast tsv output file with 25 columns, return a file with only
|
|
96 # taxid acc
|
|
97 # output file name provided as parameter
|
|
98 # ------------------------------------------------------------------------
|
|
99 krona_taxid_acc_f = ''
|
|
100 def creates_taxid_acc_f_from_megablast_res(megablast_f, tax_acc_out_f):
|
|
101 acc_col_nb_in_megablast_res = str(2)
|
|
102 # krona_taxdb_f = os.path.expanduser('~/miniconda3/envs/krona/opt/krona/taxonomy/') # krona['taxdb'] # "/nfs/data/db/tax_krona/"
|
|
103 krona_taxdb_f = os.path.expanduser('/db/krona/') # krona['taxdb'] # "/nfs/data/db/tax_krona/"
|
|
104 if not os.path.isfile(krona_taxdb_f + 'all.accession2taxid.sorted'):
|
|
105 sys.exit(prog_tag + "[Error] missing "+krona_taxdb_f+" file, please run 'ktUpdateTaxonomy.sh --accessions' in your krona conda environment (and 'ktUpdateTaxonomy.sh' before if you have not done)")
|
|
106
|
|
107 # conda: "../envs/krona.yaml"
|
|
108 cmd = ' '.join(["cut -f", acc_col_nb_in_megablast_res, megablast_f,
|
|
109 '| ktGetTaxIDFromAcc -tax ',krona_taxdb_f,' -p ',
|
|
110 # '| uniq ', # remove many redundant lines # DO NOT USE because need exact number of each acc nr
|
|
111 '> ',tax_acc_out_f]) # return lines:taxid acc
|
|
112 # print("cmd:"+cmd)
|
|
113 os.system(cmd)
|
|
114 print(prog_tag + ' ' + tax_acc_out_f + " file created")
|
|
115
|
|
116 # test creates_taxid_acc_f_from_megablast_res function
|
|
117 # display created file and header
|
|
118 if b_test_creates_taxid_acc_f_from_megablast_res:
|
|
119 megablast_f = "megablast_out_f_25clmn.tsv"
|
|
120 print(prog_tag + " START b_test_creates_taxid_acc_f_from_megablast_res")
|
|
121 print(prog_tag + " loading "+megablast_f+" file")
|
|
122 tax_acc_out_f = 'megablast_out_f_taxid_acc.tsv'
|
|
123 creates_taxid_acc_f_from_megablast_res(megablast_f, tax_acc_out_f)
|
|
124 if os.path.isfile(tax_acc_out_f):
|
|
125 print(prog_tag + " " + tax_acc_out_f + " file created, start with:")
|
|
126 cmd = "head " + tax_acc_out_f
|
|
127 print(os.system(cmd))
|
|
128 else:
|
|
129 sys.exit(prog_tag + "[Error] creates_taxid_acc_f_from_megablast_res has not created file "+tax_acc_out_f)
|
|
130 print("END b_test_creates_taxid_acc_f_from_megablast_res")
|
|
131 sys.exit("Exit program after test")
|
|
132
|
|
133 # --------------------------------------------------------------------------
|
|
134
|
|
135 # --------------------------------------------------------------------------
|
|
136 # MAIN
|
|
137 # --------------------------------------------------------------------------
|
|
138 ##### MAIN
|
|
139 def __main__():
|
|
140 # creates taxid acc file from megablast result
|
|
141 creates_taxid_acc_f_from_megablast_res(megablast_f, tax_acc_out_f)
|
|
142
|
|
143 #### MAIN END
|
|
144 if __name__ == "__main__": __main__()
|
|
145
|