annotate MEGABLAST_TAB_get_taxid_acc.py @ 1:fafa2c8d93d3 draft default tip

Upload wrapper
author p.lucas
date Wed, 13 Dec 2023 10:52:32 +0000
parents e2df7c111004
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
1 #!/usr/bin/env python
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
2 # -*- coding: utf-8 -*-
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
3 ###
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
4 # From a megablast file of results (tabular 25 columns) and taxid(s) user is interested in,
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
5 # provide 1 file with 2 columns: taxids acc
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
6 ###
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
7
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
8 ### Libraries to import:
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
9 # NOTE: Python 2.7 because needs krona env that MUST use 2.7 (last krona version load 2022 01 21)
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
10 # NOTE: to update krona tax in conda env, run:
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
11 # ktUpdateTaxonomy.sh
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
12 # ktUpdateTaxonomy.sh --accessions (this one NOT PROVIDED IN DOCUMENTATION)
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
13 import argparse, os, sys, warnings
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
14 # NEEDS to use krona conda environnement if access ktGetTaxIDFromAcc
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
15 from os import path
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
16
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
17 # to be able to report line number in error messages
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
18 import inspect
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
19 frame = inspect.currentframe()
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
20
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
21 # debug
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
22 b_test_creates_taxid_acc_f_from_megablast_res = False # ok 2022 01 21
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
23
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
24 prog_tag = '[' + os.path.basename(__file__) + ']'
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
25
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
26 krona_taxid_acc_f = ''
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
27 krona_tab_dir = ''
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
28
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
29 # taxid found under the taxid searched for
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
30 tax_in = []
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
31
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
32 # boolean to know if we dowload ncbi taxonomy file in current env
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
33 b_test_all = False
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
34 b_test = False
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
35
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
36 # min_nr_reads_by_accnr = 1
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
37
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
38 parser = argparse.ArgumentParser()
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
39 parser.add_argument("-r", "--megablast_tabular_f", dest='megablast_f',
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
40 help="megablast results tabular file (25 colums), including accession numbers in col 2",
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
41 metavar="FILE")
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
42 parser.add_argument("-o", "--tax_acc_out_f", dest='tax_acc_out_f',
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
43 help="Output text file with accession numbers from krona_taxid_acc_f NOT found under taxid in ncbi taxonomy tree",
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
44 metavar="FILE")
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
45 #parser.add_argument("-m", "--min_number_off_reads_by_acc_nr", dest='min_nr_reads_by_accnr',
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
46 # help="[Optional] minimal number of reads matching an accession number to take it into account (default:1)",
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
47 # metavar="INT")
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
48 parser.add_argument("-z", "--test_all", dest='b_test_all',
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
49 help="[Optional] run all tests",
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
50 action='store_true')
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
51 parser.set_defaults(b_load_ncbi_tax_f=False)
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
52 parser.set_defaults(b_test_all=False)
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
53 parser.set_defaults(min_nr_reads_by_accnr=1)
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
54
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
55 args = parser.parse_args()
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
56
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
57 # -------------------------------------------
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
58 # check arguments
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
59 b_test_all = args.b_test_all
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
60
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
61 if b_test_all:
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
62 b_test_creates_taxid_acc_f_from_megablast_res = True
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
63 b_test = True
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
64 else:
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
65 b_test = b_test_creates_taxid_acc_f_from_megablast_res
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
66
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
67 if ((not b_test)and
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
68 ((len(sys.argv) < 5) or (len(sys.argv) > 5))):
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
69 print("\n".join(["To use this scripts, install first MEGABLAST_TAB_get_acc_under_taxid_in_out.yaml conda env. Then run:",
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
70 "conda activate MEGABLAST_TAB_get_taxid_acc",
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
71 "ktUpdateTaxonomy.sh",
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
72 "ktUpdateTaxonomy.sh --accessions",
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
73 "\n\n",
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
74 "Example: "+ ' '.join(['./MEGABLAST_TAB_get_taxid_acc.py',
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
75 '-r megablast_out_f_25clmn.tsv',
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
76 '-o megablast_out_f_taxid_acc.tsv']),"\n\n" ]))
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
77 parser.print_help()
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
78 print(prog_tag + "[Error] we found "+str(len(sys.argv)) +
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
79 " arguments, exit line "+str(frame.f_lineno))
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
80 sys.exit(0)
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
81
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
82 # print('args:', args)
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
83 if(not b_test):
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
84 if args.megablast_f is not None:
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
85 # get absolute path in case of files
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
86 megablast_f = os.path.abspath(args.megablast_f)
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
87 else:
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
88 sys.exit("[Error] You must provide megablast_f")
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
89 if args.tax_acc_out_f is not None:
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
90 tax_acc_out_f = os.path.abspath(args.tax_acc_out_f)
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
91 # if min_nr_reads_by_accnr is not None:
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
92 # min_nr_reads_by_accnr = args.min_nr_reads_by_accnr
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
93
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
94 # ------------------------------------------------------------------------
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
95 # from a megablast tsv output file with 25 columns, return a file with only
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
96 # taxid acc
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
97 # output file name provided as parameter
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
98 # ------------------------------------------------------------------------
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
99 krona_taxid_acc_f = ''
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
100 def creates_taxid_acc_f_from_megablast_res(megablast_f, tax_acc_out_f):
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
101 acc_col_nb_in_megablast_res = str(2)
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
102 # krona_taxdb_f = os.path.expanduser('~/miniconda3/envs/krona/opt/krona/taxonomy/') # krona['taxdb'] # "/nfs/data/db/tax_krona/"
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
103 krona_taxdb_f = os.path.expanduser('/db/krona/') # krona['taxdb'] # "/nfs/data/db/tax_krona/"
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
104 if not os.path.isfile(krona_taxdb_f + 'all.accession2taxid.sorted'):
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
105 sys.exit(prog_tag + "[Error] missing "+krona_taxdb_f+" file, please run 'ktUpdateTaxonomy.sh --accessions' in your krona conda environment (and 'ktUpdateTaxonomy.sh' before if you have not done)")
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
106
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
107 # conda: "../envs/krona.yaml"
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
108 cmd = ' '.join(["cut -f", acc_col_nb_in_megablast_res, megablast_f,
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
109 '| ktGetTaxIDFromAcc -tax ',krona_taxdb_f,' -p ',
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
110 # '| uniq ', # remove many redundant lines # DO NOT USE because need exact number of each acc nr
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
111 '> ',tax_acc_out_f]) # return lines:taxid acc
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
112 # print("cmd:"+cmd)
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
113 os.system(cmd)
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
114 print(prog_tag + ' ' + tax_acc_out_f + " file created")
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
115
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
116 # test creates_taxid_acc_f_from_megablast_res function
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
117 # display created file and header
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
118 if b_test_creates_taxid_acc_f_from_megablast_res:
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
119 megablast_f = "megablast_out_f_25clmn.tsv"
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
120 print(prog_tag + " START b_test_creates_taxid_acc_f_from_megablast_res")
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
121 print(prog_tag + " loading "+megablast_f+" file")
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
122 tax_acc_out_f = 'megablast_out_f_taxid_acc.tsv'
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
123 creates_taxid_acc_f_from_megablast_res(megablast_f, tax_acc_out_f)
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
124 if os.path.isfile(tax_acc_out_f):
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
125 print(prog_tag + " " + tax_acc_out_f + " file created, start with:")
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
126 cmd = "head " + tax_acc_out_f
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
127 print(os.system(cmd))
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
128 else:
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
129 sys.exit(prog_tag + "[Error] creates_taxid_acc_f_from_megablast_res has not created file "+tax_acc_out_f)
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
130 print("END b_test_creates_taxid_acc_f_from_megablast_res")
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
131 sys.exit("Exit program after test")
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
132
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
133 # --------------------------------------------------------------------------
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
134
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
135 # --------------------------------------------------------------------------
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
136 # MAIN
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
137 # --------------------------------------------------------------------------
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
138 ##### MAIN
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
139 def __main__():
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
140 # creates taxid acc file from megablast result
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
141 creates_taxid_acc_f_from_megablast_res(megablast_f, tax_acc_out_f)
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
142
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
143 #### MAIN END
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
144 if __name__ == "__main__": __main__()
e2df7c111004 Upload python script
p.lucas
parents:
diff changeset
145