annotate predict.py @ 0:b856d3d95413 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
author iuc
date Mon, 09 Jan 2023 13:27:09 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
1 #!/usr/bin/env python
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
2 # -*- coding: utf-8 -*-
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
3 # Credits: Grigorii Sukhorukov, Macha Nikolski
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
4 import argparse
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
5 import os
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
6 from pathlib import Path
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
7
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
8 import numpy as np
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
9 import pandas as pd
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
10 from Bio import SeqIO
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
11 from models import model_10
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
12 from utils import preprocess as pp
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
13
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
14 os.environ["CUDA_VISIBLE_DEVICES"] = ""
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
15 os.environ["TF_XLA_FLAGS"] = "--tf_xla_cpu_global_jit"
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
16 # loglevel :
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
17 # 0 all printed
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
18 # 1 I not printed
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
19 # 2 I and W not printed
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
20 # 3 nothing printed
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
21 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
22
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
23
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
24 def predict_nn(ds_path, nn_weights_path, length, batch_size=256):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
25 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
26 Breaks down contigs into fragments
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
27 and uses pretrained neural networks to give predictions for fragments
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
28 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
29 try:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
30 seqs_ = list(SeqIO.parse(ds_path, "fasta"))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
31 except FileNotFoundError:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
32 raise Exception("test dataset was not found. Change ds variable")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
33
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
34 out_table = {
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
35 "id": [],
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
36 "length": [],
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
37 "fragment": [],
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
38 "pred_vir": [],
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
39 "pred_other": [],
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
40 }
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
41 if not seqs_:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
42 raise ValueError("All sequences were smaller than length of the model")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
43 test_fragments = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
44 test_fragments_rc = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
45 for seq in seqs_:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
46 fragments_, fragments_rc, _ = \
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
47 pp.fragmenting(
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
48 [seq],
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
49 length,
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
50 max_gap=0.8,
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
51 sl_wind_step=int(length / 2)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
52 )
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
53 test_fragments.extend(fragments_)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
54 test_fragments_rc.extend(fragments_rc)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
55 for j in range(len(fragments_)):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
56 out_table["id"].append(seq.id)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
57 out_table["length"].append(len(seq.seq))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
58 out_table["fragment"].append(j)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
59 test_encoded = pp.one_hot_encode(test_fragments)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
60 test_encoded_rc = pp.one_hot_encode(test_fragments_rc)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
61 model = model_10.model(length)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
62 model.load_weights(Path(nn_weights_path, f"model_{length}.h5"))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
63 prediction = model.predict([test_encoded, test_encoded_rc], batch_size)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
64 out_table['pred_vir'].extend(list(prediction[..., 1]))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
65 out_table['pred_other'].extend(list(prediction[..., 0]))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
66 print('Exporting predictions to csv file')
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
67 df = pd.DataFrame(out_table)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
68 df['NN_decision'] = np.where(df['pred_vir'] > df['pred_other'], 'virus', 'other')
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
69 return df
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
70
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
71
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
72 def predict_test(ds_path, length):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
73 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
74 Breaks down contigs into fragments
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
75 and gives 1 as prediction to all fragments
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
76 use only for testing!
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
77 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
78 try:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
79 seqs_ = list(SeqIO.parse(ds_path, "fasta"))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
80 except FileNotFoundError:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
81 raise Exception("test dataset was not found. Change ds variable")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
82
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
83 out_table = {
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
84 "id": [],
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
85 "length": [],
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
86 "fragment": [],
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
87 }
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
88 if not seqs_:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
89 raise ValueError("All sequences were smaller than length of the model")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
90 for seq in seqs_:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
91 fragments_, fragments_rc, _ = \
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
92 pp.fragmenting(
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
93 [seq],
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
94 length,
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
95 max_gap=0.8,
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
96 sl_wind_step=int(length / 2)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
97 )
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
98 for j in range(len(fragments_)):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
99 out_table["id"].append(seq.id)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
100 out_table["length"].append(len(seq.seq))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
101 out_table["fragment"].append(j)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
102 print('Exporting predictions to tsv file')
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
103 df = pd.DataFrame(out_table)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
104 df['pred_vir'] = 1
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
105 df['pred_other'] = 0
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
106 df['NN_decision'] = 'virus'
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
107 return df
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
108
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
109
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
110 def predict_contigs(df):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
111 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
112 Based on predictions of predict_rf for fragments
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
113 gives a final prediction for the whole contig
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
114 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
115 df = (
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
116 df.groupby(
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
117 ["id",
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
118 "length",
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
119 'NN_decision'],
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
120 sort=False
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
121 ).size().unstack(fill_value=0)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
122 )
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
123 df = df.reset_index()
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
124 df = df.reindex(
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
125 ['length', 'id', 'virus', 'other', ],
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
126 axis=1
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
127 ).fillna(value=0)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
128 df['decision'] = np.where(df['virus'] >= df['other'], 'virus', 'other')
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
129 df = df.sort_values(by='length', ascending=False)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
130 df = df.loc[:, ['length', 'id', 'virus', 'other', 'decision']]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
131 df = df.rename(
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
132 columns={
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
133 'virus': '# viral fragments',
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
134 'other': '# other fragments',
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
135 }
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
136 )
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
137 df['# viral / # total'] = (df['# viral fragments'] / (df['# viral fragments'] + df['# other fragments'])).round(3)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
138 df['# viral / # total * length'] = df['# viral / # total'] * df['length']
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
139 df = df.sort_values(by='# viral / # total * length', ascending=False)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
140 return df
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
141
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
142
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
143 def predict(test_ds, weights, out_path, return_viral=True):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
144 """filters out contaminant contigs from the fasta file.
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
145
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
146 test_ds: path to the input file with
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
147 contigs in fasta format (str or list of str)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
148 weights: path to the folder containing weights
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
149 for NN and RF modules trained on 500 and 1000 fragment lengths (str)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
150 out_path: path to the folder to store predictions (str)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
151 return_viral: whether to return contigs annotated as
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
152 viral in separate fasta file (True/False)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
153 """
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
154
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
155 test_ds = test_ds
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
156 if isinstance(test_ds, list):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
157 pass
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
158 elif isinstance(test_ds, str):
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
159 test_ds = [test_ds]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
160 else:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
161 raise ValueError('test_ds was incorrectly assigned in the config file')
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
162
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
163 assert Path(test_ds[0]).exists(), f'{test_ds[0]} does not exist'
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
164 # assert Path(weights).exists(), f'{weights} does not exist'
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
165 limit = 0
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
166 Path(out_path).mkdir(parents=True, exist_ok=True)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
167
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
168 # parameter to activate test function. Only for debugging on github
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
169 # test is launched when the weights directory is empty
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
170 use_test_f = not Path(weights, 'model_1000.h5').exists()
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
171 for ts in test_ds:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
172 dfs_fr = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
173 dfs_cont = []
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
174 for l_ in 500, 1000:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
175 print(f'starting prediction for {Path(ts).name} '
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
176 f'for fragment length {l_}')
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
177 if use_test_f:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
178 df = predict_test(ds_path=ts, length=l_, )
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
179 else:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
180 df = predict_nn(
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
181 ds_path=ts,
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
182 nn_weights_path=weights,
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
183 length=l_,
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
184 )
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
185 df = df.round(3)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
186 dfs_fr.append(df)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
187 df = predict_contigs(df)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
188 dfs_cont.append(df)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
189 print('prediction finished')
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
190 df_500 = dfs_fr[0][(dfs_fr[0]['length'] >= limit) & (dfs_fr[0]['length'] < 1500)]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
191 df_1000 = dfs_fr[1][(dfs_fr[1]['length'] >= 1500)]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
192 df = pd.concat([df_1000, df_500], ignore_index=True)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
193 pred_fr = Path(out_path, "predicted_fragments.tsv")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
194 df.to_csv(pred_fr, sep='\t')
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
195
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
196 df_500 = dfs_cont[0][(dfs_cont[0]['length']
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
197 >= limit) & (dfs_cont[0]['length'] < 1500)]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
198 df_1000 = dfs_cont[1][(dfs_cont[1]['length']
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
199 >= 1500)]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
200 df = pd.concat([df_1000, df_500], ignore_index=True)
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
201 pred_contigs = Path(out_path, "predicted.tsv")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
202 df.to_csv(pred_contigs, sep='\t')
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
203
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
204 if return_viral:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
205 viral_ids = list(df[df["decision"] == "virus"]["id"])
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
206 seqs_ = list(SeqIO.parse(ts, "fasta"))
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
207 viral_seqs = [s_ for s_ in seqs_ if s_.id in viral_ids]
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
208 SeqIO.write(
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
209 viral_seqs,
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
210 Path(
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
211 out_path,
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
212 "viral.fasta"), 'fasta')
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
213
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
214
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
215 if __name__ == '__main__':
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
216 parser = argparse.ArgumentParser()
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
217 parser.add_argument("--test_ds", help="path to the input "
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
218 "file with contigs "
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
219 "in fasta format "
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
220 "(str or list of str)")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
221 parser.add_argument("--weights", help="path to the folder containing "
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
222 "weights for NN and RF modules "
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
223 "trained on 500 and 1000 "
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
224 "fragment lengths (str)")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
225 parser.add_argument("--out_path", help="path to the folder to store "
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
226 "predictions (str)")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
227 parser.add_argument("--return_viral", help="whether to return "
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
228 "contigs annotated "
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
229 "as viral in separate "
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
230 "fasta file (True/False)")
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
231
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
232 args = parser.parse_args()
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
233 if args.test_ds:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
234 test_ds = args.test_ds
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
235 if args.weights:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
236 weights = args.weights
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
237 if args.out_path:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
238 out_path = args.out_path
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
239 if args.return_viral:
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
240 return_viral = args.return_viral
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
241
b856d3d95413 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontaminator commit 3f8e87001f3dfe7d005d0765aeaa930225c93b72
iuc
parents:
diff changeset
242 predict(test_ds, weights, out_path, return_viral)