Next changeset 1:9b12bc1b1e2c (2022-11-30) |
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/VirHunter commit 628688c1302dbf972e48806d2a5bafe27847bdcc |
added:
README.txt macros.xml models/__init__.py models/model_10.py models/model_5.py models/model_7.py predict.py test-data/predicted.csv test-data/predicted_fragments.csv test-data/viral.fasta test-data/virhunter.loc test-data/viruses.fasta test-data/weights/test/RF_1000.joblib test-data/weights/test/RF_500.joblib test-data/weights/test/model_5_1000.h5 test-data/weights/test/model_5_500.h5 test-data/weights/test/model_7_1000.h5 test-data/weights/test/model_7_500.h5 tool-data/virhunter.loc tool-data/virhunter.loc.sample tool-data/weights/test/RF_1000.joblib tool-data/weights/test/RF_500.joblib tool-data/weights/test/model_5_1000.h5 tool-data/weights/test/model_5_500.h5 tool-data/weights/test/model_7_1000.h5 tool-data/weights/test/model_7_500.h5 tool_data_table_conf.xml.sample tool_data_table_conf.xml.test utils/__init__.py utils/batch_loader.py utils/preprocess.py virhunter.xml virhunter.yml |
b |
diff -r 000000000000 -r 457fd8fd681a README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.txt Wed Nov 09 12:19:26 2022 +0000 |
[ |
@@ -0,0 +1,90 @@ +# VirHunter + +VirHunter is a deep learning method that uses Convolutional Neural Networks (CNNs) and a Random Forest Classifier to identify viruses in sequening datasets. More precisely, VirHunter classifies previously assembled contigs as viral, host and bacterial (contamination). + +## System Requirements +VirHunter installation requires a Unix environment with [python 3.8](http://www.python.org/). +It was tested on Linux and macOS operating systems. +For now, VirHunter is still not fully compatible with M1 chip MacBook. + +In order to run VirHunter you need to have git and conda already installed. +If you are installing conda for the first time, we suggest you to use +a lightweight [miniconda](https://docs.conda.io/en/latest/miniconda.html). +Otherwise, you can use pip for the dependencies' installation. + +## Installation + +To install VirHunter, you need to download it from github and then to install the dependancies. + +First, clone the repository from [github](https://github.com/cbib/virhunter) + +git clone https://github.com/cbib/virhunter.git + +Go to the VirHunter root folder + +cd virhunter/ + +### Installing dependencies with Conda + +First, you have to create the environment from the envs/environment.yml file. +The installation may take around 500 Mb of drive space. + +conda env create -f envs/environment.yml + +Second, activate the environment: + +conda activate virhunter + +### Installing dependencies with pip + +If you don't have Conda installed in your system, you can install python dependencies via pip program: + +pip install -r envs/requirements.txt + +Then if you have macOS you will need to install wget library to run some scripts (Conda installation already has it). You can do this with brew package manager. + +brew install wget + +### Testing your installation of VirHunter + +You can test that VirHunter was successfully installed on the toy dataset we provide. +IMPORTANT: the toy dataset is intended only to test that VirHunter has been well installed and all the scripts can be executed. +These modules should not be used for prediction on your owd datasets! + +First, you have to download the toy dataset + +bash scripts/download_test_installation.sh + +Then run the bash script that calls the testing, training and prediction python scripts of VirHunter. +Attention, the training process may take some time (up to an hour). + +bash scripts/test_installation.sh + + +## Using VirHunter for prediction + +To run VirHunter you can use the already pre-trained models or train VirHunter yourself (described in the next section). +Pre-trained model weights are already available for the multiple host plants. +You can download them using the download_weights.sh script. + +bash scripts/download_weights.sh + +Once the config file is ready, you can start the prediction: + +python virhunter/predict.py --test_ds /path/to/test_ds_1 + +After prediction VirHunter produces two csv files and one optional fasta file: + +1. The first file ends with _predicted_fragments.csv +It is an intermediate result containing predictions of the three CNN networks (probabilities of belonging to each of the virus/plant/bacteria class) and of the RF classifier for each fragment of every contig. + +2. The second file ends with _predicted.csv. +This file contains final predictions for contigs calculated from the previous file. + - id - fasta header of a contig. + - length - length of the contig. + - # viral fragments, # plant fragments and # bacterial fragments - the number of fragments of the contig that received corresponding class prediction by the RF classifier. + - decision - class given by the VirHunter to the contig. + - # viral / # total - number of viral fragments divided by the total number of fragments of the contig. + - # viral / # total * length - number of viral fragments divided by the total number of fragments of the contig multiplied by contig length. It is used to display the most relevant contigs first. + +3. The fasta file ends with _viral.fasta. It contains contigs that were predicted as viral by VirHunter. |
b |
diff -r 000000000000 -r 457fd8fd681a macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Wed Nov 09 12:19:26 2022 +0000 |
b |
@@ -0,0 +1,20 @@ +<macros> + <token name="@TOOL_VERSION@">1.0.0</token> + <token name="@VERSION_SUFFIX@">0</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="1.23.3">numpy</requirement> + <requirement type="package" version="1.79">biopython</requirement> + <requirement type="package" version="1.4.4">pandas</requirement> + <requirement type="package" version="1.1.0">joblib</requirement> + <requirement type="package" version="1.1.2">scikit-learn</requirement> + <requirement type="package" version="3.7.0">h5py</requirement> + <requirement type="package" version="2.9.1">tensorflow</requirement> + </requirements> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1038/s41467-019-12528-4</citation> + </citations> + </xml> +</macros> \ No newline at end of file |
b |
diff -r 000000000000 -r 457fd8fd681a models/model_10.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/models/model_10.py Wed Nov 09 12:19:26 2022 +0000 |
[ |
@@ -0,0 +1,28 @@ +from tensorflow.keras import layers, models + + +def launch(input_layer, hidden_layers): + output = input_layer + for hidden_layer in hidden_layers: + output = hidden_layer(output) + return output + + +def model(length, kernel_size=10, filters=512, dense_ns=512): + forward_input = layers.Input(shape=(length, 4)) + reverse_input = layers.Input(shape=(length, 4)) + hidden_layers = [ + layers.Conv1D(filters=filters, kernel_size=kernel_size), + layers.LeakyReLU(alpha=0.1), + layers.GlobalMaxPooling1D(), + layers.Dropout(0.1), + ] + forward_output = launch(forward_input, hidden_layers) + reverse_output = launch(reverse_input, hidden_layers) + output = layers.Concatenate()([forward_output, reverse_output]) + output = layers.Dense(dense_ns, activation='relu')(output) + output = layers.Dropout(0.1)(output) + output = layers.Dense(3, activation='softmax')(output) + model_ = models.Model(inputs=[forward_input, reverse_input], outputs=output) + model_.compile(optimizer="adam", loss='categorical_crossentropy', metrics='accuracy') + return model_ |
b |
diff -r 000000000000 -r 457fd8fd681a models/model_5.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/models/model_5.py Wed Nov 09 12:19:26 2022 +0000 |
[ |
@@ -0,0 +1,49 @@ +from tensorflow.keras import layers, models + + +def launch(input_layer, hidden_layers): + output = input_layer + for hidden_layer in hidden_layers: + output = hidden_layer(output) + return output + + +def model(length, kernel_size=5, filters=256, dense_ns=256): + forward_input = layers.Input(shape=(length, 4)) + reverse_input = layers.Input(shape=(length, 4)) + hidden_layers = [ + layers.Conv1D(filters=filters, kernel_size=kernel_size), + layers.LeakyReLU(alpha=0.1), + layers.GlobalMaxPooling1D(), + layers.Dropout(0.1), + ] + forward_output = launch(forward_input, hidden_layers) + reverse_output = launch(reverse_input, hidden_layers) + output = layers.Concatenate()([forward_output, reverse_output]) + output = layers.Dense(dense_ns, activation='relu')(output) + output = layers.Dropout(0.1)(output) + # output = layers.Dense(64, activation='relu')(output) + # output = layers.Dropout(0.1)(output) + output = layers.Dense(3, activation='softmax')(output) + model_ = models.Model(inputs=[forward_input, reverse_input], outputs=output) + model_.compile(optimizer="adam", loss='categorical_crossentropy', metrics='accuracy') + return model_ + + +# def model(length, kernel_size=5, filters=256, dense_ns=512): +# forward_input = layers.Input(shape=(length, 4)) +# reverse_input = layers.Input(shape=(length, 4)) +# hidden_layers = [ +# layers.Conv1D(filters=filters, kernel_size=kernel_size), +# layers.MaxPool1D(pool_size=50, strides=25), +# layers.LSTM(32), +# ] +# forward_output = launch(forward_input, hidden_layers) +# reverse_output = launch(reverse_input, hidden_layers) +# output = layers.Concatenate()([forward_output, reverse_output]) +# # output = layers.Dense(64, activation='relu')(output) +# output = layers.Dropout(0.1)(output) +# output = layers.Dense(3, activation='softmax')(output) +# model_ = models.Model(inputs=[forward_input, reverse_input], outputs=output) +# model_.compile(optimizer="adam", loss='categorical_crossentropy', metrics='accuracy') +# return model_ |
b |
diff -r 000000000000 -r 457fd8fd681a models/model_7.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/models/model_7.py Wed Nov 09 12:19:26 2022 +0000 |
[ |
@@ -0,0 +1,49 @@ +from tensorflow.keras import layers, models + + +def launch(input_layer, hidden_layers): + output = input_layer + for hidden_layer in hidden_layers: + output = hidden_layer(output) + return output + + +def model(length, kernel_size=7, filters=256, dense_ns=256): + forward_input = layers.Input(shape=(length, 4)) + reverse_input = layers.Input(shape=(length, 4)) + hidden_layers = [ + layers.Conv1D(filters=filters, kernel_size=kernel_size), + layers.LeakyReLU(alpha=0.1), + layers.GlobalMaxPooling1D(), + layers.Dropout(0.1), + ] + forward_output = launch(forward_input, hidden_layers) + reverse_output = launch(reverse_input, hidden_layers) + output = layers.Concatenate()([forward_output, reverse_output]) + output = layers.Dense(dense_ns, activation='relu')(output) + output = layers.Dropout(0.1)(output) + # output = layers.Dense(128, activation='relu')(output) + # output = layers.Dropout(0.1)(output) + output = layers.Dense(3, activation='softmax')(output) + model_ = models.Model(inputs=[forward_input, reverse_input], outputs=output) + model_.compile(optimizer="adam", loss='categorical_crossentropy', metrics='accuracy') + return model_ + + +# def model(length, kernel_size=7, filters=256, dense_ns=512): +# forward_input = layers.Input(shape=(length, 4)) +# reverse_input = layers.Input(shape=(length, 4)) +# hidden_layers = [ +# layers.Conv1D(filters=filters, kernel_size=kernel_size), +# layers.MaxPool1D(pool_size=50, strides=25), +# layers.LSTM(32), +# ] +# forward_output = launch(forward_input, hidden_layers) +# reverse_output = launch(reverse_input, hidden_layers) +# output = layers.Concatenate()([forward_output, reverse_output]) +# # output = layers.Dense(64, activation='relu')(output) +# output = layers.Dropout(0.1)(output) +# output = layers.Dense(3, activation='softmax')(output) +# model_ = models.Model(inputs=[forward_input, reverse_input], outputs=output) +# model_.compile(optimizer="adam", loss='categorical_crossentropy', metrics='accuracy') +# return model_ |
b |
diff -r 000000000000 -r 457fd8fd681a predict.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/predict.py Wed Nov 09 12:19:26 2022 +0000 |
[ |
b'@@ -0,0 +1,194 @@\n+#!/usr/bin/env python\n+# -*- coding: utf-8 -*-\n+# Credits: Grigorii Sukhorukov, Macha Nikolski\n+import argparse\n+import os\n+from pathlib import Path\n+\n+import numpy as np\n+import pandas as pd\n+from Bio import SeqIO\n+from joblib import load\n+from models import model_5, model_7\n+from utils import preprocess as pp\n+\n+os.environ["CUDA_VISIBLE_DEVICES"] = ""\n+os.environ["TF_XLA_FLAGS"] = "--tf_xla_cpu_global_jit"\n+# loglevel : 0 all printed, 1 I not printed, 2 I and W not printed, 3 nothing printed\n+os.environ[\'TF_CPP_MIN_LOG_LEVEL\'] = \'3\'\n+\n+\n+def predict_nn(ds_path, nn_weights_path, length, batch_size=256):\n+ """\n+ Breaks down contigs into fragments\n+ and uses pretrained neural networks to give predictions for fragments\n+ """\n+ try:\n+ seqs_ = list(SeqIO.parse(ds_path, "fasta"))\n+ except FileNotFoundError:\n+ raise Exception("test dataset was not found. Change ds variable")\n+ out_table = {\n+ "id": [],\n+ "length": [],\n+ "fragment": [],\n+ "pred_plant_5": [],\n+ "pred_vir_5": [],\n+ "pred_bact_5": [],\n+ "pred_plant_7": [],\n+ "pred_vir_7": [],\n+ "pred_bact_7": [],\n+ # "pred_plant_10": [],\n+ # "pred_vir_10": [],\n+ # "pred_bact_10": [],\n+ }\n+ if not seqs_:\n+ raise ValueError("All sequences were smaller than length of the model")\n+ test_fragments = []\n+ test_fragments_rc = []\n+ for seq in seqs_:\n+ fragments_, fragments_rc, _ = pp.fragmenting([seq], length, max_gap=0.8,\n+ sl_wind_step=int(length / 2))\n+ test_fragments.extend(fragments_)\n+ test_fragments_rc.extend(fragments_rc)\n+ for j in range(len(fragments_)):\n+ out_table["id"].append(seq.id)\n+ out_table["length"].append(len(seq.seq))\n+ out_table["fragment"].append(j)\n+ test_encoded = pp.one_hot_encode(test_fragments)\n+ test_encoded_rc = pp.one_hot_encode(test_fragments_rc)\n+ # for model, s in zip([model_5.model(length), model_7.model(length), model_10.model(length)], [5, 7, 10]):\n+ for model, s in zip([model_5.model(length), model_7.model(length)], [5, 7]):\n+ model.load_weights(Path(nn_weights_path, f"model_{s}_{length}.h5"))\n+ prediction = model.predict([test_encoded, test_encoded_rc], batch_size)\n+ out_table[f"pred_plant_{s}"].extend(list(prediction[..., 0]))\n+ out_table[f"pred_vir_{s}"].extend(list(prediction[..., 1]))\n+ out_table[f"pred_bact_{s}"].extend(list(prediction[..., 2]))\n+ return pd.DataFrame(out_table)\n+\n+\n+def predict_rf(df, rf_weights_path, length):\n+ """\n+ Using predictions by predict_nn and weights of a trained RF classifier gives a single prediction for a fragment\n+ """\n+\n+ clf = load(Path(rf_weights_path, f"RF_{length}.joblib"))\n+ X = df[["pred_plant_5", "pred_vir_5", "pred_plant_7", "pred_vir_7"]]\n+ # X = ["pred_plant_5", "pred_vir_5", "pred_plant_7", "pred_vir_7", "pred_plant_10", "pred_vir_10", ]]\n+ y_pred = clf.predict(X)\n+ mapping = {0: "plant", 1: "virus", 2: "bacteria"}\n+ df["RF_decision"] = np.vectorize(mapping.get)(y_pred)\n+ prob_classes = clf.predict_proba(X)\n+ df["RF_pred_plant"] = prob_classes[..., 0]\n+ df["RF_pred_vir"] = prob_classes[..., 1]\n+ df["RF_pred_bact"] = prob_classes[..., 2]\n+ return df\n+\n+\n+def predict_contigs(df):\n+ """\n+ Based on predictions of predict_rf for fragments gives a final prediction for the whole contig\n+ """\n+ df = (\n+ df.groupby(["id", "length", \'RF_decision\'], sort=False)\n+ .size()\n+ .unstack(fill_value=0)\n+ )\n+ df = df.reset_index()\n+ df = df.reindex([\'length\', \'id\', \'virus\', \'plant\', \'bacteria\'], axis=1)\n+ conditions = [\n+ (df[\'virus\'] > df[\'plant\']) & (df[\'virus\'] > df[\'bacteria\']),\n+ (df[\'plant\'] > df[\'virus\']) & (df[\'plant\'] > df[\'bacteria\']),\n+ (df[\'bacteria\'] >= df[\'plant\']) & (df[\'bacteria\'] >= df['..b'# total\'] = (df[\'# viral fragments\'] / (df[\'# viral fragments\'] + df[\'# bacterial fragments\'] + df[\'# plant fragments\'])).round(3)\n+ df[\'# viral / # total * length\'] = df[\'# viral / # total\'] * df[\'length\']\n+ df = df.sort_values(by=\'# viral / # total * length\', ascending=False)\n+ return df\n+\n+\n+def predict(test_ds, weights, out_path, return_viral, limit):\n+ """Predicts viral contigs from the fasta file\n+\n+ test_ds: path to the input file with contigs in fasta format (str or list of str)\n+ weights: path to the folder containing weights for NN and RF modules trained on 500 and 1000 fragment lengths (str)\n+ out_path: path to the folder to store predictions (str)\n+ return_viral: whether to return contigs annotated as viral in separate fasta file (True/False)\n+ limit: Do predictions only for contigs > l. We suggest l=750. (int)\n+ """\n+ test_ds = test_ds\n+ if isinstance(test_ds, list):\n+ pass\n+ elif isinstance(test_ds, str):\n+ test_ds = [test_ds]\n+ else:\n+ raise ValueError(\'test_ds was incorrectly assigned in the config file\')\n+\n+ assert Path(test_ds[0]).exists(), f\'{test_ds[0]} does not exist\'\n+ assert Path(weights).exists(), f\'{weights} does not exist\'\n+ assert isinstance(limit, int), \'limit should be an integer\'\n+ Path(out_path).mkdir(parents=True, exist_ok=True)\n+\n+ for ts in test_ds:\n+ dfs_fr = []\n+ dfs_cont = []\n+ for l_ in 500, 1000:\n+ # print(f\'starting prediction for {Path(ts).name} for fragment length {l_}\')\n+ df = predict_nn(\n+ ds_path=ts,\n+ nn_weights_path=weights,\n+ length=l_,\n+ )\n+ print(df)\n+ df = predict_rf(\n+ df=df,\n+ rf_weights_path=weights,\n+ length=l_,\n+ )\n+ df = df.round(3)\n+ dfs_fr.append(df)\n+ df = predict_contigs(df)\n+ dfs_cont.append(df)\n+ # print(\'prediction finished\')\n+ df_500 = dfs_fr[0][(dfs_fr[0][\'length\'] >= limit) & (dfs_fr[0][\'length\'] < 1500)]\n+ df_1000 = dfs_fr[1][(dfs_fr[1][\'length\'] >= 1500)]\n+ df = pd.concat([df_1000, df_500], ignore_index=True)\n+ pred_fr = Path(out_path, \'predicted_fragments.csv\')\n+ df.to_csv(pred_fr)\n+\n+ df_500 = dfs_cont[0][(dfs_cont[0][\'length\'] >= limit) & (dfs_cont[0][\'length\'] < 1500)]\n+ df_1000 = dfs_cont[1][(dfs_cont[1][\'length\'] >= 1500)]\n+ df = pd.concat([df_1000, df_500], ignore_index=True)\n+ pred_contigs = Path(out_path, \'predicted.csv\')\n+ df.to_csv(pred_contigs)\n+\n+ if return_viral:\n+ viral_ids = list(df[df["decision"] == "virus"]["id"])\n+ seqs_ = list(SeqIO.parse(ts, "fasta"))\n+ viral_seqs = [s_ for s_ in seqs_ if s_.id in viral_ids]\n+ SeqIO.write(viral_seqs, Path(out_path, \'viral.fasta\'), \'fasta\')\n+\n+\n+if __name__ == \'__main__\':\n+ parser = argparse.ArgumentParser()\n+ parser.add_argument("--test_ds", help="path to the input file with contigs in fasta format (str or list of str)")\n+ parser.add_argument("--weights", help="path to the folder containing weights for NN and RF modules trained on 500 and 1000 fragment lengths (str)")\n+ parser.add_argument("--out_path", help="path to the folder to store predictions (str)")\n+ parser.add_argument("--return_viral", help="whether to return contigs annotated as viral in separate fasta file (True/False)")\n+ parser.add_argument("--limit", help="Do predictions only for contigs > l. We suggest l=750. (int)", type=int)\n+\n+ args = parser.parse_args()\n+ if args.test_ds:\n+ test_ds = args.test_ds\n+ if args.weights:\n+ weights = args.weights\n+ if args.out_path:\n+ out_path = args.out_path\n+ if args.return_viral:\n+ return_viral = args.return_viral\n+ if args.limit:\n+ limit = args.limit\n+ predict(test_ds, weights, out_path, return_viral, limit)\n' |
b |
diff -r 000000000000 -r 457fd8fd681a test-data/predicted.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/predicted.csv Wed Nov 09 12:19:26 2022 +0000 |
b |
b'@@ -0,0 +1,274 @@\n+,length,id,# viral fragments,# plant fragments,# bacterial fragments,decision,# viral / # total,# viral / # total * length\n+0,19251,KY110737.1,30,2,6,virus,0.789,15189.039\n+1,18478,KY707825.2,24,3,9,virus,0.667,12324.826000000001\n+2,10427,AM267479.1,20,0,0,virus,1.0,10427.0\n+3,9802,MT533611.1,19,0,0,virus,1.0,9802.0\n+4,9798,LC537538.1,19,0,0,virus,1.0,9798.0\n+5,9736,LC600465.1,19,0,0,virus,1.0,9736.0\n+6,9736,LC494681.1,19,0,0,virus,1.0,9736.0\n+7,9445,JN936438.1,18,0,0,virus,1.0,9445.0\n+8,11892,LC602461.1,18,5,0,virus,0.783,9311.436\n+9,9757,MK777994.1,18,1,0,virus,0.947,9239.878999999999\n+10,9663,LC038073.1,18,1,0,virus,0.947,9150.860999999999\n+11,9429,MK649741.1,17,1,0,virus,0.944,8900.975999999999\n+12,9798,LC537503.1,17,0,2,virus,0.895,8769.210000000001\n+13,8421,MN059369.1,16,0,0,virus,1.0,8421.0\n+14,9798,AB747297.1,16,0,3,virus,0.842,8249.916\n+15,8745,NC_008169.1,16,0,1,virus,0.941,8229.045\n+16,10487,KX884756.1,15,5,0,virus,0.75,7865.25\n+17,9287,MZ148041.1,15,3,0,virus,0.833,7736.071\n+18,8745,MT554537.1,15,0,2,virus,0.882,7713.09\n+19,8366,MN059208.1,14,1,1,virus,0.875,7320.25\n+20,7242,AJ534983.1,14,0,0,virus,1.0,7242.0\n+21,7702,KX008574.1,14,1,0,virus,0.933,7185.966\n+22,8644,MN059273.1,14,0,3,virus,0.824,7122.656\n+23,7343,KX034904.1,13,1,0,virus,0.929,6821.647\n+24,7341,KX034878.1,13,1,0,virus,0.929,6819.789000000001\n+25,6791,JN710440.1,13,0,0,virus,1.0,6791.0\n+26,6436,MT752813.1,12,0,0,virus,1.0,6436.0\n+27,6436,MT752690.1,12,0,0,virus,1.0,6436.0\n+28,6436,MT752919.1,12,0,0,virus,1.0,6436.0\n+29,6395,HE818411.1,12,0,0,virus,1.0,6395.0\n+30,6355,EF469769.1,12,0,0,virus,1.0,6355.0\n+31,6226,LC125633.1,12,0,0,virus,1.0,6226.0\n+32,7466,MN504765.1,11,1,2,virus,0.786,5868.276\n+33,6383,MF002491.1,11,1,0,virus,0.917,5853.211\n+34,8361,MZ148109.1,11,5,0,virus,0.688,5752.3679999999995\n+35,5664,LT991639.1,11,0,0,virus,1.0,5664.0\n+36,6549,MT317167.1,11,2,0,virus,0.846,5540.454\n+37,7010,BK011046.1,11,3,0,virus,0.786,5509.860000000001\n+38,5266,MN497809.1,10,0,0,virus,1.0,5266.0\n+39,5078,KY769710.1,10,0,0,virus,1.0,5078.0\n+40,7377,KC847061.1,9,5,0,virus,0.643,4743.411\n+41,8434,KF030878.2,9,7,0,virus,0.562,4739.908\n+42,5398,MT892660.1,8,0,2,virus,0.8,4318.400000000001\n+43,4224,EF155982.1,8,0,0,virus,1.0,4224.0\n+44,4059,MN101211.1,8,0,0,virus,1.0,4059.0\n+45,8607,MW629381.1,7,10,0,plant,0.412,3546.084\n+46,3334,LC066453.1,6,0,0,virus,1.0,3334.0\n+47,3449,MN160358.1,5,1,0,virus,0.833,2873.017\n+48,2824,NC_015324.1,5,0,0,virus,1.0,2824.0\n+49,2797,JF909142.1,5,0,0,virus,1.0,2797.0\n+50,2789,JX025360.1,5,0,0,virus,1.0,2789.0\n+51,2781,MG250124.1,5,0,0,virus,1.0,2781.0\n+52,2781,KC106643.1,5,0,0,virus,1.0,2781.0\n+53,2781,KX024650.1,5,0,0,virus,1.0,2781.0\n+54,2778,GU180085.1,5,0,0,virus,1.0,2778.0\n+55,2773,LN846605.1,5,0,0,virus,1.0,2773.0\n+56,2768,JN604500.1,5,0,0,virus,1.0,2768.0\n+57,2767,KU569582.1,5,0,0,virus,1.0,2767.0\n+58,2762,FN554528.1,5,0,0,virus,1.0,2762.0\n+59,2761,MK757212.1,5,0,0,virus,1.0,2761.0\n+60,2761,NC_023312.1,5,0,0,virus,1.0,2761.0\n+61,2751,AJ558124.1,5,0,0,virus,1.0,2751.0\n+62,2750,KJ536097.1,5,0,0,virus,1.0,2750.0\n+63,2749,KJ473695.1,5,0,0,virus,1.0,2749.0\n+64,2743,MT298220.1,5,0,0,virus,1.0,2743.0\n+65,2741,KC019309.1,5,0,0,virus,1.0,2741.0\n+66,2739,JN082236.1,5,0,0,virus,1.0,2739.0\n+67,2738,MH577756.1,5,0,0,virus,1.0,2738.0\n+68,2738,MH577693.1,5,0,0,virus,1.0,2738.0\n+69,2738,AY184487.3,5,0,0,virus,1.0,2738.0\n+70,2737,MN630281.1,5,0,0,virus,1.0,2737.0\n+71,2736,KX302711.1,5,0,0,virus,1.0,2736.0\n+72,2732,AJ558125.1,5,0,0,virus,1.0,2732.0\n+73,2724,HE979787.1,5,0,0,virus,1.0,2724.0\n+74,3808,HM125550.1,5,0,2,virus,0.714,2718.912\n+75,2705,KM229922.1,5,0,0,virus,1.0,2705.0\n+76,2698,HQ264186.1,5,0,0,virus,1.0,2698.0\n+77,2693,MW426877.1,5,0,0,virus,1.0,2693.0\n+78,2690,MT592862.1,5,0,0,virus,1.0,2690.0\n+79,2662,BK010711.1,5,0,0,virus,1.0,2662.0\n+80,2659,NC_055586.1,5,0,0,virus,1.0,2659.0\n+81,2640,MW273384.1,5,0,0,virus,1.0,2640.0\n+82,2638,JX857691.1,5,0,0,virus,1.0,2638.0\n+83,3164,MH999327.1,5,1,0,virus,0.833,2635.612\n+84,2630,E'..b'1,3,0,0,virus,1.0,909.0\n+185,1200,GU904131.1,3,1,0,virus,0.75,900.0\n+186,900,EU196425.1,3,0,0,virus,1.0,900.0\n+187,894,MH974475.1,3,0,0,virus,1.0,894.0\n+188,885,MF773983.1,3,0,0,virus,1.0,885.0\n+189,873,KC985038.1,3,0,0,virus,1.0,873.0\n+190,872,MT656393.1,3,0,0,virus,1.0,872.0\n+191,872,MT656427.1,3,0,0,virus,1.0,872.0\n+192,871,KR076680.1,3,0,0,virus,1.0,871.0\n+193,867,DQ364987.1,3,0,0,virus,1.0,867.0\n+194,1419,AY366417.1,3,2,0,virus,0.6,851.4\n+195,849,HM768172.1,3,0,0,virus,1.0,849.0\n+196,1132,MK929579.1,3,1,0,virus,0.75,849.0\n+197,849,HM768183.1,3,0,0,virus,1.0,849.0\n+198,846,MN814413.1,3,0,0,virus,1.0,846.0\n+199,1410,AB457617.1,3,2,0,virus,0.6,846.0\n+200,837,KU743350.1,3,0,0,virus,1.0,837.0\n+201,1374,KJ789902.1,3,2,0,virus,0.6,824.4\n+202,1370,KY271069.1,3,2,0,virus,0.6,822.0\n+203,822,JQ361098.1,3,0,0,virus,1.0,822.0\n+204,821,MG717790.1,3,0,0,virus,1.0,821.0\n+205,1366,LN831969.1,3,2,0,virus,0.6,819.6\n+206,1086,KF710809.1,3,0,1,virus,0.75,814.5\n+207,1086,KF710849.1,3,0,1,virus,0.75,814.5\n+208,1086,KM607344.1,3,1,0,virus,0.75,814.5\n+209,810,JQ619096.1,3,0,0,virus,1.0,810.0\n+210,810,JQ619105.1,3,0,0,virus,1.0,810.0\n+211,1075,FJ859737.1,3,1,0,virus,0.75,806.25\n+212,803,KC282369.1,3,0,0,virus,1.0,803.0\n+213,801,JQ954315.1,3,0,0,virus,1.0,801.0\n+214,795,MN955028.1,3,0,0,virus,1.0,795.0\n+215,794,KX130921.1,3,0,0,virus,1.0,794.0\n+216,1058,FJ463044.1,3,1,0,virus,0.75,793.5\n+217,787,AM113814.1,3,0,0,virus,1.0,787.0\n+218,1048,KM607216.1,3,1,0,virus,0.75,786.0\n+219,1040,KM607230.1,3,1,0,virus,0.75,780.0\n+220,777,HM180089.1,3,0,0,virus,1.0,777.0\n+221,777,KJ494928.1,3,0,0,virus,1.0,777.0\n+222,777,KU297996.1,3,0,0,virus,1.0,777.0\n+223,776,MT037061.1,3,0,0,virus,1.0,776.0\n+224,1032,MH746956.1,3,1,0,virus,0.75,774.0\n+225,774,FR693101.1,3,0,0,virus,1.0,774.0\n+226,1028,KF711199.1,3,1,0,virus,0.75,771.0\n+227,771,KM275602.1,3,0,0,virus,1.0,771.0\n+228,1028,KF711263.1,3,1,0,virus,0.75,771.0\n+229,771,JF825866.1,3,0,0,virus,1.0,771.0\n+230,1264,KX119438.1,3,2,0,virus,0.6,758.4\n+231,751,KF156666.1,3,0,0,virus,1.0,751.0\n+232,999,FJ262095.1,2,1,0,virus,0.667,666.3330000000001\n+233,999,FJ262061.1,2,1,0,virus,0.667,666.3330000000001\n+234,976,LC465418.1,2,1,0,virus,0.667,650.9920000000001\n+235,956,KR065439.1,2,1,0,virus,0.667,637.652\n+236,917,MK445316.1,2,0,1,virus,0.667,611.639\n+237,1200,KJ849141.1,2,2,0,bacteria,0.5,600.0\n+238,890,HQ335265.1,2,1,0,virus,0.667,593.63\n+239,881,AJ884700.1,2,0,1,virus,0.667,587.6270000000001\n+240,1164,KP233027.1,2,1,1,virus,0.5,582.0\n+241,859,KF525380.1,2,1,0,virus,0.667,572.9530000000001\n+242,1421,AY839627.1,2,3,0,plant,0.4,568.4\n+243,836,LC223336.1,2,0,1,virus,0.667,557.6120000000001\n+244,1372,MH643738.1,2,3,0,plant,0.4,548.8000000000001\n+245,1365,MN240344.1,2,3,0,plant,0.4,546.0\n+246,1342,FN806780.1,2,3,0,plant,0.4,536.8000000000001\n+247,1061,KM607850.1,2,1,1,virus,0.5,530.5\n+248,1060,KM607802.1,2,2,0,bacteria,0.5,530.0\n+249,784,AM494507.1,2,1,0,virus,0.667,522.928\n+250,1037,KM607814.1,2,1,1,virus,0.5,518.5\n+251,1035,KY473725.1,2,2,0,bacteria,0.5,517.5\n+252,768,MH182699.1,2,1,0,virus,0.667,512.2560000000001\n+253,1023,JF828185.1,2,2,0,bacteria,0.5,511.5\n+254,756,MH686308.1,2,0,1,virus,0.667,504.252\n+255,1003,KX431385.1,2,2,0,bacteria,0.5,501.5\n+256,975,KP866939.1,1,0,2,bacteria,0.333,324.675\n+257,960,KF010516.1,1,0,2,bacteria,0.333,319.68\n+258,945,KX418448.1,1,2,0,plant,0.333,314.685\n+259,942,JF795548.1,1,2,0,plant,0.333,313.68600000000004\n+260,939,KP232980.1,1,0,2,bacteria,0.333,312.687\n+261,1369,KM880104.1,1,4,0,plant,0.2,273.8\n+262,1367,LT600712.1,1,4,0,plant,0.2,273.40000000000003\n+263,1083,KM607376.1,1,2,1,plant,0.25,270.75\n+264,1346,LT674474.1,1,4,0,plant,0.2,269.2\n+265,756,MW519386.1,1,2,0,plant,0.333,251.74800000000002\n+266,1256,AM041944.1,1,4,0,plant,0.2,251.20000000000002\n+267,1179,AJ493270.1,0,4,0,plant,0.0,0.0\n+268,833,MK876224.1,0,3,0,plant,0.0,0.0\n+269,1216,AB697885.1,0,4,0,plant,0.0,0.0\n+270,822,MF043148.1,0,0,3,bacteria,0.0,0.0\n+271,960,KC969470.1,0,3,0,plant,0.0,0.0\n+272,756,MW519383.1,0,3,0,plant,0.0,0.0\n' |
b |
diff -r 000000000000 -r 457fd8fd681a test-data/predicted_fragments.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/predicted_fragments.csv Wed Nov 09 12:19:26 2022 +0000 |
b |
b'@@ -0,0 +1,1730 @@\n+,id,length,fragment,pred_plant_5,pred_vir_5,pred_bact_5,pred_plant_7,pred_vir_7,pred_bact_7,RF_decision,RF_pred_plant,RF_pred_vir,RF_pred_bact\n+0,MT921846.1,2971,0,0.716,0.252,0.032,0.129,0.799,0.072,virus,0.19,0.782,0.028\n+1,MT921846.1,2971,1,0.681,0.287,0.033,0.263,0.684,0.053,virus,0.221,0.772,0.007\n+2,MT921846.1,2971,2,0.741,0.224,0.036,0.789,0.181,0.029,plant,0.878,0.065,0.057\n+3,MT921846.1,2971,3,0.754,0.208,0.038,0.783,0.154,0.063,plant,0.878,0.065,0.057\n+4,MT921846.1,2971,4,0.513,0.414,0.073,0.269,0.601,0.13,virus,0.324,0.584,0.091\n+5,MF540775.1,3929,0,0.896,0.093,0.011,0.884,0.112,0.004,plant,0.855,0.028,0.117\n+6,MF540775.1,3929,1,0.843,0.14,0.016,0.806,0.186,0.009,plant,0.85,0.053,0.098\n+7,MF540775.1,3929,2,0.879,0.108,0.013,0.765,0.211,0.024,plant,0.705,0.039,0.256\n+8,MF540775.1,3929,3,0.824,0.158,0.017,0.698,0.257,0.045,plant,0.804,0.129,0.066\n+9,MF540775.1,3929,4,0.748,0.223,0.029,0.562,0.38,0.058,plant,0.765,0.136,0.099\n+10,MF540775.1,3929,5,0.836,0.15,0.015,0.675,0.31,0.015,plant,0.804,0.129,0.066\n+11,MF540775.1,3929,6,0.857,0.129,0.015,0.584,0.398,0.018,plant,0.753,0.162,0.085\n+12,BK011046.1,7010,0,0.526,0.424,0.05,0.221,0.728,0.051,virus,0.303,0.692,0.004\n+13,BK011046.1,7010,1,0.464,0.486,0.05,0.243,0.707,0.05,virus,0.28,0.713,0.007\n+14,BK011046.1,7010,2,0.282,0.605,0.112,0.181,0.702,0.117,virus,0.04,0.919,0.041\n+15,BK011046.1,7010,3,0.315,0.585,0.1,0.343,0.632,0.025,virus,0.424,0.543,0.032\n+16,BK011046.1,7010,4,0.539,0.413,0.048,0.561,0.435,0.004,plant,0.855,0.093,0.052\n+17,BK011046.1,7010,5,0.449,0.477,0.075,0.423,0.555,0.022,plant,0.602,0.327,0.071\n+18,BK011046.1,7010,6,0.442,0.482,0.076,0.398,0.571,0.031,plant,0.565,0.347,0.088\n+19,BK011046.1,7010,7,0.392,0.493,0.115,0.189,0.745,0.065,virus,0.103,0.883,0.013\n+20,BK011046.1,7010,8,0.506,0.396,0.098,0.203,0.717,0.08,virus,0.316,0.671,0.013\n+21,BK011046.1,7010,9,0.427,0.49,0.083,0.177,0.731,0.092,virus,0.152,0.829,0.019\n+22,BK011046.1,7010,10,0.389,0.509,0.101,0.26,0.672,0.067,virus,0.233,0.759,0.008\n+23,BK011046.1,7010,11,0.224,0.642,0.134,0.115,0.791,0.094,virus,0.034,0.951,0.014\n+24,BK011046.1,7010,12,0.267,0.598,0.135,0.156,0.72,0.125,virus,0.04,0.919,0.041\n+25,BK011046.1,7010,13,0.253,0.607,0.139,0.161,0.711,0.128,virus,0.04,0.919,0.041\n+26,NC_055544.1,2626,0,0.098,0.568,0.334,0.033,0.855,0.112,virus,0.026,0.96,0.013\n+27,NC_055544.1,2626,1,0.214,0.687,0.099,0.06,0.893,0.047,virus,0.029,0.957,0.014\n+28,NC_055544.1,2626,2,0.208,0.713,0.079,0.045,0.93,0.024,virus,0.029,0.957,0.014\n+29,NC_055544.1,2626,3,0.262,0.625,0.113,0.04,0.886,0.074,virus,0.029,0.957,0.014\n+30,NC_055544.1,2626,4,0.322,0.6,0.078,0.065,0.908,0.027,virus,0.041,0.946,0.013\n+31,JX857691.1,2638,0,0.146,0.588,0.266,0.066,0.892,0.042,virus,0.033,0.952,0.014\n+32,JX857691.1,2638,1,0.334,0.606,0.06,0.119,0.866,0.015,virus,0.047,0.94,0.013\n+33,JX857691.1,2638,2,0.149,0.757,0.095,0.077,0.904,0.018,virus,0.033,0.952,0.014\n+34,JX857691.1,2638,3,0.169,0.663,0.168,0.067,0.828,0.106,virus,0.034,0.949,0.017\n+35,JX857691.1,2638,4,0.274,0.611,0.115,0.112,0.869,0.019,virus,0.034,0.951,0.014\n+36,LC125633.1,6226,0,0.111,0.533,0.356,0.078,0.839,0.083,virus,0.044,0.924,0.032\n+37,LC125633.1,6226,1,0.236,0.645,0.119,0.144,0.821,0.035,virus,0.034,0.951,0.014\n+38,LC125633.1,6226,2,0.134,0.606,0.26,0.14,0.825,0.035,virus,0.039,0.946,0.014\n+39,LC125633.1,6226,3,0.233,0.581,0.186,0.103,0.841,0.055,virus,0.034,0.949,0.017\n+40,LC125633.1,6226,4,0.202,0.631,0.167,0.069,0.85,0.081,virus,0.029,0.957,0.014\n+41,LC125633.1,6226,5,0.191,0.593,0.216,0.202,0.736,0.062,virus,0.209,0.779,0.012\n+42,LC125633.1,6226,6,0.345,0.54,0.115,0.314,0.629,0.057,virus,0.232,0.704,0.064\n+43,LC125633.1,6226,7,0.07,0.424,0.507,0.046,0.702,0.252,virus,0.066,0.79,0.144\n+44,LC125633.1,6226,8,0.068,0.478,0.454,0.025,0.677,0.298,virus,0.04,0.88,0.081\n+45,LC125633.1,6226,9,0.136,0.399,0.465,0.122,0.845,0.033,virus,0.084,0.885,0.031\n+46,LC125633.1,6226,10,0.068,0.435,0.498,0.127,0.777,0.097,virus,0.058,0.856,0.087\n+47,LC125633.1,622'..b'virus,0.015,0.975,0.01\n+1681,KC007531.1,1293,4,0.056,0.928,0.016,0.147,0.849,0.004,virus,0.013,0.983,0.004\n+1682,MK098183.1,1350,0,0.112,0.866,0.021,0.218,0.767,0.015,virus,0.036,0.937,0.027\n+1683,MK098183.1,1350,1,0.081,0.914,0.005,0.132,0.866,0.002,virus,0.007,0.982,0.012\n+1684,MK098183.1,1350,2,0.29,0.698,0.012,0.676,0.315,0.009,plant,0.637,0.307,0.056\n+1685,MK098183.1,1350,3,0.152,0.842,0.006,0.342,0.63,0.028,virus,0.273,0.627,0.1\n+1686,MK098183.1,1350,4,0.069,0.924,0.006,0.307,0.67,0.023,virus,0.154,0.815,0.03\n+1687,MN814413.1,846,0,0.093,0.864,0.043,0.112,0.839,0.05,virus,0.015,0.973,0.012\n+1688,MN814413.1,846,1,0.264,0.72,0.015,0.051,0.938,0.011,virus,0.105,0.862,0.033\n+1689,MN814413.1,846,2,0.192,0.802,0.005,0.056,0.939,0.005,virus,0.022,0.968,0.01\n+1690,FJ859737.1,1075,0,0.136,0.271,0.593,0.231,0.534,0.235,virus,0.176,0.606,0.217\n+1691,FJ859737.1,1075,1,0.34,0.637,0.023,0.261,0.728,0.011,virus,0.28,0.707,0.012\n+1692,FJ859737.1,1075,2,0.176,0.812,0.011,0.439,0.553,0.008,plant,0.514,0.478,0.008\n+1693,FJ859737.1,1075,3,0.101,0.892,0.007,0.191,0.806,0.003,virus,0.033,0.952,0.015\n+1694,KX420987.1,938,0,0.028,0.967,0.005,0.047,0.946,0.007,virus,0.003,0.995,0.002\n+1695,KX420987.1,938,1,0.024,0.963,0.013,0.031,0.94,0.029,virus,0.003,0.987,0.01\n+1696,KX420987.1,938,2,0.045,0.889,0.065,0.06,0.871,0.069,virus,0.005,0.987,0.008\n+1697,KX418448.1,945,0,0.506,0.482,0.012,0.326,0.669,0.005,virus,0.417,0.56,0.023\n+1698,KX418448.1,945,1,0.7,0.295,0.005,0.808,0.191,0.001,plant,0.811,0.126,0.063\n+1699,KX418448.1,945,2,0.349,0.646,0.005,0.666,0.332,0.002,plant,0.663,0.312,0.026\n+1700,KC985038.1,873,0,0.12,0.796,0.084,0.414,0.503,0.083,virus,0.459,0.479,0.062\n+1701,KC985038.1,873,1,0.038,0.919,0.043,0.129,0.858,0.012,virus,0.013,0.983,0.004\n+1702,KC985038.1,873,2,0.024,0.968,0.008,0.049,0.949,0.002,virus,0.003,0.995,0.002\n+1703,MN955028.1,795,0,0.063,0.896,0.041,0.072,0.769,0.159,virus,0.026,0.966,0.008\n+1704,MN955028.1,795,1,0.102,0.863,0.035,0.083,0.902,0.016,virus,0.007,0.985,0.008\n+1705,MN955028.1,795,2,0.095,0.879,0.027,0.088,0.899,0.013,virus,0.007,0.982,0.012\n+1706,KC466374.1,1075,0,0.227,0.745,0.028,0.164,0.742,0.094,virus,0.08,0.88,0.04\n+1707,KC466374.1,1075,1,0.182,0.809,0.01,0.176,0.819,0.005,virus,0.027,0.954,0.019\n+1708,KC466374.1,1075,2,0.141,0.849,0.01,0.33,0.663,0.007,virus,0.18,0.792,0.028\n+1709,KC466374.1,1075,3,0.129,0.867,0.004,0.216,0.782,0.003,virus,0.037,0.949,0.015\n+1710,HM035070.1,920,0,0.221,0.767,0.012,0.463,0.531,0.006,virus,0.375,0.606,0.018\n+1711,HM035070.1,920,1,0.092,0.866,0.042,0.109,0.872,0.019,virus,0.007,0.982,0.012\n+1712,HM035070.1,920,2,0.125,0.846,0.03,0.273,0.707,0.02,virus,0.14,0.845,0.015\n+1713,KF711199.1,1028,0,0.06,0.916,0.024,0.063,0.925,0.012,virus,0.005,0.987,0.008\n+1714,KF711199.1,1028,1,0.101,0.897,0.003,0.155,0.844,0.001,virus,0.007,0.985,0.008\n+1715,KF711199.1,1028,2,0.283,0.713,0.005,0.634,0.363,0.003,plant,0.646,0.29,0.064\n+1716,KF711199.1,1028,3,0.179,0.817,0.004,0.454,0.542,0.004,virus,0.478,0.509,0.013\n+1717,JF828185.1,1023,0,0.376,0.525,0.099,0.513,0.422,0.065,plant,0.587,0.391,0.022\n+1718,JF828185.1,1023,1,0.624,0.212,0.163,0.498,0.337,0.165,plant,0.545,0.306,0.149\n+1719,JF828185.1,1023,2,0.21,0.441,0.349,0.274,0.624,0.102,virus,0.193,0.582,0.225\n+1720,JF828185.1,1023,3,0.166,0.531,0.303,0.275,0.631,0.094,virus,0.26,0.515,0.225\n+1721,MN901875.1,1151,0,0.127,0.676,0.197,0.152,0.698,0.15,virus,0.128,0.815,0.057\n+1722,MN901875.1,1151,1,0.202,0.775,0.023,0.082,0.894,0.024,virus,0.024,0.964,0.012\n+1723,MN901875.1,1151,2,0.035,0.958,0.007,0.039,0.905,0.056,virus,0.003,0.995,0.002\n+1724,MN901875.1,1151,3,0.025,0.966,0.009,0.042,0.936,0.022,virus,0.003,0.995,0.002\n+1725,MN167135.1,1139,0,0.046,0.855,0.098,0.279,0.38,0.342,virus,0.172,0.42,0.408\n+1726,MN167135.1,1139,1,0.072,0.9,0.028,0.201,0.729,0.07,virus,0.029,0.949,0.022\n+1727,MN167135.1,1139,2,0.061,0.916,0.024,0.215,0.779,0.006,virus,0.027,0.951,0.022\n+1728,MN167135.1,1139,3,0.117,0.877,0.006,0.314,0.684,0.002,virus,0.082,0.889,0.029\n' |
b |
diff -r 000000000000 -r 457fd8fd681a test-data/viral.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/viral.fasta Wed Nov 09 12:19:26 2022 +0000 |
b |
b'@@ -0,0 +1,11658 @@\n+>MT921846.1 |Tomato spotted wilt tospovirus isolate Blackberry lily segment S, complete sequence\n+AGAGCAATTGTGTCATAATTTTATTCATAATCAAACCTCACTTAGCAAATCACAATACTG\n+TAATAAGAACACAGTACCAATAACCATAATGTCTTCAAGTGTTTATGAGTCGATCATTCA\n+GACAAAAGCTTCAGTCTGGGGATCAACTGCATCTGGTAAAGCTGTTGTAGATTCTTACTG\n+GATTCATGAACTTGGTACTGGTTCTCCACTAGTTCAAACCCAGTTGTATTCTGATTCGAG\n+AAGCAAAAGTAGCTTTGGCTATACTGCAAAGGTAGGGAATCTTCCCTGTGAAGAAGAAGA\n+GATTCTTTCTCAGCATGTGTATATCCCTGTTTTTGATGATATTGATTTTAGCATCAATAT\n+TAATGACTCTGTTCTGGCACTATCTGTTTGCTCAAATACAGTCAATACTAACGGAGTGAA\n+ACATCAAGGTCATTTGAAGGTTTTGTCTCCTGCTCAGCTCCACTCTATTGGATCTACCAT\n+GAACAGATCTGATATTACAGACCGATTCCAGCTCCAAGAAAAAGACATAATTCCCAATGA\n+CAGATACATTGAAGCTGCAAACAAGGGCTCTTTGTCTTGTGTTAAAGAGCATACCTATAA\n+GGTTGAGATGTGCTACAATCAAGCTTTGGGCAAAGTGAATGTTCTATCCCCTAACAGAAA\n+TGTCCATGAATGGCTGTACTGTTTCAAGCCAAATTTCAATCAAGTTGAAAGCAACAACAG\n+AACTGTAAATTCTCTTGCAGTAAAATCTCTGCTCATGTCAGCAGAAAACAACATCATGCC\n+TAACTCTCAGGCTTTTGTAAAAGCTTCCACTGATTCTCATTTCAAACTGAGCCTCTGGCT\n+AAGAGTTCCAAAGGTTTTGAAGCAGATTTCCATTCAGAAATTGTTCAAAGTTGCAGGAGA\n+TGAAACAAACAAAACATTTTATTTATCTATTGCTTGCATTCCAAACCATAACAGTGTTGA\n+GACAGCTTTAAACATTTCTGTTATTTGCAAGCATCAGCTCCCAATCCGTAAATGTCAAGC\n+TCCTTTTGAATTATCAATGATGTTTTCTGATTTAAAGGAGCCTTACAACATTGTTCATGA\n+TCCTTCATATCCCCAGAGGATCGTTCATGCTCTGCTTGAAACTCACACATCTTTTGCACA\n+AGTTCTTTGCAACAACTTGCAAGAAGATGTGATCATCTACACTTTGAACAACTATGAGCT\n+AACTCCTGGAAAGTTAGATCTAGGTGAAAGAACCTTAAATTACAGTGAAGATATCTGCAA\n+AAGGAAATATTTCCTTTCAAAAACACTTGAATGTCTTCCATCCAACACACAAACTATGTC\n+TTACTTAGACAGCATCCAAATCCCTTCCTGGAAGATAGACTTTGCCAAGGGAGAAATTAA\n+AATTTCTCCACAATCTATTGCAGTTGCAAAATCTTTGTTAAAGCTTGATTTAAGCGGGAT\n+CAAAAAGAAAGAATCTAAGATCTCGGAAGCATATGCTTCAGGATCAAAATAATCTTGCTG\n+TGTCCAGCTTTTCCCAATTATGTTATGTTTATTTTCTTTCTTTACTTATAATTATTTCTT\n+TGTTTTGTCATTTCTTTTGAATTTCTCCTGTTTAATAGAAACCATAAAAATAAAAATAAA\n+AATAAAAATAAAATCAAAATGAAACAAAATCAAAAAATGAAACAAAAATCAAACAAAAAT\n+CAAAAAATGAAATAAAACAACAAAAAATTAAAAAACAAAAAACCAAAAAAGATCCCGAAA\n+GGGACGATTTTGGCCAAATTTGGGTTTTGTTTTTGTTTTTTGTTTTTTGTTTTTTGTTTT\n+TTATTTTATTTTTATTTTTATTATTTATTCTATTTTATTTTATTTTTATTTTTATTTTTA\n+TTTTTATTTTATGTTTTTTGTTGTTTTTGTTATTTTGTTTATTATTAAGCACAACACACA\n+GAAAGCAAACTTTAATTAAACACACTTATTTTAAAATTAACACACTACAGCAAGCACAAG\n+CAATAAAGATAAAGAAAGCTTTATATATTTATAGGCTTTTTTATAATTTAACTACAGCTG\n+CTTTTAAGCAAGTTCTGCAAGTTTTGCCTGCTTTTTAACCCCGAACATCTCATAGAACTT\n+GTTAAGAGTTTCACTGTAATGTTCCATAGCAATACTTCCTTTAGCATTAGGATTGCTGGA\n+GCCAAGTATAGCAGCATACTCTTTCCCTTTCTTCACCTGATCTTCATTCATTTCAAATGC\n+TTTGCTTTTTAGCACAGTGCAAACTTTTCCTAAGGCTTCCCTGGTGTCATACTTCTTTGG\n+ATCGATCCCGAGGTCTTTGTATTTTGCATCCTGATATATAGCCAAGACAACACTGATCAT\n+CTCAAAGCTATCAACTGAAGCAATAAGAGGTAAGCTACCTCCCAGCATTATGGCAAGCCT\n+CACAGACTTTGCATCATCAAGAGGTAATCCATAGGCTTGAATCAAAGGGTGAGAAGCAAT\n+CTTAGATTTGATAGTATTAAGATTCTCAGAATTCCCAGTTTCCTCGACAAGCCTGACCCT\n+GATCAAGCTATCAAGCCTTCTGAAGGTCATGTCAGTGGCTCCAATCCTGTCTGAAGTTTT\n+CTTTATGGTAATTTTACCAAAAGTGAAATCACTTTGCTTAATAACCTTCATTATACTCTG\n+ACGATTCTTCAGGAATGTCAGACATGAAATAATGCTCATCTTCTTGATCTGGTCAAGGTT\n+TTCCAGACAAAAAGTCTTGAAGTTGAATGCTACCAGATTCTGATCTTCCTCAAATTCAAG\n+ATCTTTGCCTTGTGTCAACAAAGCAACAATGTTTTCCTTAGTGAGCTTAACCTTAGACAT\n+GATGATCGTAAAAGTTGTTATATGCTTTGACCGTATGTAATTCAAGGTGCGAAAGTACAA\n+CTCTGTATTCCGCAGTCGTTTCTTAGGGTTTTAATGTGATGATTTGTAAGACTGAGTGTT\n+AAGGTTTGAATAAAATTGACACAATCGCTCT\n+>EU196425.1 |Sugarcane mosaic virus isolate TUC-1E polyprotein gene, partial cds\n+GTTTTTCACCAAGCTGGAACAGTCGATGCAGGCGCTCAAGGAGGAGATGGAAACGCCGGA\n+ACCCAGCCGCCAGCCACTGGAGCAGCAGCTCAAGGAGGAGCTCAACCACCAGCTACTGGA\n+GCAGCCGCGCAACCACCTGCAGCTCAAGGTTCACAACCACCCACAGGGGGAGCAACTGGT\n+GGAGGTGGTGCACAAACAGGAGCTGGTGAAACTGGCTCAGTTACAGGAGGTCAAAGAGAC\n+AAGGATGTAGATGCTGGTACGACAGGCAAAATTACAGTGCCAAAACTTAAAGCCATGTCG\n+AAGAAGATGCGCTTACCGAAAGCAAAAGGAAAAGATGTTTTACATCTGGACTTTCTGTTA\n+ACATACAAACCGCAACAACAAGACATATCAAACACAAGAGCAACCAGAGAGGAGTTTGAT\n+AGGTGGTATGAAGCCATAAAGAAGGAATATGATATAGATGACACACAAATGACAGTTATC\n+ATGAGTGGTCTAATGGTATGGTGCATTGAGAATGGTTGCTCACCAAACATAAACGGAAAT\n+TGGACAATGATGGATGGAGATGAACAAAGAGCCTTCCCATTAAAACCAGTCATTGAAAAC\n+GCATCTCCAACATTCCGGCAAATAATGCATCATTTCAGTGATGCAGCTGAAGCATATATC\n+GAGTATAGAAACTCTACAGAGCGATACATGCCACGATATGGACTT'..b'AAT\n+TCTTTGGATCTGCATACAGGAAGAAGGGAAAAGGTAAAGGCACCACTGTTGGTATGGGCA\n+AGTCAAGCAGGAGGTTTGTTAATATGTATGGATTTGACCCAACAGAATATTCATTCATCC\n+AGTTCGTTGATCCGCTCACTGGAGCTCAAATTGAAGAGAACGTCTATGCTGATANNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNTAAGGGATTTCAATCCAATTGCTCAAACAGTTTGCAGAGTAAAAGTGTCTGTTGAAC\n+ATGGAACGTCTGAAATGTATGGGTTCGGTTTTGGTGCGTATATTATAGTAAACCACCATC\n+TATTCAAGAGTTTCAATGGATCCATGGAAGTGCGATCAATGCATGGAACATTCAGAGTGA\n+AGAATTTGCATAGCCTGAGCGTTTTACCGATCAAAGGCAGAGACATTATCATCATAAAGA\n+TGCCAAAGGATTTTCCTGTTTTCCCACAAAAACTGCACTTCCGAGCTCCAGTGCAGAATG\n+AGAGGATTTGTTTGGTTGGAACTAATTTTCAAGAAAAACATGCATCATCAATCATCACAG\n+AAACGAGTACTACATACAATGTACCGGGCAGCACTTTTTGGAAGCATTGGATTGAAACAA\n+ATGATGGGCATTGTGGATTACCAGTAGTGAGTACAGCTGATGGATGTCTAGTTGGAATAC\n+ACAGCTTGGCGAATAATGTGCAAACCACGAATTATTATTCAGCCTTTGATGAGGATTTTG\n+AAAGTAAGTATCTCCGAACTAATGAGCATAATGAGTGGACCAAATCGTGGGTATATAACC\n+CAGATACTGTGTTGTGGGGTCCATTGAAGCTCAAGGAGAGTACCCCTAAGGGCCTGTTTA\n+AGACAACAAAACTTGTACAGGATTTAATTGATCATGATGTTGTTGTAGAGCAAGCTAAAC\n+ATTCTGCGTGGATGTATGAGGCTCTAACAGGGAATTTGCAAGCTGTGGCGACAATGAAGA\n+GTCAGCTAGTGACAAAGCACGTGGTCAAAGGGGAGTGTCGGCACTTCAAAGAGTTCTTAA\n+CTGTGGATTCGGAAGCAGAAGCTNNNNNNTTCTTCAGGCCTTTGATGGATGCTTATGGGA\n+AGAGCTTGTTAAATAGAGAAGCATATATAAAGGACATAATGAAATACTCAAAGCCTATTG\n+ATGTTGGAATAGTAGACTGTGATGCTTTTGAAGAGGCTATCAATAGGGTTATCATTTATC\n+TGCAAGTGCATGGCTTCCAGAAATGCAATTACATCACCGATGAGCAGGAAATTTTCAAAG\n+CTCTCAACATGAAAGCTGCTGTCGGAGCTATGTATGGAGGCAAGAAGAAAGACTACTTCN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGTGCAAAGAGAAGA\n+TACTTGCAAATAAGACAAGGACATTCACTGCTGCACCTTTAGATACTCTACTGGGTGGAA\n+AGGTGTGCGTTGATGATTTTAATAATCAGTTCTACTCAAAGAACATTGAATGCTGCTGGA\n+CTGTTGGAATGACTAAGTTTTATGGAGGTTGGGACAAATTGCTTCGGCGTCTACCTGAAA\n+ATTNNNGGGTGTACTGCGATGCCGATGGTTCACAATTCGATAGTTCACTCACCCCATACC\n+TAATTAATGCTGTTCTCATCATCAGAAGCACGTACATGGAAGATTGGGACTTGGGGTTGC\n+AAATGTTGCGCAATTTGTACACAGAGATAATTTACACACCAATCTCAACTCCAGATGGAA\n+CAATTGTCAAGAAGTTTAGAGGTAATAATAGCGGTCAACCTTCTACCGTTGTGGATAATT\n+CTCTCATGGTTGTCCTTGCTATGCATTACGCTCTCATTAAGGAGTGCGTTGAGTTTGAAG\n+AAATCGACAGCACGTGTGTATTCTTTGTTAATGGTGATGACTTATTGATTGCTGTGAATC\n+CGGAGAAAGAGAGCATTCTCGATAGAATGTCACAACATTTCTCAGATCTTGGTTTGAACT\n+ATGATTTTTCGTCGAGAACAAGAAGGAAGGAGGAATTGTGGTTCATGTCCCATAGGGGCC\n+TGCTAATCGAGGGTATGTACGTGCCAAAGCTTGAAGAAGAGAGAANNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCAAG\n+GAAATGACACAATCGATGCAGGAGGAAGCACTAAGAAGGATGCAAAACAAGAGCAAGGTA\n+GCATTCAACCAAATCTCAACAAGGAAAAGGAAAAGGATGTGAATGTTGGAACATCTGGAA\n+CTCATACTGTGCCACGAATTAAAGCTATCACGTCTAAAATGAGAATGCCCAAGAGTAAAG\n+GTGCAACTGTACTAAATTTGGAACACTTACTCGAGTATGCTCCACAGCAAATTGACATCT\n+CAAATACTCGGGCAACTCAATCACAGTTTGATACGTGGTATGAAGCAGTACAACTTGCAT\n+ACGACATAGGAGAAACTGAAATGCCAACTGTGATGAATGGGCTTATGGTTTGGTGCATTG\n+AAAATGGAACCTCGCCAAACATCAACGGAGTTTGGGTTATGATGGATGGAGATGAACAAG\n+TCGAATACCCACTGAAACCAATCGTTGAGAATGCAAAACCAACACTTAGGCAAATCATGG\n+CACATTTCTCAGATGTTGCAGAAGCGTATATAGAAATGCGCAACAAAAAGGAACCATATA\n+TGCCACGATATGGTTTAGTTCGTAATCTGCGCGATGGAAGTTTGGCTCGCTATGCTTTTG\n+ACTTTTATGAAGTTACATCACGGACACCAGTGAGGGCTAGAGAGGCACACATTCAAATGA\n+AGGCCGCAGCTTTAAAATCAGCTCAATCTCGACTTTTCGGATTGGATGGTGGCATTAGTA\n+CACAAGAGGAAAACACAGAGAGGCACACCACCGAGGATGTTTCTCCAAGTATGCATACTC\n+TACTTGGAGTGAAGAACATGTGATTGTAGTGTCTTTCCGGACGATATATAGATATTTATG\n+TTTGCAGTAAGTATTTTGGCTTTTCCTGTACTACTTTTATCGAAATTAATAATCAGTTTG\n+AATATTACTGGCAGATAGGGGTGGTATAGCGATTCCGTCGTTGTNAGTGACCTTAGCTGT\n+CGTTTCTGTATTATTATGTTTTGTATAAAAGTGCCGGGTTGTTGTTGTTGTGGCTGATCT\n+ATCGATTAGGTGATGTTGCGATTTTGTCGTAACAGTGACTATGTCTGGATATATCTTGCT\n+TGGGTGATGCTGTGATTCTGTCATA\n' |
b |
diff -r 000000000000 -r 457fd8fd681a test-data/virhunter.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/virhunter.loc Wed Nov 09 12:19:26 2022 +0000 |
b |
@@ -0,0 +1,1 @@ +test test ${__HERE__}/weights/test \ No newline at end of file |
b |
diff -r 000000000000 -r 457fd8fd681a test-data/viruses.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/viruses.fasta Wed Nov 09 12:19:26 2022 +0000 |
b |
b'@@ -0,0 +1,15441 @@\n+>MT921846.1 |Tomato spotted wilt tospovirus isolate Blackberry lily segment S, complete sequence\n+AGAGCAATTGTGTCATAATTTTATTCATAATCAAACCTCACTTAGCAAATCACAATACTG\n+TAATAAGAACACAGTACCAATAACCATAATGTCTTCAAGTGTTTATGAGTCGATCATTCA\n+GACAAAAGCTTCAGTCTGGGGATCAACTGCATCTGGTAAAGCTGTTGTAGATTCTTACTG\n+GATTCATGAACTTGGTACTGGTTCTCCACTAGTTCAAACCCAGTTGTATTCTGATTCGAG\n+AAGCAAAAGTAGCTTTGGCTATACTGCAAAGGTAGGGAATCTTCCCTGTGAAGAAGAAGA\n+GATTCTTTCTCAGCATGTGTATATCCCTGTTTTTGATGATATTGATTTTAGCATCAATAT\n+TAATGACTCTGTTCTGGCACTATCTGTTTGCTCAAATACAGTCAATACTAACGGAGTGAA\n+ACATCAAGGTCATTTGAAGGTTTTGTCTCCTGCTCAGCTCCACTCTATTGGATCTACCAT\n+GAACAGATCTGATATTACAGACCGATTCCAGCTCCAAGAAAAAGACATAATTCCCAATGA\n+CAGATACATTGAAGCTGCAAACAAGGGCTCTTTGTCTTGTGTTAAAGAGCATACCTATAA\n+GGTTGAGATGTGCTACAATCAAGCTTTGGGCAAAGTGAATGTTCTATCCCCTAACAGAAA\n+TGTCCATGAATGGCTGTACTGTTTCAAGCCAAATTTCAATCAAGTTGAAAGCAACAACAG\n+AACTGTAAATTCTCTTGCAGTAAAATCTCTGCTCATGTCAGCAGAAAACAACATCATGCC\n+TAACTCTCAGGCTTTTGTAAAAGCTTCCACTGATTCTCATTTCAAACTGAGCCTCTGGCT\n+AAGAGTTCCAAAGGTTTTGAAGCAGATTTCCATTCAGAAATTGTTCAAAGTTGCAGGAGA\n+TGAAACAAACAAAACATTTTATTTATCTATTGCTTGCATTCCAAACCATAACAGTGTTGA\n+GACAGCTTTAAACATTTCTGTTATTTGCAAGCATCAGCTCCCAATCCGTAAATGTCAAGC\n+TCCTTTTGAATTATCAATGATGTTTTCTGATTTAAAGGAGCCTTACAACATTGTTCATGA\n+TCCTTCATATCCCCAGAGGATCGTTCATGCTCTGCTTGAAACTCACACATCTTTTGCACA\n+AGTTCTTTGCAACAACTTGCAAGAAGATGTGATCATCTACACTTTGAACAACTATGAGCT\n+AACTCCTGGAAAGTTAGATCTAGGTGAAAGAACCTTAAATTACAGTGAAGATATCTGCAA\n+AAGGAAATATTTCCTTTCAAAAACACTTGAATGTCTTCCATCCAACACACAAACTATGTC\n+TTACTTAGACAGCATCCAAATCCCTTCCTGGAAGATAGACTTTGCCAAGGGAGAAATTAA\n+AATTTCTCCACAATCTATTGCAGTTGCAAAATCTTTGTTAAAGCTTGATTTAAGCGGGAT\n+CAAAAAGAAAGAATCTAAGATCTCGGAAGCATATGCTTCAGGATCAAAATAATCTTGCTG\n+TGTCCAGCTTTTCCCAATTATGTTATGTTTATTTTCTTTCTTTACTTATAATTATTTCTT\n+TGTTTTGTCATTTCTTTTGAATTTCTCCTGTTTAATAGAAACCATAAAAATAAAAATAAA\n+AATAAAAATAAAATCAAAATGAAACAAAATCAAAAAATGAAACAAAAATCAAACAAAAAT\n+CAAAAAATGAAATAAAACAACAAAAAATTAAAAAACAAAAAACCAAAAAAGATCCCGAAA\n+GGGACGATTTTGGCCAAATTTGGGTTTTGTTTTTGTTTTTTGTTTTTTGTTTTTTGTTTT\n+TTATTTTATTTTTATTTTTATTATTTATTCTATTTTATTTTATTTTTATTTTTATTTTTA\n+TTTTTATTTTATGTTTTTTGTTGTTTTTGTTATTTTGTTTATTATTAAGCACAACACACA\n+GAAAGCAAACTTTAATTAAACACACTTATTTTAAAATTAACACACTACAGCAAGCACAAG\n+CAATAAAGATAAAGAAAGCTTTATATATTTATAGGCTTTTTTATAATTTAACTACAGCTG\n+CTTTTAAGCAAGTTCTGCAAGTTTTGCCTGCTTTTTAACCCCGAACATCTCATAGAACTT\n+GTTAAGAGTTTCACTGTAATGTTCCATAGCAATACTTCCTTTAGCATTAGGATTGCTGGA\n+GCCAAGTATAGCAGCATACTCTTTCCCTTTCTTCACCTGATCTTCATTCATTTCAAATGC\n+TTTGCTTTTTAGCACAGTGCAAACTTTTCCTAAGGCTTCCCTGGTGTCATACTTCTTTGG\n+ATCGATCCCGAGGTCTTTGTATTTTGCATCCTGATATATAGCCAAGACAACACTGATCAT\n+CTCAAAGCTATCAACTGAAGCAATAAGAGGTAAGCTACCTCCCAGCATTATGGCAAGCCT\n+CACAGACTTTGCATCATCAAGAGGTAATCCATAGGCTTGAATCAAAGGGTGAGAAGCAAT\n+CTTAGATTTGATAGTATTAAGATTCTCAGAATTCCCAGTTTCCTCGACAAGCCTGACCCT\n+GATCAAGCTATCAAGCCTTCTGAAGGTCATGTCAGTGGCTCCAATCCTGTCTGAAGTTTT\n+CTTTATGGTAATTTTACCAAAAGTGAAATCACTTTGCTTAATAACCTTCATTATACTCTG\n+ACGATTCTTCAGGAATGTCAGACATGAAATAATGCTCATCTTCTTGATCTGGTCAAGGTT\n+TTCCAGACAAAAAGTCTTGAAGTTGAATGCTACCAGATTCTGATCTTCCTCAAATTCAAG\n+ATCTTTGCCTTGTGTCAACAAAGCAACAATGTTTTCCTTAGTGAGCTTAACCTTAGACAT\n+GATGATCGTAAAAGTTGTTATATGCTTTGACCGTATGTAATTCAAGGTGCGAAAGTACAA\n+CTCTGTATTCCGCAGTCGTTTCTTAGGGTTTTAATGTGATGATTTGTAAGACTGAGTGTT\n+AAGGTTTGAATAAAATTGACACAATCGCTCT\n+>MF540775.1 |Ti ringspot-associated emaravirus isolate Ti1 RNA-dependent RNA polymerase (RdRp) gene, partial cds\n+GGGGGCTTGGGCTAACATGATATCAGTGAATGTTCTACCATTTAATCTAAACTGTGATTC\n+TAACATTTCTTTAAATGTATCAGGTGATTTAGTCTTTAAAAATGTTAGGAAGTTAATTTC\n+CTCATGTTGTTTATGCTCAATTCTTTGTTTATAGACTAGCTTTTCATGACTTAATTTTAA\n+CTTATGACTAAGACTAAAAGTTCTGATAGTGTTTTCATCATCATACATTCCAAATTTATT\n+TAGTATATAGTCCTCTATCTTATCACTCCTATTGGTGCTTATATGTAACTTACTGCTAAC\n+TAGATCTCTCTTCATGAATGAATCAATTGTTAACAAGACACTTTTTGGAGTTGTATAATT\n+TATCTTGTGAGACTTATTCCCAATCCTAATAGTTATATCCTTCAAGTATTTGACACACAA\n+ACTCATGAACTTTATTTTGATATTATATATCTTAATATCGAATGCATTATCTATGTTTTG\n+ATTTGGTATAAGACAAGAACTGTAATCAGTTGTTTTGTACTTTAAATTGCAATAATCAAT\n+GAAATCATGATCAATATCTTCTGGAAATTCATAAACCTTTACCACTCCAAATGGATTATC\n+GTATGAGGAGAAGTTTATGTTATTTAATATACTTTCACTTATATCTAAAAACTTTTCACA\n+CTTCGGGTCAATG'..b'AAT\n+TCTTTGGATCTGCATACAGGAAGAAGGGAAAAGGTAAAGGCACCACTGTTGGTATGGGCA\n+AGTCAAGCAGGAGGTTTGTTAATATGTATGGATTTGACCCAACAGAATATTCATTCATCC\n+AGTTCGTTGATCCGCTCACTGGAGCTCAAATTGAAGAGAACGTCTATGCTGATANNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNTAAGGGATTTCAATCCAATTGCTCAAACAGTTTGCAGAGTAAAAGTGTCTGTTGAAC\n+ATGGAACGTCTGAAATGTATGGGTTCGGTTTTGGTGCGTATATTATAGTAAACCACCATC\n+TATTCAAGAGTTTCAATGGATCCATGGAAGTGCGATCAATGCATGGAACATTCAGAGTGA\n+AGAATTTGCATAGCCTGAGCGTTTTACCGATCAAAGGCAGAGACATTATCATCATAAAGA\n+TGCCAAAGGATTTTCCTGTTTTCCCACAAAAACTGCACTTCCGAGCTCCAGTGCAGAATG\n+AGAGGATTTGTTTGGTTGGAACTAATTTTCAAGAAAAACATGCATCATCAATCATCACAG\n+AAACGAGTACTACATACAATGTACCGGGCAGCACTTTTTGGAAGCATTGGATTGAAACAA\n+ATGATGGGCATTGTGGATTACCAGTAGTGAGTACAGCTGATGGATGTCTAGTTGGAATAC\n+ACAGCTTGGCGAATAATGTGCAAACCACGAATTATTATTCAGCCTTTGATGAGGATTTTG\n+AAAGTAAGTATCTCCGAACTAATGAGCATAATGAGTGGACCAAATCGTGGGTATATAACC\n+CAGATACTGTGTTGTGGGGTCCATTGAAGCTCAAGGAGAGTACCCCTAAGGGCCTGTTTA\n+AGACAACAAAACTTGTACAGGATTTAATTGATCATGATGTTGTTGTAGAGCAAGCTAAAC\n+ATTCTGCGTGGATGTATGAGGCTCTAACAGGGAATTTGCAAGCTGTGGCGACAATGAAGA\n+GTCAGCTAGTGACAAAGCACGTGGTCAAAGGGGAGTGTCGGCACTTCAAAGAGTTCTTAA\n+CTGTGGATTCGGAAGCAGAAGCTNNNNNNTTCTTCAGGCCTTTGATGGATGCTTATGGGA\n+AGAGCTTGTTAAATAGAGAAGCATATATAAAGGACATAATGAAATACTCAAAGCCTATTG\n+ATGTTGGAATAGTAGACTGTGATGCTTTTGAAGAGGCTATCAATAGGGTTATCATTTATC\n+TGCAAGTGCATGGCTTCCAGAAATGCAATTACATCACCGATGAGCAGGAAATTTTCAAAG\n+CTCTCAACATGAAAGCTGCTGTCGGAGCTATGTATGGAGGCAAGAAGAAAGACTACTTCN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGTGCAAAGAGAAGA\n+TACTTGCAAATAAGACAAGGACATTCACTGCTGCACCTTTAGATACTCTACTGGGTGGAA\n+AGGTGTGCGTTGATGATTTTAATAATCAGTTCTACTCAAAGAACATTGAATGCTGCTGGA\n+CTGTTGGAATGACTAAGTTTTATGGAGGTTGGGACAAATTGCTTCGGCGTCTACCTGAAA\n+ATTNNNGGGTGTACTGCGATGCCGATGGTTCACAATTCGATAGTTCACTCACCCCATACC\n+TAATTAATGCTGTTCTCATCATCAGAAGCACGTACATGGAAGATTGGGACTTGGGGTTGC\n+AAATGTTGCGCAATTTGTACACAGAGATAATTTACACACCAATCTCAACTCCAGATGGAA\n+CAATTGTCAAGAAGTTTAGAGGTAATAATAGCGGTCAACCTTCTACCGTTGTGGATAATT\n+CTCTCATGGTTGTCCTTGCTATGCATTACGCTCTCATTAAGGAGTGCGTTGAGTTTGAAG\n+AAATCGACAGCACGTGTGTATTCTTTGTTAATGGTGATGACTTATTGATTGCTGTGAATC\n+CGGAGAAAGAGAGCATTCTCGATAGAATGTCACAACATTTCTCAGATCTTGGTTTGAACT\n+ATGATTTTTCGTCGAGAACAAGAAGGAAGGAGGAATTGTGGTTCATGTCCCATAGGGGCC\n+TGCTAATCGAGGGTATGTACGTGCCAAAGCTTGAAGAAGAGAGAANNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCAAG\n+GAAATGACACAATCGATGCAGGAGGAAGCACTAAGAAGGATGCAAAACAAGAGCAAGGTA\n+GCATTCAACCAAATCTCAACAAGGAAAAGGAAAAGGATGTGAATGTTGGAACATCTGGAA\n+CTCATACTGTGCCACGAATTAAAGCTATCACGTCTAAAATGAGAATGCCCAAGAGTAAAG\n+GTGCAACTGTACTAAATTTGGAACACTTACTCGAGTATGCTCCACAGCAAATTGACATCT\n+CAAATACTCGGGCAACTCAATCACAGTTTGATACGTGGTATGAAGCAGTACAACTTGCAT\n+ACGACATAGGAGAAACTGAAATGCCAACTGTGATGAATGGGCTTATGGTTTGGTGCATTG\n+AAAATGGAACCTCGCCAAACATCAACGGAGTTTGGGTTATGATGGATGGAGATGAACAAG\n+TCGAATACCCACTGAAACCAATCGTTGAGAATGCAAAACCAACACTTAGGCAAATCATGG\n+CACATTTCTCAGATGTTGCAGAAGCGTATATAGAAATGCGCAACAAAAAGGAACCATATA\n+TGCCACGATATGGTTTAGTTCGTAATCTGCGCGATGGAAGTTTGGCTCGCTATGCTTTTG\n+ACTTTTATGAAGTTACATCACGGACACCAGTGAGGGCTAGAGAGGCACACATTCAAATGA\n+AGGCCGCAGCTTTAAAATCAGCTCAATCTCGACTTTTCGGATTGGATGGTGGCATTAGTA\n+CACAAGAGGAAAACACAGAGAGGCACACCACCGAGGATGTTTCTCCAAGTATGCATACTC\n+TACTTGGAGTGAAGAACATGTGATTGTAGTGTCTTTCCGGACGATATATAGATATTTATG\n+TTTGCAGTAAGTATTTTGGCTTTTCCTGTACTACTTTTATCGAAATTAATAATCAGTTTG\n+AATATTACTGGCAGATAGGGGTGGTATAGCGATTCCGTCGTTGTNAGTGACCTTAGCTGT\n+CGTTTCTGTATTATTATGTTTTGTATAAAAGTGCCGGGTTGTTGTTGTTGTGGCTGATCT\n+ATCGATTAGGTGATGTTGCGATTTTGTCGTAACAGTGACTATGTCTGGATATATCTTGCT\n+TGGGTGATGCTGTGATTCTGTCATA\n' |
b |
diff -r 000000000000 -r 457fd8fd681a test-data/weights/test/RF_1000.joblib |
b |
Binary file test-data/weights/test/RF_1000.joblib has changed |
b |
diff -r 000000000000 -r 457fd8fd681a test-data/weights/test/RF_500.joblib |
b |
Binary file test-data/weights/test/RF_500.joblib has changed |
b |
diff -r 000000000000 -r 457fd8fd681a test-data/weights/test/model_5_1000.h5 |
b |
Binary file test-data/weights/test/model_5_1000.h5 has changed |
b |
diff -r 000000000000 -r 457fd8fd681a test-data/weights/test/model_5_500.h5 |
b |
Binary file test-data/weights/test/model_5_500.h5 has changed |
b |
diff -r 000000000000 -r 457fd8fd681a test-data/weights/test/model_7_1000.h5 |
b |
Binary file test-data/weights/test/model_7_1000.h5 has changed |
b |
diff -r 000000000000 -r 457fd8fd681a test-data/weights/test/model_7_500.h5 |
b |
Binary file test-data/weights/test/model_7_500.h5 has changed |
b |
diff -r 000000000000 -r 457fd8fd681a tool-data/virhunter.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/virhunter.loc Wed Nov 09 12:19:26 2022 +0000 |
b |
@@ -0,0 +1,1 @@ +test test ${__HERE__}/weights/test \ No newline at end of file |
b |
diff -r 000000000000 -r 457fd8fd681a tool-data/virhunter.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/virhunter.loc.sample Wed Nov 09 12:19:26 2022 +0000 |
b |
@@ -0,0 +1,29 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of Samtools indexed sequences data files. You will need +#to create these data files and then create a fasta_indexes.loc file +#similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The fasta_indexes.loc +#file has this format (white space characters are TAB characters): +# +# <unique_build_id> <dbkey> <display_name> <file_base_path> +# +#So, for example, if you had hg19 Canonical indexed stored in +# +# /depot/data2/galaxy/hg19/sam/, +# +#then the fasta_indexes.loc entry would look like this: +# +#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /depot/data2/galaxy/hg19/sam/hg19canon.fa +# +#and your /depot/data2/galaxy/hg19/sam/ directory +#would contain hg19canon.fa and hg19canon.fa.fai files. +# +#Your fasta_indexes.loc file should include an entry per line for +#each index set you have stored. The file in the path does actually +#exist, but it should never be directly used. Instead, the name serves +#as a prefix for the index file. For example: +# +#hg18canon hg18 Human (Homo sapiens): hg18 Canonical /depot/data2/galaxy/hg18/sam/hg18canon.fa +#hg18full hg18 Human (Homo sapiens): hg18 Full /depot/data2/galaxy/hg18/sam/hg18full.fa +#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /depot/data2/galaxy/hg19/sam/hg19canon.fa +#hg19full hg19 Human (Homo sapiens): hg19 Full /depot/data2/galaxy/hg19/sam/hg19full.fa \ No newline at end of file |
b |
diff -r 000000000000 -r 457fd8fd681a tool-data/weights/test/RF_1000.joblib |
b |
Binary file tool-data/weights/test/RF_1000.joblib has changed |
b |
diff -r 000000000000 -r 457fd8fd681a tool-data/weights/test/RF_500.joblib |
b |
Binary file tool-data/weights/test/RF_500.joblib has changed |
b |
diff -r 000000000000 -r 457fd8fd681a tool-data/weights/test/model_5_1000.h5 |
b |
Binary file tool-data/weights/test/model_5_1000.h5 has changed |
b |
diff -r 000000000000 -r 457fd8fd681a tool-data/weights/test/model_5_500.h5 |
b |
Binary file tool-data/weights/test/model_5_500.h5 has changed |
b |
diff -r 000000000000 -r 457fd8fd681a tool-data/weights/test/model_7_1000.h5 |
b |
Binary file tool-data/weights/test/model_7_1000.h5 has changed |
b |
diff -r 000000000000 -r 457fd8fd681a tool-data/weights/test/model_7_500.h5 |
b |
Binary file tool-data/weights/test/model_7_500.h5 has changed |
b |
diff -r 000000000000 -r 457fd8fd681a tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Wed Nov 09 12:19:26 2022 +0000 |
b |
@@ -0,0 +1,10 @@ +<?xml version="1.0"?> +<tables> + <!-- Locations of indexes in the virhunter mapper format for virhunter versions XXXXX--> + <table name="virhunter_models" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/virhunter.loc" /> + </table> +</tables> + + |
b |
diff -r 000000000000 -r 457fd8fd681a tool_data_table_conf.xml.test --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Wed Nov 09 12:19:26 2022 +0000 |
b |
@@ -0,0 +1,7 @@ +<tables> + <!-- Locations of indexes in the virhunter mapper format for virhunter versions XXXXX--> + <table name="virhunter_models" comment_char="#"> + <columns>value, name, path</columns> + <file path="${__HERE__}/test-data/virhunter.loc" /> + </table> +</tables> \ No newline at end of file |
b |
diff -r 000000000000 -r 457fd8fd681a utils/batch_loader.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/batch_loader.py Wed Nov 09 12:19:26 2022 +0000 |
[ |
@@ -0,0 +1,73 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Credits: Grigorii Sukhorukov, Macha Nikolski +import numpy as np +from sklearn.utils import shuffle +from tensorflow import keras + + +class BatchLoader(keras.utils.Sequence): + """Helper to iterate over the data (as Numpy arrays).""" + def __init__( + self, + input_seqs, + input_seqs_rc, + input_labs, + batches, + rc=True, + random_seed=1 + ): + self.input_seqs = input_seqs + self.input_seqs_rc = input_seqs_rc + self.input_labs = input_labs + self.batches = batches + self.rc = rc + self.random_seed = random_seed + + def __len__(self): + return len(self.batches) + + def __getitem__(self, idx): + batch = sorted(self.batches[idx]) + batch_seqs, batch_seqs_rc, batch_labs = shuffle( + np.array(self.input_seqs[batch, ...]), + np.array(self.input_seqs_rc[batch, ...]), + np.array(self.input_labs[batch, ...]), + random_state=self.random_seed + ) + # adding reverse batches + # batch_seqs = np.concatenate((batch_seqs, batch_seqs[:, ::-1, ...])) + # batch_seqs_rc = np.concatenate((batch_seqs_rc, batch_seqs_rc[:, ::-1, ...])) + # batch_labs = np.concatenate((batch_labs, batch_labs[:, ::-1, ...])) + if self.rc: + return (batch_seqs, batch_seqs_rc), batch_labs + else: + return batch_seqs, batch_labs + + +class BatchGenerator: + """Helper to iterate over the data (as Numpy arrays).""" + def __init__( + self, + input_seqs, + input_seqs_rc, + input_labs, + batches, + random_seed=1 + ): + self.input_seqs = input_seqs + self.input_seqs_rc = input_seqs_rc + self.input_labs = input_labs + self.batches = batches + self.random_seed = random_seed + + def __call__(self): + for batch in self.batches: + batch = sorted(batch) + batch_seqs, batch_seqs_rc, batch_labs = shuffle( + np.array(self.input_seqs[batch, ...]), + np.array(self.input_seqs_rc[batch, ...]), + np.array(self.input_labs[batch, ...]), + random_state=self.random_seed + ) + yield (batch_seqs, batch_seqs_rc), batch_labs |
b |
diff -r 000000000000 -r 457fd8fd681a utils/preprocess.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utils/preprocess.py Wed Nov 09 12:19:26 2022 +0000 |
[ |
b'@@ -0,0 +1,381 @@\n+#!/usr/bin/env python\n+# -*- coding: utf-8 -*-\n+# Credits: Grigorii Sukhorukov, Macha Nikolski\n+import math\n+import os\n+import pathlib\n+import random\n+\n+import h5py\n+import numpy as np\n+from Bio import SeqIO\n+from Bio.Seq import Seq\n+from Bio.SeqRecord import SeqRecord\n+from sklearn.utils import shuffle\n+\n+\n+def reverse_complement(fragment):\n+ """\n+ provides reverse complement to sequences\n+ Input:\n+ sequences - list with SeqRecord sequences in fasta format\n+ Output:\n+ complementary_sequences -\n+ list with SeqRecord complementary sequences in fasta format\n+ """\n+ # complementary_sequences = []\n+ # for sequence in sequences:\n+ # complementary_sequence = SeqRecord(\n+ # seq=Seq(sequence.seq).reverse_complement(),\n+ # id=sequence.id + "_reverse_complement",\n+ # )\n+ # complementary_sequences.append(complementary_sequence)\n+ fragment = fragment[::-1].translate(str.maketrans(\'ACGT\', \'TGCA\'))\n+ return fragment\n+\n+\n+def introduce_mutations(seqs, mut_rate, rs=None):\n+ """\n+ Function that mutates sequences in the entering fasta file\n+ A proportion of nucleotides are changed to other nucleotide\n+ Not yet taking account of mutation for gaps\n+ mut_rate - proportion from 0.0 to 1.0, float\n+ """\n+ random.seed(a=rs)\n+ assert 0.0 <= mut_rate <= 1.0\n+ mutated_seqs = []\n+ for seq in seqs:\n+ mut_seq = list(str(seq.seq))\n+ l_ = len(mut_seq)\n+ mutated_sites_i = random.sample(range(l_), int(mut_rate * l_))\n+ for mut_site_i in mutated_sites_i:\n+ mut_site = mut_seq[mut_site_i]\n+ mutations = ["A", "C", "T", "G"]\n+ if mut_site in mutations:\n+ mutations.remove(mut_site)\n+ mut_seq[mut_site_i] = random.sample(mutations, 1)[0]\n+ mutated_seq = SeqRecord(\n+ seq=Seq("".join(mut_seq)),\n+ id=seq.id + f"mut_{mut_rate}",\n+ name="",\n+ description="",\n+ )\n+ mutated_seqs.append(mutated_seq)\n+ return mutated_seqs\n+\n+\n+def separate_by_length(length_, seq_list, fold=None,):\n+ # TODO: add docs\n+ included = []\n+ to_process = []\n+ excluded = 0\n+ for seq_ in seq_list:\n+ l_ = len(seq_.seq)\n+ if l_ >= length_:\n+ if fold is None:\n+ included.append(seq_)\n+ elif l_ < length_ * fold:\n+ included.append(seq_)\n+ else:\n+ to_process.append(seq_)\n+ else:\n+ excluded += 1\n+ print(f"A total of {excluded} sequences was excluded due to being smaller than {length_}")\n+ return included, to_process\n+\n+\n+def chunks(lst, n):\n+ """Yield successive n-sized chunks from lst.\n+ https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks"""\n+ for i in range(0, len(lst), n):\n+ yield lst[i:i + n]\n+\n+\n+def correct(frag):\n+ """\n+ leaves only unambiguous DNA code (ACTG-)\n+ Input:\n+ frag - string of nucleotides\n+ Output:\n+ pr_frag - corrected string of nucleotides\n+ """\n+ pr_frag = frag.upper()\n+ pr_frag_s = set(pr_frag)\n+ if pr_frag_s != {"A", "C", "G", "T", "-"}:\n+ for letter in pr_frag_s - {"A", "C", "G", "T", "-"}:\n+ pr_frag = pr_frag.replace(letter, "-")\n+ return pr_frag\n+\n+\n+def fragmenting(sequences, sl_wind_size, max_gap=0.05, sl_wind_step=None):\n+ """\n+ slices sequences in fragments by sliding window\n+ based on its size and step.\n+ last fragment is padded by \'-\'\n+ fragments have ambiguous bases replaced by \'-\'\n+ fragments with many \'-\' are discarded\n+ Input:\n+ sequences - list with SeqRecord sequences in fasta format\n+ max_gap - max allowed proportion of \'-\'\n+ sl_wind_size - sliding window step\n+ sl_wind_step - sliding window step, by default equals\n+ sliding window size (None is replaced by it)\n+ Output:\n+ fragments - list with sequence fragments\n+ '..b' n_fragments = entry[1]\n+ seqs = []\n+ fragments = []\n+ fragments_rc = []\n+ counter_1 = 0\n+ counter_2 = 0\n+ while counter_1 < n_fragments:\n+ # select chromosomes if there are any\n+ fragment_full = random.choice(seq)\n+ r_end = len(fragment_full.seq) - length\n+ try:\n+ r_start = random.randrange(r_end)\n+ fragment = SeqRecord(\n+ seq=fragment_full.seq[r_start:(r_start + length)],\n+ id=f"{fragment_full.id}_{length}_{r_start}",\n+ name="",\n+ description="",\n+ )\n+ temp_, temp_rc, _ = fragmenting([fragment], length, max_gap, sl_wind_step=sl_wind_step)\n+ if temp_ and temp_rc:\n+ seqs.append(fragment)\n+ fragments.extend(temp_)\n+ fragments_rc.extend(temp_rc)\n+ counter_1 += 1\n+ except ValueError:\n+ # print(f"{fragment_full.id} has length {len(fragment_full.seq)} and is too short to be sampled")\n+ pass\n+ counter_2 += 1\n+ if limit:\n+ assert counter_2 <= limit * n_fragments, f"While cycle iterated more than {limit}, data is ambiguous." \\\n+ f" Only {len(fragments)} fragments were sampled out of {n_fragments}"\n+ total_fragments.extend(fragments)\n+ total_fragments_rc.extend(fragments_rc)\n+ total_seqs.extend(seqs)\n+ # print("sequence sampling done")\n+ return total_fragments, total_fragments_rc, total_seqs\n+\n+\n+def prepare_ds_fragmenting(in_seq, label, label_int, fragment_length, sl_wind_step, max_gap=0.05, n_cpus=1):\n+ if sl_wind_step is None:\n+ sl_wind_step = int(fragment_length / 2)\n+ # generating viral fragments and labels\n+ seqs = list(SeqIO.parse(in_seq, "fasta"))\n+ frags, frags_rc, seqs_ = fragmenting(seqs, fragment_length, max_gap=max_gap, sl_wind_step=sl_wind_step)\n+ encoded = one_hot_encode(frags)\n+ encoded_rc = one_hot_encode(frags_rc)\n+ labs = prepare_labels(frags, label=label_int, label_depth=3)\n+ seqs_ = label_fasta_fragments(seqs_, label=label)\n+ # subsetting to unique fragments\n+ u_encoded, indices = np.unique(encoded, axis=0, return_index=True)\n+ u_encoded_rc = encoded_rc[indices]\n+ u_labs = labs[indices]\n+ u_seqs = [seqs_[i] for i in indices]\n+ assert (np.shape(u_encoded)[0] == np.shape(u_encoded_rc)[0])\n+ print(f"Encoding {label} sequences finished")\n+ # print(f"{np.shape(u_encoded)[0]} forward fragments generated")\n+ n_frags = np.shape(u_encoded)[0]\n+ return u_encoded, u_encoded_rc, u_labs, u_seqs, n_frags\n+\n+\n+def prepare_ds_sampling(in_seqs, fragment_length, n_frags, label, label_int, random_seed, n_cpus=1, limit=100):\n+ # generating plant fragments and labels\n+ seqs_list = prepare_seq_lists(in_seqs, n_frags)\n+ frags, frags_rc, seqs_ = sample_fragments.remote(seqs_list, fragment_length, random_seed, limit=limit, max_gap=0.05)\n+ frags, frags_rc, seqs_ = shuffle(frags, frags_rc, seqs_, random_state=random_seed, n_samples=int(n_frags))\n+ encoded = one_hot_encode(frags)\n+ encoded_rc = one_hot_encode(frags_rc)\n+ labs = prepare_labels(frags, label=label_int, label_depth=3)\n+ seqs_ = label_fasta_fragments(seqs_, label=label)\n+ assert (np.shape(encoded)[0] == np.shape(encoded_rc)[0])\n+ print(f"Encoding {label} sequences finished")\n+ # print(f"{np.shape(encoded)[0]} forward fragments generated")\n+ return encoded, encoded_rc, labs, seqs_, n_frags\n+\n+\n+def storing_encoded(encoded, encoded_rc, labs, out_path, ):\n+ f = h5py.File(out_path, "w")\n+ f.create_dataset("fragments", data=encoded)\n+ f.create_dataset("fragments_rc", data=encoded_rc)\n+ f.create_dataset("labels", data=labs)\n+ f.close()\n+ print(f"encoded fragments and labels stored in {out_path}")\n' |
b |
diff -r 000000000000 -r 457fd8fd681a virhunter.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/virhunter.xml Wed Nov 09 12:19:26 2022 +0000 |
[ |
@@ -0,0 +1,55 @@ +<tool id="virhunter" name="virhunter" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.05"> + <description> + Deep learning method to identify viruses in sequencing datasets.. + </description> + <macros> + <import>macros.xml</import> + </macros> + <xrefs> + <xref type="bio.tools">virhunter</xref> + </xrefs> + <expand macro="requirements"/> + <command detect_errors="exit_code"><![CDATA[ + + mkdir -p '${predicted_fragments.extra_files_path}' && + python '$__tool_directory__/predict.py' + --test_ds '${fasta_file}' + --weights '${weights.fields.path}' + --out_path '${predicted_fragments.extra_files_path}' + --return_viral True + --limit $limit + && cp '${predicted_fragments.extra_files_path}'/predicted_fragments.csv predicted_fragments.csv + && cp '${predicted_fragments.extra_files_path}'/predicted.csv predicted.csv + && cp '${predicted_fragments.extra_files_path}'/viral.fasta viral.fasta + + ]]></command> + <inputs> + <param name="fasta_file" type="data" format="fasta" label="DNA FASTA file(s)"/> + <param name="weights" type="select" label="Select a reference model" help="If your model of interest is not listed, contact the Galaxy team"> + <options from_data_table="virhunter_models"> + <validator type="no_options" message="No models are available for the selected input dataset" /> + </options> + </param> + <param argument="--limit" type="integer" min="0" value="750" label="Minimum contig length" help="Do not predict contigs shorter than this value. Default: 750" /> + </inputs> + <outputs> + <data format="csv" name="predicted_fragments" from_work_dir="predicted_fragments.csv" label="${tool.name} on ${on_string}: predicted fragments"/> + <data format="csv" name="predicted" from_work_dir="predicted.csv" label="${tool.name} on ${on_string}: predicted "/> + <data format="fasta" name="viral" from_work_dir="viral.fasta" label="${tool.name} on ${on_string}: viral FASTA file" /> + </outputs> + <tests> + <test> + <param name="fasta_file" value="viruses.fasta"/> + <param name="weights" value="test"/> + <output name="predicted_fragments" file="predicted_fragments.csv" ftype="csv" lines_diff="2"/> + <output name="predicted" file="predicted.csv" ftype="csv" lines_diff="2"/> + <output name="viral" file="viral.fasta" ftype="fasta" lines_diff="2"/> + </test> + </tests> + + <help> + <![CDATA[ + VirHunter is a deep learning method that uses Convolutional Neural Networks (CNNs) and a Random Forest Classifier to identify viruses in sequening datasets. More precisely, VirHunter classifies previously assembled contigs as viral, host and bacterial (contamination). + ]]></help> + <expand macro="citations" /> +</tool> \ No newline at end of file |
b |
diff -r 000000000000 -r 457fd8fd681a virhunter.yml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/virhunter.yml Wed Nov 09 12:19:26 2022 +0000 |
b |
@@ -0,0 +1,17 @@ +name: virhunter +channels: + - defaults + - bioconda + - conda-forge +dependencies: + - biopython=1.78 + - h5py=2.10.0 + - joblib=1.0.0 + - pandas=1.2.1 + - pip=20.3.3 + - python=3.8.5 + - scikit-learn=0.23.2 + - scipy=1.5.2 + - wget=1.20.1 + - pip: + - pyyaml==5.4 |