Commit message:
Uploaded |
added:
cpt_related_genome_prot/TaxID_List.txt cpt_related_genome_prot/cpt-macros.xml cpt_related_genome_prot/macros.xml cpt_related_genome_prot/prot_relate.xml cpt_related_genome_prot/relatedness_prot.py cpt_related_genome_prot/test-data/prot_relate_in.tab cpt_related_genome_prot/test-data/prot_relate_out.tab |
b |
diff -r 000000000000 -r ebcc87a27f9c cpt_related_genome_prot/TaxID_List.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_related_genome_prot/TaxID_List.txt Fri Jun 10 08:46:28 2022 +0000 |
b |
b'@@ -0,0 +1,14253 @@\n+10472\tPlasmaviridae\n+10473\tPlasmavirus\n+10656\tTectiviridae\n+10658\tEnterobacteria phage PRD1\n+10660\tCorticovirus\n+10661\tPseudoalteromonas virus PM2\n+10662\tMyoviridae\n+10663\tTequatrovirus\n+10664\tEnterobacteria phage T2\n+10665\tEscherichia virus T4\n+10666\tEnterobacteria phage T6\n+10667\tBacillus phage PBSX\n+10674\tEnterobacteria phage K3\n+10676\tEnterobacteria phage M1\n+10677\tEscherichia virus Mu\n+10678\tEscherichia virus P1\n+10679\tEscherichia virus P2\n+10680\tEnterobacteria phage P4\n+10681\tEnterobacteria phage phiR73\n+10682\tEnterobacteria phage P7\n+10683\tBacillus virus PBS1\n+10684\tBacillus phage PBS2\n+10685\tBacillus virus SPO1\n+10686\tEnterobacteria phage TuIa\n+10687\tEnterobacteria phage TuIb\n+10689\tVibrio phage CP-T1\n+10690\tHaemophilus virus HP1\n+10691\tEnterobacteria phage Ox2\n+10692\tEnterobacteria phage RB18\n+10693\tEnterobacteria phage RB51\n+10696\tBacillus phage SP82\n+10699\tSiphoviridae\n+10701\tActinophage RP3\n+10702\tStreptomyces phage VWB\n+10703\tCorynephage beta\n+10704\tRhizobium phage 16-3\n+10705\tEnterobacteria phage 82\n+10707\tEscherichia virus BF23\n+10708\tPseudomonas virus D3112\n+10710\tEscherichia virus Lambda\n+10711\tEnterobacteria phage P21\n+10712\tPhage 434\n+10713\tEnterobacteria phage phi80\n+10714\tCorynephage omega\n+10715\tStaphylococcus phage 42D.m\n+10717\tBacillus phage phi105\n+10719\tStreptomyces virus phiC31\n+10721\tStaphylococcus phage S phi-C\n+10723\tBacillus phage SPO2\n+10724\tBacillus phage SPP1\n+10725\tBacillus phage SPR\n+10726\tEscherichia virus T5\n+10727\tStaphylococcus phage L54a\n+10728\tStreptococcus pneumoniae phage HB-3\n+10730\tEscherichia phage 933W\n+10732\tStreptomyces phage R4\n+10733\tCorynebacterium phage gamma\n+10735\tBacillus phage rho11s\n+10736\tBacillus phage phi3T\n+10737\tEnterobacteria phage phi21\n+10738\tEnterobacteria phage PA-2\n+10742\tEscherichia virus HK022\n+10743\tPhage 21\n+10744\tPodoviridae\n+10746\tLactococcus phage phi-vML3\n+10747\tStreptococcus phage Cp1\n+10748\tStreptococcus phage CP-7\n+10749\tStreptococcus phage CP-9\n+10750\tEnterobacteria phage LP7\n+10752\tEscherichia virus N4\n+10753\tBacillus phage Nf\n+10754\tSalmonella virus P22\n+10755\tPhage phi-15\n+10756\tBacillus virus phi29\n+10757\tBacillus phage PZA\n+10759\tEnterobacteria phage T3\n+10760\tEscherichia phage T7\n+10761\tShigella virus Sf6\n+10765\tBacillus phage H1\n+10766\tEnterobacteria phage K11\n+10773\tBacillus phage SF6\n+10778\tBacillus virus B103\n+10779\tBacillus phage BS32\n+10841\tMicroviridae\n+10842\tMicrovirus\n+10843\tEscherichia phage G4\n+10844\tEnterobacteria phage S13\n+10845\tEscherichia phage St-1\n+10847\tEscherichia virus phiX174\n+10848\tEscherichia virus phiK\n+10849\tEscherichia phage alpha3\n+10850\tEnterobacteria phage G14\n+10851\tEnterobacteria phage U3\n+10852\tSpiromicrovirus\n+10854\tSpiroplasma phage 1-R8A2B\n+10855\tSpiroplasma virus SpV4\n+10856\tBdellomicrovirus\n+10860\tInoviridae\n+10861\tInovirus\n+10862\tXanthomonas virus Cf1c\n+10863\tEnterobacteria phage f1\n+10864\tEnterobacteria phage fd\n+10866\tEnterobacteria phage ZJ/2\n+10867\tEnterobacteria phage Ike\n+10868\tEnterobacteria phage If1\n+10869\tEscherichia virus I22\n+10872\tPseudomonas phage Pf3\n+10874\tShigella phage SfX\n+10875\tPlectrovirus\n+10877\tCystoviridae\n+10878\tCystovirus\n+10879\tPseudomonas virus phi6\n+11989\tLeviviridae\n+11990\tLevivirus\n+12008\tAllolevivirus\n+12014\tEnterobacteria phage BO1\n+12015\tEnterobacteria phage isolate BZ13\n+12016\tEnterobacteria phage f2\n+12017\tEnterobacteria phage fr\n+12018\tEnterobacteria phage GA\n+12019\tEnterobacteria phage JP34\n+12020\tEnterobacteria phage JP501\n+12021\tEnterobacteria phage KU1\n+12022\tEnterobacteria phage MS2\n+12023\tPseudomonas phage PP7\n+12024\tPseudomonas phage PRR1\n+12026\tEnterobacteria phage R17\n+12027\tEnterobacteria phage SP\n+12029\tEnterobacteria phage TH1\n+12030\tEnterobacteria phage TW19\n+12031\tEnterobacteria phage TW28\n+12032\tEnterobacteria phage VK\n+12034\tEnterobacteria phage fr1\n+12333\tunclassified bacterial viruses\n+12335\tNocardia phage NJL\n+12336\tClostridium phage c-st\n+12340\tEnterobacteria phage 933J\n+12342\tXanthomonas phage Cf16\n+12344\tStreptococcus phag'..b'coccus phage CHPC964\n+2675257\tLactococcus phage CHPC965\n+2675258\tLactococcus phage CHPC966\n+2675259\tLactococcus phage CHPC967\n+2675260\tLactococcus phage CHPC972\n+2675261\tLactococcus phage CHPC973\n+2675262\tLactococcus phage CHPC974\n+2675263\tLactococcus phage PC_B1\n+2675264\tLactococcus phage PC_B3\n+2675265\tLactococcus phage PC_S1\n+2675441\tSiphoviridae sp. ctdc_1\n+2675442\tPodoviridae sp. ctbj_2\n+2675443\tPodoviridae sp. ctdb7\n+2675444\tPodoviridae sp. ctcf755\n+2675445\tPodoviridae sp. ctbd591\n+2675446\tPodoviridae sp. ctbh1\n+2675447\tPodoviridae sp. ctfa10\n+2675448\tPodoviridae sp. ctda_1\n+2675449\tPodoviridae sp. ctdc61\n+2675450\tPodoviridae sp. ctjc_2\n+2675824\tKlebsiella phage JD902\n+2675825\tKlebsiella phage JD905\n+2675826\tKlebsiella phage JD907\n+2675827\tKlebsiella phage JD908\n+2675828\tKlebsiella phage JD910\n+2678559\tunclassified Kelleziovirus\n+2678560\tunclassified Patiencevirus\n+2678563\tKlebsiella phage KP1801\n+2678601\tErwinia phage Hena1\n+2678606\tunclassified Phayoncevirus\n+2678937\tAcinetobacter phage vB_AbaP_B5\n+2679904\tPseudomonas phage F116\n+2681195\tStaphylococcus phage RDs-2019a\n+2681196\tKlebsiella phage P545\n+2681592\tBurkholderia phage phiE52237\n+2681594\tEscherichia phage P2\n+2681595\tEscherichia phage Wphi\n+2681596\tMannheimia phage phi-MhaA1-PHL101\n+2681597\tEscherichia phage RB14\n+2681598\tEscherichia phage T4\n+2681599\tEscherichia phage RB16\n+2681602\tEscherichia phage RB32\n+2681603\tEscherichia phage Mu\n+2681604\tEscherichia phage K1-5\n+2681605\tEscherichia phage vB_EcoS_AKFV33\n+2681606\tEscherichia phage DT57C\n+2681607\tAlphaproteobacteria phage PhiJL001\n+2681608\tStaphylococcus phage vB_SepiS-phiIPLA88\n+2681630\tEscherichia phage 186\n+2681777\tEscherichia phage LYY-2019a\n+2682147\tSerratia phage Moroni-Ayala_S23\n+2682148\tSerratia phage 214_S16\n+2682149\tSerratia phage GingerNinja_S34\n+2682150\tSerratia phage PhooPhighters_S19\n+2682151\tSerratia phage Rovert_S28\n+2682152\tSerratia phage McSteamy_S57\n+2682153\tSerratia phage Jello_S10\n+2682154\tSerratia phage BigDog_S98\n+2682155\tSerratia phage AndrewSwain_S20\n+2682156\tSerratia phage 169-15204L7_S7\n+2682157\tSerratia phage 162-15204L5_S5\n+2682158\tSerratia phage Tlacuache_S77\n+2682770\tEscherichia virus Ec_Makalu_002\n+2682965\tVibrio phage VALG_phi6\n+2683673\tEnterococcus phage vB_EfaS-DELF1\n+2684473\tAlteromonas phage AltPT11-V22\n+2686068\tBacteriophage MR-5\n+2686080\tArthrobacter phage DrYang\n+2686081\tArthrobacter phage LittleTokyo\n+2686082\tArthrobacter phage Giantsbane\n+2686083\tArthrobacter phage Powerpuff\n+2686084\tMycobacterium phage Mcshane\n+2686085\tMycobacterium phage Aneem\n+2686086\tMycobacterium phage Gabriela\n+2686087\tMicrobacterium phage Leaf\n+2686088\tMycobacterium phage Phaded\n+2686089\tMycobacterium phage Itos\n+2686090\tMicrobacterium phage Dewdrop\n+2686202\tVibrio phage NF\n+2686203\tClostridium phage phiCDKH01\n+2686204\tKlebsiella phage VLC1\n+2686205\tKlebsiella phage VLC2\n+2686206\tKlebsiella phage VLC3\n+2686207\tKlebsiella phage VLC4\n+2686208\tKlebsiella phage vB_KpnM_15-38_KLPPOU148\n+2686209\tKlebsiella phage vB_KpnS_15-38_KLPPOU149\n+2686211\tMycobacterium phage Ohfah\n+2686212\tMycobacterium phage Yokurt\n+2686281\tMethanobacterium virus PhiF3\n+2686283\tKlebsiella phage vB_Kpn_P545\n+2686285\tPseudomonas phage pf8_ST274-AUS411\n+2686290\tAcinetobacter phage vB_AbaP_Berthold\n+2686308\tAcinetobacter phage vB_AbaP_Apostate\n+2686437\tVibrio phage vB_VchM_Kuja\n+2686455\tPelagibacter phage HTVC023P\n+2686456\tPelagibacter phage HTVC026P\n+2686457\tPelagibacter phage HTVC027P\n+2686458\tPelagibacter phage HTVC103P\n+2686459\tPelagibacter phage HTVC104P\n+2686460\tPelagibacter phage HTVC106P\n+2686461\tPelagibacter phage HTVC111P\n+2686462\tPelagibacter phage HTVC112P\n+2686463\tPelagibacter phage HTVC115P\n+2686464\tPelagibacter phage HTVC202P\n+2690230\tAcinetobacter phage vB_AbaM_PhT2\n+2691098\tunclassified Lokivirus\n+2692157\tSalmonella phage SP1 SHa-2019\n+2692158\tSalmonella phage SP2 SHa-2019\n+2692159\tSalmonella phage SP3 SHa-2019\n+2692160\tSalmonella phage SP4 SHa-2019\n+2692161\tProteus phage PmP SHa-2019\n+2694977\tSalmonella phage 7c\n' |
b |
diff -r 000000000000 -r ebcc87a27f9c cpt_related_genome_prot/cpt-macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_related_genome_prot/cpt-macros.xml Fri Jun 10 08:46:28 2022 +0000 |
[ |
@@ -0,0 +1,115 @@ +<?xml version="1.0"?> +<macros> + <xml name="gff_requirements"> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="2.12.1">requests</requirement> + <yield/> + </requirements> + <version_command> + <![CDATA[ + cd $__tool_directory__ && git rev-parse HEAD + ]]> + </version_command> + </xml> + <xml name="citation/mijalisrasche"> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex">@unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-crr"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020-AJC-solo"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-clm"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="sl-citations-clm"> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </xml> +</macros> |
b |
diff -r 000000000000 -r ebcc87a27f9c cpt_related_genome_prot/macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_related_genome_prot/macros.xml Fri Jun 10 08:46:28 2022 +0000 |
b |
@@ -0,0 +1,85 @@ +<?xml version="1.0"?> +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="3.8.13">python</requirement> + <requirement type="package" version="1.79">biopython</requirement> + <requirement type="package" version="1.2.2">cpt_gffparser</requirement> + <yield/> + </requirements> + </xml> + <token name="@BLAST_TSV@"> + "$blast_tsv" + </token> + <xml name="blast_tsv"> + <param label="Blast Results" help="TSV/tabular (25 Column)" + name="blast_tsv" type="data" format="tabular" /> + </xml> + + <token name="@BLAST_XML@"> + "$blast_xml" + </token> + <xml name="blast_xml"> + <param label="Blast Results" help="XML format" + name="blast_xml" type="data" format="blastxml" /> + </xml> + <xml name="gff3_with_fasta"> + <param label="Genome Sequences" name="fasta" type="data" format="fasta" /> + <param label="Genome Annotations" name="gff3" type="data" format="gff3" /> + </xml> + <xml name="genome_selector"> + <conditional name="reference_genome"> + <param name="reference_genome_source" type="select" label="Reference Genome"> + <option value="history" selected="True">From History</option> + <option value="cached">Locally Cached</option> + </param> + <when value="cached"> + <param name="fasta_indexes" type="select" label="Source FASTA Sequence"> + <options from_data_table="all_fasta"/> + </param> + </when> + <when value="history"> + <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> + </when> + </conditional> + </xml> + <xml name="gff3_input"> + <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> + </xml> + <xml name="input/gff3+fasta"> + <expand macro="gff3_input" /> + <expand macro="genome_selector" /> + </xml> + <token name="@INPUT_GFF@"> + "$gff3_data" + </token> + <token name="@INPUT_FASTA@"> +#if str($reference_genome.reference_genome_source) == 'cached': + "${reference_genome.fasta_indexes.fields.path}" +#else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa +#end if + </token> + <token name="@GENOME_SELECTOR_PRE@"> +#if $reference_genome.reference_genome_source == 'history': + ln -s $reference_genome.genome_fasta genomeref.fa; +#end if + </token> + <token name="@GENOME_SELECTOR@"> +#if str($reference_genome.reference_genome_source) == 'cached': + "${reference_genome.fasta_indexes.fields.path}" +#else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa +#end if + </token> + <xml name="input/fasta"> + <param label="Fasta file" name="sequences" type="data" format="fasta"/> + </xml> + + <token name="@SEQUENCE@"> + "$sequences" + </token> + <xml name="input/fasta/protein"> + <param label="Protein fasta file" name="sequences" type="data" format="fasta"/> + </xml> +</macros> |
b |
diff -r 000000000000 -r ebcc87a27f9c cpt_related_genome_prot/prot_relate.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_related_genome_prot/prot_relate.xml Fri Jun 10 08:46:28 2022 +0000 |
b |
@@ -0,0 +1,68 @@ +<?xml version="1.0"?> +<tool id="edu.tamu.cpt.blast.relatedness.prot" name="Related Genomes" version="19.1.0.0"> + <description>based on protein blast results</description> + <macros> + <import>macros.xml</import> + <import>cpt-macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="aggressive"> +$__tool_directory__/relatedness_prot.py +${blastIn.blast} +$__tool_directory__/TaxID_List.txt +--hits $hits +$filter +--protein +#if $blastIn.blastType == "XML": +--xmlMode +#end if +--taxFilter "$taxFilter" +> $accession_list +</command> + <inputs> + <conditional name="blastIn"> + <param name="blastType" type="select" label="Blastn Input Type"> + <option value="XML" selected="true">Blast XML</option> + <option value="TSV">Blast Tabular</option> + </param> + <when value="XML"> + <param label="Blastn Results (Blast XML)" name="blast" type="data" format="blastxml"/> + </when> + <when value="TSV"> + <param label="Blastn Results" name="blast" type="data" format="tsv,tabular"/> + </when> + </conditional> + <param label = 'Number of results to return' name="hits" type="integer" size="15" value="5"/> + <param name="filter" type="boolean" truevalue="" falsevalue="--noFilter" checked="true" label="Automatically filter by phage Taxonomy IDs"/> + <param name="taxFilter" type="text" label="TaxIDs to filter out of results (Space separated)"/> + </inputs> + <outputs> + <data format="tabular" name="accession_list" label="Top BlastP Hits" /> + </outputs> + <tests> + <test> + <conditional name="blastIn"> + <param name="blastType" value="TSV"/> + <param name="blast" value="prot_relate_in.tab"/> + </conditional> + <param name="hits" value="10"/> + <output name="accession_list" file="prot_relate_out.tab" lines_diff="4"/> + </test> + </tests> + <help> +**What it does** + +This tool filters a set of BLASTp results and return the top +related genomes based on number of protein-protein matches. The +default mode is to only consider phage hits (based on TaxID), +but this can the toggled off. + +The input must be a tabular file from a BLASTp run with the qseqID, +sallseqid, salltitles, sallacc, and staxIDs fields selected. + +The output will be a tabular file with the top hits returned, +where each row specifies the TaxID, organism name, and number of +similar unique protein hits shared. +</help> + <expand macro="citations-2020" /> +</tool> |
b |
diff -r 000000000000 -r ebcc87a27f9c cpt_related_genome_prot/relatedness_prot.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_related_genome_prot/relatedness_prot.py Fri Jun 10 08:46:28 2022 +0000 |
[ |
b'@@ -0,0 +1,254 @@\n+#!/usr/bin/env python\n+import sys\n+import argparse\n+import json\n+import logging\n+from Bio.Blast import NCBIXML\n+\n+logging.basicConfig(level=logging.DEBUG)\n+log = logging.getLogger()\n+\n+def parse_blast(blast, isXML = False):\n+ res = []\n+ finalRes = []\n+ if isXML:\n+ for iter_num, blast_record in enumerate(NCBIXML.parse(blast), 1):\n+ for alignment in blast_record.alignments:\n+ tempID = alignment.hit_id[alignment.hit_id.find("gb|") + 3:]\n+ tempID = tempID[:tempID.find("|")]\n+ tempDesc = alignment.title\n+ while tempDesc.find("|") >= 0:\n+ tempDesc = tempDesc[tempDesc.find("|") + 1:]\n+ tempDesc = tempDesc.strip()\n+ tempID = tempID.strip()\n+ #for hsp in alignment.hsps:\n+ line = [str(blast_record.query)[:str(blast_record.query).find("[")].strip()]\n+ line.append(alignment.hit_id)\n+ line.append(tempDesc)\n+ line.append(alignment.accession)\n+ res.append(line)\n+ blast.seek(0)\n+ resInd = -1\n+ taxLine = blast.readline()\n+ while taxLine: \n+ if "<Hit>" in taxLine:\n+ resInd += 1\n+ taxSlice = ""\n+ elif "<taxid>" in taxLine:\n+ taxSlice = taxLine[taxLine.find("<taxid>") + 7:taxLine.find("</taxid>")]\n+ finalRes.append(res[resInd])\n+ finalRes[-1].append(taxSlice)\n+ taxLine = blast.readline()\n+ return finalRes\n+ else:\n+ for line in blast:\n+ finalRes.append(line.strip("\\n").split("\\t"))\n+ return finalRes\n+\n+def with_dice(blast):\n+ for data in blast:\n+ dice = 2 * int(data[14]) / (float(data[22]) + float(data[23]))\n+ yield data + [dice]\n+\n+\n+def filter_dice(blast, threshold=0.5):\n+ for data in blast:\n+ if data[-1] > threshold:\n+ yield data\n+\n+\n+def split_identifiers_nucl(_, ident):\n+ if "<>" in ident:\n+ idents = ident.split("<>")\n+ else:\n+ idents = [ident]\n+ return idents\n+\n+\n+def split_identifiers_prot(_, ident):\n+ if "<>" in ident:\n+ idents = ident.split("<>")\n+ else:\n+ idents = [ident]\n+ return [\n+ x[x.index("[") + 1 : x.rindex("]")]\n+ for x in idents\n+ # MULTISPECIES: recombination-associated protein RdgC [Enterobacteriaceae]<>RecName: Full=Recombination-associated protein RdgC<>putative exonuclease, RdgC [Enterobacter sp. 638]\n+ if "[" in x and "]" in x\n+ ]\n+\n+\n+def split_identifiers_phage(par, ident):\n+ par = par.replace("lcl|", "")\n+ par = par[0 : par.index("_prot_")]\n+ return [par]\n+\n+\n+def important_only(blast, split_identifiers):\n+ for data in blast:\n+ yield [\n+ data[0], # 01 Query Seq-id (ID of your sequence)\n+ data[1], # 13 All subject Seq-id(s), separated by a \';\'\n+ split_identifiers(\n+ data[1], data[2]\n+ ), # 25 All subject title(s), separated by a \'<>\'\n+ data[3].split(";"), # Extra: All Subject Accessions\n+ data[4].split(";"), # Extra: All TaxIDs\n+ ]\n+\n+\n+def deform_scores(blast):\n+ for data in blast:\n+ for org in data[2]:\n+ yield [data[0], data[1], org, data[3], data[4]]\n+\n+\n+def expand_fields(blast):\n+ for data in blast:\n+ for x in range(0, len(data[4])):\n+ yield [data[0], data[1], data[2][x], data[3], int(data[4][x])]\n+\n+def expand_taxIDs(blast, taxFilter):\n+ for data in blast:\n+ # if(len(data[4]) > 0):\n+ # print(data[0])\n+ for ID in data[4]:\n+ if ID != "N/A":\n+ filterOut = False\n+ for tax in taxFilter:\n+ if str(ID).strip() == tax:\n+ filterOut = True\n+ if not filterOut:\n+ yield [data[0], data[1], data[2], data[3], int(ID)]\n+\n+\n+def expand_titles(blast):\n+ for data in blast:\n+ for title in data[2]:\n+ yield [data[0], data[1], title, data[3], dat'..b'iNames):\n+ for data in blast:\n+ for x in range(0, len(phageTaxLookup)):\n+ if (data[4]) == phageTaxLookup[x]:\n+ yield [data[0], data[1], phageSciNames[x], data[3], data[4]]\n+ break\n+\n+\n+def remove_dupes(data):\n+ has_seen = {}\n+ for row in data:\n+ # qseqid, sseqid\n+ key = (row[0], row[4])\n+ # If we\'ve seen the key before, we can exit\n+ if key in has_seen:\n+ continue\n+\n+ # Otherwise, continue on\n+ has_seen[key] = True\n+ # Pretty simple\n+ yield row\n+\n+def scoreMap(blast):\n+ c = {}\n+ m = {}\n+ for (qseq, subID, subTitle, access, ID) in blast:\n+ if (str(subTitle), ID) not in c:\n+ m[(str(subTitle), ID)] = access\n+ c[(str(subTitle), ID)] = 0\n+\n+ c[(str(subTitle), ID)] += 1\n+ return c, m\n+\n+\n+if __name__ == "__main__":\n+ parser = argparse.ArgumentParser(description="Top related genomes")\n+ parser.add_argument(\n+ "blast", type=argparse.FileType("r"), help="Blast 25 Column Results"\n+ )\n+ parser.add_argument("phagedb", type=argparse.FileType("r"))\n+ parser.add_argument("--access", action="store_true")\n+ parser.add_argument("--protein", action="store_true")\n+ parser.add_argument("--canonical", action="store_true")\n+ parser.add_argument("--noFilter", action="store_true")\n+ #parser.add_argument("--title", action="store_true") # Add when ready to update XML after semester\n+ parser.add_argument("--hits", type=int, default=5)\n+ parser.add_argument("--xmlMode", action="store_true") \n+ parser.add_argument("--taxFilter", type=str) \n+\n+ args = parser.parse_args()\n+\n+ phageDb = args.phagedb\n+ phageTaxLookup = []\n+ sciName = []\n+ line = phageDb.readline()\n+ \n+ taxList = []\n+ if args.taxFilter and args.taxFilter != "" :\n+ args.taxFilter = args.taxFilter.split(" ")\n+ for ind in args.taxFilter:\n+ taxList.append(ind.strip())\n+\n+ while line:\n+ line = line.split("\\t")\n+ phageTaxLookup.append(int(line[0]))\n+ line[1] = line[1].strip()\n+ if (line[1] == ""):\n+ line[1] = "Novel Genome"\n+ sciName.append(line[1])\n+ line = phageDb.readline()\n+\n+ if args.protein:\n+ splitId = split_identifiers_prot\n+ # phageNameLookup = {k[\'source\'].rstrip(\'.\'): k[\'id\'] for k in phageDb}\n+ elif args.canonical:\n+ splitId = split_identifiers_phage\n+ # phageNameLookup = {k[\'source\'].rstrip(\'.\'): k[\'id\'] for k in phageDb}\n+ else:\n+ splitId = split_identifiers_nucl\n+ # phageNameLookup = {k[\'desc\'].rstrip(\'.\'): k[\'id\'] for k in phageDb}\n+\n+ data = parse_blast(args.blast, args.xmlMode)\n+ # data = with_dice(data)\n+ # data = filter_dice(data, threshold=0.0)\n+ data = important_only(data, splitId)\n+ \n+ data = expand_taxIDs(data, taxList)\n+ data = remove_dupes(data)\n+ if not args.noFilter:\n+ data = filter_phage(data, phageTaxLookup, sciName)\n+ listify = []\n+ for x in data:\n+ listify.append(x)\n+ #listify = greatest_taxID(listify)\n+ \n+ count_label = "Similar Unique Proteins"\n+ \n+ counts, accessions = scoreMap(listify)\n+ \n+ nameRec = listify[0][0]\n+ sys.stdout.write(\n+ "Top %d matches for BLASTp results of %s\\n"\n+ % (args.hits, nameRec)\n+ )\n+ header = "# TaxID\\t"\n+ #if args.title:\n+ header += "Name\\t"\n+ if args.access:\n+ header += "Accessions\\t"\n+ header += "Similar Unique Proteins\\n"\n+ sys.stdout.write(header)\n+\n+ for idx, ((name, ID), num) in enumerate(\n+ sorted(counts.items(), key=lambda item: -item[1])\n+ ):\n+ if idx > args.hits - 1:\n+ break\n+ line = str(ID) + "\\t"\n+ #if args.title:\n+ line += str(name) + "\\t"\n+ if args.access:\n+ line += str(accessions[(name, ID)][0]) + "\\t"\n+ line += str(num) + "\\n" \n+ sys.stdout.write(line)\n' |
b |
diff -r 000000000000 -r ebcc87a27f9c cpt_related_genome_prot/test-data/prot_relate_in.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_related_genome_prot/test-data/prot_relate_in.tab Fri Jun 10 08:46:28 2022 +0000 |
[ |
b'@@ -0,0 +1,5076 @@\n+6797807b-1962-4304-86e6-e441d807be09\tgb|QEG09153.1|\thypothetical protein CPT_Phriendly_001 [Vibrio phage Phriendly]\tQEG09153\t2596675\n+991e56f2-1ec9-45c8-ad91-934b6bad6d5e\tgb|QEG09154.1|\thypothetical protein CPT_Phriendly_002 [Vibrio phage Phriendly]\tQEG09154\t2596675\n+991e56f2-1ec9-45c8-ad91-934b6bad6d5e\tref|YP_004782526.1|;gb|AEK82066.1|\thypothetical protein SaPh711_gp151 [Salmonella phage 7-11]<>hypothetical protein [Salmonella phage 7-11]\tYP_004782526;AEK82066\t1054968\n+991e56f2-1ec9-45c8-ad91-934b6bad6d5e\tgb|AVJ48132.1|\thypothetical protein [Salmonella phage SE131]\tAVJ48132\t2081631\n+cf89c239-73a0-4277-94b0-1ce5c60d83f7\tgb|QEG09155.1|\thypothetical protein CPT_Phriendly_003 [Vibrio phage Phriendly]\tQEG09155\t2596675\n+976b0435-4212-43e1-91f3-f4c5e5b73692\tgb|QEG09156.1|\thypothetical protein CPT_Phriendly_004 [Vibrio phage Phriendly]\tQEG09156\t2596675\n+976b0435-4212-43e1-91f3-f4c5e5b73692\tgb|AII27516.1|\thypothetical protein Q21_gp38 [Vibrio phage VPp1]\tAII27516\t1524880\n+976b0435-4212-43e1-91f3-f4c5e5b73692\tgb|ASR73817.1|\thypothetical protein [Vibrio phage vB_ValP_IME271]\tASR73817\t2024229\n+976b0435-4212-43e1-91f3-f4c5e5b73692\tgb|QBJ00540.1|\thypothetical protein [Vibrio phage vB_VpP_BA6]\tQBJ00540\t2530170\n+112b8a65-1847-4dd5-a893-cf973fb0d1c0\tgb|QEG09157.1|\thypothetical protein CPT_Phriendly_005 [Vibrio phage Phriendly]\tQEG09157\t2596675\n+112b8a65-1847-4dd5-a893-cf973fb0d1c0\tgb|QBJ00542.1|\thypothetical protein [Vibrio phage vB_VpP_BA6]\tQBJ00542\t2530170\n+112b8a65-1847-4dd5-a893-cf973fb0d1c0\tgb|AQS78999.1|;gb|ASR73818.1|\thypothetical protein Q21_gp39 [Vibrio phage VPp1]<>hypothetical protein [Vibrio phage vB_ValP_IME271]\tAQS78999;ASR73818\t1524880;2024229\n+44d258bc-8ea4-409a-b999-7d4e2357a462\tgb|QEG09158.1|\thypothetical protein CPT_Phriendly_006 [Vibrio phage Phriendly]\tQEG09158\t2596675\n+44d258bc-8ea4-409a-b999-7d4e2357a462\tgb|QBJ00541.1|\thypothetical protein [Vibrio phage vB_VpP_BA6]\tQBJ00541\t2530170\n+44d258bc-8ea4-409a-b999-7d4e2357a462\tgb|ASR73819.1|\thypothetical protein [Vibrio phage vB_ValP_IME271]\tASR73819\t2024229\n+44d258bc-8ea4-409a-b999-7d4e2357a462\tgb|AII27517.1|\thypothetical protein Q21_gp40 [Vibrio phage VPp1]\tAII27517\t1524880\n+44d258bc-8ea4-409a-b999-7d4e2357a462\tdbj|BBA65576.1|\thypothetical protein [Xanthomonas phage XacN1]\tBBA65576\t2042251\n+8751a168-e955-403c-9c8c-9aba02ae882f\tgb|QEG09159.1|\thypothetical protein CPT_Phriendly_007 [Vibrio phage Phriendly]\tQEG09159\t2596675\n+d60261ef-446f-4e95-84a9-777a752d5315\tgb|QEG09160.1|\thypothetical protein CPT_Phriendly_008 [Vibrio phage Phriendly]\tQEG09160\t2596675\n+d60261ef-446f-4e95-84a9-777a752d5315\tgb|ASR73822.1|\thypothetical protein [Vibrio phage vB_ValP_IME271]\tASR73822\t2024229\n+d60261ef-446f-4e95-84a9-777a752d5315\tgb|QBJ00544.1|\thypothetical protein [Vibrio phage vB_VpP_BA6]\tQBJ00544\t2530170\n+70001925-4745-48b9-8867-42bc2d2f0255\tgb|QEG09161.1|\thypothetical protein CPT_Phriendly_009 [Vibrio phage Phriendly]\tQEG09161\t2596675\n+70001925-4745-48b9-8867-42bc2d2f0255\tgb|ASR73824.1|\thypothetical protein [Vibrio phage vB_ValP_IME271]\tASR73824\t2024229\n+70001925-4745-48b9-8867-42bc2d2f0255\tgb|QBJ00546.1|\thypothetical protein [Vibrio phage vB_VpP_BA6]\tQBJ00546\t2530170\n+70001925-4745-48b9-8867-42bc2d2f0255\tgb|AII27521.1|\thypothetical protein Q21_gp44 [Vibrio phage VPp1]\tAII27521\t1524880\n+7a6fbac4-f1b0-485c-81b2-6f38cc44f6cf\tgb|QEG09161.1|\thypothetical protein CPT_Phriendly_009 [Vibrio phage Phriendly]\tQEG09161\t2596675\n+7a6fbac4-f1b0-485c-81b2-6f38cc44f6cf\tgb|ASR73824.1|\thypothetical protein [Vibrio phage vB_ValP_IME271]\tASR73824\t2024229\n+7a6fbac4-f1b0-485c-81b2-6f38cc44f6cf\tgb|QBJ00546.1|\thypothetical protein [Vibrio phage vB_VpP_BA6]\tQBJ00546\t2530170\n+7a6fbac4-f1b0-485c-81b2-6f38cc44f6cf\tgb|AII27521.1|\thypothetical protein Q21_gp44 [Vibrio phage VPp1]\tAII27521\t1524880\n+af843952-5caf-4354-b9d6-c9b81ebdb425\tgb|QEG09162.1|\thypothetical protein CPT_Phriendly_010 [Vibrio phage Phriendly]\tQEG09162\t2596675\n+af843952-5caf-4354-b9d6-c9b81ebdb425\tgb|QBJ00547.1|\thypothetical protein'..b'1|;gb|ASV41882.1|;gb|AXQ71048.1|\tPhoH family protein [Vibrio phage ICP1]<>hypothetical protein TUST1-191_01030 [Vibrio phage ICP1_2006_D]<>hypothetical protein TUST1-182_01030 [Vibrio phage ICP1_2006_C]<>hypothetical protein TUST1-159_01015 [Vibrio phage ICP1_2006_B]<>hypothetical protein TUST1-17_01015 [Vibrio phage ICP1_2006_A]<>hypothetical protein TUST1-15_01035 [Vibrio phage ICP1_2005_A]<>hypothetical protein TUST1-2_01045 [Vibrio phage ICP1_2001_A]<>hypothetical protein TUST1-10_01020 [Vibrio phage ICP1_2004_A]<>phosphate starvation-inducible protein [Vibrio phage JSF4]<>putative ATPase [Vibrio phage JSF6]<>putative ATPase [Vibrio phage JSF1]<>putative ATPase [Vibrio phage JSF2]<>PhoH family protein [Vibrio phage ICP1_2012_A]\tYP_004251151;ADX88252;ADX88479;ADX88703;ADX88929;ADX89159;ADX89389;ADX89616;APD17934;ASV41524;ASV41742;ASV41882;AXQ71048\t979525;979526;979527;979528;979529;979530;979531;979532;1916110;1983592;1983601;1983615;2302419\n+8ab00e1a-585d-4de1-9e2c-c22807b162dd\tgb|ASV42186.1|;gb|ASV42462.1|;gb|ASV42712.1|;gb|AXY82300.1|;gb|AXY82518.1|;gb|QFR59267.1|\tputative ATPase [Vibrio phage JSF13]<>putative ATPase [Vibrio phage JSF14]<>putative ATPase [Vibrio phage JSF17]<>PhoH family protein [Vibrio phage ICP1_2011_A]<>PhoH family protein [Vibrio phage ICP1_2011_B]<>PhoH family protein [Vibrio phage ICP1_2017_F_Mathbaria]\tASV42186;ASV42462;ASV42712;AXY82300;AXY82518;QFR59267\t1296592;1983596;1983597;1983599;2302418;2653638\n+8ab00e1a-585d-4de1-9e2c-c22807b162dd\tgb|QDP42825.1|\tputative PhoH-like protein [Bacillus phage vB_BmeM-Goe8]\tQDP42825\t2593638\n+8ab00e1a-585d-4de1-9e2c-c22807b162dd\tgb|AUR93344.1|\tPhoH-like protein [Vibrio phage 1.187.O._10N.286.49.F1]\tAUR93344\t1881434\n+8ab00e1a-585d-4de1-9e2c-c22807b162dd\tgb|AUR94151.1|\tPhoH-like protein [Vibrio phage 1.193.O._10N.286.52.C6]\tAUR94151\t1881436\n+8ab00e1a-585d-4de1-9e2c-c22807b162dd\tref|YP_009225560.1|;gb|AKO61027.1|\tphosphate starvation-inducible protein PhoH [Pseudoalteromonas phage H101]<>phosphate starvation-inducible protein PhoH [Pseudoalteromonas phage H101]\tYP_009225560;AKO61027\t1654919\n+8ab00e1a-585d-4de1-9e2c-c22807b162dd\tref|YP_009302685.1|;gb|AMM45096.1|\tPhoH [Bacillus phage SP-15]<>PhoH [Bacillus phage SP-15]\tYP_009302685;AMM45096\t1792032\n+8ab00e1a-585d-4de1-9e2c-c22807b162dd\tgb|AUR91592.1|\thedgehog/hint domain protein [Vibrio phage 1.161.O._10N.261.48.C5]\tAUR91592\t1881340\n+8ab00e1a-585d-4de1-9e2c-c22807b162dd\tgb|QAU04516.1|\tphosphate starvation protein [Vibrio phage ValB1MD]\tQAU04516\t2508852\n+8ab00e1a-585d-4de1-9e2c-c22807b162dd\tgb|ASJ79269.1|\tphosphate starvation-inducible protein [Curvibacter phage P26059A]\tASJ79269\t1983783\n+8ab00e1a-585d-4de1-9e2c-c22807b162dd\tgb|AWD92982.1|\tputative regulatory protein [Bacillus phage vB_BceM-HSE3]\tAWD92982\t2170705\n+8ab00e1a-585d-4de1-9e2c-c22807b162dd\tgb|AEQ27577.1|\tphosphate-starvation inducible protein, partial [Synechococcus phage SPGM99-10]\tAEQ27577\t498340\n+8ab00e1a-585d-4de1-9e2c-c22807b162dd\tgb|AEQ27574.1|\tphosphate-starvation inducible protein, partial [Synechococcus phage SPGM99-03]\tAEQ27574\t1091299\n+c1689793-9878-4a28-acc3-98e722ed377c\tgb|QEG09222.1|\tunimolecular spanin [Vibrio phage Phriendly]\tQEG09222\t2596675\n+c1689793-9878-4a28-acc3-98e722ed377c\tgb|QBJ00600.1|\thypothetical protein [Vibrio phage vB_VpP_BA6]\tQBJ00600\t2530170\n+c1689793-9878-4a28-acc3-98e722ed377c\tgb|AII27513.2|;gb|ASR73881.1|\thypothetical protein Q21_gp35 [Vibrio phage VPp1]<>hypothetical protein [Vibrio phage vB_ValP_IME271]\tAII27513;ASR73881\t1524880;2024229\n+c1689793-9878-4a28-acc3-98e722ed377c\tgb|AUR96955.1|;gb|AUR98546.1|\tmembrane lipoprotein [Vibrio phage 1.235.O._10N.261.52.B2]<>membrane lipoprotein [Vibrio phage 1.253.O._10N.286.45.B12]\tAUR96955;AUR98546\t1881270;1881352\n+c1689793-9878-4a28-acc3-98e722ed377c\tgb|AUR99087.1|\tTMhelix containing protein [Vibrio phage 1.262.O._10N.286.51.A9]\tAUR99087\t1881238\n+694123c9-a467-430d-804f-50414c0489cc\tgb|QEG09223.1|\thypothetical protein CPT_Phriendly_072 [Vibrio phage Phriendly]\tQEG09223\t2596675\n' |
b |
diff -r 000000000000 -r ebcc87a27f9c cpt_related_genome_prot/test-data/prot_relate_out.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt_related_genome_prot/test-data/prot_relate_out.tab Fri Jun 10 08:46:28 2022 +0000 |
b |
@@ -0,0 +1,12 @@ +Top 10 matches for BLASTp results of 6797807b-1962-4304-86e6-e441d807be09 +# TaxID Name Similar Unique Proteins +2596675 Vibrio phage Phriendly 72 +2024229 Vibrio phage vB_ValP_IME271 57 +1524880 Vibrio phage VPp1 56 +2530170 Vibrio phage vB_VpP_BA6 55 +1881238 Vibrio phage 1.262.O._10N.286.51.A9 37 +484895 Pseudomonas virus LUZ24 20 +1407671 uncultured Mediterranean phage uvMED 20 +1548906 Pseudomonas phage vB_PaeP_C2-10_Ab22 19 +2590840 Pseudomonas virus Pa223 19 +1640969 Pseudomonas phage DL54 19 |