Mercurial > repos > cpt > cpt_disruptin_finder
changeset 1:b973bc75693d draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:40:49 +0000 |
parents | 3f0d07a10405 |
children | f046c58687a2 |
files | cpt-macros.xml cpt_disruptin_finder/cpt-macros.xml cpt_disruptin_finder/disruptin_finder.py cpt_disruptin_finder/disruptin_finder.xml cpt_disruptin_finder/macros.xml disruptin_finder.py disruptin_finder.xml macros.xml |
diffstat | 8 files changed, 333 insertions(+), 285 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cpt-macros.xml Mon Jun 05 02:40:49 2023 +0000 @@ -0,0 +1,115 @@ +<macros> + <xml name="gff_requirements"> + <requirements> + <requirement type="package" version="2.7">python</requirement> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="2.12.1">requests</requirement> + <requirement type="package" version="1.2.2">cpt_gffparser</requirement> + <yield/> + </requirements> + <version_command> + <![CDATA[ + cd '$__tool_directory__' && git rev-parse HEAD + ]]> + </version_command> + </xml> + <xml name="citation/mijalisrasche"> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex">@unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-crr"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Ross}, + title = {CPT Galaxy Tools}, + year = {2020-}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {E. Mijalis, H. Rasche}, + title = {CPT Galaxy Tools}, + year = {2013-2017}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-2020-AJC-solo"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Criscione}, + title = {CPT Galaxy Tools}, + year = {2019-2021}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="citations-clm"> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </citations> + </xml> + <xml name="sl-citations-clm"> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {C. Maughmer}, + title = {CPT Galaxy Tools}, + year = {2017-2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + <yield/> + </xml> +</macros>
--- a/cpt_disruptin_finder/cpt-macros.xml Fri Jun 17 12:22:15 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,115 +0,0 @@ -<?xml version="1.0"?> -<macros> - <xml name="gff_requirements"> - <requirements> - <requirement type="package" version="2.7">python</requirement> - <requirement type="package" version="1.65">biopython</requirement> - <requirement type="package" version="2.12.1">requests</requirement> - <yield/> - </requirements> - <version_command> - <![CDATA[ - cd $__tool_directory__ && git rev-parse HEAD - ]]> - </version_command> - </xml> - <xml name="citation/mijalisrasche"> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex">@unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - </xml> - <xml name="citations"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-crr"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Ross}, - title = {CPT Galaxy Tools}, - year = {2020-}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-2020"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {E. Mijalis, H. Rasche}, - title = {CPT Galaxy Tools}, - year = {2013-2017}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-2020-AJC-solo"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Criscione}, - title = {CPT Galaxy Tools}, - year = {2019-2021}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="citations-clm"> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </citations> - </xml> - <xml name="sl-citations-clm"> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {C. Maughmer}, - title = {CPT Galaxy Tools}, - year = {2017-2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - <yield/> - </xml> -</macros>
--- a/cpt_disruptin_finder/disruptin_finder.py Fri Jun 17 12:22:15 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,93 +0,0 @@ -""" -This program is intended to find gene products that would be acceptable disruptin candidates. - -The criteria can be toggled between selecting for proteins with: - - net charge above a give threshold (default = +4) and length less than given threshold (default = 100 aa) - OR - - ratio of number of charged residues to length of the sequence above a given threshold (default = 0.25 residue/aa) - and length less than given threshold (default = 100 aa) - OR - - net charge above a give threshold (default = +4), ratio of number of charged residues to length of the sequence - above a given threshold (default = 0.25 residue/aa), and length less than given threshold (default = 100 aa) - -Net charge of a sequence is calculated so that for every R or K residue the net charge increases by one, and for every -D or E residue the net charge decreases by one. The ratio of charged residues to length is calculated in a similar manner. -The residues R, K, D, and E each increase the number of charged residues by one, and total for the sequence is then -divided by the length to get the ratio. - -Input a multi fasta file with all of the predicted protein sequences from the genome as well as a threshold -sequence length, net charge, and charge residue to length ratio. The program outputs another fasta file. -The output fasta file includes records for all the sequences meeting the size and charge criteria. - -""" - -from Bio import SeqIO -import argparse -import sys - - -def disruptin_finder( - fasta_file, thresh_size, thresh_net_charge, thresh_charge_ratio, selection_criteria -): - # Iterable variables - net_charge = 0 - charge_res = 0 - - # Create record variable to store record information - total_record = [] - - # Parse the .fasta file and get the sequence - for rec in SeqIO.parse(fasta_file, "fasta"): - sequence = str(rec.seq) - - if len(sequence) <= thresh_size: - for aa in sequence: - # For R and K residues a positive charge is given - if aa in "RK": - net_charge += 1 - charge_res += 1 - # For D and E residues a negative charge is given - elif aa in "DE": - net_charge -= 1 - charge_res += 1 - - # Charge (total charged residues) to size ratio is calculated - Length = len(sequence) - charge_ratio = float(charge_res) / float(Length) - - # Based on the user-specified selection criteria a list of records is compiled - if selection_criteria == "net": - if net_charge >= thresh_net_charge: - total_record = total_record + [rec] - elif selection_criteria == "ratio": - if charge_ratio >= thresh_charge_ratio: - total_record = total_record + [rec] - elif selection_criteria == "both": - if ( - charge_ratio >= thresh_charge_ratio - and net_charge >= thresh_net_charge - ): - total_record = total_record + [rec] - - # Reset the iterable variables - net_charge = 0 - charge_res = 0 - - # The total list of records is returned by the function - yield total_record - - -if __name__ == "__main__": - # Grab all of the filters from our plugin loader - parser = argparse.ArgumentParser(description="Disruptin Finder") - parser.add_argument( - "fasta_file", type=argparse.FileType("r"), help="Multi-FASTA Input" - ) - parser.add_argument("--thresh_net_charge", type=int, default=4) - parser.add_argument("--thresh_size", type=int, default=100) - parser.add_argument("--thresh_charge_ratio", type=float, default=0.25) - parser.add_argument("--selection_criteria", action="store") - args = parser.parse_args() - - for seq in disruptin_finder(**vars(args)): - SeqIO.write(seq, sys.stdout, "fasta")
--- a/cpt_disruptin_finder/disruptin_finder.xml Fri Jun 17 12:22:15 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,54 +0,0 @@ -<?xml version="1.1"?> -<tool id="edu.tamu.cpt2.phage.disruptin_finder" name="Disruptin Finder" version="1.1"> - <description>finds proteins with size and charge criteria</description> - <macros> - <import>macros.xml</import> - <import>cpt-macros.xml</import> - </macros> - <expand macro="requirements"/> - <command detect_errors="aggressive"><![CDATA[ -python $__tool_directory__/disruptin_finder.py -$fasta_file ---thresh_net_charge $thresh_net_charge ---thresh_size $thresh_size ---thresh_charge_ratio $thresh_charge_ratio ---selection_criteria $selection_criteria - -> $output]]></command> - <inputs> - <param label="Fasta" name="fasta_file" type="data" format="fasta" /> - <param label="Minimum Net Charge" name="thresh_net_charge" type="integer" value="4" /> - <param label="Maximum Length" name="thresh_size" type="integer" value="100" /> - <param label="Minimum Charge to Length Ratio" name="thresh_charge_ratio" type="float" value="0.25" /> - - <param type="select" label="Type of selection criteria" name="selection_criteria"> - <option value="net">Net charge</option> - <option value="ratio">Ratio of charged residues to sequence length</option> - <option value="both" selected="true">Both net charge and ratio</option> - </param> - - </inputs> - <outputs> - <data format="fasta" name="output"/> - </outputs> - <help><![CDATA[ -**What it does** -This program finds proteins sequences based on given selection criteria: net charge, sequence length, -and/or number of charged residues per amino acid. Inputs include a multi fasta file of protein sequences, -thresholds for size, charge, and charge-to-size ratio criteria. - -This tool returns the selected sequences in a fasta format. - - ]]></help> - <citations> - <citation type="doi">10.1371/journal.pcbi.1008214</citation> - <citation type="bibtex"> - @unpublished{galaxyTools, - author = {A. Holt}, - title = {CPT Galaxy Tools}, - year = {2020}, - note = {https://github.com/tamu-cpt/galaxy-tools/} - } - </citation> - </citations> -</tool>
--- a/cpt_disruptin_finder/macros.xml Fri Jun 17 12:22:15 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -<?xml version="1.0"?> -<macros> - <xml name="requirements"> - <requirements> - <requirement type="package" version="3.8.13">python</requirement> - <requirement type="package" version="1.79">biopython</requirement> - <requirement type="package" version="1.2.2">cpt_gffparser</requirement> - <yield/> - </requirements> - </xml> - <xml name="genome_selector"> - <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> - </xml> - <xml name="gff3_input"> - <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> - </xml> - <token name="@GENOME_SELECTOR_PRE@"> - ln -s $genome_fasta genomeref.fa; - </token> - <token name="@GENOME_SELECTOR@"> - genomeref.fa - </token> -</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/disruptin_finder.py Mon Jun 05 02:40:49 2023 +0000 @@ -0,0 +1,93 @@ +""" +This program is intended to find gene products that would be acceptable disruptin candidates. + +The criteria can be toggled between selecting for proteins with: + - net charge above a give threshold (default = +4) and length less than given threshold (default = 100 aa) + OR + - ratio of number of charged residues to length of the sequence above a given threshold (default = 0.25 residue/aa) + and length less than given threshold (default = 100 aa) + OR + - net charge above a give threshold (default = +4), ratio of number of charged residues to length of the sequence + above a given threshold (default = 0.25 residue/aa), and length less than given threshold (default = 100 aa) + +Net charge of a sequence is calculated so that for every R or K residue the net charge increases by one, and for every +D or E residue the net charge decreases by one. The ratio of charged residues to length is calculated in a similar manner. +The residues R, K, D, and E each increase the number of charged residues by one, and total for the sequence is then +divided by the length to get the ratio. + +Input a multi fasta file with all of the predicted protein sequences from the genome as well as a threshold +sequence length, net charge, and charge residue to length ratio. The program outputs another fasta file. +The output fasta file includes records for all the sequences meeting the size and charge criteria. + +""" + +from Bio import SeqIO +import argparse +import sys + + +def disruptin_finder( + fasta_file, thresh_size, thresh_net_charge, thresh_charge_ratio, selection_criteria +): + # Iterable variables + net_charge = 0 + charge_res = 0 + + # Create record variable to store record information + total_record = [] + + # Parse the .fasta file and get the sequence + for rec in SeqIO.parse(fasta_file, "fasta"): + sequence = str(rec.seq) + + if len(sequence) <= thresh_size: + for aa in sequence: + # For R and K residues a positive charge is given + if aa in "RK": + net_charge += 1 + charge_res += 1 + # For D and E residues a negative charge is given + elif aa in "DE": + net_charge -= 1 + charge_res += 1 + + # Charge (total charged residues) to size ratio is calculated + Length = len(sequence) + charge_ratio = float(charge_res) / float(Length) + + # Based on the user-specified selection criteria a list of records is compiled + if selection_criteria == "net": + if net_charge >= thresh_net_charge: + total_record = total_record + [rec] + elif selection_criteria == "ratio": + if charge_ratio >= thresh_charge_ratio: + total_record = total_record + [rec] + elif selection_criteria == "both": + if ( + charge_ratio >= thresh_charge_ratio + and net_charge >= thresh_net_charge + ): + total_record = total_record + [rec] + + # Reset the iterable variables + net_charge = 0 + charge_res = 0 + + # The total list of records is returned by the function + yield total_record + + +if __name__ == "__main__": + # Grab all of the filters from our plugin loader + parser = argparse.ArgumentParser(description="Disruptin Finder") + parser.add_argument( + "fasta_file", type=argparse.FileType("r"), help="Multi-FASTA Input" + ) + parser.add_argument("--thresh_net_charge", type=int, default=4) + parser.add_argument("--thresh_size", type=int, default=100) + parser.add_argument("--thresh_charge_ratio", type=float, default=0.25) + parser.add_argument("--selection_criteria", action="store") + args = parser.parse_args() + + for seq in disruptin_finder(**vars(args)): + SeqIO.write(seq, sys.stdout, "fasta")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/disruptin_finder.xml Mon Jun 05 02:40:49 2023 +0000 @@ -0,0 +1,51 @@ +<tool id="edu.tamu.cpt2.phage.disruptin_finder" name="Disruptin Finder" version="1.1"> + <description>finds proteins with size and charge criteria</description> + <macros> + <import>macros.xml</import> + <import>cpt-macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="aggressive"><![CDATA[ +python '$__tool_directory__/disruptin_finder.py' +'$fasta_file' +--thresh_net_charge '$thresh_net_charge' +--thresh_size '$thresh_size' +--thresh_charge_ratio '$thresh_charge_ratio' +--selection_criteria '$selection_criteria' + +> '$output']]></command> + <inputs> + <param label="Fasta" name="fasta_file" type="data" format="fasta"/> + <param label="Minimum Net Charge" name="thresh_net_charge" type="integer" value="4"/> + <param label="Maximum Length" name="thresh_size" type="integer" value="100"/> + <param label="Minimum Charge to Length Ratio" name="thresh_charge_ratio" type="float" value="0.25"/> + <param type="select" label="Type of selection criteria" name="selection_criteria"> + <option value="net">Net charge</option> + <option value="ratio">Ratio of charged residues to sequence length</option> + <option value="both" selected="true">Both net charge and ratio</option> + </param> + </inputs> + <outputs> + <data format="fasta" name="output"/> + </outputs> + <help><![CDATA[ +**What it does** +This program finds proteins sequences based on given selection criteria: net charge, sequence length, +and/or number of charged residues per amino acid. Inputs include a multi fasta file of protein sequences, +thresholds for size, charge, and charge-to-size ratio criteria. + +This tool returns the selected sequences in a fasta format. + + ]]></help> + <citations> + <citation type="doi">10.1371/journal.pcbi.1008214</citation> + <citation type="bibtex"> + @unpublished{galaxyTools, + author = {A. Holt}, + title = {CPT Galaxy Tools}, + year = {2020}, + note = {https://github.com/tamu-cpt/galaxy-tools/} + } + </citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Jun 05 02:40:49 2023 +0000 @@ -0,0 +1,74 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package">progressivemauve</requirement> + <!--<requirement type="package" version="2.7">python</requirement>--> + <requirement type="package" version="0.6.4">bcbiogff</requirement> + <yield/> + </requirements> + </xml> + <token name="@WRAPPER_VERSION@">2.4.0</token> + <xml name="citation/progressive_mauve"> + <citation type="doi">10.1371/journal.pone.0011147</citation> + </xml> + <xml name="citation/gepard"> + <citation type="doi">10.1093/bioinformatics/btm039</citation> + </xml> + <token name="@XMFA_INPUT@"> + '$xmfa' + </token> + <xml name="xmfa_input" token_formats="xmfa"> + <param type="data" format="@FORMATS@" name="xmfa" label="XMFA MSA"/> + </xml> + <token name="@XMFA_FA_INPUT@"> + '$sequences' + </token> + <xml name="xmfa_fa_input"> + <param type="data" format="fasta" name="sequences" label="Sequences in alignment" help="These sequences should be the SAME DATASET that was used in the progressiveMauve run. Failing that, they should be provided in the same order as in original progressiveMauve run"/> + </xml> + <xml name="genome_selector"> + <conditional name="reference_genome"> + <param name="reference_genome_source" type="select" label="Reference Genome"> + <option value="history" selected="True">From History</option> + <option value="cached">Locally Cached</option> + </param> + <when value="cached"> + <param name="fasta_indexes" type="select" label="Source FASTA Sequence"> + <options from_data_table="all_fasta"/> + </param> + </when> + <when value="history"> + <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/> + </when> + </conditional> + </xml> + <xml name="gff3_input"> + <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/> + </xml> + <xml name="input/gff3+fasta"> + <expand macro="gff3_input"/> + <expand macro="genome_selector"/> + </xml> + <token name="@INPUT_GFF@"> + '$gff3_data' + </token> + <token name="@INPUT_FASTA@"> + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + </token> + <token name="@GENOME_SELECTOR_PRE@"> + #if $reference_genome.reference_genome_source == 'history': + ln -s '$reference_genome.genome_fasta' genomeref.fa; + #end if + </token> + <token name="@GENOME_SELECTOR@"> + #if str($reference_genome.reference_genome_source) == 'cached': + '${reference_genome.fasta_indexes.fields.path}' + #else if str($reference_genome.reference_genome_source) == 'history': + genomeref.fa + #end if + </token> +</macros>