changeset 0:3f0d07a10405 draft

Uploaded
author cpt
date Fri, 17 Jun 2022 12:22:15 +0000
parents
children b973bc75693d
files cpt_disruptin_finder/cpt-macros.xml cpt_disruptin_finder/disruptin_finder.py cpt_disruptin_finder/disruptin_finder.xml cpt_disruptin_finder/macros.xml
diffstat 4 files changed, 285 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_disruptin_finder/cpt-macros.xml	Fri Jun 17 12:22:15 2022 +0000
@@ -0,0 +1,115 @@
+<?xml version="1.0"?>
+<macros>
+	<xml name="gff_requirements">
+		<requirements>
+			<requirement type="package" version="2.7">python</requirement>
+			<requirement type="package" version="1.65">biopython</requirement>
+			<requirement type="package" version="2.12.1">requests</requirement>
+			<yield/>
+		</requirements>
+		<version_command>
+		<![CDATA[
+			cd $__tool_directory__ && git rev-parse HEAD
+		]]>
+		</version_command>
+	</xml>
+	<xml name="citation/mijalisrasche">
+		<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+		<citation type="bibtex">@unpublished{galaxyTools,
+		author = {E. Mijalis, H. Rasche},
+		title = {CPT Galaxy Tools},
+		year = {2013-2017},
+		note = {https://github.com/tamu-cpt/galaxy-tools/}
+		}
+		</citation>
+	</xml>
+	<xml name="citations">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation> 
+		<yield/>
+		</citations>
+	</xml>
+    	<xml name="citations-crr">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Ross},
+				title = {CPT Galaxy Tools},
+				year = {2020-},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+		<yield/>
+		</citations>
+	</xml>
+        <xml name="citations-2020">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {E. Mijalis, H. Rasche},
+				title = {CPT Galaxy Tools},
+				year = {2013-2017},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="citations-2020-AJC-solo">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+                        <citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {A. Criscione},
+				title = {CPT Galaxy Tools},
+				year = {2019-2021},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+                        </citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="citations-clm">
+		<citations>
+			<citation type="doi">10.1371/journal.pcbi.1008214</citation>
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <yield/>
+		</citations>
+	</xml>
+        <xml name="sl-citations-clm">
+			<citation type="bibtex">
+			@unpublished{galaxyTools,
+				author = {C. Maughmer},
+				title = {CPT Galaxy Tools},
+				year = {2017-2020},
+				note = {https://github.com/tamu-cpt/galaxy-tools/}
+			}
+			</citation>
+                        <yield/>
+	</xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_disruptin_finder/disruptin_finder.py	Fri Jun 17 12:22:15 2022 +0000
@@ -0,0 +1,93 @@
+"""
+This program is intended to find gene products that would be acceptable disruptin candidates.
+
+The criteria can be toggled between selecting for proteins with:
+    - net charge above a give threshold (default = +4) and length less than given threshold (default = 100 aa)
+    OR
+    - ratio of number of charged residues to length of the sequence above a given threshold (default = 0.25 residue/aa)
+    and length less than given threshold (default = 100 aa)
+    OR
+    - net charge above a give threshold (default = +4), ratio of number of charged residues to length of the sequence
+    above a given threshold (default = 0.25 residue/aa), and length less than given threshold (default = 100 aa)
+
+Net charge of a sequence is calculated so that for every R or K residue the net charge increases by one, and for every
+D or E residue the net charge decreases by one. The ratio of charged residues to length is calculated in a similar manner.
+The residues R, K, D, and E each increase the number of charged residues by one, and total for the sequence is then
+divided by the length to get the ratio.
+
+Input a multi fasta file with all of the predicted protein sequences from the genome as well as a threshold
+sequence length, net charge, and charge residue to length ratio. The program outputs another fasta file.
+The output fasta file includes records for all the sequences meeting the size and charge criteria.
+
+"""
+
+from Bio import SeqIO
+import argparse
+import sys
+
+
+def disruptin_finder(
+    fasta_file, thresh_size, thresh_net_charge, thresh_charge_ratio, selection_criteria
+):
+    # Iterable variables
+    net_charge = 0
+    charge_res = 0
+
+    # Create record variable to store record information
+    total_record = []
+
+    # Parse the .fasta file and get the sequence
+    for rec in SeqIO.parse(fasta_file, "fasta"):
+        sequence = str(rec.seq)
+
+        if len(sequence) <= thresh_size:
+            for aa in sequence:
+                # For R and K residues a positive charge is given
+                if aa in "RK":
+                    net_charge += 1
+                    charge_res += 1
+                # For D and E residues a negative charge is given
+                elif aa in "DE":
+                    net_charge -= 1
+                    charge_res += 1
+
+            # Charge (total charged residues) to size ratio is calculated
+            Length = len(sequence)
+            charge_ratio = float(charge_res) / float(Length)
+
+            # Based on the user-specified selection criteria a list of records is compiled
+            if selection_criteria == "net":
+                if net_charge >= thresh_net_charge:
+                    total_record = total_record + [rec]
+            elif selection_criteria == "ratio":
+                if charge_ratio >= thresh_charge_ratio:
+                    total_record = total_record + [rec]
+            elif selection_criteria == "both":
+                if (
+                    charge_ratio >= thresh_charge_ratio
+                    and net_charge >= thresh_net_charge
+                ):
+                    total_record = total_record + [rec]
+
+            # Reset the iterable variables
+            net_charge = 0
+            charge_res = 0
+
+    # The total list of records is returned by the function
+    yield total_record
+
+
+if __name__ == "__main__":
+    # Grab all of the filters from our plugin loader
+    parser = argparse.ArgumentParser(description="Disruptin Finder")
+    parser.add_argument(
+        "fasta_file", type=argparse.FileType("r"), help="Multi-FASTA Input"
+    )
+    parser.add_argument("--thresh_net_charge", type=int, default=4)
+    parser.add_argument("--thresh_size", type=int, default=100)
+    parser.add_argument("--thresh_charge_ratio", type=float, default=0.25)
+    parser.add_argument("--selection_criteria", action="store")
+    args = parser.parse_args()
+
+    for seq in disruptin_finder(**vars(args)):
+        SeqIO.write(seq, sys.stdout, "fasta")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_disruptin_finder/disruptin_finder.xml	Fri Jun 17 12:22:15 2022 +0000
@@ -0,0 +1,54 @@
+<?xml version="1.1"?>
+<tool id="edu.tamu.cpt2.phage.disruptin_finder" name="Disruptin Finder" version="1.1">
+    <description>finds proteins with size and charge criteria</description>
+    <macros>
+		<import>macros.xml</import>
+		<import>cpt-macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <command detect_errors="aggressive"><![CDATA[
+python $__tool_directory__/disruptin_finder.py
+$fasta_file
+--thresh_net_charge $thresh_net_charge
+--thresh_size $thresh_size
+--thresh_charge_ratio $thresh_charge_ratio
+--selection_criteria $selection_criteria
+
+> $output]]></command>
+    <inputs>
+        <param label="Fasta" name="fasta_file" type="data" format="fasta" />
+	<param label="Minimum Net Charge" name="thresh_net_charge" type="integer" value="4" />
+        <param label="Maximum Length" name="thresh_size" type="integer" value="100" />
+	<param label="Minimum Charge to Length Ratio" name="thresh_charge_ratio" type="float" value="0.25" />
+		
+	<param type="select" label="Type of selection criteria" name="selection_criteria">
+		<option value="net">Net charge</option>
+		<option value="ratio">Ratio of charged residues to sequence length</option>
+		<option value="both" selected="true">Both net charge and ratio</option>
+	</param>
+
+    </inputs>
+    <outputs>
+		<data format="fasta" name="output"/>
+    </outputs>
+    <help><![CDATA[
+**What it does**
+This program finds proteins sequences based on given selection criteria: net charge, sequence length, 
+and/or number of charged residues per amino acid. Inputs include a multi fasta file of protein sequences,
+thresholds for size, charge, and charge-to-size ratio criteria.
+
+This tool returns the selected sequences in a fasta format.
+
+        ]]></help>
+    <citations>
+      <citation type="doi">10.1371/journal.pcbi.1008214</citation>
+      <citation type="bibtex">
+      @unpublished{galaxyTools, 
+          author = {A. Holt},
+          title = {CPT Galaxy Tools},
+          year = {2020},
+          note = {https://github.com/tamu-cpt/galaxy-tools/}
+      }
+      </citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cpt_disruptin_finder/macros.xml	Fri Jun 17 12:22:15 2022 +0000
@@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+<macros>
+  <xml name="requirements">
+    <requirements>
+		<requirement type="package" version="3.8.13">python</requirement>
+		<requirement type="package" version="1.79">biopython</requirement>
+		<requirement type="package" version="1.2.2">cpt_gffparser</requirement>  
+		<yield/>
+    </requirements>
+  </xml>
+  <xml name="genome_selector">
+	    <param name="genome_fasta" type="data" format="fasta" label="Source FASTA Sequence"/>
+  </xml>
+  <xml name="gff3_input">
+    <param label="GFF3 Annotations" name="gff3_data" type="data" format="gff3"/>
+  </xml>
+  <token name="@GENOME_SELECTOR_PRE@">
+		ln -s $genome_fasta genomeref.fa;
+	</token>
+	<token name="@GENOME_SELECTOR@">
+		genomeref.fa
+	</token>
+</macros>