diff disruptin_finder.py @ 1:b973bc75693d draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:40:49 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/disruptin_finder.py	Mon Jun 05 02:40:49 2023 +0000
@@ -0,0 +1,93 @@
+"""
+This program is intended to find gene products that would be acceptable disruptin candidates.
+
+The criteria can be toggled between selecting for proteins with:
+    - net charge above a give threshold (default = +4) and length less than given threshold (default = 100 aa)
+    OR
+    - ratio of number of charged residues to length of the sequence above a given threshold (default = 0.25 residue/aa)
+    and length less than given threshold (default = 100 aa)
+    OR
+    - net charge above a give threshold (default = +4), ratio of number of charged residues to length of the sequence
+    above a given threshold (default = 0.25 residue/aa), and length less than given threshold (default = 100 aa)
+
+Net charge of a sequence is calculated so that for every R or K residue the net charge increases by one, and for every
+D or E residue the net charge decreases by one. The ratio of charged residues to length is calculated in a similar manner.
+The residues R, K, D, and E each increase the number of charged residues by one, and total for the sequence is then
+divided by the length to get the ratio.
+
+Input a multi fasta file with all of the predicted protein sequences from the genome as well as a threshold
+sequence length, net charge, and charge residue to length ratio. The program outputs another fasta file.
+The output fasta file includes records for all the sequences meeting the size and charge criteria.
+
+"""
+
+from Bio import SeqIO
+import argparse
+import sys
+
+
+def disruptin_finder(
+    fasta_file, thresh_size, thresh_net_charge, thresh_charge_ratio, selection_criteria
+):
+    # Iterable variables
+    net_charge = 0
+    charge_res = 0
+
+    # Create record variable to store record information
+    total_record = []
+
+    # Parse the .fasta file and get the sequence
+    for rec in SeqIO.parse(fasta_file, "fasta"):
+        sequence = str(rec.seq)
+
+        if len(sequence) <= thresh_size:
+            for aa in sequence:
+                # For R and K residues a positive charge is given
+                if aa in "RK":
+                    net_charge += 1
+                    charge_res += 1
+                # For D and E residues a negative charge is given
+                elif aa in "DE":
+                    net_charge -= 1
+                    charge_res += 1
+
+            # Charge (total charged residues) to size ratio is calculated
+            Length = len(sequence)
+            charge_ratio = float(charge_res) / float(Length)
+
+            # Based on the user-specified selection criteria a list of records is compiled
+            if selection_criteria == "net":
+                if net_charge >= thresh_net_charge:
+                    total_record = total_record + [rec]
+            elif selection_criteria == "ratio":
+                if charge_ratio >= thresh_charge_ratio:
+                    total_record = total_record + [rec]
+            elif selection_criteria == "both":
+                if (
+                    charge_ratio >= thresh_charge_ratio
+                    and net_charge >= thresh_net_charge
+                ):
+                    total_record = total_record + [rec]
+
+            # Reset the iterable variables
+            net_charge = 0
+            charge_res = 0
+
+    # The total list of records is returned by the function
+    yield total_record
+
+
+if __name__ == "__main__":
+    # Grab all of the filters from our plugin loader
+    parser = argparse.ArgumentParser(description="Disruptin Finder")
+    parser.add_argument(
+        "fasta_file", type=argparse.FileType("r"), help="Multi-FASTA Input"
+    )
+    parser.add_argument("--thresh_net_charge", type=int, default=4)
+    parser.add_argument("--thresh_size", type=int, default=100)
+    parser.add_argument("--thresh_charge_ratio", type=float, default=0.25)
+    parser.add_argument("--selection_criteria", action="store")
+    args = parser.parse_args()
+
+    for seq in disruptin_finder(**vars(args)):
+        SeqIO.write(seq, sys.stdout, "fasta")