comparison disruptin_finder.py @ 1:b973bc75693d draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:40:49 +0000
parents
children
comparison
equal deleted inserted replaced
0:3f0d07a10405 1:b973bc75693d
1 """
2 This program is intended to find gene products that would be acceptable disruptin candidates.
3
4 The criteria can be toggled between selecting for proteins with:
5 - net charge above a give threshold (default = +4) and length less than given threshold (default = 100 aa)
6 OR
7 - ratio of number of charged residues to length of the sequence above a given threshold (default = 0.25 residue/aa)
8 and length less than given threshold (default = 100 aa)
9 OR
10 - net charge above a give threshold (default = +4), ratio of number of charged residues to length of the sequence
11 above a given threshold (default = 0.25 residue/aa), and length less than given threshold (default = 100 aa)
12
13 Net charge of a sequence is calculated so that for every R or K residue the net charge increases by one, and for every
14 D or E residue the net charge decreases by one. The ratio of charged residues to length is calculated in a similar manner.
15 The residues R, K, D, and E each increase the number of charged residues by one, and total for the sequence is then
16 divided by the length to get the ratio.
17
18 Input a multi fasta file with all of the predicted protein sequences from the genome as well as a threshold
19 sequence length, net charge, and charge residue to length ratio. The program outputs another fasta file.
20 The output fasta file includes records for all the sequences meeting the size and charge criteria.
21
22 """
23
24 from Bio import SeqIO
25 import argparse
26 import sys
27
28
29 def disruptin_finder(
30 fasta_file, thresh_size, thresh_net_charge, thresh_charge_ratio, selection_criteria
31 ):
32 # Iterable variables
33 net_charge = 0
34 charge_res = 0
35
36 # Create record variable to store record information
37 total_record = []
38
39 # Parse the .fasta file and get the sequence
40 for rec in SeqIO.parse(fasta_file, "fasta"):
41 sequence = str(rec.seq)
42
43 if len(sequence) <= thresh_size:
44 for aa in sequence:
45 # For R and K residues a positive charge is given
46 if aa in "RK":
47 net_charge += 1
48 charge_res += 1
49 # For D and E residues a negative charge is given
50 elif aa in "DE":
51 net_charge -= 1
52 charge_res += 1
53
54 # Charge (total charged residues) to size ratio is calculated
55 Length = len(sequence)
56 charge_ratio = float(charge_res) / float(Length)
57
58 # Based on the user-specified selection criteria a list of records is compiled
59 if selection_criteria == "net":
60 if net_charge >= thresh_net_charge:
61 total_record = total_record + [rec]
62 elif selection_criteria == "ratio":
63 if charge_ratio >= thresh_charge_ratio:
64 total_record = total_record + [rec]
65 elif selection_criteria == "both":
66 if (
67 charge_ratio >= thresh_charge_ratio
68 and net_charge >= thresh_net_charge
69 ):
70 total_record = total_record + [rec]
71
72 # Reset the iterable variables
73 net_charge = 0
74 charge_res = 0
75
76 # The total list of records is returned by the function
77 yield total_record
78
79
80 if __name__ == "__main__":
81 # Grab all of the filters from our plugin loader
82 parser = argparse.ArgumentParser(description="Disruptin Finder")
83 parser.add_argument(
84 "fasta_file", type=argparse.FileType("r"), help="Multi-FASTA Input"
85 )
86 parser.add_argument("--thresh_net_charge", type=int, default=4)
87 parser.add_argument("--thresh_size", type=int, default=100)
88 parser.add_argument("--thresh_charge_ratio", type=float, default=0.25)
89 parser.add_argument("--selection_criteria", action="store")
90 args = parser.parse_args()
91
92 for seq in disruptin_finder(**vars(args)):
93 SeqIO.write(seq, sys.stdout, "fasta")