Mercurial > repos > iuc > fasta_stats
comparison fasta-stats.py @ 4:0dbb995c7d35 draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/fasta_stats/ commit 50f5cce5a8c11001e2c59600a2b99a4243b6d06f"
author | iuc |
---|---|
date | Thu, 18 Nov 2021 20:56:57 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:56022eb50bbd | 4:0dbb995c7d35 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 | |
4 # python version of fasta-stats with some extra features | |
5 # written by anmol.kiran@gmail.com | |
6 # git: @codemeleon | |
7 # date: 10/11/2021 | |
8 | |
9 import argparse | |
10 import re | |
11 from os import path | |
12 | |
13 import numpy as np | |
14 from Bio import SeqIO | |
15 | |
16 | |
17 def calculate_NG50(estimated_genome, total_length, sequence_lengths): | |
18 temp = 0 | |
19 teoretical_NG50 = estimated_genome / 2.0 | |
20 NG50 = 0 | |
21 for seq in sequence_lengths: | |
22 temp += seq | |
23 if teoretical_NG50 < temp: | |
24 NG50 = seq | |
25 break | |
26 return NG50 | |
27 | |
28 | |
29 def run(fasta, stats_output, gaps_output, genome_size): | |
30 """Generates scaffold statistics.""" | |
31 if not fasta: | |
32 exit("Input file not given.") | |
33 if not path.isfile(fasta): | |
34 exit(f"{fasta} path does not exist.") | |
35 | |
36 seq_len = {} | |
37 bases_global = {"A": 0, "N": 0, "T": 0, "C": 0, "G": 0} | |
38 bases_seq = {} | |
39 seq_id_Ngaprange = {} | |
40 nstart = 0 | |
41 contigs_len = [] | |
42 gap_count = 0 | |
43 for seq_record in SeqIO.parse(fasta, "fasta"): | |
44 seq = str(seq_record.seq).upper() | |
45 # print(len(seq)) | |
46 seq_len[seq_record.id] = len(seq) | |
47 | |
48 # NOTE: Nucleotide count | |
49 bases_seq[seq_record.id] = { | |
50 "A": seq.count("A"), | |
51 "N": seq.count("N"), | |
52 "T": seq.count("T"), | |
53 "C": seq.count("C"), | |
54 "G": seq.count("G"), | |
55 } | |
56 bases_global["A"] += bases_seq[seq_record.id]["A"] | |
57 bases_global["N"] += bases_seq[seq_record.id]["N"] | |
58 bases_global["T"] += bases_seq[seq_record.id]["T"] | |
59 bases_global["C"] += bases_seq[seq_record.id]["C"] | |
60 bases_global["G"] += bases_seq[seq_record.id]["G"] | |
61 | |
62 # NOTE: Gap count and their range | |
63 range_gen = re.finditer("N+", seq) | |
64 n_range = [match.span() for match in range_gen] | |
65 for n_rng in n_range: | |
66 if n_rng[0] == 0 or n_rng[1] == seq_len[seq_record.id]: | |
67 continue | |
68 else: | |
69 gap_count += 1 | |
70 | |
71 # NOTE: Contigs, their lenths from scaffold and their N gap range | |
72 seq_id_Ngaprange[seq_record.id] = n_range | |
73 n_range_len = len(n_range) | |
74 if n_range_len > 0: | |
75 n_range = ( | |
76 [(0, 0)] + n_range + [(seq_len[seq_record.id], seq_len[seq_record.id])] | |
77 ) | |
78 for idx in range(n_range_len + 1): | |
79 nstart = n_range[idx][1] | |
80 nend = n_range[idx + 1][0] | |
81 con_len = nend - nstart | |
82 if con_len: | |
83 contigs_len.append(con_len) | |
84 else: | |
85 contigs_len.append(len(seq)) | |
86 | |
87 # NOTE: Scaffold statistics | |
88 SEQ_LEN_LIST = sorted(seq_len.values(), reverse=True) | |
89 scaffold_lens = np.array(SEQ_LEN_LIST) | |
90 scaffold_lens_sum = np.cumsum(scaffold_lens) | |
91 N50_len = scaffold_lens_sum[-1] * 0.5 | |
92 N50_idx = np.where(scaffold_lens_sum > N50_len)[0][0] | |
93 N90_len = scaffold_lens_sum[-1] * 0.9 | |
94 N90_idx = np.where(scaffold_lens_sum > N90_len)[0][0] | |
95 NG50 = calculate_NG50(genome_size, scaffold_lens_sum[-1], scaffold_lens) | |
96 | |
97 # NOTE: Contig statistics | |
98 seq_len_list = sorted(contigs_len, reverse=True) | |
99 contigs_len = np.array(seq_len_list) | |
100 contigs_len_sum = np.cumsum(contigs_len) | |
101 n50_len = contigs_len_sum[-1] * 0.5 | |
102 n50_idx = np.where(contigs_len_sum > n50_len)[0][0] | |
103 n90_len = contigs_len_sum[-1] * 0.9 | |
104 n90_idx = np.where(contigs_len_sum > n90_len)[0][0] | |
105 ng50 = calculate_NG50(genome_size, contigs_len_sum[-1], contigs_len) | |
106 | |
107 with open(stats_output, "w") as soutput: | |
108 soutput.write("{}\t{}\n".format("Scaffold L50", N50_idx + 1)) | |
109 soutput.write("{}\t{}\n".format("Scaffold N50", SEQ_LEN_LIST[N50_idx])) | |
110 soutput.write("{}\t{}\n".format("Scaffold L90", N90_idx + 1)) | |
111 soutput.write("{}\t{}\n".format("Scaffold N90", SEQ_LEN_LIST[N90_idx])) | |
112 if genome_size != 0: | |
113 soutput.write("{}\t{}\n".format("Scaffold NG50", NG50)) | |
114 soutput.write("{}\t{}\n".format("Scaffold len_max", SEQ_LEN_LIST[0])) | |
115 soutput.write("{}\t{}\n".format("Scaffold len_min", SEQ_LEN_LIST[-1])) | |
116 soutput.write( | |
117 "{}\t{}\n".format("Scaffold len_mean", int(np.mean(SEQ_LEN_LIST))) | |
118 ) | |
119 soutput.write( | |
120 "{}\t{}\n".format("Scaffold len_median", int(np.median(SEQ_LEN_LIST))) | |
121 ) | |
122 soutput.write("{}\t{}\n".format("Scaffold len_std", int(np.std(SEQ_LEN_LIST)))) | |
123 soutput.write("{}\t{}\n".format("Scaffold num_A", bases_global["A"])) | |
124 soutput.write("{}\t{}\n".format("Scaffold num_T", bases_global["T"])) | |
125 soutput.write("{}\t{}\n".format("Scaffold num_C", bases_global["C"])) | |
126 soutput.write("{}\t{}\n".format("Scaffold num_G", bases_global["G"])) | |
127 soutput.write("{}\t{}\n".format("Scaffold num_N", bases_global["N"])) | |
128 soutput.write("{}\t{}\n".format("Scaffold num_bp", scaffold_lens_sum[-1])) | |
129 soutput.write( | |
130 "{}\t{}\n".format( | |
131 "Scaffold num_bp_not_N", scaffold_lens_sum[-1] - bases_global["N"] | |
132 ) | |
133 ) | |
134 soutput.write("{}\t{}\n".format("Scaffold num_seq", len(SEQ_LEN_LIST))) | |
135 soutput.write( | |
136 "{}\t{:.2f}\n".format( | |
137 "Scaffold GC content overall", | |
138 ( | |
139 (bases_global["G"] + bases_global["C"]) | |
140 * 100.0 | |
141 / scaffold_lens_sum[-1] | |
142 ), | |
143 ) | |
144 ) | |
145 | |
146 soutput.write("{}\t{}\n".format("Contig L50", n50_idx + 1)) | |
147 soutput.write("{}\t{}\n".format("Contig N50", seq_len_list[n50_idx])) | |
148 soutput.write("{}\t{}\n".format("Contig L90", n90_idx + 1)) | |
149 soutput.write("{}\t{}\n".format("Contig N90", seq_len_list[n90_idx])) | |
150 if genome_size != 0: | |
151 soutput.write("{}\t{}\n".format("Contig NG50", ng50)) | |
152 soutput.write("{}\t{}\n".format("Contig len_max", seq_len_list[0])) | |
153 soutput.write("{}\t{}\n".format("Contig len_min", seq_len_list[-1])) | |
154 soutput.write("{}\t{}\n".format("Contig len_mean", int(np.mean(seq_len_list)))) | |
155 soutput.write( | |
156 "{}\t{}\n".format("Contig len_median", int(np.median(seq_len_list))) | |
157 ) | |
158 soutput.write("{}\t{}\n".format("Contig len_std", int(np.std(seq_len_list)))) | |
159 soutput.write("{}\t{}\n".format("Contig num_bp", contigs_len_sum[-1])) | |
160 soutput.write("{}\t{}\n".format("Contig num_seq", len(contigs_len_sum))) | |
161 soutput.write("{}\t{}\n".format("Number of gaps", gap_count)) | |
162 if gaps_output is not None: | |
163 # NOTE: generate gaps statistics file | |
164 with open(gaps_output, "w") as goutput: | |
165 for key in seq_id_Ngaprange: | |
166 for rng in seq_id_Ngaprange[key]: | |
167 goutput.write("{}\t{}\t{}\n".format(key, rng[0], rng[1])) | |
168 | |
169 | |
170 if __name__ == "__main__": | |
171 parser = argparse.ArgumentParser() | |
172 parser.add_argument("-f", "--fasta", required=True, help="FASTA file") | |
173 parser.add_argument( | |
174 "-z", | |
175 "--genome_size", | |
176 required=False, | |
177 type=int, | |
178 help="If provided, the NG50 statistic will be computed", | |
179 default=0, | |
180 ) | |
181 parser.add_argument( | |
182 "-s", | |
183 "--stats_output", | |
184 required=True, | |
185 help="File to store the general statistics", | |
186 ) | |
187 parser.add_argument( | |
188 "-r", | |
189 "--gaps_output", | |
190 required=False, | |
191 help="File to store the gaps statistics", | |
192 default=None, | |
193 ) | |
194 args = parser.parse_args() | |
195 | |
196 run( | |
197 args.fasta, | |
198 args.stats_output, | |
199 args.gaps_output, | |
200 args.genome_size, | |
201 ) |