comparison vsnp_statistics.py @ 8:1becb6606626 draft

Uploaded
author greg
date Mon, 02 Aug 2021 13:34:09 +0000
parents d0fbdeaaa488
children 377c1a96aae9
comparison
equal deleted inserted replaced
7:de2af65c4633 8:1becb6606626
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import argparse 3 import argparse
4 import csv
5 import gzip 4 import gzip
6 import os 5 import os
7 from functools import partial 6 from functools import partial
8 7
9 import numpy 8 import numpy
10 import pandas 9 import pandas
11 from Bio import SeqIO 10 from Bio import SeqIO
11
12
13 class Statistics:
14
15 def __init__(self, reference, fastq_file, file_size, total_reads, mean_read_length, mean_read_quality, reads_passing_q30):
16 self.reference = reference
17 self.fastq_file = fastq_file
18 self.file_size = file_size
19 self.total_reads = total_reads
20 self.mean_read_length = mean_read_length
21 self.mean_read_quality = mean_read_quality
22 self.reads_passing_q30 = reads_passing_q30
12 23
13 24
14 def nice_size(size): 25 def nice_size(size):
15 # Returns a readably formatted string with the size 26 # Returns a readably formatted string with the size
16 words = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB'] 27 words = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB']
30 return "%s%d bytes" % (prefix, size) 41 return "%s%d bytes" % (prefix, size)
31 return "%s%.1f %s" % (prefix, size, word) 42 return "%s%.1f %s" % (prefix, size, word)
32 return '??? bytes' 43 return '??? bytes'
33 44
34 45
35 def output_statistics(fastq_files, idxstats_files, metrics_files, output_file, gzipped, dbkey): 46 def get_statistics(dbkey, fastq_file, gzipped):
36 # Produce an Excel spreadsheet that 47 sampling_size = 10000
37 # contains a row for each sample. 48 # Read fastq_file into a data fram to
38 columns = ['Reference', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30', 49 # get the phred quality scores.
39 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total', 50 _open = partial(gzip.open, mode='rt') if gzipped else open
40 'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count'] 51 with _open(fastq_file) as fh:
41 data_frames = [] 52 identifiers = []
42 for i, fastq_file in enumerate(fastq_files): 53 seqs = []
43 idxstats_file = idxstats_files[i] 54 letter_annotations = []
44 metrics_file = metrics_files[i] 55 for seq_record in SeqIO.parse(fh, "fastq"):
45 file_name_base = os.path.basename(fastq_file) 56 identifiers.append(seq_record.id)
46 # Read fastq_file into a data frame. 57 seqs.append(seq_record.seq)
47 _open = partial(gzip.open, mode='rt') if gzipped else open 58 letter_annotations.append(seq_record.letter_annotations["phred_quality"])
48 with _open(fastq_file) as fh: 59 # Convert lists to Pandas series.
49 identifiers = [] 60 s1 = pandas.Series(identifiers, name='id')
50 seqs = [] 61 s2 = pandas.Series(seqs, name='seq')
51 letter_annotations = [] 62 # Gather Series into a data frame.
52 for seq_record in SeqIO.parse(fh, "fastq"): 63 fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id'])
53 identifiers.append(seq_record.id) 64 # Starting at row 3, keep every 4 row
54 seqs.append(seq_record.seq) 65 # random sample specified number of rows.
55 letter_annotations.append(seq_record.letter_annotations["phred_quality"]) 66 file_size = nice_size(os.path.getsize(fastq_file))
56 # Convert lists to Pandas series. 67 total_reads = int(len(fastq_df.index) / 4)
57 s1 = pandas.Series(identifiers, name='id') 68 # Mean Read Length
58 s2 = pandas.Series(seqs, name='seq') 69 if sampling_size > total_reads:
59 # Gather Series into a data frame. 70 sampling_size = total_reads
60 fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id']) 71 fastq_df = fastq_df.iloc[3::4].sample(sampling_size)
61 total_reads = int(len(fastq_df.index) / 4) 72 dict_mean = {}
62 current_sample_df = pandas.DataFrame(index=[file_name_base], columns=columns) 73 list_length = []
63 # Reference 74 i = 0
64 current_sample_df.at[file_name_base, 'Reference'] = dbkey 75 for id, seq, in fastq_df.iterrows():
65 # File Size 76 dict_mean[id] = numpy.mean(letter_annotations[i])
66 current_sample_df.at[file_name_base, 'File Size'] = nice_size(os.path.getsize(fastq_file)) 77 list_length.append(len(seq.array[0]))
67 # Mean Read Length 78 i += 1
68 sampling_size = 10000 79 mean_read_length = '%.1f' % numpy.mean(list_length)
69 if sampling_size > total_reads: 80 # Mean Read Quality
70 sampling_size = total_reads 81 df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave'])
71 fastq_df = fastq_df.iloc[3::4].sample(sampling_size) 82 mean_read_quality = '%.1f' % df_mean['ave'].mean()
72 dict_mean = {} 83 # Reads Passing Q30
73 list_length = [] 84 reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30])
74 i = 0 85 reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size)
75 for id, seq, in fastq_df.iterrows(): 86 stats = Statistics(dbkey, os.path.basename(fastq_file), file_size, total_reads, mean_read_length,
76 dict_mean[id] = numpy.mean(letter_annotations[i]) 87 mean_read_quality, reads_passing_q30)
77 list_length.append(len(seq.array[0])) 88 return stats
78 i += 1 89
79 current_sample_df.at[file_name_base, 'Mean Read Length'] = '%.1f' % numpy.mean(list_length) 90
80 # Mean Read Quality 91 def accrue_statistics(dbkey, read1, read2, gzipped):
81 df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave']) 92 read1_stats = get_statistics(dbkey, read1, gzipped)
82 current_sample_df.at[file_name_base, 'Mean Read Quality'] = '%.1f' % df_mean['ave'].mean() 93 if read2 is None:
83 # Reads Passing Q30 94 read2_stats = None
84 reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30]) 95 else:
85 reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size) 96 read2_stats = get_statistics(dbkey, read2, gzipped)
86 current_sample_df.at[file_name_base, 'Reads Passing Q30'] = reads_passing_q30 97 return read1_stats, read2_stats
98
99
100 def output_statistics(read1_stats, read2_stats, idxstats_file, metrics_file, output_file):
101 paired_reads = read2_stats is not None
102 if paired_reads:
103 columns = ['Reference', 'Read1 FASTQ', 'File Size', 'Reads', 'Mean Read Length', 'Mean Read Quality',
104 'Reads Passing Q30', 'Read2 FASTQ', 'File Size', 'Reads', 'Mean Read Length', 'Mean Read Quality',
105 'Reads Passing Q30', 'Total Reads', 'All Mapped Reads', 'Unmapped Reads',
106 'Unmapped Reads Percentage of Total', 'Reference with Coverage', 'Average Depth of Coverage',
107 'Good SNP Count']
108 else:
109 columns = ['Reference', 'FASTQ', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30',
110 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total',
111 'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count']
112 with open(output_file, "w") as outfh:
113 outfh.write("%s\n" % "\t".join(columns))
114 line_items = []
115 # Get the current stats and associated files.
116 # Get and output the statistics.
117 line_items.append(read1_stats.reference)
118 line_items.append(read1_stats.fastq_file)
119 line_items.append(read1_stats.file_size)
120 if paired_reads:
121 line_items.append(read1_stats.total_reads)
122 line_items.append(read1_stats.mean_read_length)
123 line_items.append(read1_stats.mean_read_quality)
124 line_items.append(read1_stats.reads_passing_q30)
125 if paired_reads:
126 line_items.append(read2_stats.fastq_file)
127 line_items.append(read2_stats.file_size)
128 line_items.append(read2_stats.total_reads)
129 line_items.append(read2_stats.mean_read_length)
130 line_items.append(read2_stats.mean_read_quality)
131 line_items.append(read2_stats.reads_passing_q30)
87 # Total Reads 132 # Total Reads
88 current_sample_df.at[file_name_base, 'Total Reads'] = total_reads 133 if paired_reads:
134 total_reads = read1_stats.total_reads + read2_stats.total_reads
135 else:
136 total_reads = read1_stats.total_reads
137 line_items.append(total_reads)
89 # All Mapped Reads 138 # All Mapped Reads
90 all_mapped_reads, unmapped_reads = process_idxstats_file(idxstats_file) 139 all_mapped_reads, unmapped_reads = process_idxstats_file(idxstats_file)
91 current_sample_df.at[file_name_base, 'All Mapped Reads'] = all_mapped_reads 140 line_items.append(all_mapped_reads)
92 # Unmapped Reads 141 line_items.append(unmapped_reads)
93 current_sample_df.at[file_name_base, 'Unmapped Reads'] = unmapped_reads
94 # Unmapped Reads Percentage of Total 142 # Unmapped Reads Percentage of Total
95 if unmapped_reads > 0: 143 if unmapped_reads > 0:
96 unmapped_reads_percentage = '{:10.2f}'.format(unmapped_reads / total_reads) 144 unmapped_reads_percentage = '{:10.2f}'.format(unmapped_reads / total_reads)
97 else: 145 else:
98 unmapped_reads_percentage = 0 146 unmapped_reads_percentage = 0
99 current_sample_df.at[file_name_base, 'Unmapped Reads Percentage of Total'] = unmapped_reads_percentage 147 line_items.append(unmapped_reads_percentage)
100 # Reference with Coverage 148 # Reference with Coverage
101 ref_with_coverage, avg_depth_of_coverage, good_snp_count = process_metrics_file(metrics_file) 149 ref_with_coverage, avg_depth_of_coverage, good_snp_count = process_metrics_file(metrics_file)
102 current_sample_df.at[file_name_base, 'Reference with Coverage'] = ref_with_coverage 150 line_items.append(ref_with_coverage)
103 # Average Depth of Coverage 151 line_items.append(avg_depth_of_coverage)
104 current_sample_df.at[file_name_base, 'Average Depth of Coverage'] = avg_depth_of_coverage 152 line_items.append(good_snp_count)
105 # Good SNP Count 153 outfh.write('%s\n' % '\t'.join(str(x) for x in line_items))
106 current_sample_df.at[file_name_base, 'Good SNP Count'] = good_snp_count
107 data_frames.append(current_sample_df)
108 output_df = pandas.concat(data_frames)
109 output_df.to_csv(output_file, sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\')
110 154
111 155
112 def process_idxstats_file(idxstats_file): 156 def process_idxstats_file(idxstats_file):
113 all_mapped_reads = 0 157 all_mapped_reads = 0
114 unmapped_reads = 0 158 unmapped_reads = 0
148 192
149 parser = argparse.ArgumentParser() 193 parser = argparse.ArgumentParser()
150 194
151 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey') 195 parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey')
152 parser.add_argument('--gzipped', action='store_true', dest='gzipped', required=False, default=False, help='Input files are gzipped') 196 parser.add_argument('--gzipped', action='store_true', dest='gzipped', required=False, default=False, help='Input files are gzipped')
153 parser.add_argument('--input_idxstats_dir', action='store', dest='input_idxstats_dir', required=False, default=None, help='Samtools idxstats input directory')
154 parser.add_argument('--input_metrics_dir', action='store', dest='input_metrics_dir', required=False, default=None, help='vSNP add zero coverage metrics input directory')
155 parser.add_argument('--input_reads_dir', action='store', dest='input_reads_dir', required=False, default=None, help='Samples input directory')
156 parser.add_argument('--list_paired', action='store_true', dest='list_paired', required=False, default=False, help='Input samples is a list of paired reads')
157 parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file') 197 parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file')
158 parser.add_argument('--read1', action='store', dest='read1', help='Required: single read') 198 parser.add_argument('--read1', action='store', dest='read1', help='Required: single read')
159 parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read') 199 parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read')
160 parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', help='Output of samtools_idxstats') 200 parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', help='Output of samtools_idxstats')
161 parser.add_argument('--vsnp_azc', action='store', dest='vsnp_azc', help='Output of vsnp_add_zero_coverage') 201 parser.add_argument('--vsnp_azc_metrics', action='store', dest='vsnp_azc_metrics', help='Output of vsnp_add_zero_coverage')
162 202
163 args = parser.parse_args() 203 args = parser.parse_args()
164 204
165 fastq_files = [] 205 stats_list = []
166 idxstats_files = [] 206 idxstats_files = []
167 metrics_files = [] 207 metrics_files = []
168 # Accumulate inputs. 208 # Accumulate inputs.
169 if args.read1 is not None: 209 read1_stats, read2_stats = accrue_statistics(args.dbkey, args.read1, args.read2, args.gzipped)
170 # The inputs are not dataset collections, so 210 output_statistics(read1_stats, read2_stats, args.samtools_idxstats, args.vsnp_azc_metrics, args.output)
171 # read1, read2 (possibly) and vsnp_azc will also
172 # not be None.
173 fastq_files.append(args.read1)
174 idxstats_files.append(args.samtools_idxstats)
175 metrics_files.append(args.vsnp_azc)
176 if args.read2 is not None:
177 fastq_files.append(args.read2)
178 idxstats_files.append(args.samtools_idxstats)
179 metrics_files.append(args.vsnp_azc)
180 else:
181 for file_name in sorted(os.listdir(args.input_reads_dir)):
182 fastq_files.append(os.path.join(args.input_reads_dir, file_name))
183 for file_name in sorted(os.listdir(args.input_idxstats_dir)):
184 idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name))
185 if args.list_paired:
186 # Add the idxstats file for reverse.
187 idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name))
188 for file_name in sorted(os.listdir(args.input_metrics_dir)):
189 metrics_files.append(os.path.join(args.input_metrics_dir, file_name))
190 if args.list_paired:
191 # Add the metrics file for reverse.
192 metrics_files.append(os.path.join(args.input_metrics_dir, file_name))
193 output_statistics(fastq_files, idxstats_files, metrics_files, args.output, args.gzipped, args.dbkey)