# HG changeset patch # User greg # Date 1627911249 0 # Node ID 1becb66066260a5cfb925f6eeb3b05ff74eb2738 # Parent de2af65c46336340b2d785b61ba7256462781a5b Uploaded diff -r de2af65c4633 -r 1becb6606626 macros.xml --- a/macros.xml Thu Jul 22 18:05:22 2021 +0000 +++ b/macros.xml Mon Aug 02 13:34:09 2021 +0000 @@ -1,7 +1,31 @@ 1.0 - 19.09 + 21.05 + + biopython + + + numpy + + + openpyxl + + + pandas + + + pysam + + + pyyaml + + + xlrd + + + xlsxwriter + diff -r de2af65c4633 -r 1becb6606626 test-data/add_zc_metrics3.tabular --- a/test-data/add_zc_metrics3.tabular Thu Jul 22 18:05:22 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -# File Number of Good SNPs Average Coverage Genome Coverage -13-1941-6_S4_L001_R1_600000_fastq_gz 0.001252 0.13% -13-1941-6_S4_L001_R1_600000_fastq_gz 0 diff -r de2af65c4633 -r 1becb6606626 test-data/add_zc_metrics4.tabular --- a/test-data/add_zc_metrics4.tabular Thu Jul 22 18:05:22 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -# File Number of Good SNPs Average Coverage Genome Coverage -Mcap_Deer_DE_SRR650221_fastq_gz 0.439436 8.27% -Mcap_Deer_DE_SRR650221_fastq_gz 36 diff -r de2af65c4633 -r 1becb6606626 test-data/samtools_idxstats3.tabular --- a/test-data/samtools_idxstats3.tabular Thu Jul 22 18:05:22 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -NC_002945.4 4349904 24 0 -* 0 0 2 diff -r de2af65c4633 -r 1becb6606626 test-data/samtools_idxstats4.tabular --- a/test-data/samtools_idxstats4.tabular Thu Jul 22 18:05:22 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -NC_002945.4 4349904 17063 0 -* 0 0 223 diff -r de2af65c4633 -r 1becb6606626 test-data/vsnp_statistics1.tabular --- a/test-data/vsnp_statistics1.tabular Thu Jul 22 18:05:22 2021 +0000 +++ b/test-data/vsnp_statistics1.tabular Mon Aug 02 13:34:09 2021 +0000 @@ -1,2 +1,2 @@ - Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count -Mcap_Deer_DE_SRR650221_fastq_gz 89 1.6 MB 121.0 29.7 0.53 4317 17063 223 0.05 8.27% 0.439436 36 +Reference FASTQ File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count +89 Mcap_Deer_DE_SRR650221_fastq_gz 1.6 MB 121.0 29.7 0.53 4317 17063 223 0.05 8.27% 0.439436 36 diff -r de2af65c4633 -r 1becb6606626 test-data/vsnp_statistics2.tabular --- a/test-data/vsnp_statistics2.tabular Thu Jul 22 18:05:22 2021 +0000 +++ b/test-data/vsnp_statistics2.tabular Mon Aug 02 13:34:09 2021 +0000 @@ -1,3 +1,2 @@ - Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count -13-1941-6_S4_L001_R1_600000_fastq_gz 89 8.7 KB 100.0 65.7 1.00 25 45 5 0.20 98.74% 10.338671 611 -13-1941-6_S4_L001_R2_600000_fastq_gz 89 8.5 KB 100.0 66.3 1.00 25 45 5 0.20 98.74% 10.338671 611 +Reference Read1 FASTQ File Size Reads Mean Read Length Mean Read Quality Reads Passing Q30 Read2 FASTQ File Size Reads Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count +89 13-1941-6_S4_L001_R1_600000_fastq_gz 8.7 KB 25 100.0 65.7 1.00 13-1941-6_S4_L001_R2_600000_fastq_gz 8.5 KB 25 100.0 66.3 1.00 50 45 5 0.10 98.74% 10.338671 611 diff -r de2af65c4633 -r 1becb6606626 test-data/vsnp_statistics3.tabular --- a/test-data/vsnp_statistics3.tabular Thu Jul 22 18:05:22 2021 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ - Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count -13-1941-6_S4_L001_R1_600000_fastq_gz 89 8.7 KB 100.0 65.7 1.00 25 24 2 0.08 0.13% 0.001252 0 -Mcap_Deer_DE_SRR650221_fastq_gz 89 1.6 MB 121.0 29.7 0.53 4317 17063 223 0.05 8.27% 0.439436 36 diff -r de2af65c4633 -r 1becb6606626 test-data/vsnp_statistics4.tabular --- a/test-data/vsnp_statistics4.tabular Thu Jul 22 18:05:22 2021 +0000 +++ b/test-data/vsnp_statistics4.tabular Mon Aug 02 13:34:09 2021 +0000 @@ -1,3 +1,2 @@ - Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count -13-1941-6_S4_L001_R1_600000_fastq_gz 89 8.7 KB 100.0 65.7 1.00 25 46 4 0.16 0.16% 0.002146 0 -13-1941-6_S4_L001_R2_600000_fastq_gz 89 8.5 KB 100.0 66.3 1.00 25 46 4 0.16 0.16% 0.002146 0 +Reference Read1 FASTQ File Size Reads Mean Read Length Mean Read Quality Reads Passing Q30 Read2 FASTQ File Size Reads Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count +89 13-1941-6_S4_L001_R1_600000_fastq_gz 8.7 KB 25 100.0 65.7 1.00 13-1941-6_S4_L001_R2_600000_fastq_gz 8.5 KB 25 100.0 66.3 1.00 50 46 4 0.08 0.16% 0.002146 0 diff -r de2af65c4633 -r 1becb6606626 vsnp_statistics.py --- a/vsnp_statistics.py Thu Jul 22 18:05:22 2021 +0000 +++ b/vsnp_statistics.py Mon Aug 02 13:34:09 2021 +0000 @@ -1,7 +1,6 @@ #!/usr/bin/env python import argparse -import csv import gzip import os from functools import partial @@ -11,6 +10,18 @@ from Bio import SeqIO +class Statistics: + + def __init__(self, reference, fastq_file, file_size, total_reads, mean_read_length, mean_read_quality, reads_passing_q30): + self.reference = reference + self.fastq_file = fastq_file + self.file_size = file_size + self.total_reads = total_reads + self.mean_read_length = mean_read_length + self.mean_read_quality = mean_read_quality + self.reads_passing_q30 = reads_passing_q30 + + def nice_size(size): # Returns a readably formatted string with the size words = ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB'] @@ -32,81 +43,114 @@ return '??? bytes' -def output_statistics(fastq_files, idxstats_files, metrics_files, output_file, gzipped, dbkey): - # Produce an Excel spreadsheet that - # contains a row for each sample. - columns = ['Reference', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30', - 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total', - 'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count'] - data_frames = [] - for i, fastq_file in enumerate(fastq_files): - idxstats_file = idxstats_files[i] - metrics_file = metrics_files[i] - file_name_base = os.path.basename(fastq_file) - # Read fastq_file into a data frame. - _open = partial(gzip.open, mode='rt') if gzipped else open - with _open(fastq_file) as fh: - identifiers = [] - seqs = [] - letter_annotations = [] - for seq_record in SeqIO.parse(fh, "fastq"): - identifiers.append(seq_record.id) - seqs.append(seq_record.seq) - letter_annotations.append(seq_record.letter_annotations["phred_quality"]) - # Convert lists to Pandas series. - s1 = pandas.Series(identifiers, name='id') - s2 = pandas.Series(seqs, name='seq') - # Gather Series into a data frame. - fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id']) - total_reads = int(len(fastq_df.index) / 4) - current_sample_df = pandas.DataFrame(index=[file_name_base], columns=columns) - # Reference - current_sample_df.at[file_name_base, 'Reference'] = dbkey - # File Size - current_sample_df.at[file_name_base, 'File Size'] = nice_size(os.path.getsize(fastq_file)) - # Mean Read Length - sampling_size = 10000 - if sampling_size > total_reads: - sampling_size = total_reads - fastq_df = fastq_df.iloc[3::4].sample(sampling_size) - dict_mean = {} - list_length = [] - i = 0 - for id, seq, in fastq_df.iterrows(): - dict_mean[id] = numpy.mean(letter_annotations[i]) - list_length.append(len(seq.array[0])) - i += 1 - current_sample_df.at[file_name_base, 'Mean Read Length'] = '%.1f' % numpy.mean(list_length) - # Mean Read Quality - df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave']) - current_sample_df.at[file_name_base, 'Mean Read Quality'] = '%.1f' % df_mean['ave'].mean() - # Reads Passing Q30 - reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30]) - reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size) - current_sample_df.at[file_name_base, 'Reads Passing Q30'] = reads_passing_q30 +def get_statistics(dbkey, fastq_file, gzipped): + sampling_size = 10000 + # Read fastq_file into a data fram to + # get the phred quality scores. + _open = partial(gzip.open, mode='rt') if gzipped else open + with _open(fastq_file) as fh: + identifiers = [] + seqs = [] + letter_annotations = [] + for seq_record in SeqIO.parse(fh, "fastq"): + identifiers.append(seq_record.id) + seqs.append(seq_record.seq) + letter_annotations.append(seq_record.letter_annotations["phred_quality"]) + # Convert lists to Pandas series. + s1 = pandas.Series(identifiers, name='id') + s2 = pandas.Series(seqs, name='seq') + # Gather Series into a data frame. + fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id']) + # Starting at row 3, keep every 4 row + # random sample specified number of rows. + file_size = nice_size(os.path.getsize(fastq_file)) + total_reads = int(len(fastq_df.index) / 4) + # Mean Read Length + if sampling_size > total_reads: + sampling_size = total_reads + fastq_df = fastq_df.iloc[3::4].sample(sampling_size) + dict_mean = {} + list_length = [] + i = 0 + for id, seq, in fastq_df.iterrows(): + dict_mean[id] = numpy.mean(letter_annotations[i]) + list_length.append(len(seq.array[0])) + i += 1 + mean_read_length = '%.1f' % numpy.mean(list_length) + # Mean Read Quality + df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave']) + mean_read_quality = '%.1f' % df_mean['ave'].mean() + # Reads Passing Q30 + reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30]) + reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size) + stats = Statistics(dbkey, os.path.basename(fastq_file), file_size, total_reads, mean_read_length, + mean_read_quality, reads_passing_q30) + return stats + + +def accrue_statistics(dbkey, read1, read2, gzipped): + read1_stats = get_statistics(dbkey, read1, gzipped) + if read2 is None: + read2_stats = None + else: + read2_stats = get_statistics(dbkey, read2, gzipped) + return read1_stats, read2_stats + + +def output_statistics(read1_stats, read2_stats, idxstats_file, metrics_file, output_file): + paired_reads = read2_stats is not None + if paired_reads: + columns = ['Reference', 'Read1 FASTQ', 'File Size', 'Reads', 'Mean Read Length', 'Mean Read Quality', + 'Reads Passing Q30', 'Read2 FASTQ', 'File Size', 'Reads', 'Mean Read Length', 'Mean Read Quality', + 'Reads Passing Q30', 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', + 'Unmapped Reads Percentage of Total', 'Reference with Coverage', 'Average Depth of Coverage', + 'Good SNP Count'] + else: + columns = ['Reference', 'FASTQ', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30', + 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total', + 'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count'] + with open(output_file, "w") as outfh: + outfh.write("%s\n" % "\t".join(columns)) + line_items = [] + # Get the current stats and associated files. + # Get and output the statistics. + line_items.append(read1_stats.reference) + line_items.append(read1_stats.fastq_file) + line_items.append(read1_stats.file_size) + if paired_reads: + line_items.append(read1_stats.total_reads) + line_items.append(read1_stats.mean_read_length) + line_items.append(read1_stats.mean_read_quality) + line_items.append(read1_stats.reads_passing_q30) + if paired_reads: + line_items.append(read2_stats.fastq_file) + line_items.append(read2_stats.file_size) + line_items.append(read2_stats.total_reads) + line_items.append(read2_stats.mean_read_length) + line_items.append(read2_stats.mean_read_quality) + line_items.append(read2_stats.reads_passing_q30) # Total Reads - current_sample_df.at[file_name_base, 'Total Reads'] = total_reads + if paired_reads: + total_reads = read1_stats.total_reads + read2_stats.total_reads + else: + total_reads = read1_stats.total_reads + line_items.append(total_reads) # All Mapped Reads all_mapped_reads, unmapped_reads = process_idxstats_file(idxstats_file) - current_sample_df.at[file_name_base, 'All Mapped Reads'] = all_mapped_reads - # Unmapped Reads - current_sample_df.at[file_name_base, 'Unmapped Reads'] = unmapped_reads + line_items.append(all_mapped_reads) + line_items.append(unmapped_reads) # Unmapped Reads Percentage of Total if unmapped_reads > 0: unmapped_reads_percentage = '{:10.2f}'.format(unmapped_reads / total_reads) else: unmapped_reads_percentage = 0 - current_sample_df.at[file_name_base, 'Unmapped Reads Percentage of Total'] = unmapped_reads_percentage + line_items.append(unmapped_reads_percentage) # Reference with Coverage ref_with_coverage, avg_depth_of_coverage, good_snp_count = process_metrics_file(metrics_file) - current_sample_df.at[file_name_base, 'Reference with Coverage'] = ref_with_coverage - # Average Depth of Coverage - current_sample_df.at[file_name_base, 'Average Depth of Coverage'] = avg_depth_of_coverage - # Good SNP Count - current_sample_df.at[file_name_base, 'Good SNP Count'] = good_snp_count - data_frames.append(current_sample_df) - output_df = pandas.concat(data_frames) - output_df.to_csv(output_file, sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\') + line_items.append(ref_with_coverage) + line_items.append(avg_depth_of_coverage) + line_items.append(good_snp_count) + outfh.write('%s\n' % '\t'.join(str(x) for x in line_items)) def process_idxstats_file(idxstats_file): @@ -150,44 +194,17 @@ parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey') parser.add_argument('--gzipped', action='store_true', dest='gzipped', required=False, default=False, help='Input files are gzipped') -parser.add_argument('--input_idxstats_dir', action='store', dest='input_idxstats_dir', required=False, default=None, help='Samtools idxstats input directory') -parser.add_argument('--input_metrics_dir', action='store', dest='input_metrics_dir', required=False, default=None, help='vSNP add zero coverage metrics input directory') -parser.add_argument('--input_reads_dir', action='store', dest='input_reads_dir', required=False, default=None, help='Samples input directory') -parser.add_argument('--list_paired', action='store_true', dest='list_paired', required=False, default=False, help='Input samples is a list of paired reads') parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file') parser.add_argument('--read1', action='store', dest='read1', help='Required: single read') parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read') parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', help='Output of samtools_idxstats') -parser.add_argument('--vsnp_azc', action='store', dest='vsnp_azc', help='Output of vsnp_add_zero_coverage') +parser.add_argument('--vsnp_azc_metrics', action='store', dest='vsnp_azc_metrics', help='Output of vsnp_add_zero_coverage') args = parser.parse_args() -fastq_files = [] +stats_list = [] idxstats_files = [] metrics_files = [] # Accumulate inputs. -if args.read1 is not None: - # The inputs are not dataset collections, so - # read1, read2 (possibly) and vsnp_azc will also - # not be None. - fastq_files.append(args.read1) - idxstats_files.append(args.samtools_idxstats) - metrics_files.append(args.vsnp_azc) - if args.read2 is not None: - fastq_files.append(args.read2) - idxstats_files.append(args.samtools_idxstats) - metrics_files.append(args.vsnp_azc) -else: - for file_name in sorted(os.listdir(args.input_reads_dir)): - fastq_files.append(os.path.join(args.input_reads_dir, file_name)) - for file_name in sorted(os.listdir(args.input_idxstats_dir)): - idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name)) - if args.list_paired: - # Add the idxstats file for reverse. - idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name)) - for file_name in sorted(os.listdir(args.input_metrics_dir)): - metrics_files.append(os.path.join(args.input_metrics_dir, file_name)) - if args.list_paired: - # Add the metrics file for reverse. - metrics_files.append(os.path.join(args.input_metrics_dir, file_name)) -output_statistics(fastq_files, idxstats_files, metrics_files, args.output, args.gzipped, args.dbkey) +read1_stats, read2_stats = accrue_statistics(args.dbkey, args.read1, args.read2, args.gzipped) +output_statistics(read1_stats, read2_stats, args.samtools_idxstats, args.vsnp_azc_metrics, args.output) diff -r de2af65c4633 -r 1becb6606626 vsnp_statistics.xml --- a/vsnp_statistics.xml Thu Jul 22 18:05:22 2021 +0000 +++ b/vsnp_statistics.xml Mon Aug 02 13:34:09 2021 +0000 @@ -4,122 +4,68 @@ macros.xml - biopython - numpy - openpyxl - pandas - xlrd + + + + + - - + + + - - - - - - - - - - - - - - - - + + - - - - - - - - - - - - - - - + + + + + + + + @@ -127,67 +73,32 @@ - - + - + - - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - + + @@ -195,8 +106,8 @@ **What it does** Accepts associated fastq files, SAMtools idxstats files and **vSNP: add zero coverage** metrics files and extracts information from them -to produce an Excel spreadsheet containing statistics for each sample. The samples can be single or paired reads, and all associated inputs -can be either single files or collections of files. The output statistics include reference, file size, mean read length, mean read quality, +to produce an Excel spreadsheet containing statistics for each sample. The samples can be a single read, a single set of paired reads in +separate datasets or collection of paired reads. The output statistics include reference, file size, mean read length, mean read quality, reads passing Q30, total reads, all mapped reads, unmapped reads, unmapped reads percentage of total, reference with coverage, average depth of coverage and good SNP count.