# HG changeset patch # User iuc # Date 1630064739 0 # Node ID 6853676d2bae8e2c057d76cf690abd1a4684d3b7 # Parent 9b4011d5e8cb53e33d5ffd4aaad3b458b876d49d "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 92f46d4bb55b582f05ac3c4b094307f114cbf98f" diff -r 9b4011d5e8cb -r 6853676d2bae test-data/13-1941-6_S4_L001_R1_600000.fastq.gz Binary file test-data/13-1941-6_S4_L001_R1_600000.fastq.gz has changed diff -r 9b4011d5e8cb -r 6853676d2bae test-data/13-1941-6_S4_L001_R2_600000.fastq.gz Binary file test-data/13-1941-6_S4_L001_R2_600000.fastq.gz has changed diff -r 9b4011d5e8cb -r 6853676d2bae test-data/add_zc_metrics.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/add_zc_metrics.tabular Fri Aug 27 11:45:39 2021 +0000 @@ -0,0 +1,3 @@ +# File Number of Good SNPs Average Coverage Genome Coverage +MarkDuplicates on data 4_ MarkDuplicates BAM output 10.338671 98.74% +VCFfilter_ on data 7 611 diff -r 9b4011d5e8cb -r 6853676d2bae test-data/add_zc_metrics1.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/add_zc_metrics1.tabular Fri Aug 27 11:45:39 2021 +0000 @@ -0,0 +1,3 @@ +# File Number of Good SNPs Average Coverage Genome Coverage +Mcap_Deer_DE_SRR650221_fastq_gz 0.439436 8.27% +Mcap_Deer_DE_SRR650221_fastq_gz 36 diff -r 9b4011d5e8cb -r 6853676d2bae test-data/add_zc_metrics2.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/add_zc_metrics2.tabular Fri Aug 27 11:45:39 2021 +0000 @@ -0,0 +1,3 @@ +# File Number of Good SNPs Average Coverage Genome Coverage +MarkDuplicates on data 4_ MarkDuplicates BAM output 10.338671 98.74% +VCFfilter_ on data 7 611 diff -r 9b4011d5e8cb -r 6853676d2bae test-data/add_zc_metrics3.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/add_zc_metrics3.tabular Fri Aug 27 11:45:39 2021 +0000 @@ -0,0 +1,3 @@ +# File Number of Good SNPs Average Coverage Genome Coverage +13-1941-6_S4_L001_R1_600000_fastq_gz 0.001252 0.13% +13-1941-6_S4_L001_R1_600000_fastq_gz 0 diff -r 9b4011d5e8cb -r 6853676d2bae test-data/add_zc_metrics4.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/add_zc_metrics4.tabular Fri Aug 27 11:45:39 2021 +0000 @@ -0,0 +1,3 @@ +# File Number of Good SNPs Average Coverage Genome Coverage +Mcap_Deer_DE_SRR650221_fastq_gz 0.439436 8.27% +Mcap_Deer_DE_SRR650221_fastq_gz 36 diff -r 9b4011d5e8cb -r 6853676d2bae test-data/add_zc_metrics5.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/add_zc_metrics5.tabular Fri Aug 27 11:45:39 2021 +0000 @@ -0,0 +1,3 @@ +# File Number of Good SNPs Average Coverage Genome Coverage +13-1941-6_S4_L001_600000_fastq 0.002146 0.16% +13-1941-6_S4_L001_600000_fastq 0 diff -r 9b4011d5e8cb -r 6853676d2bae test-data/samtools_idxstats.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/samtools_idxstats.tabular Fri Aug 27 11:45:39 2021 +0000 @@ -0,0 +1,2 @@ +NC_002945.4 4349904 45 4047 +* 0 0 5 diff -r 9b4011d5e8cb -r 6853676d2bae test-data/samtools_idxstats1.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/samtools_idxstats1.tabular Fri Aug 27 11:45:39 2021 +0000 @@ -0,0 +1,2 @@ +NC_002945.4 4349904 17063 0 +* 0 0 223 diff -r 9b4011d5e8cb -r 6853676d2bae test-data/samtools_idxstats2.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/samtools_idxstats2.tabular Fri Aug 27 11:45:39 2021 +0000 @@ -0,0 +1,2 @@ +NC_002945.4 4349904 45 4047 +* 0 0 5 diff -r 9b4011d5e8cb -r 6853676d2bae test-data/samtools_idxstats3.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/samtools_idxstats3.tabular Fri Aug 27 11:45:39 2021 +0000 @@ -0,0 +1,2 @@ +NC_002945.4 4349904 24 0 +* 0 0 2 diff -r 9b4011d5e8cb -r 6853676d2bae test-data/samtools_idxstats4.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/samtools_idxstats4.tabular Fri Aug 27 11:45:39 2021 +0000 @@ -0,0 +1,2 @@ +NC_002945.4 4349904 17063 0 +* 0 0 223 diff -r 9b4011d5e8cb -r 6853676d2bae test-data/samtools_idxstats5.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/samtools_idxstats5.tabular Fri Aug 27 11:45:39 2021 +0000 @@ -0,0 +1,2 @@ +NC_002945.4 4349904 46 2 +* 0 0 4 diff -r 9b4011d5e8cb -r 6853676d2bae test-data/vsnp_statistics1.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/vsnp_statistics1.tabular Fri Aug 27 11:45:39 2021 +0000 @@ -0,0 +1,2 @@ + Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count +Mcap_Deer_DE_SRR650221_fastq_gz 89 1.6 MB 121.0 29.7 0.53 4317 17063 223 0.05 8.27% 0.439436 36 diff -r 9b4011d5e8cb -r 6853676d2bae test-data/vsnp_statistics2.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/vsnp_statistics2.tabular Fri Aug 27 11:45:39 2021 +0000 @@ -0,0 +1,3 @@ + Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count +13-1941-6_S4_L001_R1_600000_fastq_gz 89 8.7 KB 100.0 65.7 1.00 25 45 5 0.20 98.74% 10.338671 611 +13-1941-6_S4_L001_R2_600000_fastq_gz 89 8.5 KB 100.0 66.3 1.00 25 45 5 0.20 98.74% 10.338671 611 diff -r 9b4011d5e8cb -r 6853676d2bae test-data/vsnp_statistics3.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/vsnp_statistics3.tabular Fri Aug 27 11:45:39 2021 +0000 @@ -0,0 +1,3 @@ + Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count +13-1941-6_S4_L001_R1_600000_fastq_gz 89 8.7 KB 100.0 65.7 1.00 25 24 2 0.08 0.13% 0.001252 0 +Mcap_Deer_DE_SRR650221_fastq_gz 89 1.6 MB 121.0 29.7 0.53 4317 17063 223 0.05 8.27% 0.439436 36 diff -r 9b4011d5e8cb -r 6853676d2bae test-data/vsnp_statistics4.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/vsnp_statistics4.tabular Fri Aug 27 11:45:39 2021 +0000 @@ -0,0 +1,3 @@ + Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count +13-1941-6_S4_L001_R1_600000_fastq_gz 89 8.7 KB 100.0 65.7 1.00 25 46 4 0.16 0.16% 0.002146 0 +13-1941-6_S4_L001_R2_600000_fastq_gz 89 8.5 KB 100.0 66.3 1.00 25 46 4 0.16 0.16% 0.002146 0 diff -r 9b4011d5e8cb -r 6853676d2bae vsnp_determine_ref_from_data.xml --- a/vsnp_determine_ref_from_data.xml Thu Jan 14 09:11:43 2021 +0000 +++ b/vsnp_determine_ref_from_data.xml Fri Aug 27 11:45:39 2021 +0000 @@ -1,11 +1,11 @@ - + from input data macros.xml - biopython - pyyaml + biopython + pyyaml size: + size = size / float(1024 ** ind) + if word == 'bytes': # No decimals for bytes + return "%s%d bytes" % (prefix, size) + return "%s%.1f %s" % (prefix, size, word) + return '??? bytes' + + +def output_statistics(fastq_files, idxstats_files, metrics_files, output_file, gzipped, dbkey): + # Produce an Excel spreadsheet that + # contains a row for each sample. + columns = ['Reference', 'File Size', 'Mean Read Length', 'Mean Read Quality', 'Reads Passing Q30', + 'Total Reads', 'All Mapped Reads', 'Unmapped Reads', 'Unmapped Reads Percentage of Total', + 'Reference with Coverage', 'Average Depth of Coverage', 'Good SNP Count'] + data_frames = [] + for i, fastq_file in enumerate(fastq_files): + idxstats_file = idxstats_files[i] + metrics_file = metrics_files[i] + file_name_base = os.path.basename(fastq_file) + # Read fastq_file into a data frame. + _open = partial(gzip.open, mode='rt') if gzipped else open + with _open(fastq_file) as fh: + identifiers = [] + seqs = [] + letter_annotations = [] + for seq_record in SeqIO.parse(fh, "fastq"): + identifiers.append(seq_record.id) + seqs.append(seq_record.seq) + letter_annotations.append(seq_record.letter_annotations["phred_quality"]) + # Convert lists to Pandas series. + s1 = pandas.Series(identifiers, name='id') + s2 = pandas.Series(seqs, name='seq') + # Gather Series into a data frame. + fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id']) + total_reads = int(len(fastq_df.index) / 4) + current_sample_df = pandas.DataFrame(index=[file_name_base], columns=columns) + # Reference + current_sample_df.at[file_name_base, 'Reference'] = dbkey + # File Size + current_sample_df.at[file_name_base, 'File Size'] = nice_size(os.path.getsize(fastq_file)) + # Mean Read Length + sampling_size = 10000 + if sampling_size > total_reads: + sampling_size = total_reads + fastq_df = fastq_df.iloc[3::4].sample(sampling_size) + dict_mean = {} + list_length = [] + i = 0 + for id, seq, in fastq_df.iterrows(): + dict_mean[id] = numpy.mean(letter_annotations[i]) + list_length.append(len(seq.array[0])) + i += 1 + current_sample_df.at[file_name_base, 'Mean Read Length'] = '%.1f' % numpy.mean(list_length) + # Mean Read Quality + df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave']) + current_sample_df.at[file_name_base, 'Mean Read Quality'] = '%.1f' % df_mean['ave'].mean() + # Reads Passing Q30 + reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30]) + reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size) + current_sample_df.at[file_name_base, 'Reads Passing Q30'] = reads_passing_q30 + # Total Reads + current_sample_df.at[file_name_base, 'Total Reads'] = total_reads + # All Mapped Reads + all_mapped_reads, unmapped_reads = process_idxstats_file(idxstats_file) + current_sample_df.at[file_name_base, 'All Mapped Reads'] = all_mapped_reads + # Unmapped Reads + current_sample_df.at[file_name_base, 'Unmapped Reads'] = unmapped_reads + # Unmapped Reads Percentage of Total + if unmapped_reads > 0: + unmapped_reads_percentage = '{:10.2f}'.format(unmapped_reads / total_reads) + else: + unmapped_reads_percentage = 0 + current_sample_df.at[file_name_base, 'Unmapped Reads Percentage of Total'] = unmapped_reads_percentage + # Reference with Coverage + ref_with_coverage, avg_depth_of_coverage, good_snp_count = process_metrics_file(metrics_file) + current_sample_df.at[file_name_base, 'Reference with Coverage'] = ref_with_coverage + # Average Depth of Coverage + current_sample_df.at[file_name_base, 'Average Depth of Coverage'] = avg_depth_of_coverage + # Good SNP Count + current_sample_df.at[file_name_base, 'Good SNP Count'] = good_snp_count + data_frames.append(current_sample_df) + output_df = pandas.concat(data_frames) + output_df.to_csv(output_file, sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\') + + +def process_idxstats_file(idxstats_file): + all_mapped_reads = 0 + unmapped_reads = 0 + with open(idxstats_file, "r") as fh: + for i, line in enumerate(fh): + line = line.rstrip('\r\n') + items = line.split("\t") + if i == 0: + # NC_002945.4 4349904 213570 4047 + all_mapped_reads = int(items[2]) + elif i == 1: + # * 0 0 82774 + unmapped_reads = int(items[3]) + return all_mapped_reads, unmapped_reads + + +def process_metrics_file(metrics_file): + ref_with_coverage = '0%' + avg_depth_of_coverage = 0 + good_snp_count = 0 + with open(metrics_file, "r") as ifh: + for i, line in enumerate(ifh): + if i == 0: + # Skip comments. + continue + line = line.rstrip('\r\n') + items = line.split("\t") + if i == 1: + # MarkDuplicates 10.338671 98.74% + ref_with_coverage = items[3] + avg_depth_of_coverage = items[2] + elif i == 2: + # VCFfilter 611 + good_snp_count = items[1] + return ref_with_coverage, avg_depth_of_coverage, good_snp_count + + +parser = argparse.ArgumentParser() + +parser.add_argument('--dbkey', action='store', dest='dbkey', help='Reference dbkey') +parser.add_argument('--gzipped', action='store_true', dest='gzipped', required=False, default=False, help='Input files are gzipped') +parser.add_argument('--input_idxstats_dir', action='store', dest='input_idxstats_dir', required=False, default=None, help='Samtools idxstats input directory') +parser.add_argument('--input_metrics_dir', action='store', dest='input_metrics_dir', required=False, default=None, help='vSNP add zero coverage metrics input directory') +parser.add_argument('--input_reads_dir', action='store', dest='input_reads_dir', required=False, default=None, help='Samples input directory') +parser.add_argument('--list_paired', action='store_true', dest='list_paired', required=False, default=False, help='Input samples is a list of paired reads') +parser.add_argument('--output', action='store', dest='output', help='Output Excel statistics file') +parser.add_argument('--read1', action='store', dest='read1', help='Required: single read') +parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read') +parser.add_argument('--samtools_idxstats', action='store', dest='samtools_idxstats', help='Output of samtools_idxstats') +parser.add_argument('--vsnp_azc', action='store', dest='vsnp_azc', help='Output of vsnp_add_zero_coverage') + +args = parser.parse_args() + +fastq_files = [] +idxstats_files = [] +metrics_files = [] +# Accumulate inputs. +if args.read1 is not None: + # The inputs are not dataset collections, so + # read1, read2 (possibly) and vsnp_azc will also + # not be None. + fastq_files.append(args.read1) + idxstats_files.append(args.samtools_idxstats) + metrics_files.append(args.vsnp_azc) + if args.read2 is not None: + fastq_files.append(args.read2) + idxstats_files.append(args.samtools_idxstats) + metrics_files.append(args.vsnp_azc) +else: + for file_name in sorted(os.listdir(args.input_reads_dir)): + fastq_files.append(os.path.join(args.input_reads_dir, file_name)) + for file_name in sorted(os.listdir(args.input_idxstats_dir)): + idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name)) + if args.list_paired: + # Add the idxstats file for reverse. + idxstats_files.append(os.path.join(args.input_idxstats_dir, file_name)) + for file_name in sorted(os.listdir(args.input_metrics_dir)): + metrics_files.append(os.path.join(args.input_metrics_dir, file_name)) + if args.list_paired: + # Add the metrics file for reverse. + metrics_files.append(os.path.join(args.input_metrics_dir, file_name)) +output_statistics(fastq_files, idxstats_files, metrics_files, args.output, args.gzipped, args.dbkey)