# HG changeset patch
# User greg
# Date 1623865127 0
# Node ID d0fbdeaaa48890c5fe2c87003e4a2882c4a9bd5e
# Parent 2d6c6b01319efe3ca865352d10f91a6eea61b567
"planemo upload for repository https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/sequence_analysis/vsnp/vsnp_statistics commit 770e89322a15829580ed9577a853660f63233f32"
diff -r 2d6c6b01319e -r d0fbdeaaa488 .shed.yml
--- a/.shed.yml Sun Jan 03 15:47:28 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,13 +0,0 @@
-name: vsnp_statistics
-owner: greg
-description: |
- Contains a tool that produces an Excel spreadsheet containing statistics for samples and associated metrics files.
-homepage_url: https://github.com/USDA-VS/vSNP
-long_description: |
- Contains a tool Accepts a single fastqsanger sample, a set of paired read samples, or a collections of samples
- along with associated SAMtools idxstats and vSNP zero coverage metrics files and extracts information from them
- to produce an Excel spreadsheet containing statistics for each sample.
-remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/sequence_analysis/vsnp/vsnp_statistics
-type: unrestricted
-categories:
- - Sequence Analysis
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/13-1941-6_S4_L001_R1_600000.fastq.gz
Binary file test-data/13-1941-6_S4_L001_R1_600000.fastq.gz has changed
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/13-1941-6_S4_L001_R2_600000.fastq.gz
Binary file test-data/13-1941-6_S4_L001_R2_600000.fastq.gz has changed
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/Mcap_Deer_DE_SRR650221.fastq.gz
Binary file test-data/Mcap_Deer_DE_SRR650221.fastq.gz has changed
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/add_zc_metrics.tabular
--- a/test-data/add_zc_metrics.tabular Sun Jan 03 15:47:28 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-# File Number of Good SNPs Average Coverage Genome Coverage
-MarkDuplicates on data 4_ MarkDuplicates BAM output 10.338671 98.74%
-VCFfilter_ on data 7 611
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/add_zc_metrics1.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/add_zc_metrics1.tabular Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+# File Number of Good SNPs Average Coverage Genome Coverage
+Mcap_Deer_DE_SRR650221_fastq_gz 0.439436 8.27%
+Mcap_Deer_DE_SRR650221_fastq_gz 36
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/add_zc_metrics2.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/add_zc_metrics2.tabular Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+# File Number of Good SNPs Average Coverage Genome Coverage
+MarkDuplicates on data 4_ MarkDuplicates BAM output 10.338671 98.74%
+VCFfilter_ on data 7 611
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/add_zc_metrics3.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/add_zc_metrics3.tabular Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+# File Number of Good SNPs Average Coverage Genome Coverage
+13-1941-6_S4_L001_R1_600000_fastq_gz 0.001252 0.13%
+13-1941-6_S4_L001_R1_600000_fastq_gz 0
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/add_zc_metrics4.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/add_zc_metrics4.tabular Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+# File Number of Good SNPs Average Coverage Genome Coverage
+Mcap_Deer_DE_SRR650221_fastq_gz 0.439436 8.27%
+Mcap_Deer_DE_SRR650221_fastq_gz 36
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/add_zc_metrics5.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/add_zc_metrics5.tabular Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+# File Number of Good SNPs Average Coverage Genome Coverage
+13-1941-6_S4_L001_600000_fastq 0.002146 0.16%
+13-1941-6_S4_L001_600000_fastq 0
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/samtools_idxstats.tabular
--- a/test-data/samtools_idxstats.tabular Sun Jan 03 15:47:28 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-NC_002945.4 4349904 45 4047
-* 0 0 5
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/samtools_idxstats1.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samtools_idxstats1.tabular Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,2 @@
+NC_002945.4 4349904 17063 0
+* 0 0 223
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/samtools_idxstats2.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samtools_idxstats2.tabular Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,2 @@
+NC_002945.4 4349904 45 4047
+* 0 0 5
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/samtools_idxstats3.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samtools_idxstats3.tabular Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,2 @@
+NC_002945.4 4349904 24 0
+* 0 0 2
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/samtools_idxstats4.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samtools_idxstats4.tabular Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,2 @@
+NC_002945.4 4349904 17063 0
+* 0 0 223
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/samtools_idxstats5.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/samtools_idxstats5.tabular Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,2 @@
+NC_002945.4 4349904 46 2
+* 0 0 4
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/vsnp_statistics.xlsx
Binary file test-data/vsnp_statistics.xlsx has changed
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/vsnp_statistics1.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/vsnp_statistics1.tabular Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,2 @@
+ Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count
+Mcap_Deer_DE_SRR650221_fastq_gz 89 1.6 MB 121.0 29.7 0.53 4317 17063 223 0.05 8.27% 0.439436 36
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/vsnp_statistics2.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/vsnp_statistics2.tabular Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+ Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count
+13-1941-6_S4_L001_R1_600000_fastq_gz 89 8.7 KB 100.0 65.7 1.00 25 45 5 0.20 98.74% 10.338671 611
+13-1941-6_S4_L001_R2_600000_fastq_gz 89 8.5 KB 100.0 66.3 1.00 25 45 5 0.20 98.74% 10.338671 611
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/vsnp_statistics3.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/vsnp_statistics3.tabular Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+ Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count
+13-1941-6_S4_L001_R1_600000_fastq_gz 89 8.7 KB 100.0 65.7 1.00 25 24 2 0.08 0.13% 0.001252 0
+Mcap_Deer_DE_SRR650221_fastq_gz 89 1.6 MB 121.0 29.7 0.53 4317 17063 223 0.05 8.27% 0.439436 36
diff -r 2d6c6b01319e -r d0fbdeaaa488 test-data/vsnp_statistics4.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/vsnp_statistics4.tabular Wed Jun 16 17:38:47 2021 +0000
@@ -0,0 +1,3 @@
+ Reference File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count
+13-1941-6_S4_L001_R1_600000_fastq_gz 89 8.7 KB 100.0 65.7 1.00 25 46 4 0.16 0.16% 0.002146 0
+13-1941-6_S4_L001_R2_600000_fastq_gz 89 8.5 KB 100.0 66.3 1.00 25 46 4 0.16 0.16% 0.002146 0
diff -r 2d6c6b01319e -r d0fbdeaaa488 vsnp_statistics.py
--- a/vsnp_statistics.py Sun Jan 03 15:47:28 2021 +0000
+++ b/vsnp_statistics.py Wed Jun 16 17:38:47 2021 +0000
@@ -1,32 +1,14 @@
#!/usr/bin/env python
import argparse
+import csv
import gzip
import os
-import shutil
+from functools import partial
import numpy
import pandas
-
-QUALITYKEY = {'!': '0', '"': '1', '#': '2', '$': '3', '%': '4', '&': '5', "'": '6', '(': '7',
- ')': '8', '*': '9', '+': '10', ',': '11', '-': '12', '.': '13', '/': '14', '0': '15',
- '1': '16', '2': '17', '3': '18', '4': '19', '5': '20', '6': '21', '7': '22',
- '8': '23', '9': '24', ':': '25', ';': '26', '<': '27', '=': '28', '>': '29',
- '?': '30', '@': '31', 'A': '32', 'B': '33', 'C': '34', 'D': '35', 'E': '36',
- 'F': '37', 'G': '38', 'H': '39', 'I': '40', 'J': '41', 'K': '42', 'L': '43',
- 'M': '44', 'N': '45', 'O': '46', 'P': '47', 'Q': '48', 'R': '49', 'S': '50',
- 'T': '51', 'U': '52', 'V': '53', 'W': '54', 'X': '55', 'Y': '56', 'Z': '57',
- '_': '1', ']': '1', '[': '1', '\\': '1', '\n': '1', '`': '1', 'a': '1', 'b': '1',
- 'c': '1', 'd': '1', 'e': '1', 'f': '1', 'g': '1', 'h': '1', 'i': '1', 'j': '1',
- 'k': '1', 'l': '1', 'm': '1', 'n': '1', 'o': '1', 'p': '1', 'q': '1', 'r': '1',
- 's': '1', 't': '1', 'u': '1', 'v': '1', 'w': '1', 'x': '1', 'y': '1', 'z': '1',
- ' ': '1'}
-
-
-def fastq_to_df(fastq_file, gzipped):
- if gzipped:
- return pandas.read_csv(gzip.open(fastq_file, "r"), header=None, sep="^")
- return pandas.read_csv(open(fastq_file, "r"), header=None, sep="^")
+from Bio import SeqIO
def nice_size(size):
@@ -62,7 +44,20 @@
metrics_file = metrics_files[i]
file_name_base = os.path.basename(fastq_file)
# Read fastq_file into a data frame.
- fastq_df = fastq_to_df(fastq_file, gzipped)
+ _open = partial(gzip.open, mode='rt') if gzipped else open
+ with _open(fastq_file) as fh:
+ identifiers = []
+ seqs = []
+ letter_annotations = []
+ for seq_record in SeqIO.parse(fh, "fastq"):
+ identifiers.append(seq_record.id)
+ seqs.append(seq_record.seq)
+ letter_annotations.append(seq_record.letter_annotations["phred_quality"])
+ # Convert lists to Pandas series.
+ s1 = pandas.Series(identifiers, name='id')
+ s2 = pandas.Series(seqs, name='seq')
+ # Gather Series into a data frame.
+ fastq_df = pandas.DataFrame(dict(id=s1, seq=s2)).set_index(['id'])
total_reads = int(len(fastq_df.index) / 4)
current_sample_df = pandas.DataFrame(index=[file_name_base], columns=columns)
# Reference
@@ -76,19 +71,18 @@
fastq_df = fastq_df.iloc[3::4].sample(sampling_size)
dict_mean = {}
list_length = []
- for index, row in fastq_df.iterrows():
- base_qualities = []
- for base in list(row.array[0]):
- base_qualities.append(int(QUALITYKEY[base]))
- dict_mean[index] = numpy.mean(base_qualities)
- list_length.append(len(row.array[0]))
- current_sample_df.at[file_name_base, 'Mean Read Length'] = "%.1f" % numpy.mean(list_length)
+ i = 0
+ for id, seq, in fastq_df.iterrows():
+ dict_mean[id] = numpy.mean(letter_annotations[i])
+ list_length.append(len(seq.array[0]))
+ i += 1
+ current_sample_df.at[file_name_base, 'Mean Read Length'] = '%.1f' % numpy.mean(list_length)
# Mean Read Quality
df_mean = pandas.DataFrame.from_dict(dict_mean, orient='index', columns=['ave'])
- current_sample_df.at[file_name_base, 'Mean Read Quality'] = "%.1f" % df_mean['ave'].mean()
+ current_sample_df.at[file_name_base, 'Mean Read Quality'] = '%.1f' % df_mean['ave'].mean()
# Reads Passing Q30
reads_gt_q30 = len(df_mean[df_mean['ave'] >= 30])
- reads_passing_q30 = "{:10.2f}".format(reads_gt_q30 / sampling_size)
+ reads_passing_q30 = '{:10.2f}'.format(reads_gt_q30 / sampling_size)
current_sample_df.at[file_name_base, 'Reads Passing Q30'] = reads_passing_q30
# Total Reads
current_sample_df.at[file_name_base, 'Total Reads'] = total_reads
@@ -99,7 +93,7 @@
current_sample_df.at[file_name_base, 'Unmapped Reads'] = unmapped_reads
# Unmapped Reads Percentage of Total
if unmapped_reads > 0:
- unmapped_reads_percentage = "{:10.2f}".format(unmapped_reads / total_reads)
+ unmapped_reads_percentage = '{:10.2f}'.format(unmapped_reads / total_reads)
else:
unmapped_reads_percentage = 0
current_sample_df.at[file_name_base, 'Unmapped Reads Percentage of Total'] = unmapped_reads_percentage
@@ -111,12 +105,8 @@
# Good SNP Count
current_sample_df.at[file_name_base, 'Good SNP Count'] = good_snp_count
data_frames.append(current_sample_df)
- excel_df = pandas.concat(data_frames)
- excel_file_name = "output.xlsx"
- writer = pandas.ExcelWriter(excel_file_name, engine='xlsxwriter')
- excel_df.to_excel(writer, sheet_name='Sheet1')
- writer.save()
- shutil.move(excel_file_name, output_file)
+ output_df = pandas.concat(data_frames)
+ output_df.to_csv(output_file, sep='\t', quoting=csv.QUOTE_NONE, escapechar='\\')
def process_idxstats_file(idxstats_file):
@@ -124,6 +114,7 @@
unmapped_reads = 0
with open(idxstats_file, "r") as fh:
for i, line in enumerate(fh):
+ line = line.rstrip('\r\n')
items = line.split("\t")
if i == 0:
# NC_002945.4 4349904 213570 4047
@@ -143,6 +134,7 @@
if i == 0:
# Skip comments.
continue
+ line = line.rstrip('\r\n')
items = line.split("\t")
if i == 1:
# MarkDuplicates 10.338671 98.74%
diff -r 2d6c6b01319e -r d0fbdeaaa488 vsnp_statistics.xml
--- a/vsnp_statistics.xml Sun Jan 03 15:47:28 2021 +0000
+++ b/vsnp_statistics.xml Wed Jun 16 17:38:47 2021 +0000
@@ -4,10 +4,10 @@
macros.xml
+ biopython
numpy
pandas
xlrd
- xlsxwriter
-
+
@@ -131,7 +131,7 @@
-
+
@@ -141,7 +141,7 @@
-
+
@@ -165,7 +165,7 @@
-
+
@@ -187,7 +187,7 @@
-
+