# HG changeset patch
# User greg
# Date 1631753767 0
# Node ID b908bb18008ad8cfdd041cd703022099572bf82e
# Parent  39ab5405b509401bb420bd1019df8492c692fb9b
Uploaded

diff -r 39ab5405b509 -r b908bb18008a .shed.yml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.shed.yml	Thu Sep 16 00:56:07 2021 +0000
@@ -0,0 +1,13 @@
+name: vsnp_statistics
+owner: greg
+description: |
+  Contains a tool that produces an Excel spreadsheet containing statistics for samples and associated metrics files.
+homepage_url: https://github.com/USDA-VS/vSNP
+long_description: |
+  Contains a tool Accepts a single fastqsanger sample, a set of paired read samples, or a collections of samples
+  along with associated SAMtools idxstats and vSNP zero coverage metrics files and extracts information from them
+  to produce an Excel spreadsheet containing statistics for each sample.
+remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/sequence_analysis/vsnp/vsnp_statistics
+type: unrestricted
+categories:
+  - Sequence Analysis
diff -r 39ab5405b509 -r b908bb18008a test-data/vsnp_statistics1.tabular
--- a/test-data/vsnp_statistics1.tabular	Fri Aug 27 20:04:26 2021 +0000
+++ b/test-data/vsnp_statistics1.tabular	Thu Sep 16 00:56:07 2021 +0000
@@ -1,2 +1,2 @@
 FASTQ	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count	Reference
-Mcap_Deer_DE_SRR650221_fastq_gz	1.6 MB	121.0	29.7	      0.53	4317	17063	223	      0.05	8.27%	0.439436	36	89
+Mcap_Deer_DE_SRR650221_fastq_gz	1.6 MB
diff -r 39ab5405b509 -r b908bb18008a test-data/vsnp_statistics2.tabular
--- a/test-data/vsnp_statistics2.tabular	Fri Aug 27 20:04:26 2021 +0000
+++ b/test-data/vsnp_statistics2.tabular	Thu Sep 16 00:56:07 2021 +0000
@@ -1,2 +1,2 @@
 Read1 FASTQ	File Size	Reads	Mean Read Length	Mean Read Quality	Reads Passing Q30	Read2 FASTQ	File Size	Reads	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count	Reference
-13-1941-6_S4_L001_R1_600000_fastq_gz	8.7 KB	25	100.0	65.7	      1.00	13-1941-6_S4_L001_R2_600000_fastq_gz	8.5 KB	25	100.0	66.3	      1.00	50	45	5	      0.10	98.74%	10.338671	611	89
+13-1941-6_S4_L001_R1_600000_fastq_gz	8.7 KB	100
diff -r 39ab5405b509 -r b908bb18008a test-data/vsnp_statistics4.tabular
--- a/test-data/vsnp_statistics4.tabular	Fri Aug 27 20:04:26 2021 +0000
+++ b/test-data/vsnp_statistics4.tabular	Thu Sep 16 00:56:07 2021 +0000
@@ -1,2 +1,2 @@
 Read1 FASTQ	File Size	Reads	Mean Read Length	Mean Read Quality	Reads Passing Q30	Read2 FASTQ	File Size	Reads	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count	Reference
-Unnamed Collection_R1	8.7 KB	25	100.0	65.7	      1.00	Unnamed Collection_R2	8.5 KB	25	100.0	66.3	      1.00	50	46	4	      0.08	0.16%	0.002146	0	89
+Unnamed Collection_R1	8.7 KB	100
diff -r 39ab5405b509 -r b908bb18008a vsnp_statistics.py
--- a/vsnp_statistics.py	Fri Aug 27 20:04:26 2021 +0000
+++ b/vsnp_statistics.py	Thu Sep 16 00:56:07 2021 +0000
@@ -64,11 +64,14 @@
     # Starting at row 3, keep every 4 row
     # random sample specified number of rows.
     file_size = nice_size(os.path.getsize(fastq_file))
-    total_reads = int(len(fastq_df.index) / 4)
+    total_reads = len(seqs)
     # Mean Read Length
     if sampling_size > total_reads:
         sampling_size = total_reads
-    fastq_df = fastq_df.iloc[3::4].sample(sampling_size)
+    try:
+        fastq_df = fastq_df.iloc[3::4].sample(sampling_size)
+    except ValueError:
+        fastq_df = fastq_df.iloc[3::4].sample(sampling_size, replace=True)
     dict_mean = {}
     list_length = []
     i = 0
diff -r 39ab5405b509 -r b908bb18008a vsnp_statistics.xml
--- a/vsnp_statistics.xml	Fri Aug 27 20:04:26 2021 +0000
+++ b/vsnp_statistics.xml	Thu Sep 16 00:56:07 2021 +0000
@@ -76,7 +76,7 @@
             <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz" dbkey="89"/>
             <param name="samtools_idxstats" value="samtools_idxstats1.tabular" ftype="tabular" dbkey="89"/>
             <param name="vsnp_azc_metrics" value="add_zc_metrics1.tabular" ftype="tabular" dbkey="89"/>
-            <output name="output" file="vsnp_statistics1.tabular" ftype="tabular"/>
+            <output name="output" file="vsnp_statistics1.tabular" ftype="tabular" compare="contains"/>
         </test>
         <!-- A set of paired fastq files -->
         <test expect_num_outputs="1">
@@ -85,7 +85,7 @@
             <param name="read2" value="13-1941-6_S4_L001_R2_600000.fastq.gz" ftype="fastqsanger.gz" dbkey="89"/>
             <param name="samtools_idxstats" value="samtools_idxstats2.tabular" ftype="tabular" dbkey="89"/>
             <param name="vsnp_azc_metrics" value="add_zc_metrics2.tabular" ftype="tabular" dbkey="89"/>
-            <output name="output" file="vsnp_statistics2.tabular" ftype="tabular"/>
+            <output name="output" file="vsnp_statistics2.tabular" ftype="tabular" compare="contains"/>
         </test>
         <!-- A collection of paired fastq files -->
         <test expect_num_outputs="1">
@@ -98,7 +98,7 @@
             </param>
             <param name="samtools_idxstats" value="samtools_idxstats5.tabular" ftype="tabular" dbkey="89"/>
             <param name="vsnp_azc_metrics" value="add_zc_metrics5.tabular" ftype="tabular" dbkey="89"/>
-            <output name="output" file="vsnp_statistics4.tabular" ftype="tabular"/>
+            <output name="output" file="vsnp_statistics4.tabular" ftype="tabular" compare="contains"/>
         </test>
     </tests>
     <help>