Mercurial > repos > greg > vsnp_statistics

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.shed.yml	Thu Sep 16 00:56:07 2021 +0000
@@ -0,0 +1,13 @@
+name: vsnp_statistics
+owner: greg
+description: |
+  Contains a tool that produces an Excel spreadsheet containing statistics for samples and associated metrics files.
+homepage_url: https://github.com/USDA-VS/vSNP
+long_description: |
+  Contains a tool Accepts a single fastqsanger sample, a set of paired read samples, or a collections of samples
+  along with associated SAMtools idxstats and vSNP zero coverage metrics files and extracts information from them
+  to produce an Excel spreadsheet containing statistics for each sample.
+remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/sequence_analysis/vsnp/vsnp_statistics
+type: unrestricted
+categories:
+  - Sequence Analysis
--- a/test-data/vsnp_statistics1.tabular	Fri Aug 27 20:04:26 2021 +0000
+++ b/test-data/vsnp_statistics1.tabular	Thu Sep 16 00:56:07 2021 +0000
@@ -1,2 +1,2 @@
 FASTQ	File Size	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count	Reference
-Mcap_Deer_DE_SRR650221_fastq_gz	1.6 MB	121.0	29.7	      0.53	4317	17063	223	      0.05	8.27%	0.439436	36	89
+Mcap_Deer_DE_SRR650221_fastq_gz	1.6 MB
--- a/test-data/vsnp_statistics2.tabular	Fri Aug 27 20:04:26 2021 +0000
+++ b/test-data/vsnp_statistics2.tabular	Thu Sep 16 00:56:07 2021 +0000
@@ -1,2 +1,2 @@
 Read1 FASTQ	File Size	Reads	Mean Read Length	Mean Read Quality	Reads Passing Q30	Read2 FASTQ	File Size	Reads	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count	Reference
-13-1941-6_S4_L001_R1_600000_fastq_gz	8.7 KB	25	100.0	65.7	      1.00	13-1941-6_S4_L001_R2_600000_fastq_gz	8.5 KB	25	100.0	66.3	      1.00	50	45	5	      0.10	98.74%	10.338671	611	89
+13-1941-6_S4_L001_R1_600000_fastq_gz	8.7 KB	100
--- a/test-data/vsnp_statistics4.tabular	Fri Aug 27 20:04:26 2021 +0000
+++ b/test-data/vsnp_statistics4.tabular	Thu Sep 16 00:56:07 2021 +0000
@@ -1,2 +1,2 @@
 Read1 FASTQ	File Size	Reads	Mean Read Length	Mean Read Quality	Reads Passing Q30	Read2 FASTQ	File Size	Reads	Mean Read Length	Mean Read Quality	Reads Passing Q30	Total Reads	All Mapped Reads	Unmapped Reads	Unmapped Reads Percentage of Total	Reference with Coverage	Average Depth of Coverage	Good SNP Count	Reference
-Unnamed Collection_R1	8.7 KB	25	100.0	65.7	      1.00	Unnamed Collection_R2	8.5 KB	25	100.0	66.3	      1.00	50	46	4	      0.08	0.16%	0.002146	0	89
+Unnamed Collection_R1	8.7 KB	100
--- a/vsnp_statistics.py	Fri Aug 27 20:04:26 2021 +0000
+++ b/vsnp_statistics.py	Thu Sep 16 00:56:07 2021 +0000
@@ -64,11 +64,14 @@
     # Starting at row 3, keep every 4 row
     # random sample specified number of rows.
     file_size = nice_size(os.path.getsize(fastq_file))
-    total_reads = int(len(fastq_df.index) / 4)
+    total_reads = len(seqs)
     # Mean Read Length
     if sampling_size > total_reads:
         sampling_size = total_reads
-    fastq_df = fastq_df.iloc[3::4].sample(sampling_size)
+    try:
+        fastq_df = fastq_df.iloc[3::4].sample(sampling_size)
+    except ValueError:
+        fastq_df = fastq_df.iloc[3::4].sample(sampling_size, replace=True)
     dict_mean = {}
     list_length = []
     i = 0
--- a/vsnp_statistics.xml	Fri Aug 27 20:04:26 2021 +0000
+++ b/vsnp_statistics.xml	Thu Sep 16 00:56:07 2021 +0000
@@ -76,7 +76,7 @@
             <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz" dbkey="89"/>
             <param name="samtools_idxstats" value="samtools_idxstats1.tabular" ftype="tabular" dbkey="89"/>
             <param name="vsnp_azc_metrics" value="add_zc_metrics1.tabular" ftype="tabular" dbkey="89"/>
-            <output name="output" file="vsnp_statistics1.tabular" ftype="tabular"/>
+            <output name="output" file="vsnp_statistics1.tabular" ftype="tabular" compare="contains"/>
         </test>
         <!-- A set of paired fastq files -->
         <test expect_num_outputs="1">
@@ -85,7 +85,7 @@
             <param name="read2" value="13-1941-6_S4_L001_R2_600000.fastq.gz" ftype="fastqsanger.gz" dbkey="89"/>
             <param name="samtools_idxstats" value="samtools_idxstats2.tabular" ftype="tabular" dbkey="89"/>
             <param name="vsnp_azc_metrics" value="add_zc_metrics2.tabular" ftype="tabular" dbkey="89"/>
-            <output name="output" file="vsnp_statistics2.tabular" ftype="tabular"/>
+            <output name="output" file="vsnp_statistics2.tabular" ftype="tabular" compare="contains"/>
         </test>
         <!-- A collection of paired fastq files -->
         <test expect_num_outputs="1">
@@ -98,7 +98,7 @@
             </param>
             <param name="samtools_idxstats" value="samtools_idxstats5.tabular" ftype="tabular" dbkey="89"/>
             <param name="vsnp_azc_metrics" value="add_zc_metrics5.tabular" ftype="tabular" dbkey="89"/>
-            <output name="output" file="vsnp_statistics4.tabular" ftype="tabular"/>
+            <output name="output" file="vsnp_statistics4.tabular" ftype="tabular" compare="contains"/>
         </test>
     </tests>
     <help>