# HG changeset patch # User greg # Date 1631753767 0 # Node ID b908bb18008ad8cfdd041cd703022099572bf82e # Parent 39ab5405b509401bb420bd1019df8492c692fb9b Uploaded diff -r 39ab5405b509 -r b908bb18008a .shed.yml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.shed.yml Thu Sep 16 00:56:07 2021 +0000 @@ -0,0 +1,13 @@ +name: vsnp_statistics +owner: greg +description: | + Contains a tool that produces an Excel spreadsheet containing statistics for samples and associated metrics files. +homepage_url: https://github.com/USDA-VS/vSNP +long_description: | + Contains a tool Accepts a single fastqsanger sample, a set of paired read samples, or a collections of samples + along with associated SAMtools idxstats and vSNP zero coverage metrics files and extracts information from them + to produce an Excel spreadsheet containing statistics for each sample. +remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/sequence_analysis/vsnp/vsnp_statistics +type: unrestricted +categories: + - Sequence Analysis diff -r 39ab5405b509 -r b908bb18008a test-data/vsnp_statistics1.tabular --- a/test-data/vsnp_statistics1.tabular Fri Aug 27 20:04:26 2021 +0000 +++ b/test-data/vsnp_statistics1.tabular Thu Sep 16 00:56:07 2021 +0000 @@ -1,2 +1,2 @@ FASTQ File Size Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count Reference -Mcap_Deer_DE_SRR650221_fastq_gz 1.6 MB 121.0 29.7 0.53 4317 17063 223 0.05 8.27% 0.439436 36 89 +Mcap_Deer_DE_SRR650221_fastq_gz 1.6 MB diff -r 39ab5405b509 -r b908bb18008a test-data/vsnp_statistics2.tabular --- a/test-data/vsnp_statistics2.tabular Fri Aug 27 20:04:26 2021 +0000 +++ b/test-data/vsnp_statistics2.tabular Thu Sep 16 00:56:07 2021 +0000 @@ -1,2 +1,2 @@ Read1 FASTQ File Size Reads Mean Read Length Mean Read Quality Reads Passing Q30 Read2 FASTQ File Size Reads Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count Reference -13-1941-6_S4_L001_R1_600000_fastq_gz 8.7 KB 25 100.0 65.7 1.00 13-1941-6_S4_L001_R2_600000_fastq_gz 8.5 KB 25 100.0 66.3 1.00 50 45 5 0.10 98.74% 10.338671 611 89 +13-1941-6_S4_L001_R1_600000_fastq_gz 8.7 KB 100 diff -r 39ab5405b509 -r b908bb18008a test-data/vsnp_statistics4.tabular --- a/test-data/vsnp_statistics4.tabular Fri Aug 27 20:04:26 2021 +0000 +++ b/test-data/vsnp_statistics4.tabular Thu Sep 16 00:56:07 2021 +0000 @@ -1,2 +1,2 @@ Read1 FASTQ File Size Reads Mean Read Length Mean Read Quality Reads Passing Q30 Read2 FASTQ File Size Reads Mean Read Length Mean Read Quality Reads Passing Q30 Total Reads All Mapped Reads Unmapped Reads Unmapped Reads Percentage of Total Reference with Coverage Average Depth of Coverage Good SNP Count Reference -Unnamed Collection_R1 8.7 KB 25 100.0 65.7 1.00 Unnamed Collection_R2 8.5 KB 25 100.0 66.3 1.00 50 46 4 0.08 0.16% 0.002146 0 89 +Unnamed Collection_R1 8.7 KB 100 diff -r 39ab5405b509 -r b908bb18008a vsnp_statistics.py --- a/vsnp_statistics.py Fri Aug 27 20:04:26 2021 +0000 +++ b/vsnp_statistics.py Thu Sep 16 00:56:07 2021 +0000 @@ -64,11 +64,14 @@ # Starting at row 3, keep every 4 row # random sample specified number of rows. file_size = nice_size(os.path.getsize(fastq_file)) - total_reads = int(len(fastq_df.index) / 4) + total_reads = len(seqs) # Mean Read Length if sampling_size > total_reads: sampling_size = total_reads - fastq_df = fastq_df.iloc[3::4].sample(sampling_size) + try: + fastq_df = fastq_df.iloc[3::4].sample(sampling_size) + except ValueError: + fastq_df = fastq_df.iloc[3::4].sample(sampling_size, replace=True) dict_mean = {} list_length = [] i = 0 diff -r 39ab5405b509 -r b908bb18008a vsnp_statistics.xml --- a/vsnp_statistics.xml Fri Aug 27 20:04:26 2021 +0000 +++ b/vsnp_statistics.xml Thu Sep 16 00:56:07 2021 +0000 @@ -76,7 +76,7 @@ - + @@ -85,7 +85,7 @@ - + @@ -98,7 +98,7 @@ - +