Mercurial > repos > iuc > snpfreqplot

--- a/heatmap_for_variants.R	Thu Dec 10 13:41:29 2020 +0000
+++ b/heatmap_for_variants.R	Fri Dec 18 23:48:01 2020 +0000
@@ -116,7 +116,7 @@
                                         # colormanagement
 my_colors <- colorRampPalette(c("grey93", "brown", "black")) #heatmap
 count <- length(unique(ann_final$gene))                     #annotations (genes)
-gene_color <- c(brewer.pal(brewer_color_gene_annotation, n = count))
+gene_color <- rep(c(brewer.pal(brewer_color_gene_annotation, n = count)), length.out = count)
 names(gene_color) <- unique(ann_final$gene)

                                         # colormanagement annotations (effect)
@@ -161,7 +161,7 @@
     pos_ref_alt <- cols[1:3]
     rest <- ""
     if (length(cols) > 3) {
-        rest <- paste0(" :: ", paste(cols[4:length(cols)], sep = " "))
+        rest <- paste0(" :: ", paste0(cols[4:length(cols)], collapse = " "))
     }
     ## Trim the REF or ALT if too long
     if (str_length(pos_ref_alt[2]) > 3) {
@@ -175,7 +175,7 @@
                        pos_ref_alt[2], " > ",
                        pos_ref_alt[3])
     ## Join rest
-    new_name <- paste0(new_name, " ", paste(rest))
+    new_name <- paste0(new_name, " ", rest)
 }

 colnames(final) <- sapply(colnames(final), fix_label)
--- a/helperFunctions.R	Thu Dec 10 13:41:29 2020 +0000
+++ b/helperFunctions.R	Fri Dec 18 23:48:01 2020 +0000
@@ -54,22 +54,28 @@
     group_ind <- tab %>% group_by(POS, REF, ALT) %>% select(POS, REF, ALT) # nolint
     nlines <- nrow(tab)
     groups <- list()
-    groups[[1]] <- c(1, 1)
-    last_pa <- paste(group_ind[1, ])
-    for (r in 2:nlines) {
-        curr_pa <- paste(group_ind[r, ])
-        group_ind_diff_between_lines <- !all(last_pa == curr_pa)
-        if (group_ind_diff_between_lines) {
-            ## end of current group, start of new
-            groups[[length(groups)]][2] <- r - 1     ## change prev end
-            groups[[length(groups) + 1]] <- c(r, r)  ## set (start, end)
-        } else if (r == nlines) {
-            ## i.e. if the very last line shares
-            ## the same POS REF ALT as the one before,
-            ## close current group.
-            groups[[length(groups)]][2] <- r
+    if (nlines) {
+        groups[[1]] <- c(1, 1)
+    } else {
+        groups[[1]] <- c(0, 0)
+    }
+    if (nlines >= 2) {
+        last_pa <- paste(group_ind[1, ])
+        for (r in 2:nlines) {
+            curr_pa <- paste(group_ind[r, ])
+            group_ind_diff_between_lines <- !all(last_pa == curr_pa)
+            if (group_ind_diff_between_lines) {
+                ## end of current group, start of new
+                groups[[length(groups)]][2] <- r - 1     ## change prev end
+                groups[[length(groups) + 1]] <- c(r, r)  ## set (start, end)
+            } else if (r == nlines) {
+                ## i.e. if the very last line shares
+                ## the same POS REF ALT as the one before,
+                ## close current group.
+                groups[[length(groups)]][2] <- r
+            }
+            last_pa <- curr_pa
         }
-        last_pa <- curr_pa
     }
     as_tibble(do.call(
         "rbind",
@@ -82,7 +88,8 @@

 read_and_process <- function(id) {
     file <- (samples %>% filter(ids == id))$files    # nolint
-    variants <- read.table(file, header = T, sep = "\t")
+    variants <- read.table(file, header = T, sep = "\t", colClasses = "character")
+    variants["AF"]  <- lapply(variants["AF"], as.numeric)
     uniq_ids <- split_table_and_process(variants)
     if (nrow(variants) != nrow(uniq_ids)) {
         stop(paste0(id, " '", file, "' failed: ", file, "\"",
--- a/snpEffExtract.R	Thu Dec 10 13:41:29 2020 +0000
+++ b/snpEffExtract.R	Fri Dec 18 23:48:01 2020 +0000
@@ -5,6 +5,12 @@

 tsv_eff_from_vcf <- function(input_vcf, output_tab) {
     read_vcf <- readVcf(input_vcf)  # nolint
+    if (!nrow(read_vcf@fixed)) {
+        # no variants in file -> just write a valid header line
+        write(c("CHROM", "POS", "REF", "ALT", "AF", "EFF[*].GENE", "EFF[*].EFFECT"),
+              ncolumns = 7, file = output_tab, sep = "\t")
+        return()
+    }
     chrom_pos <- data.frame(read_vcf@rowRanges)[, c("seqnames", "start")]
     ref_alt_filter <- read_vcf@fixed[, c("REF", "ALT", "FILTER")]
     dp_af <- read_vcf@info[c("DP", "AF")]
--- a/snpfreqplot.xml	Thu Dec 10 13:41:29 2020 +0000
+++ b/snpfreqplot.xml	Fri Dec 18 23:48:01 2020 +0000
@@ -3,7 +3,7 @@
     <description>Generates a heatmap of allele frequencies grouped by variant type for SnpEff-annotated SARS-CoV-2 data</description>
     <macros>
         <token name="@VERSION@">1.0</token>
-        <token name="@GALAXY_VERSION@">1</token>
+        <token name="@GALAXY_VERSION@">2</token>
     </macros>
     <requirements>
         <requirement type="package" version="4.0">r-base</requirement>
@@ -179,6 +179,14 @@
             <output name="outfile" ftype="pdf" value="heatmap.default.pdf" compare="sim_size" delta="250" />
         </test>
         <test expect_num_outputs="1">
+            <!-- PDF, tabular inputs, short color palette -->
+            <param name="sinputs" ftype="tabular" value="input436.tabular,input437.tabular,input438.tabular,input439.tabular,input440.tabular,input441.tabular,input442.tabular,input443.tabular,input444.tabular" />
+            <section name="advanced" >
+                <param name="color" value="Set2" />
+            </section>
+            <output name="outfile" ftype="pdf" value="heatmap.default.pdf" compare="sim_size" delta="250" />
+        </test>
+        <test expect_num_outputs="1">
             <!-- PNG, multiple inputs, non-numeric IDS -->
             <param name="sinputs" ftype="tabular" value="input436.tabular,input437.tabular,input443.tabular,input444.tabular" />
             <param name="varfreq" value="0.5" />
@@ -242,15 +250,16 @@
             </output>
         </test>
         <test expect_num_outputs="1">
-            <!-- SVG, Vcf test with problematic splice+syn at snpeff789.vcf for threshold = 0.0222 -->
-            <param name="sinputs" ftype="vcf" value="snpeff.123.vcf,snpeff.456.vcf,snpeff.789.vcf" />
+            <!-- SVG, Vcf test with problematic splice+syn at snpeff789.vcf for threshold = 0.0222
+            and an "empty" file (no variants) and a file with just one variant among its inputs -->
+            <param name="sinputs" ftype="vcf" value="snpeff.123.vcf,snpeff.456.vcf,snpeff.789.vcf,no_variants.vcf,single_variant.vcf" />
             <param name="varfreq" value="0.0222" />
             <section name="advanced" >
                 <param name="output_type" value="svg" />
             </section>
             <output name="outfile" ftype="svg">
                 <assert_contents>
-                    <has_text text="viewBox=&quot;0 0 3101 879&quot;" />
+                    <has_text text="viewBox=&quot;0 0 1962 546&quot;" />
                 </assert_contents>
             </output>
         </test>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/no_variants.vcf	Fri Dec 18 23:48:01 2020 +0000
@@ -0,0 +1,24 @@
+##fileformat=VCFv4.0
+##fileDate=20200707
+##source=lofreq call --verbose --ref reference.fa --call-indels --min-cov 5 --max-depth 1000000 --min-bq 30 --min-alt-bq 30 --min-mq 20 --max-mq 255 --min-jq 0 --min-alt-jq 0 --def-alt-jq 0 --sig 0.0005 --bonf dynamic --no-default-filter --no-default-filter -r NC_045512.2:1-14951 -o /data/dnb02/galaxy_db/job_working_directory/009/430/9430166/working/pp-tmp/lofreq2_call_parallelpfou9vt9/0.vcf.gz reads.bam
+##reference=reference.fa
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw Depth">
+##INFO=<ID=AF,Number=1,Type=Float,Description="Allele Frequency">
+##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias at this position">
+##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
+##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">
+##INFO=<ID=CONSVAR,Number=0,Type=Flag,Description="Indicates that the variant is a consensus variant (as opposed to a low frequency variant).">
+##INFO=<ID=HRUN,Number=1,Type=Integer,Description="Homopolymer length to the right of report indel position">
+##FILTER=<ID=min_snvqual_66,Description="Minimum SNV Quality (Phred) 66">
+##FILTER=<ID=min_indelqual_52,Description="Minimum Indel Quality (Phred) 52">
+##FILTER=<ID=min_snvqual_70,Description="Minimum SNV Quality (Phred) 70">
+##FILTER=<ID=min_indelqual_57,Description="Minimum Indel Quality (Phred) 57">
+##FILTER=<ID=min_af_0.050000,Description="Minimum allele frequency 0.050000">
+##FILTER=<ID=min_dp_5,Description="Minimum Coverage 5">
+##FILTER=<ID=sb_fdr,Description="Strand-Bias Multiple Testing Correction: fdr corr. pvalue > 0.001000">
+##SnpEffVersion="4.5covid19 (build 2020-04-15 22:26), by Pablo Cingolani"
+##SnpEffCmd="SnpEff  -i vcf -o vcf -formatEff -classic -no-downstream -no-intergenic -no-upstream -no-utr -stats /data/dnb02/galaxy_db/job_working_directory/009/430/9430172/galaxy_dataset_24243789.dat NC_045512.2 /data/dnb02/galaxy_db/files/022/094/dataset_22094491.dat "
+##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_Change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank  | Genotype [ | ERRORS | WARNINGS ] )' ">
+##INFO=<ID=LOF,Number=.,Type=String,Description="Predicted loss of function effects for this variant. Format: 'Gene_Name | Gene_ID | Number_of_transcripts_in_gene | Percent_of_transcripts_affected'">
+##INFO=<ID=NMD,Number=.,Type=String,Description="Predicted nonsense mediated decay effects for this variant. Format: 'Gene_Name | Gene_ID | Number_of_transcripts_in_gene | Percent_of_transcripts_affected'">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/single_variant.vcf	Fri Dec 18 23:48:01 2020 +0000
@@ -0,0 +1,26 @@
+##fileformat=VCFv4.0
+##fileDate=20200707
+##source=lofreq call --verbose --ref reference.fa --call-indels --min-cov 5 --max-depth 1000000 --min-bq 30 --min-alt-bq 30 --min-mq 20 --max-mq 255 --min-jq 0 --min-alt-jq 0 --def-alt-jq 0 --sig 0.0005 --bonf dynamic --no-default-filter --no-default-filter -r NC_045512.2:1-14951 -o /data/dnb02/galaxy_db/job_working_directory/009/430/9430166/working/pp-tmp/lofreq2_call_parallelpfou9vt9/0.vcf.gz reads.bam
+##reference=reference.fa
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw Depth">
+##INFO=<ID=AF,Number=1,Type=Float,Description="Allele Frequency">
+##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias at this position">
+##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
+##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">
+##INFO=<ID=CONSVAR,Number=0,Type=Flag,Description="Indicates that the variant is a consensus variant (as opposed to a low frequency variant).">
+##INFO=<ID=HRUN,Number=1,Type=Integer,Description="Homopolymer length to the right of report indel position">
+##FILTER=<ID=min_snvqual_66,Description="Minimum SNV Quality (Phred) 66">
+##FILTER=<ID=min_indelqual_52,Description="Minimum Indel Quality (Phred) 52">
+##FILTER=<ID=min_snvqual_70,Description="Minimum SNV Quality (Phred) 70">
+##FILTER=<ID=min_indelqual_57,Description="Minimum Indel Quality (Phred) 57">
+##FILTER=<ID=min_af_0.050000,Description="Minimum allele frequency 0.050000">
+##FILTER=<ID=min_dp_5,Description="Minimum Coverage 5">
+##FILTER=<ID=sb_fdr,Description="Strand-Bias Multiple Testing Correction: fdr corr. pvalue > 0.001000">
+##SnpEffVersion="4.5covid19 (build 2020-04-15 22:26), by Pablo Cingolani"
+##SnpEffCmd="SnpEff  -i vcf -o vcf -formatEff -classic -no-downstream -no-intergenic -no-upstream -no-utr -stats /data/dnb02/galaxy_db/job_working_directory/009/430/9430172/galaxy_dataset_24243789.dat NC_045512.2 /data/dnb02/galaxy_db/files/022/094/dataset_22094491.dat "
+##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_Change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank  | Genotype [ | ERRORS | WARNINGS ] )' ">
+##INFO=<ID=LOF,Number=.,Type=String,Description="Predicted loss of function effects for this variant. Format: 'Gene_Name | Gene_ID | Number_of_transcripts_in_gene | Percent_of_transcripts_affected'">
+##INFO=<ID=NMD,Number=.,Type=String,Description="Predicted nonsense mediated decay effects for this variant. Format: 'Gene_Name | Gene_ID | Number_of_transcripts_in_gene | Percent_of_transcripts_affected'">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+NC_045512.2	241	.	C	T	13172.0	PASS	DP=363;AF=0.980716;SB=0;DP4=1,1,167,193
+