Mercurial > repos > iuc > snpfreqplot
changeset 2:dc51db22310c draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/snpfreqplot/ commit d1c54d077cfc0eeb9699719760e668948cb9bbbc"
author | iuc |
---|---|
date | Fri, 18 Dec 2020 23:48:01 +0000 |
parents | e362b3143cde |
children | 3d0adeee3f2b |
files | heatmap_for_variants.R helperFunctions.R snpEffExtract.R snpfreqplot.xml test-data/no_variants.vcf test-data/single_variant.vcf |
diffstat | 6 files changed, 95 insertions(+), 23 deletions(-) [+] |
line wrap: on
line diff
--- a/heatmap_for_variants.R Thu Dec 10 13:41:29 2020 +0000 +++ b/heatmap_for_variants.R Fri Dec 18 23:48:01 2020 +0000 @@ -116,7 +116,7 @@ # colormanagement my_colors <- colorRampPalette(c("grey93", "brown", "black")) #heatmap count <- length(unique(ann_final$gene)) #annotations (genes) -gene_color <- c(brewer.pal(brewer_color_gene_annotation, n = count)) +gene_color <- rep(c(brewer.pal(brewer_color_gene_annotation, n = count)), length.out = count) names(gene_color) <- unique(ann_final$gene) # colormanagement annotations (effect) @@ -161,7 +161,7 @@ pos_ref_alt <- cols[1:3] rest <- "" if (length(cols) > 3) { - rest <- paste0(" :: ", paste(cols[4:length(cols)], sep = " ")) + rest <- paste0(" :: ", paste0(cols[4:length(cols)], collapse = " ")) } ## Trim the REF or ALT if too long if (str_length(pos_ref_alt[2]) > 3) { @@ -175,7 +175,7 @@ pos_ref_alt[2], " > ", pos_ref_alt[3]) ## Join rest - new_name <- paste0(new_name, " ", paste(rest)) + new_name <- paste0(new_name, " ", rest) } colnames(final) <- sapply(colnames(final), fix_label)
--- a/helperFunctions.R Thu Dec 10 13:41:29 2020 +0000 +++ b/helperFunctions.R Fri Dec 18 23:48:01 2020 +0000 @@ -54,22 +54,28 @@ group_ind <- tab %>% group_by(POS, REF, ALT) %>% select(POS, REF, ALT) # nolint nlines <- nrow(tab) groups <- list() - groups[[1]] <- c(1, 1) - last_pa <- paste(group_ind[1, ]) - for (r in 2:nlines) { - curr_pa <- paste(group_ind[r, ]) - group_ind_diff_between_lines <- !all(last_pa == curr_pa) - if (group_ind_diff_between_lines) { - ## end of current group, start of new - groups[[length(groups)]][2] <- r - 1 ## change prev end - groups[[length(groups) + 1]] <- c(r, r) ## set (start, end) - } else if (r == nlines) { - ## i.e. if the very last line shares - ## the same POS REF ALT as the one before, - ## close current group. - groups[[length(groups)]][2] <- r + if (nlines) { + groups[[1]] <- c(1, 1) + } else { + groups[[1]] <- c(0, 0) + } + if (nlines >= 2) { + last_pa <- paste(group_ind[1, ]) + for (r in 2:nlines) { + curr_pa <- paste(group_ind[r, ]) + group_ind_diff_between_lines <- !all(last_pa == curr_pa) + if (group_ind_diff_between_lines) { + ## end of current group, start of new + groups[[length(groups)]][2] <- r - 1 ## change prev end + groups[[length(groups) + 1]] <- c(r, r) ## set (start, end) + } else if (r == nlines) { + ## i.e. if the very last line shares + ## the same POS REF ALT as the one before, + ## close current group. + groups[[length(groups)]][2] <- r + } + last_pa <- curr_pa } - last_pa <- curr_pa } as_tibble(do.call( "rbind", @@ -82,7 +88,8 @@ read_and_process <- function(id) { file <- (samples %>% filter(ids == id))$files # nolint - variants <- read.table(file, header = T, sep = "\t") + variants <- read.table(file, header = T, sep = "\t", colClasses = "character") + variants["AF"] <- lapply(variants["AF"], as.numeric) uniq_ids <- split_table_and_process(variants) if (nrow(variants) != nrow(uniq_ids)) { stop(paste0(id, " '", file, "' failed: ", file, "\"",
--- a/snpEffExtract.R Thu Dec 10 13:41:29 2020 +0000 +++ b/snpEffExtract.R Fri Dec 18 23:48:01 2020 +0000 @@ -5,6 +5,12 @@ tsv_eff_from_vcf <- function(input_vcf, output_tab) { read_vcf <- readVcf(input_vcf) # nolint + if (!nrow(read_vcf@fixed)) { + # no variants in file -> just write a valid header line + write(c("CHROM", "POS", "REF", "ALT", "AF", "EFF[*].GENE", "EFF[*].EFFECT"), + ncolumns = 7, file = output_tab, sep = "\t") + return() + } chrom_pos <- data.frame(read_vcf@rowRanges)[, c("seqnames", "start")] ref_alt_filter <- read_vcf@fixed[, c("REF", "ALT", "FILTER")] dp_af <- read_vcf@info[c("DP", "AF")]
--- a/snpfreqplot.xml Thu Dec 10 13:41:29 2020 +0000 +++ b/snpfreqplot.xml Fri Dec 18 23:48:01 2020 +0000 @@ -3,7 +3,7 @@ <description>Generates a heatmap of allele frequencies grouped by variant type for SnpEff-annotated SARS-CoV-2 data</description> <macros> <token name="@VERSION@">1.0</token> - <token name="@GALAXY_VERSION@">1</token> + <token name="@GALAXY_VERSION@">2</token> </macros> <requirements> <requirement type="package" version="4.0">r-base</requirement> @@ -179,6 +179,14 @@ <output name="outfile" ftype="pdf" value="heatmap.default.pdf" compare="sim_size" delta="250" /> </test> <test expect_num_outputs="1"> + <!-- PDF, tabular inputs, short color palette --> + <param name="sinputs" ftype="tabular" value="input436.tabular,input437.tabular,input438.tabular,input439.tabular,input440.tabular,input441.tabular,input442.tabular,input443.tabular,input444.tabular" /> + <section name="advanced" > + <param name="color" value="Set2" /> + </section> + <output name="outfile" ftype="pdf" value="heatmap.default.pdf" compare="sim_size" delta="250" /> + </test> + <test expect_num_outputs="1"> <!-- PNG, multiple inputs, non-numeric IDS --> <param name="sinputs" ftype="tabular" value="input436.tabular,input437.tabular,input443.tabular,input444.tabular" /> <param name="varfreq" value="0.5" /> @@ -242,15 +250,16 @@ </output> </test> <test expect_num_outputs="1"> - <!-- SVG, Vcf test with problematic splice+syn at snpeff789.vcf for threshold = 0.0222 --> - <param name="sinputs" ftype="vcf" value="snpeff.123.vcf,snpeff.456.vcf,snpeff.789.vcf" /> + <!-- SVG, Vcf test with problematic splice+syn at snpeff789.vcf for threshold = 0.0222 + and an "empty" file (no variants) and a file with just one variant among its inputs --> + <param name="sinputs" ftype="vcf" value="snpeff.123.vcf,snpeff.456.vcf,snpeff.789.vcf,no_variants.vcf,single_variant.vcf" /> <param name="varfreq" value="0.0222" /> <section name="advanced" > <param name="output_type" value="svg" /> </section> <output name="outfile" ftype="svg"> <assert_contents> - <has_text text="viewBox="0 0 3101 879"" /> + <has_text text="viewBox="0 0 1962 546"" /> </assert_contents> </output> </test>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/no_variants.vcf Fri Dec 18 23:48:01 2020 +0000 @@ -0,0 +1,24 @@ +##fileformat=VCFv4.0 +##fileDate=20200707 +##source=lofreq call --verbose --ref reference.fa --call-indels --min-cov 5 --max-depth 1000000 --min-bq 30 --min-alt-bq 30 --min-mq 20 --max-mq 255 --min-jq 0 --min-alt-jq 0 --def-alt-jq 0 --sig 0.0005 --bonf dynamic --no-default-filter --no-default-filter -r NC_045512.2:1-14951 -o /data/dnb02/galaxy_db/job_working_directory/009/430/9430166/working/pp-tmp/lofreq2_call_parallelpfou9vt9/0.vcf.gz reads.bam +##reference=reference.fa +##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw Depth"> +##INFO=<ID=AF,Number=1,Type=Float,Description="Allele Frequency"> +##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias at this position"> +##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"> +##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL."> +##INFO=<ID=CONSVAR,Number=0,Type=Flag,Description="Indicates that the variant is a consensus variant (as opposed to a low frequency variant)."> +##INFO=<ID=HRUN,Number=1,Type=Integer,Description="Homopolymer length to the right of report indel position"> +##FILTER=<ID=min_snvqual_66,Description="Minimum SNV Quality (Phred) 66"> +##FILTER=<ID=min_indelqual_52,Description="Minimum Indel Quality (Phred) 52"> +##FILTER=<ID=min_snvqual_70,Description="Minimum SNV Quality (Phred) 70"> +##FILTER=<ID=min_indelqual_57,Description="Minimum Indel Quality (Phred) 57"> +##FILTER=<ID=min_af_0.050000,Description="Minimum allele frequency 0.050000"> +##FILTER=<ID=min_dp_5,Description="Minimum Coverage 5"> +##FILTER=<ID=sb_fdr,Description="Strand-Bias Multiple Testing Correction: fdr corr. pvalue > 0.001000"> +##SnpEffVersion="4.5covid19 (build 2020-04-15 22:26), by Pablo Cingolani" +##SnpEffCmd="SnpEff -i vcf -o vcf -formatEff -classic -no-downstream -no-intergenic -no-upstream -no-utr -stats /data/dnb02/galaxy_db/job_working_directory/009/430/9430172/galaxy_dataset_24243789.dat NC_045512.2 /data/dnb02/galaxy_db/files/022/094/dataset_22094491.dat " +##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_Change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank | Genotype [ | ERRORS | WARNINGS ] )' "> +##INFO=<ID=LOF,Number=.,Type=String,Description="Predicted loss of function effects for this variant. Format: 'Gene_Name | Gene_ID | Number_of_transcripts_in_gene | Percent_of_transcripts_affected'"> +##INFO=<ID=NMD,Number=.,Type=String,Description="Predicted nonsense mediated decay effects for this variant. Format: 'Gene_Name | Gene_ID | Number_of_transcripts_in_gene | Percent_of_transcripts_affected'"> +#CHROM POS ID REF ALT QUAL FILTER INFO
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/single_variant.vcf Fri Dec 18 23:48:01 2020 +0000 @@ -0,0 +1,26 @@ +##fileformat=VCFv4.0 +##fileDate=20200707 +##source=lofreq call --verbose --ref reference.fa --call-indels --min-cov 5 --max-depth 1000000 --min-bq 30 --min-alt-bq 30 --min-mq 20 --max-mq 255 --min-jq 0 --min-alt-jq 0 --def-alt-jq 0 --sig 0.0005 --bonf dynamic --no-default-filter --no-default-filter -r NC_045512.2:1-14951 -o /data/dnb02/galaxy_db/job_working_directory/009/430/9430166/working/pp-tmp/lofreq2_call_parallelpfou9vt9/0.vcf.gz reads.bam +##reference=reference.fa +##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw Depth"> +##INFO=<ID=AF,Number=1,Type=Float,Description="Allele Frequency"> +##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias at this position"> +##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"> +##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL."> +##INFO=<ID=CONSVAR,Number=0,Type=Flag,Description="Indicates that the variant is a consensus variant (as opposed to a low frequency variant)."> +##INFO=<ID=HRUN,Number=1,Type=Integer,Description="Homopolymer length to the right of report indel position"> +##FILTER=<ID=min_snvqual_66,Description="Minimum SNV Quality (Phred) 66"> +##FILTER=<ID=min_indelqual_52,Description="Minimum Indel Quality (Phred) 52"> +##FILTER=<ID=min_snvqual_70,Description="Minimum SNV Quality (Phred) 70"> +##FILTER=<ID=min_indelqual_57,Description="Minimum Indel Quality (Phred) 57"> +##FILTER=<ID=min_af_0.050000,Description="Minimum allele frequency 0.050000"> +##FILTER=<ID=min_dp_5,Description="Minimum Coverage 5"> +##FILTER=<ID=sb_fdr,Description="Strand-Bias Multiple Testing Correction: fdr corr. pvalue > 0.001000"> +##SnpEffVersion="4.5covid19 (build 2020-04-15 22:26), by Pablo Cingolani" +##SnpEffCmd="SnpEff -i vcf -o vcf -formatEff -classic -no-downstream -no-intergenic -no-upstream -no-utr -stats /data/dnb02/galaxy_db/job_working_directory/009/430/9430172/galaxy_dataset_24243789.dat NC_045512.2 /data/dnb02/galaxy_db/files/022/094/dataset_22094491.dat " +##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_Change| Amino_Acid_length | Gene_Name | Transcript_BioType | Gene_Coding | Transcript_ID | Exon_Rank | Genotype [ | ERRORS | WARNINGS ] )' "> +##INFO=<ID=LOF,Number=.,Type=String,Description="Predicted loss of function effects for this variant. Format: 'Gene_Name | Gene_ID | Number_of_transcripts_in_gene | Percent_of_transcripts_affected'"> +##INFO=<ID=NMD,Number=.,Type=String,Description="Predicted nonsense mediated decay effects for this variant. Format: 'Gene_Name | Gene_ID | Number_of_transcripts_in_gene | Percent_of_transcripts_affected'"> +#CHROM POS ID REF ALT QUAL FILTER INFO +NC_045512.2 241 . C T 13172.0 PASS DP=363;AF=0.980716;SB=0;DP4=1,1,167,193 +