changeset 0:fb00fb7cb201 draft

"planemo upload for repository https://github.com/usegalaxy-au/tools-au commit 7770c3ea0fab9df0f39d3d73d10c9e282b77c60f-dirty"
author galaxy-australia
date Wed, 27 Apr 2022 06:34:22 +0000
parents
children 271eb7f4b8bc
files hapcut2.xml test-data/input.bam test-data/input.vcf test-data/output_frag.dat test-data/output_haplotype.out test-data/output_haplotype.out.phased.vcf
diffstat 4 files changed, 238 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hapcut2.xml	Wed Apr 27 06:34:22 2022 +0000
@@ -0,0 +1,174 @@
+<tool id="hapcut2" name="Hapcut2" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@+ga@GA_VERSION_SUFFIX@">
+  <description> - haplotype assembly for diploid organisms</description>
+  <xrefs>
+    <xref type="bio.tools">hapcut2</xref>
+  </xrefs>
+  <macros>
+    <token name="@TOOL_VERSION@">1.3.3</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@GA_VERSION_SUFFIX@">1</token>
+  </macros>
+  <requirements>
+    <requirement type="package" version="1.3.3">hapcut2</requirement>
+  </requirements>
+
+  <command detect_errors="exit_code"><![CDATA[
+
+## Prep inputs
+## =====================================================================
+ln -s '$input_bam' input.bam
+&& ln -s '$input_vcf' input.vcf
+
+
+## Run program
+## =====================================================================
+## Extract variant fragments from alignment
+&& extractHAIRS --bam input.bam --VCF input.vcf --out frags.dat
+#if $optimization.choice == 'pacbio':
+  --pacbio 1
+  --ref '$optimization.reference_fasta'
+#elif $optimization.choice == 'ont':
+  --ont 1
+  --ref '$optimization.reference_fasta'
+#elif $optimization.choice == 'hic':
+  --HiC 1
+#end if
+
+#if $advanced.minIS
+--minIS $advanced.minIS
+#end if
+
+#if $advanced.maxIS
+  --maxIS $advanced.maxIS
+#end if
+
+## Create haplotype.out and haplotype.out.phased.VCF
+&& HAPCUT2 --fragments frags.dat --VCF input.vcf --output haplotype.out
+#if $optimization.choice == 'hic':
+  --HiC 1
+#end if
+
+  ]]></command>
+
+  <inputs>
+    <param name="input_bam" type="data" format="bam" label="Input BAM file"/>
+    <param name="input_vcf" type="data" format="vcf" label="Input VCF file"/>
+
+    <conditional name="optimization">
+      <!-- TODO: include 10X (requires extra processing step) -->
+      <param name="choice" type="select" display="radio" label="Optimization">
+        <option value="default" selected="true">Default</option>
+        <option value="pacbio">Pacbio</option>
+        <option value="ont">Oxford Nanopore</option>
+        <option value="hic">Hi-C</option>
+      </param>
+
+      <when value="pacbio">
+        <param name="reference_fasta" type="data" format="fasta"
+          label="Reference genome fasta file"
+          help="The reference genome is required for long-read optimization."
+        />
+      </when>
+
+      <when value="ont">
+        <param name="reference_fasta" type="data" format="fasta"
+          label="Reference genome fasta file"
+          help="The reference genome is required for long-read optimization."
+        />
+      </when>
+    </conditional>
+
+    <param name="output_phased" type="boolean" label="Output phased VCF file?"
+      checked="true"
+      help="Output variant calls on the haplotype assembly"
+    />
+    <param name="output_fragments" type="boolean" label="Output fragments file?"
+      help="Output fragments collected by extractHAIRS"
+    />
+
+
+    <section name="advanced" title="Advanced parameters">
+      <param name="maxIS" type="integer" label="Maximum insert size"
+        optional="true" value="1000"
+        help="Maximum insert size for a paired-end read to be considered as a single fragment for phasing."
+      />
+
+      <param name="minIS" type="integer" label="Minimum insert size"
+        optional="true" value="0"
+        help="Minimum insert size for a paired-end read to be considered as a single fragment for phasing."
+      />
+    </section>
+  </inputs>
+
+  <outputs>
+    <data name="haplotype" format="txt" from_work_dir="haplotype.out"
+      label="${tool.name} on ${on_string}: Haplotype block"
+    />
+    <data name="haplotype_phased" format="vcf" from_work_dir="haplotype.out.phased.VCF"
+      label="${tool.name} on ${on_string}: Phased haplotype VCF"
+    >
+      <filter>output_phased</filter>
+    </data>
+    <data name="frags" format="txt" from_work_dir="frags.dat"
+      label="${tool.name} on ${on_string}: Fragments"
+    >
+      <filter>output_fragments</filter>
+    </data>
+  </outputs>
+
+  <tests>
+    <test expect_num_outputs="3">
+      <param name="input_bam" ftype="bam" value="input.bam"/>
+      <param name="input_vcf" ftype="vcf" value="input.vcf"/>
+      <param name="output_fragments" value="1"/>
+      <param name="output_phased" value="1"/>
+      <param name="optimization" value="default"/>
+      <output name="frags" ftype="txt" file="output_frag.dat"/>
+      <output name="haplotype" ftype="txt" file="output_haplotype.out"/>
+      <output name="haplotype_phased" ftype="vcf" file="output_haplotype.out.phased.vcf"/>
+    </test>
+  </tests>
+
+  <help><![CDATA[
+.. class:: infomark
+
+*NOTE: At this time HapCUT2 is for diploid organisms only and can assemble haplotypes for one individual at a time. VCF input should contain variants and genotypes for a single diploid individual.*
+
+.. class:: infomark
+
+*NOTE: At this time HapCUT2 on Galaxy cannot be used for 10X Genomics sequencing data.*
+
+
+**What it does**
+
+HapCUT2 is a maximum-likelihood-based tool for assembling haplotypes from DNA sequence reads, designed to "just work" with excellent speed and accuracy. Previously described haplotype assembly methods are specialized for specific read technologies or protocols, with slow or inaccurate performance on others. With this in mind, HapCUT2 is designed for speed and accuracy across diverse sequencing technologies, including but not limited to:
+
+- NGS short reads (Illumina HiSeq)
+- single-molecule long reads (PacBio and Oxford Nanopore)
+- Linked-Reads (e.g. 10X Genomics, stLFR or TELL-seq)
+- proximity-ligation (Hi-C) reads
+- high-coverage sequencing (>40x coverage-per-SNP) using above technologies
+- combinations of the above technologies (e.g. scaffold long reads with Hi-C reads)
+
+
+**Inputs**
+
+Input data should reference a single diploid individual mapped to a reference genome.
+
+1. BAM file with reads mapped to reference genome
+
+2. VCF file with variant calls against reference genome
+
+*Using linked reads (10X Genomics, stLFR etc)?* Additional preparation is required: `see here <https://github.com/vibansal/HapCUT2/blob/master/linkedreads.md>`_
+
+
+**Outputs**
+
+- ``haplotype.out``: `phased block file <https://github.com/vibansal/HapCUT2/blob/master/outputformat.md>`_
+- ``haplotype.out.phased.vcf``: (optional) phased VCF file
+- ``Fragments``: (optional) An intermediate file containing alignment fragments with haplotype information
+
+See `HapCUT2 on GitHib <https://github.com/vibansal/HapCUT2>`_ for more detailed information.
+
+  ]]></help>
+</tool>
Binary file test-data/input.bam has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input.vcf	Wed Apr 27 06:34:22 2022 +0000
@@ -0,0 +1,30 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##fileDate=20220204
+##source=freeBayes v1.3.2-dirty
+##reference=reference/ref.fa
+##contig=<ID=NC_045512.2,length=29903>
+##phasing=none
+##commandline="freebayes -p 2 -P 0 -C 2 -F 0.05 --min-coverage 10 --min-repeat-entropy 1.0 -q 13 -m 60 --strict-vcf -f reference/ref.fa snps.bam --region NC_045512.2:0-2772"
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total read depth at the locus">
+##INFO=<ID=RO,Number=1,Type=Integer,Description="Count of full observations of the reference haplotype.">
+##INFO=<ID=AO,Number=A,Type=Integer,Description="Count of full observations of this alternate haplotype.">
+##INFO=<ID=QR,Number=1,Type=Integer,Description="Reference allele quality sum in phred">
+##INFO=<ID=QA,Number=A,Type=Integer,Description="Alternate allele quality sum in phred">
+##INFO=<ID=AB,Number=A,Type=Float,Description="Allele balance at heterozygous sites: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous">
+##INFO=<ID=TYPE,Number=A,Type=String,Description="The type of allele, either snp, mnp, ins, del, or complex.">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype Likelihood, log10-scaled likelihoods of the data given the called genotype for each possible genotype generated from the reference and alternate alleles given the sample ploidy">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=RO,Number=1,Type=Integer,Description="Reference allele observation count">
+##FORMAT=<ID=QR,Number=1,Type=Integer,Description="Sum of quality of the reference observations">
+##FORMAT=<ID=AO,Number=A,Type=Integer,Description="Alternate allele observation count">
+##FORMAT=<ID=QA,Number=A,Type=Integer,Description="Sum of quality of the alternate observations">
+##bcftools_viewVersion=1.13+htslib-1.13
+##bcftools_viewCommand=view --include 'FMT/GT="1/1" && QUAL>=100.0 && FMT/DP>=10 && (FMT/AO)/(FMT/DP)>=0.9' snps.raw.vcf; Date=Fri Feb  4 01:38:26 2022
+##bcftools_annotateVersion=1.13+htslib-1.13
+##bcftools_annotateCommand=annotate --remove ^INFO/TYPE,^INFO/DP,^INFO/RO,^INFO/AO,^INFO/AB,^FORMAT/GT,^FORMAT/DP,^FORMAT/RO,^FORMAT/AO,^FORMAT/QR,^FORMAT/QA,^FORMAT/GL; Date=Fri Feb  4 01:38:27 2022
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sars_cov_2_omicron_SRR17309642_R1_25k_fastq_gz
+NC_045512.2	3037	.	C	T	338.161	.	AB=0;AO=11;DP=11;QA=418;QR=0;RO=0;TYPE=snp	GT:DP:RO:QR:AO:QA:GL	1/1:11:0:0:11:418:-37.9727,-3.31133,0
+NC_045512.2	14408	.	C	T	494.13	.	AB=0;AO=15;DP=15;QA=570;QR=0;RO=0;TYPE=snp	GT:DP:RO:QR:AO:QA:GL	1/1:15:0:0:15:570:-51.6429,-4.51545,0
+NC_045512.2	23403	.	A	G	716.256	.	AB=0;AO=22;DP=22;QA=836;QR=0;RO=0;TYPE=snp	GT:DP:RO:QR:AO:QA:GL	1/1:22:0:0:22:836:-75.5656,-6.62266,0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_haplotype.out.phased.vcf	Wed Apr 27 06:34:22 2022 +0000
@@ -0,0 +1,34 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##fileDate=20220204
+##source=freeBayes v1.3.2-dirty
+##reference=reference/ref.fa
+##contig=<ID=NC_045512.2,length=29903>
+##phasing=none
+##commandline="freebayes -p 2 -P 0 -C 2 -F 0.05 --min-coverage 10 --min-repeat-entropy 1.0 -q 13 -m 60 --strict-vcf -f reference/ref.fa snps.bam --region NC_045512.2:0-2772"
+##INFO=<ID=hapcut2,Number=1,Type=Integer,Description="phased by HapCUT2 or not">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total read depth at the locus">
+##INFO=<ID=RO,Number=1,Type=Integer,Description="Count of full observations of the reference haplotype.">
+##INFO=<ID=AO,Number=A,Type=Integer,Description="Count of full observations of this alternate haplotype.">
+##INFO=<ID=QR,Number=1,Type=Integer,Description="Reference allele quality sum in phred">
+##INFO=<ID=QA,Number=A,Type=Integer,Description="Alternate allele quality sum in phred">
+##INFO=<ID=AB,Number=A,Type=Float,Description="Allele balance at heterozygous sites: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous">
+##INFO=<ID=TYPE,Number=A,Type=String,Description="The type of allele, either snp, mnp, ins, del, or complex.">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype Likelihood, log10-scaled likelihoods of the data given the called genotype for each possible genotype generated from the reference and alternate alleles given the sample ploidy">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=RO,Number=1,Type=Integer,Description="Reference allele observation count">
+##FORMAT=<ID=QR,Number=1,Type=Integer,Description="Sum of quality of the reference observations">
+##FORMAT=<ID=AO,Number=A,Type=Integer,Description="Alternate allele observation count">
+##FORMAT=<ID=QA,Number=A,Type=Integer,Description="Sum of quality of the alternate observations">
+##bcftools_viewVersion=1.13+htslib-1.13
+##bcftools_viewCommand=view --include 'FMT/GT="1/1" && QUAL>=100.0 && FMT/DP>=10 && (FMT/AO)/(FMT/DP)>=0.9' snps.raw.vcf; Date=Fri Feb  4 01:38:26 2022
+##bcftools_annotateVersion=1.13+htslib-1.13
+##bcftools_annotateCommand=annotate --remove ^INFO/TYPE,^INFO/DP,^INFO/RO,^INFO/AO,^INFO/AB,^FORMAT/GT,^FORMAT/DP,^FORMAT/RO,^FORMAT/AO,^FORMAT/QR,^FORMAT/QA,^FORMAT/GL; Date=Fri Feb  4 01:38:27 2022
+##FORMAT=<ID=PS,Number=1,Type=Integer,Description="ID of Phase Set for Variant">
+##FORMAT=<ID=PQ,Number=1,Type=Integer,Description="Phred QV indicating probability that this variant is incorrectly phased relative to the haplotype">
+##FORMAT=<ID=PD,Number=1,Type=Integer,Description="phased Read Depth">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sars_cov_2_omicron_SRR17309642_R1_25k_fastq_gz
+NC_045512.2	3037	.	C	T	338.161	.	AB=0;AO=11;DP=11;QA=418;QR=0;RO=0;TYPE=snp	GT:DP:RO:QR:AO:QA:GL:PS	1/1:11:0:0:11:418:-37.9727,-3.31133,0:.
+NC_045512.2	14408	.	C	T	494.13	.	AB=0;AO=15;DP=15;QA=570;QR=0;RO=0;TYPE=snp	GT:DP:RO:QR:AO:QA:GL:PS	1/1:15:0:0:15:570:-51.6429,-4.51545,0:.
+NC_045512.2	23403	.	A	G	716.256	.	AB=0;AO=22;DP=22;QA=836;QR=0;RO=0;TYPE=snp	GT:DP:RO:QR:AO:QA:GL:PS	1/1:22:0:0:22:836:-75.5656,-6.62266,0:.