Repository 'hapcut2'
hg clone https://toolshed.g2.bx.psu.edu/repos/galaxy-australia/hapcut2

Changeset 0:fb00fb7cb201 (2022-04-27)
Next changeset 1:271eb7f4b8bc (2022-05-08)
Commit message:
"planemo upload for repository https://github.com/usegalaxy-au/tools-au commit 7770c3ea0fab9df0f39d3d73d10c9e282b77c60f-dirty"
added:
hapcut2.xml
test-data/input.bam
test-data/input.vcf
test-data/output_frag.dat
test-data/output_haplotype.out
test-data/output_haplotype.out.phased.vcf
b
diff -r 000000000000 -r fb00fb7cb201 hapcut2.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hapcut2.xml Wed Apr 27 06:34:22 2022 +0000
[
@@ -0,0 +1,174 @@
+<tool id="hapcut2" name="Hapcut2" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@+ga@GA_VERSION_SUFFIX@">
+  <description> - haplotype assembly for diploid organisms</description>
+  <xrefs>
+    <xref type="bio.tools">hapcut2</xref>
+  </xrefs>
+  <macros>
+    <token name="@TOOL_VERSION@">1.3.3</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@GA_VERSION_SUFFIX@">1</token>
+  </macros>
+  <requirements>
+    <requirement type="package" version="1.3.3">hapcut2</requirement>
+  </requirements>
+
+  <command detect_errors="exit_code"><![CDATA[
+
+## Prep inputs
+## =====================================================================
+ln -s '$input_bam' input.bam
+&& ln -s '$input_vcf' input.vcf
+
+
+## Run program
+## =====================================================================
+## Extract variant fragments from alignment
+&& extractHAIRS --bam input.bam --VCF input.vcf --out frags.dat
+#if $optimization.choice == 'pacbio':
+  --pacbio 1
+  --ref '$optimization.reference_fasta'
+#elif $optimization.choice == 'ont':
+  --ont 1
+  --ref '$optimization.reference_fasta'
+#elif $optimization.choice == 'hic':
+  --HiC 1
+#end if
+
+#if $advanced.minIS
+--minIS $advanced.minIS
+#end if
+
+#if $advanced.maxIS
+  --maxIS $advanced.maxIS
+#end if
+
+## Create haplotype.out and haplotype.out.phased.VCF
+&& HAPCUT2 --fragments frags.dat --VCF input.vcf --output haplotype.out
+#if $optimization.choice == 'hic':
+  --HiC 1
+#end if
+
+  ]]></command>
+
+  <inputs>
+    <param name="input_bam" type="data" format="bam" label="Input BAM file"/>
+    <param name="input_vcf" type="data" format="vcf" label="Input VCF file"/>
+
+    <conditional name="optimization">
+      <!-- TODO: include 10X (requires extra processing step) -->
+      <param name="choice" type="select" display="radio" label="Optimization">
+        <option value="default" selected="true">Default</option>
+        <option value="pacbio">Pacbio</option>
+        <option value="ont">Oxford Nanopore</option>
+        <option value="hic">Hi-C</option>
+      </param>
+
+      <when value="pacbio">
+        <param name="reference_fasta" type="data" format="fasta"
+          label="Reference genome fasta file"
+          help="The reference genome is required for long-read optimization."
+        />
+      </when>
+
+      <when value="ont">
+        <param name="reference_fasta" type="data" format="fasta"
+          label="Reference genome fasta file"
+          help="The reference genome is required for long-read optimization."
+        />
+      </when>
+    </conditional>
+
+    <param name="output_phased" type="boolean" label="Output phased VCF file?"
+      checked="true"
+      help="Output variant calls on the haplotype assembly"
+    />
+    <param name="output_fragments" type="boolean" label="Output fragments file?"
+      help="Output fragments collected by extractHAIRS"
+    />
+
+
+    <section name="advanced" title="Advanced parameters">
+      <param name="maxIS" type="integer" label="Maximum insert size"
+        optional="true" value="1000"
+        help="Maximum insert size for a paired-end read to be considered as a single fragment for phasing."
+      />
+
+      <param name="minIS" type="integer" label="Minimum insert size"
+        optional="true" value="0"
+        help="Minimum insert size for a paired-end read to be considered as a single fragment for phasing."
+      />
+    </section>
+  </inputs>
+
+  <outputs>
+    <data name="haplotype" format="txt" from_work_dir="haplotype.out"
+      label="${tool.name} on ${on_string}: Haplotype block"
+    />
+    <data name="haplotype_phased" format="vcf" from_work_dir="haplotype.out.phased.VCF"
+      label="${tool.name} on ${on_string}: Phased haplotype VCF"
+    >
+      <filter>output_phased</filter>
+    </data>
+    <data name="frags" format="txt" from_work_dir="frags.dat"
+      label="${tool.name} on ${on_string}: Fragments"
+    >
+      <filter>output_fragments</filter>
+    </data>
+  </outputs>
+
+  <tests>
+    <test expect_num_outputs="3">
+      <param name="input_bam" ftype="bam" value="input.bam"/>
+      <param name="input_vcf" ftype="vcf" value="input.vcf"/>
+      <param name="output_fragments" value="1"/>
+      <param name="output_phased" value="1"/>
+      <param name="optimization" value="default"/>
+      <output name="frags" ftype="txt" file="output_frag.dat"/>
+      <output name="haplotype" ftype="txt" file="output_haplotype.out"/>
+      <output name="haplotype_phased" ftype="vcf" file="output_haplotype.out.phased.vcf"/>
+    </test>
+  </tests>
+
+  <help><![CDATA[
+.. class:: infomark
+
+*NOTE: At this time HapCUT2 is for diploid organisms only and can assemble haplotypes for one individual at a time. VCF input should contain variants and genotypes for a single diploid individual.*
+
+.. class:: infomark
+
+*NOTE: At this time HapCUT2 on Galaxy cannot be used for 10X Genomics sequencing data.*
+
+
+**What it does**
+
+HapCUT2 is a maximum-likelihood-based tool for assembling haplotypes from DNA sequence reads, designed to "just work" with excellent speed and accuracy. Previously described haplotype assembly methods are specialized for specific read technologies or protocols, with slow or inaccurate performance on others. With this in mind, HapCUT2 is designed for speed and accuracy across diverse sequencing technologies, including but not limited to:
+
+- NGS short reads (Illumina HiSeq)
+- single-molecule long reads (PacBio and Oxford Nanopore)
+- Linked-Reads (e.g. 10X Genomics, stLFR or TELL-seq)
+- proximity-ligation (Hi-C) reads
+- high-coverage sequencing (>40x coverage-per-SNP) using above technologies
+- combinations of the above technologies (e.g. scaffold long reads with Hi-C reads)
+
+
+**Inputs**
+
+Input data should reference a single diploid individual mapped to a reference genome.
+
+1. BAM file with reads mapped to reference genome
+
+2. VCF file with variant calls against reference genome
+
+*Using linked reads (10X Genomics, stLFR etc)?* Additional preparation is required: `see here <https://github.com/vibansal/HapCUT2/blob/master/linkedreads.md>`_
+
+
+**Outputs**
+
+- ``haplotype.out``: `phased block file <https://github.com/vibansal/HapCUT2/blob/master/outputformat.md>`_
+- ``haplotype.out.phased.vcf``: (optional) phased VCF file
+- ``Fragments``: (optional) An intermediate file containing alignment fragments with haplotype information
+
+See `HapCUT2 on GitHib <https://github.com/vibansal/HapCUT2>`_ for more detailed information.
+
+  ]]></help>
+</tool>
b
diff -r 000000000000 -r fb00fb7cb201 test-data/input.bam
b
Binary file test-data/input.bam has changed
b
diff -r 000000000000 -r fb00fb7cb201 test-data/input.vcf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input.vcf Wed Apr 27 06:34:22 2022 +0000
b
@@ -0,0 +1,30 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##fileDate=20220204
+##source=freeBayes v1.3.2-dirty
+##reference=reference/ref.fa
+##contig=<ID=NC_045512.2,length=29903>
+##phasing=none
+##commandline="freebayes -p 2 -P 0 -C 2 -F 0.05 --min-coverage 10 --min-repeat-entropy 1.0 -q 13 -m 60 --strict-vcf -f reference/ref.fa snps.bam --region NC_045512.2:0-2772"
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total read depth at the locus">
+##INFO=<ID=RO,Number=1,Type=Integer,Description="Count of full observations of the reference haplotype.">
+##INFO=<ID=AO,Number=A,Type=Integer,Description="Count of full observations of this alternate haplotype.">
+##INFO=<ID=QR,Number=1,Type=Integer,Description="Reference allele quality sum in phred">
+##INFO=<ID=QA,Number=A,Type=Integer,Description="Alternate allele quality sum in phred">
+##INFO=<ID=AB,Number=A,Type=Float,Description="Allele balance at heterozygous sites: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous">
+##INFO=<ID=TYPE,Number=A,Type=String,Description="The type of allele, either snp, mnp, ins, del, or complex.">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype Likelihood, log10-scaled likelihoods of the data given the called genotype for each possible genotype generated from the reference and alternate alleles given the sample ploidy">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=RO,Number=1,Type=Integer,Description="Reference allele observation count">
+##FORMAT=<ID=QR,Number=1,Type=Integer,Description="Sum of quality of the reference observations">
+##FORMAT=<ID=AO,Number=A,Type=Integer,Description="Alternate allele observation count">
+##FORMAT=<ID=QA,Number=A,Type=Integer,Description="Sum of quality of the alternate observations">
+##bcftools_viewVersion=1.13+htslib-1.13
+##bcftools_viewCommand=view --include 'FMT/GT="1/1" && QUAL>=100.0 && FMT/DP>=10 && (FMT/AO)/(FMT/DP)>=0.9' snps.raw.vcf; Date=Fri Feb  4 01:38:26 2022
+##bcftools_annotateVersion=1.13+htslib-1.13
+##bcftools_annotateCommand=annotate --remove ^INFO/TYPE,^INFO/DP,^INFO/RO,^INFO/AO,^INFO/AB,^FORMAT/GT,^FORMAT/DP,^FORMAT/RO,^FORMAT/AO,^FORMAT/QR,^FORMAT/QA,^FORMAT/GL; Date=Fri Feb  4 01:38:27 2022
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sars_cov_2_omicron_SRR17309642_R1_25k_fastq_gz
+NC_045512.2 3037 . C T 338.161 . AB=0;AO=11;DP=11;QA=418;QR=0;RO=0;TYPE=snp GT:DP:RO:QR:AO:QA:GL 1/1:11:0:0:11:418:-37.9727,-3.31133,0
+NC_045512.2 14408 . C T 494.13 . AB=0;AO=15;DP=15;QA=570;QR=0;RO=0;TYPE=snp GT:DP:RO:QR:AO:QA:GL 1/1:15:0:0:15:570:-51.6429,-4.51545,0
+NC_045512.2 23403 . A G 716.256 . AB=0;AO=22;DP=22;QA=836;QR=0;RO=0;TYPE=snp GT:DP:RO:QR:AO:QA:GL 1/1:22:0:0:22:836:-75.5656,-6.62266,0
b
diff -r 000000000000 -r fb00fb7cb201 test-data/output_haplotype.out.phased.vcf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_haplotype.out.phased.vcf Wed Apr 27 06:34:22 2022 +0000
b
@@ -0,0 +1,34 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##fileDate=20220204
+##source=freeBayes v1.3.2-dirty
+##reference=reference/ref.fa
+##contig=<ID=NC_045512.2,length=29903>
+##phasing=none
+##commandline="freebayes -p 2 -P 0 -C 2 -F 0.05 --min-coverage 10 --min-repeat-entropy 1.0 -q 13 -m 60 --strict-vcf -f reference/ref.fa snps.bam --region NC_045512.2:0-2772"
+##INFO=<ID=hapcut2,Number=1,Type=Integer,Description="phased by HapCUT2 or not">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total read depth at the locus">
+##INFO=<ID=RO,Number=1,Type=Integer,Description="Count of full observations of the reference haplotype.">
+##INFO=<ID=AO,Number=A,Type=Integer,Description="Count of full observations of this alternate haplotype.">
+##INFO=<ID=QR,Number=1,Type=Integer,Description="Reference allele quality sum in phred">
+##INFO=<ID=QA,Number=A,Type=Integer,Description="Alternate allele quality sum in phred">
+##INFO=<ID=AB,Number=A,Type=Float,Description="Allele balance at heterozygous sites: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous">
+##INFO=<ID=TYPE,Number=A,Type=String,Description="The type of allele, either snp, mnp, ins, del, or complex.">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype Likelihood, log10-scaled likelihoods of the data given the called genotype for each possible genotype generated from the reference and alternate alleles given the sample ploidy">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=RO,Number=1,Type=Integer,Description="Reference allele observation count">
+##FORMAT=<ID=QR,Number=1,Type=Integer,Description="Sum of quality of the reference observations">
+##FORMAT=<ID=AO,Number=A,Type=Integer,Description="Alternate allele observation count">
+##FORMAT=<ID=QA,Number=A,Type=Integer,Description="Sum of quality of the alternate observations">
+##bcftools_viewVersion=1.13+htslib-1.13
+##bcftools_viewCommand=view --include 'FMT/GT="1/1" && QUAL>=100.0 && FMT/DP>=10 && (FMT/AO)/(FMT/DP)>=0.9' snps.raw.vcf; Date=Fri Feb  4 01:38:26 2022
+##bcftools_annotateVersion=1.13+htslib-1.13
+##bcftools_annotateCommand=annotate --remove ^INFO/TYPE,^INFO/DP,^INFO/RO,^INFO/AO,^INFO/AB,^FORMAT/GT,^FORMAT/DP,^FORMAT/RO,^FORMAT/AO,^FORMAT/QR,^FORMAT/QA,^FORMAT/GL; Date=Fri Feb  4 01:38:27 2022
+##FORMAT=<ID=PS,Number=1,Type=Integer,Description="ID of Phase Set for Variant">
+##FORMAT=<ID=PQ,Number=1,Type=Integer,Description="Phred QV indicating probability that this variant is incorrectly phased relative to the haplotype">
+##FORMAT=<ID=PD,Number=1,Type=Integer,Description="phased Read Depth">
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sars_cov_2_omicron_SRR17309642_R1_25k_fastq_gz
+NC_045512.2 3037 . C T 338.161 . AB=0;AO=11;DP=11;QA=418;QR=0;RO=0;TYPE=snp GT:DP:RO:QR:AO:QA:GL:PS 1/1:11:0:0:11:418:-37.9727,-3.31133,0:.
+NC_045512.2 14408 . C T 494.13 . AB=0;AO=15;DP=15;QA=570;QR=0;RO=0;TYPE=snp GT:DP:RO:QR:AO:QA:GL:PS 1/1:15:0:0:15:570:-51.6429,-4.51545,0:.
+NC_045512.2 23403 . A G 716.256 . AB=0;AO=22;DP=22;QA=836;QR=0;RO=0;TYPE=snp GT:DP:RO:QR:AO:QA:GL:PS 1/1:22:0:0:22:836:-75.5656,-6.62266,0:.