Mercurial > repos > iuc > ucsc_fatovcf

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fatovcf.xml	Wed Jan 11 13:14:29 2023 +0000
@@ -0,0 +1,143 @@
+<tool id="fatovcf" name="faToVcf" version="@TOOL_VERSION@+galaxy0" profile="21.05" license="MIT">
+    <description>
+        Convert a FASTA alignment file to Variant Call Format (VCF) single-nucleotide diffs
+    </description>
+    <macros>
+        <token name="@TOOL_VERSION@">426</token>
+    </macros>
+    <xrefs>
+        <xref type="bio.tools">UCSC_Genome_Browser_Utilities</xref>
+    </xrefs>
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">ucsc-fatovcf</requirement>
+    </requirements>
+    <version_command><![CDATA[ echo "@TOOL_VERSION@" ]]></version_command>
+    <command detect_errors="exit_code"><![CDATA[
+    #if $in_fasta
+        ln -s '$in_fasta' in.fa &&
+    #end if
+    faToVcf
+        in.fa
+        '$out'
+        #if $ref_seq.refSeq == "customRef"
+            -ref=$ref_seq.ref
+        #end if
+        $ambiguous
+        #if $excl_seq.excludeFile
+            -excludeFile='$excl_seq.excludeFile'
+        #end if
+        -maxDiff=$excl_seq.maxDiff
+        #if $mask_sites.maskSites
+            -maskSites='$mask_sites.maskSites'
+        #end if
+        #if $mask_sites.windowSize > 0
+            -windowSize=$mask_sites.windowSize
+            -minAmbigInWindow=$mask_sites.minAmbigInWindow
+        #end if
+        $includeNoAltN
+        -minAc=$minAc
+        -minAf=$minAf
+        #if $output.startOffset > 0
+            -startOffset=$output.startOffset
+        #end if
+        $output.includeRef
+        $output.noGenotypes
+        #if $output.vcfChrom
+            -vcfChrom='$output.vcfChrom'
+        #end if
+    ]]></command>
+    <inputs>
+        <param name="in_fasta" format="fasta" type="data" label="FASTA Alignment" help="Must contain a series of sequences with different names and the same length. Both N and - are treated as missing information." />
+
+        <conditional name="ref_seq">
+            <param name="refSeq" type="select" label="Determine reference sequence" help="Which sequence from the FASTA file should be used as the reference sequence.">
+                <option value="" selected="true">Use the first sequence as reference</option>
+                <option value="customRef">Use a different sequence as reference</option>
+            </param>
+            <when value="customRef">
+                <param argument="-ref" type="text" label="Name of sequence that should be used as reference sequence:" help="Must be present in the FASTA file." />
+            </when>
+            <when value="" />
+        </conditional>
+
+        <param name="ambiguous" type="select" label="Treat ambiguous bases" help="If 1: Treat ambiguous bases as N, ambiguous bases (N, R, V etc.) are treated as N (no call). If 2: Resolve ambiguous characters, if the character represents two bases and one is the reference base, convert it to the non-reference base. Otherwise convert it to N. Default: 0: Don't treat ambiguous bases">
+            <option value="" selected="true">0: Don't treat ambiguous bases</option>
+            <option value="-ambiguousToN">1: Treat ambiguous bases as N (no call)</option>
+            <option value="-resolveAmbiguous">2: Resolve ambiguous characters (convert)</option>
+        </param>
+
+        <section name="excl_seq" title="Exclude sequences" expanded="true">
+            <param argument="-excludeFile" format="txt" type="data" optional="true" label="Exclude sequences from text file" help="Exclude sequences named in file which has one sequence name per line." />
+            <param argument="-maxDiff" type="integer" min="0" value="0" label="Maximum number of mismatches" help="Exclude sequences with more than N mismatches with the reference sequence. If -windowSize is used, sequences are masked accordingly before the mismatches are counted. Default: 0" />
+        </section>
+
+        <section name="mask_sites" title="Mask sites" expanded="true">
+            <param argument="-maskSites" format="vcf" type="data" optional="true" label="Mask sites at given positions (VCF file)" help="Exclude variants in positions recommended for masking in file. Typically https://github.com/W-L/ProblematicSites_SARS-CoV2/raw/master/problematic_sites_sarsCov2.vcf" />
+            <param argument="-windowSize" type="integer" min="0" value="0" label="Window size to mask bases" help="Mask any base for which there are at least -minAmbigWindow bases in a window of +-N bases around the base. Masking approach adapted from https://github.com/roblanf/sarscov2phylo/blob/master/scripts/mask_seq.py Use -windowSize=7 for same results. Default: 0" />
+            <param argument="-minAmbigInWindow" type="integer" min="1" value="2" label="Minimum of ambiguous characters within the window given above" help="When -windowSize is provided, mask any base for which there are at least this many N, ambiguous or gap characters within the window. Default: 2" />
+        </section>
+
+        <param argument="-includeNoAltN" type="boolean" truevalue="-includeNoAltN" falsevalue="" label="Include positions without defined ALT allele" help="Include base positions with no alternate alleles observed, but at least one N (missing base/no-call). Default: false" />
+        <param argument="-minAc" type="integer" min="0" value="0" label="Minimum allele count" help="Ignore alternate alleles observed fewer than N times. Default: 0" />
+        <param argument="-minAf" type="float" min="0.0" max="1.0" value="0.0" label="Minimum allele frequency" help="Ignore alternate alleles observed in less than F of non-N bases. Default: 0.0" />
+
+        <section name="output" title="Output VCF options" expanded="true">
+            <param argument="-startOffset" type="integer" min="0" value="0" label="Start offset" help="Add N bases to each position, e.g. for trimmed alignments. Default: 0" />
+            <param argument="-includeRef" type="boolean" truevalue="-includeRef" falsevalue="" label="Include the reference in the genotype columns" help="Default: omitted as redundant (false)" />
+            <param argument="-noGenotypes" type="boolean" truevalue="-noGenotypes" falsevalue="" label="Output 8-column VCF" help="VCF without the sample genotype columns. Default: false" />
+            <param argument="-vcfChrom" type="text" optional="true" label="Use this sequence for the CHROM column in the VCF" help="Default: name of the reference sequence." />
+        </section>
+    </inputs>
+    <outputs>
+        <data name="out" format="vcf" />
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1"> <!-- default params -->
+            <param name="in_fasta" value="input.fa" />
+            <output name="out" ftype="vcf" file="out1.vcf" lines_diff="4" />
+        </test>
+        <test expect_num_outputs="1"> <!-- set a value for every input parameter-->
+            <param name="in_fasta" value="input.fa" />
+
+            <conditional name="ref_seq">
+                <param name="refSeq" value="customRef" />
+                <param name="ref" value="sample3" />
+            </conditional>
+
+            <param name="ambiguous" value="-ambiguousToN" />
+
+            <section name="excl_seq">
+                <param name="excludeFile" value="excl.txt" />
+                <param name="maxDiff" value="3" />
+            </section>
+
+            <section name="mask_sites">
+                <param name="maskSites" value="mask.vcf" />
+                <param name="windowSize" value="7" />
+                <param name="minAmbigInWindow" value="3" />
+            </section>
+
+            <param name="includeNoAltN" value="true" />
+            <param name="minAc" value="1" />
+            <param name="minAf" value="0.1" />
+
+            <section name="output">
+                <param name="startOffset" value="1" />
+                <param name="includeRef" value="true" />
+                <param name="noGenotypes" value="true" />
+                <param name="vcfChrom" value="sample1" />
+            </section>
+
+            <output name="out" ftype="vcf" file="out2.vcf" lines_diff="4" />
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+
+`faToVcf`_ is a tool to extract a VCF from a multi-sequence FASTA alignment.
+
+.. _faToVcf: http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/FOOTER.txt
+    ]]>    </help>
+    <citations>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/excl.txt	Wed Jan 11 13:14:29 2023 +0000
@@ -0,0 +1,1 @@
+sample2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input.fa	Wed Jan 11 13:14:29 2023 +0000
@@ -0,0 +1,6 @@
+>sample1
+ANRGACACAGTCAC
+>sample2
+ARNGACAC----AC
+>sample3
+ARRAACGCATTCAN
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mask.vcf	Wed Jan 11 13:14:29 2023 +0000
@@ -0,0 +1,7 @@
+##fileformat=VCFv4.3
+##INFO=<ID=GENE,Number=1,Type=String,Description="Position falls into range of this gene">
+##INFO=<ID=AA_POS,Number=1,Type=Integer,Description="Position of amino acid residue within gene">
+##INFO=<ID=AA_REF,Number=1,Type=String,Description="Reference amino acid residue">
+##INFO=<ID=AA_ALT,Number=.,Type=String,Description="List of alternative amino acid residues (IUPAC ambiguity code)">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+sample3	3	.	T	.	.	mask	SUB=NDM;EXC=seq_end
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out1.vcf	Wed Jan 11 13:14:29 2023 +0000
@@ -0,0 +1,7 @@
+##fileformat=VCFv4.2
+##reference=test-data/input.fa:sample1
+##source=faToVcf test-data/input.fa test-data/out1.vcf
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample2	sample3
+sample1	4	G4A	G	A	.	.	AC=1;AN=2	GT	0	1
+sample1	7	A7G	A	G	.	.	AC=1;AN=2	GT	0	1
+sample1	10	G10T	G	T	.	.	AC=1;AN=1	GT	.	1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out2.vcf	Wed Jan 11 13:14:29 2023 +0000
@@ -0,0 +1,8 @@
+##fileformat=VCFv4.2
+##reference=test-data/input.fa:sample3
+##source=faToVcf test-data/input.fa out-test.vcf
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+sample1	3	R3R	R	*	.	.	AC=0;AN=0
+sample1	5	A5G	A	G	.	.	AC=1;AN=2
+sample1	8	G8A	G	A	.	.	AC=1;AN=2
+sample1	11	T11G	T	G	.	.	AC=1;AN=2