Mercurial > repos > iuc > ucsc_fatovcf
changeset 0:78df8fc2b3ab draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ucsc-tools/fatovcf commit 16910aa7b33e5ff73430be1ca9d7727f4ea5786a
author | iuc |
---|---|
date | Wed, 11 Jan 2023 13:14:29 +0000 |
parents | |
children | d9965e143053 |
files | fatovcf.xml test-data/excl.txt test-data/input.fa test-data/mask.vcf test-data/out1.vcf test-data/out2.vcf |
diffstat | 6 files changed, 172 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fatovcf.xml Wed Jan 11 13:14:29 2023 +0000 @@ -0,0 +1,143 @@ +<tool id="fatovcf" name="faToVcf" version="@TOOL_VERSION@+galaxy0" profile="21.05" license="MIT"> + <description> + Convert a FASTA alignment file to Variant Call Format (VCF) single-nucleotide diffs + </description> + <macros> + <token name="@TOOL_VERSION@">426</token> + </macros> + <xrefs> + <xref type="bio.tools">UCSC_Genome_Browser_Utilities</xref> + </xrefs> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">ucsc-fatovcf</requirement> + </requirements> + <version_command><![CDATA[ echo "@TOOL_VERSION@" ]]></version_command> + <command detect_errors="exit_code"><![CDATA[ + #if $in_fasta + ln -s '$in_fasta' in.fa && + #end if + faToVcf + in.fa + '$out' + #if $ref_seq.refSeq == "customRef" + -ref=$ref_seq.ref + #end if + $ambiguous + #if $excl_seq.excludeFile + -excludeFile='$excl_seq.excludeFile' + #end if + -maxDiff=$excl_seq.maxDiff + #if $mask_sites.maskSites + -maskSites='$mask_sites.maskSites' + #end if + #if $mask_sites.windowSize > 0 + -windowSize=$mask_sites.windowSize + -minAmbigInWindow=$mask_sites.minAmbigInWindow + #end if + $includeNoAltN + -minAc=$minAc + -minAf=$minAf + #if $output.startOffset > 0 + -startOffset=$output.startOffset + #end if + $output.includeRef + $output.noGenotypes + #if $output.vcfChrom + -vcfChrom='$output.vcfChrom' + #end if + ]]></command> + <inputs> + <param name="in_fasta" format="fasta" type="data" label="FASTA Alignment" help="Must contain a series of sequences with different names and the same length. Both N and - are treated as missing information." /> + + <conditional name="ref_seq"> + <param name="refSeq" type="select" label="Determine reference sequence" help="Which sequence from the FASTA file should be used as the reference sequence."> + <option value="" selected="true">Use the first sequence as reference</option> + <option value="customRef">Use a different sequence as reference</option> + </param> + <when value="customRef"> + <param argument="-ref" type="text" label="Name of sequence that should be used as reference sequence:" help="Must be present in the FASTA file." /> + </when> + <when value="" /> + </conditional> + + <param name="ambiguous" type="select" label="Treat ambiguous bases" help="If 1: Treat ambiguous bases as N, ambiguous bases (N, R, V etc.) are treated as N (no call). If 2: Resolve ambiguous characters, if the character represents two bases and one is the reference base, convert it to the non-reference base. Otherwise convert it to N. Default: 0: Don't treat ambiguous bases"> + <option value="" selected="true">0: Don't treat ambiguous bases</option> + <option value="-ambiguousToN">1: Treat ambiguous bases as N (no call)</option> + <option value="-resolveAmbiguous">2: Resolve ambiguous characters (convert)</option> + </param> + + <section name="excl_seq" title="Exclude sequences" expanded="true"> + <param argument="-excludeFile" format="txt" type="data" optional="true" label="Exclude sequences from text file" help="Exclude sequences named in file which has one sequence name per line." /> + <param argument="-maxDiff" type="integer" min="0" value="0" label="Maximum number of mismatches" help="Exclude sequences with more than N mismatches with the reference sequence. If -windowSize is used, sequences are masked accordingly before the mismatches are counted. Default: 0" /> + </section> + + <section name="mask_sites" title="Mask sites" expanded="true"> + <param argument="-maskSites" format="vcf" type="data" optional="true" label="Mask sites at given positions (VCF file)" help="Exclude variants in positions recommended for masking in file. Typically https://github.com/W-L/ProblematicSites_SARS-CoV2/raw/master/problematic_sites_sarsCov2.vcf" /> + <param argument="-windowSize" type="integer" min="0" value="0" label="Window size to mask bases" help="Mask any base for which there are at least -minAmbigWindow bases in a window of +-N bases around the base. Masking approach adapted from https://github.com/roblanf/sarscov2phylo/blob/master/scripts/mask_seq.py Use -windowSize=7 for same results. Default: 0" /> + <param argument="-minAmbigInWindow" type="integer" min="1" value="2" label="Minimum of ambiguous characters within the window given above" help="When -windowSize is provided, mask any base for which there are at least this many N, ambiguous or gap characters within the window. Default: 2" /> + </section> + + <param argument="-includeNoAltN" type="boolean" truevalue="-includeNoAltN" falsevalue="" label="Include positions without defined ALT allele" help="Include base positions with no alternate alleles observed, but at least one N (missing base/no-call). Default: false" /> + <param argument="-minAc" type="integer" min="0" value="0" label="Minimum allele count" help="Ignore alternate alleles observed fewer than N times. Default: 0" /> + <param argument="-minAf" type="float" min="0.0" max="1.0" value="0.0" label="Minimum allele frequency" help="Ignore alternate alleles observed in less than F of non-N bases. Default: 0.0" /> + + <section name="output" title="Output VCF options" expanded="true"> + <param argument="-startOffset" type="integer" min="0" value="0" label="Start offset" help="Add N bases to each position, e.g. for trimmed alignments. Default: 0" /> + <param argument="-includeRef" type="boolean" truevalue="-includeRef" falsevalue="" label="Include the reference in the genotype columns" help="Default: omitted as redundant (false)" /> + <param argument="-noGenotypes" type="boolean" truevalue="-noGenotypes" falsevalue="" label="Output 8-column VCF" help="VCF without the sample genotype columns. Default: false" /> + <param argument="-vcfChrom" type="text" optional="true" label="Use this sequence for the CHROM column in the VCF" help="Default: name of the reference sequence." /> + </section> + </inputs> + <outputs> + <data name="out" format="vcf" /> + </outputs> + <tests> + <test expect_num_outputs="1"> <!-- default params --> + <param name="in_fasta" value="input.fa" /> + <output name="out" ftype="vcf" file="out1.vcf" lines_diff="4" /> + </test> + <test expect_num_outputs="1"> <!-- set a value for every input parameter--> + <param name="in_fasta" value="input.fa" /> + + <conditional name="ref_seq"> + <param name="refSeq" value="customRef" /> + <param name="ref" value="sample3" /> + </conditional> + + <param name="ambiguous" value="-ambiguousToN" /> + + <section name="excl_seq"> + <param name="excludeFile" value="excl.txt" /> + <param name="maxDiff" value="3" /> + </section> + + <section name="mask_sites"> + <param name="maskSites" value="mask.vcf" /> + <param name="windowSize" value="7" /> + <param name="minAmbigInWindow" value="3" /> + </section> + + <param name="includeNoAltN" value="true" /> + <param name="minAc" value="1" /> + <param name="minAf" value="0.1" /> + + <section name="output"> + <param name="startOffset" value="1" /> + <param name="includeRef" value="true" /> + <param name="noGenotypes" value="true" /> + <param name="vcfChrom" value="sample1" /> + </section> + + <output name="out" ftype="vcf" file="out2.vcf" lines_diff="4" /> + </test> + </tests> + <help><![CDATA[ +**What it does** + +`faToVcf`_ is a tool to extract a VCF from a multi-sequence FASTA alignment. + +.. _faToVcf: http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/FOOTER.txt + ]]> </help> + <citations> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/excl.txt Wed Jan 11 13:14:29 2023 +0000 @@ -0,0 +1,1 @@ +sample2
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input.fa Wed Jan 11 13:14:29 2023 +0000 @@ -0,0 +1,6 @@ +>sample1 +ANRGACACAGTCAC +>sample2 +ARNGACAC----AC +>sample3 +ARRAACGCATTCAN
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mask.vcf Wed Jan 11 13:14:29 2023 +0000 @@ -0,0 +1,7 @@ +##fileformat=VCFv4.3 +##INFO=<ID=GENE,Number=1,Type=String,Description="Position falls into range of this gene"> +##INFO=<ID=AA_POS,Number=1,Type=Integer,Description="Position of amino acid residue within gene"> +##INFO=<ID=AA_REF,Number=1,Type=String,Description="Reference amino acid residue"> +##INFO=<ID=AA_ALT,Number=.,Type=String,Description="List of alternative amino acid residues (IUPAC ambiguity code)"> +#CHROM POS ID REF ALT QUAL FILTER INFO +sample3 3 . T . . mask SUB=NDM;EXC=seq_end
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out1.vcf Wed Jan 11 13:14:29 2023 +0000 @@ -0,0 +1,7 @@ +##fileformat=VCFv4.2 +##reference=test-data/input.fa:sample1 +##source=faToVcf test-data/input.fa test-data/out1.vcf +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample2 sample3 +sample1 4 G4A G A . . AC=1;AN=2 GT 0 1 +sample1 7 A7G A G . . AC=1;AN=2 GT 0 1 +sample1 10 G10T G T . . AC=1;AN=1 GT . 1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out2.vcf Wed Jan 11 13:14:29 2023 +0000 @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##reference=test-data/input.fa:sample3 +##source=faToVcf test-data/input.fa out-test.vcf +#CHROM POS ID REF ALT QUAL FILTER INFO +sample1 3 R3R R * . . AC=0;AN=0 +sample1 5 A5G A G . . AC=1;AN=2 +sample1 8 G8A G A . . AC=1;AN=2 +sample1 11 T11G T G . . AC=1;AN=2