Mercurial > repos > cafletezbrant > kmersvm
diff kmersvm/nullseq.xml @ 0:7fe1103032f7 draft
Uploaded
author | cafletezbrant |
---|---|
date | Mon, 20 Aug 2012 18:07:22 -0400 |
parents | |
children | fd740d515502 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kmersvm/nullseq.xml Mon Aug 20 18:07:22 2012 -0400 @@ -0,0 +1,77 @@ +<tool id="kmersvm_nullseq" name="Generate Null Sequence"> + <description>using random sampling from genomic DNA</description> + <command interpreter="python">scripts/nullseq_generate.py -q + #if str($excluded) !="None": + -e $excluded + #end if + -x $fold -r $rseed -g $gc_err -t $rpt_err $input $dbkey ${indices_path.fields.path} + </command> + <inputs> + <param name="fold" type="integer" value="1" label="# of Fold-Increase" /> + <param name="gc_err" type="float" value="0.02" label="Allowable GC Error" /> + <param name="rpt_err" type="float" value="0.02" label="Allowable Repeat Error" /> + <param name="rseed" type="integer" value="1" label="Random Number Seed" /> + <param format="interval" name="input" type="data" label="BED File of Positive Regions" /> + <validator type="unspecified_build" /> + <validator type="dataset_metadata_in_file" filename="nullseq_indices.loc" metadata_name="dbkey" metadata_column="0" message="Sequences are currently unavailable for the specified build." /> + <param name="excluded" optional="true" format="interval" type="data" value="None" label="Excluded Regions (optional)" /> + <param name="indices_path" type="select" label="Available Datasets"> + <options from_file="nullseq_indices.loc"> + <column name="dbkey" index="0"/> + <column name="value" index="0"/> + <column name="name" index="1"/> + <column name="path" index="2"/> + <!--filter type="data_meta" ref="input" key="dbkey" column="0" /--> + </options> + </param> + </inputs> + <outputs> + <data format="interval" name="nullseq_output" from_work_dir="nullseq_output.bed" /> + </outputs> + <tests> + <test> + <param name="input" value="nullseq_test.bed" /> + <param name="fold" value="1" /> + <param name="gc_err" value="0.02" /> + <param name="rpt_err" value="0.02" /> + <param name="rseed" value="1" /> + <param name="indices_path" value="hg19" /> + <output name="output" file="nullseq_output.bed" /> + </test> + </tests> + <help> + +**What it does** + +Takes an input BED file and generates a set of sequences for use as negative data (null sequences) in Train SVM similar in length, GC content and repeat fraction. Uses random sampling for efficiency. + +**Parameters** + +Fold-Increase: Size of desired null sequence data set expressed as multiple of the size of the input data set. + +GC Error, Repeat Error: Acceptable difference between a positive sequence and its corresponding null sequence in terms of GC content, repeat content. + +Random Number Seed: Seed for random number generator. + +Excluded Regions: Submitted regions will be excluded from null sequence generation. + +---- + +**Example** + +Given a BED file containing:: + + chr1 10212203 10212303 + chr1 103584748 103584848 + chr1 105299130 105299230 + chr1 106367772 106367872 + +Tool will output BED file matched in length, GC content and repeat content:: + + chr1 3089935 3090035 + chr1 5031335 5031435 + chr1 5103742 5103842 + chr1 5650372 5650472 + + </help> +</tool>