# HG changeset patch # User devteam # Date 1380121722 14400 # Node ID e942fd3a76a51ebd04974f84cfb36febad86f86e Uploaded tool tarball. diff -r 000000000000 -r e942fd3a76a5 fastx_collapser.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastx_collapser.xml Wed Sep 25 11:08:42 2013 -0400 @@ -0,0 +1,90 @@ + + sequences + + fastx_toolkit + + zcat -f '$input' | fastx_collapser -v -o '$output' +#if $input.ext == "fastqsanger": +-Q 33 +#end if + + + + + + + + + + + + +**What it does** + +This tool collapses identical sequences in a FASTA file into a single sequence. + +-------- + +**Example** + +Example Input File (Sequence "ATAT" appears multiple times):: + + >CSHL_2_FC0042AGLLOO_1_1_605_414 + TGCG + >CSHL_2_FC0042AGLLOO_1_1_537_759 + ATAT + >CSHL_2_FC0042AGLLOO_1_1_774_520 + TGGC + >CSHL_2_FC0042AGLLOO_1_1_742_502 + ATAT + >CSHL_2_FC0042AGLLOO_1_1_781_514 + TGAG + >CSHL_2_FC0042AGLLOO_1_1_757_487 + TTCA + >CSHL_2_FC0042AGLLOO_1_1_903_769 + ATAT + >CSHL_2_FC0042AGLLOO_1_1_724_499 + ATAT + +Example Output file:: + + >1-1 + TGCG + >2-4 + ATAT + >3-1 + TGGC + >4-1 + TGAG + >5-1 + TTCA + +.. class:: infomark + +Original Sequence Names / Lane descriptions (e.g. "CSHL_2_FC0042AGLLOO_1_1_742_502") are discarded. + +The output sequence name is composed of two numbers: the first is the sequence's number, the second is the multiplicity value. + +The following output:: + + >2-4 + ATAT + +means that the sequence "ATAT" is the second sequence in the file, and it appeared 4 times in the input FASTA file. + + +------ + +This tool is based on `FASTX-toolkit`__ by Assaf Gordon. + + .. __: http://hannonlab.cshl.edu/fastx_toolkit/ + + + diff -r 000000000000 -r e942fd3a76a5 test-data/fasta_collapser1.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fasta_collapser1.fasta Wed Sep 25 11:08:42 2013 -0400 @@ -0,0 +1,84 @@ +>1 +TGTATTTACAATGACTAGAAA +>2 +ATTGCTGCTCGGATGGTCCGGCTGTGCACAC +>3 +AGTACAAGGACATGC +>4 +ATTGCTGCTCGGATGGTCCGGCTGTGCACAC +>5 +AGTACAAGGACATGC +>6 +ATTGCTGCTCGGATGGTCCGGCTGTGCACAC +>7 +AGTACAAGGACATGC +>8 +AGTACAAGGACATGC +>9 +ATTGCTGCTCGGATGGTCCGGCTGTGCACAC +>10 +AGTACAAGGACATGC +>11 +AGTACAAGGACATGC +>12 +ATTGCTGCTCGGATGGTCCGGCTGTGCACAC +>13 +CGATTGCCGAAGTCTACCA +>14 +AGTACAAGGACATGC +>15 +CCTTGTAGTGGATTCTGATGA +>16 +AGTACAAGGACATGC +>17 +AGTACAAGGACATGC +>18 +ATTGCTGCTCGGATGGTCCGGCTGTGCACAC +>19 +AGTACAAGGACATGC +>20 +ATTGCTGCTCGGATGGTCCGGCTGTGCACAC +>21 +AGTACAAGGACATGC +>22 +AGTACAAGGACATGC +>23 +CTGCTGCGATCGGTGTGC +>24 +AGTACAAGGACATGC +>25 +ACCATTCGAGCATAC +>26 +AGTACAAGGACATGC +>27 +TCAAATTCTAGATTTTTACGG +>28 +AGTACAAGGACATGC +>29 +TGATTTCCAGAGCCAAT +>30 +ATTGCTGCTCGGATGGTCCGGCTGTGCACAC +>31 +TTACCTCACGATATTGTAATA +>32 +ATGACTTCATCGTCCACCCTTTAGAACT +>33 +ATTGCTGCTCGGATGGTCCGGCTGTGCACAC +>34 +TTCAACGCCGCCGTGAAC +>35 +ATTGCTGCTCGGATGGTCCGGCTGTGCACAC +>36 +CTGCTGCGATCGGTGTGC +>37 +ATTGCTGCTCGGATGGTCCGGCTGTGCACAC +>38 +TTCAACGCCGCCGTGAAC +>39 +TTCAACGCCGCCGTGAAC +>40 +CTGCTGCGATCGGTGTGC +>41 +TTCAACGCCGCCGTGAAC +>42 +TTCAACGCCGCCGTGAAC diff -r 000000000000 -r e942fd3a76a5 test-data/fasta_collapser1.out --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fasta_collapser1.out Wed Sep 25 11:08:42 2013 -0400 @@ -0,0 +1,24 @@ +>1-15 +AGTACAAGGACATGC +>2-11 +ATTGCTGCTCGGATGGTCCGGCTGTGCACAC +>3-5 +TTCAACGCCGCCGTGAAC +>4-3 +CTGCTGCGATCGGTGTGC +>5-1 +TCAAATTCTAGATTTTTACGG +>6-1 +ACCATTCGAGCATAC +>7-1 +TGATTTCCAGAGCCAAT +>8-1 +TTACCTCACGATATTGTAATA +>9-1 +TGTATTTACAATGACTAGAAA +>10-1 +CCTTGTAGTGGATTCTGATGA +>11-1 +CGATTGCCGAAGTCTACCA +>12-1 +ATGACTTCATCGTCCACCCTTTAGAACT \ No newline at end of file diff -r 000000000000 -r e942fd3a76a5 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Wed Sep 25 11:08:42 2013 -0400 @@ -0,0 +1,6 @@ + + + + + +