comparison fetch_fasta_from_NCBI.xml @ 4:c667d0ee39f5 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/fetch_fasta_from_ncbi commit ca3070e85c370b914ffa0562afe12b363e05aea4
author artbio
date Wed, 29 Nov 2017 17:38:52 -0500
parents 8be88084f89c
children 706fe8139955
comparison
equal deleted inserted replaced
3:8be88084f89c 4:c667d0ee39f5
1 <tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.2.1"> 1 <tool id="retrieve_fasta_from_NCBI" name="Retrieve FASTA from NCBI" version="2.3.0">
2 <description></description> 2 <description></description>
3 <command><![CDATA[ 3 <command><![CDATA[
4 python '$__tool_directory__'/fetch_fasta_from_NCBI.py 4 python '$__tool_directory__'/fetch_fasta_from_NCBI.py
5 -i "$queryString" 5 -i "$queryString"
6 -d $dbname 6 -d $dbname
7 -l '$logfile' 7 -l '$logfile'
8 $dry_run 8 -c
9 -o '$outfile' 9 -o '$outfile';
10 #if $dry_run == ""
11 number_UIDs=\$(tail -n 2 $logfile | perl -ne '/Found (\d+) UID/ && print \$1');
12 python '$__tool_directory__'/fetch_fasta_from_NCBI.py
13 -i "$queryString"
14 -d $dbname
15 -u
16 -l '$logfile'
17 -o 'uid_outfile';
18 UID_array=( \$(head uid_outfile) );
19 array_len=\${#UID_array[@]};
20 counter=0;
21 number_of_groups=\$((array_len / 200000));
22 modulo=\$((array_len % 200000));
23 if [ "\$modulo" -gt 0 ];then
24 number_of_groups=\$((number_of_groups + 1));
25 fi;
26 group_number=1;
27 echo "----- Number of groups of batches: \$number_of_groups -----" >> $logfile;
28 for ((i=0; i+200000<array_len;i+=200000)); do
29 echo "----- Group number: \$group_number -----" >> $logfile;
30 echo "\${UID_array[@]:\$i:99999}" > uid_list_1.txt;
31 echo "\${UID_array[@]:\$((i+100000)):99999}" > uid_list_2.txt;
32 python '$__tool_directory__'/fetch_fasta_from_NCBI.py
33 -d $dbname
34 -l '$logfile'
35 -o 'tmp1_outfile'
36 --UID_list uid_list_1.txt&
37 python '$__tool_directory__'/fetch_fasta_from_NCBI.py
38 -d $dbname
39 -l 'tmp1_logfile'
40 -o 'tmp2_outfile'
41 --UID_list uid_list_2.txt&
42 wait;
43 cat tmp1_outfile tmp2_outfile>> $outfile;
44 rm tmp1_outfile tmp2_outfile;
45 cat tmp1_logfile >> $logfile;
46 rm tmp1_logfile;
47 rm uid_list_1.txt uid_list_2.txt;
48 group_number=\$((group_number + 1));
49 counter=\$(( counter + 200000 ));
50 done;
51 echo "----- Group number: \$group_number -----" >> $logfile;
52 echo "----- Last group -----" >> $logfile;
53 if [ "\$counter" -lt "\$array_len" ]; then
54 echo "\${UID_array[@]:\$counter:\$((array_len - counter + 1))}" > uid_list.txt;
55 python '$__tool_directory__'/fetch_fasta_from_NCBI.py
56 -d $dbname
57 -l '$logfile'
58 -o 'tmp_outfile'
59 --UID_list uid_list.txt;
60 rm uid_list.txt;
61 cat tmp_outfile >> $outfile;
62 rm tmp_outfile;
63 fi;
64 #end if
10 ]]></command> 65 ]]></command>
11 66
12 <inputs> 67 <inputs>
13 <param name="queryString" type="text" size="5x80" area="True" value="txid10239[orgn] NOT txid131567[orgn] AND complete[all] NOT partial[title] NOT phage[title]" label="Query to NCBI in entrez format" help="exemple:'Drosophila melanogaster[Organism] AND Gcn5[Title]"> 68 <param name="queryString" type="text" size="5x80" area="True" value="txid10239[orgn] NOT txid131567[orgn] AND complete[all] NOT partial[title] NOT phage[title]" label="Query to NCBI in entrez format" help="exemple: Drosophila melanogaster[Organism] AND Gcn5[Title]">
14 <sanitizer> 69 <sanitizer>
15 <valid initial="string.printable"> 70 <valid initial="string.printable">
16 <remove value="&quot;"/> 71 <remove value="&quot;"/>
17 <remove value="\"/> 72 <remove value="\"/>
18 </valid> 73 </valid>
24 </param> 79 </param>
25 <param name="dbname" type="select" label="NCBI database"> 80 <param name="dbname" type="select" label="NCBI database">
26 <option value="nuccore">Nucleotide</option> 81 <option value="nuccore">Nucleotide</option>
27 <option value="protein">Protein</option> 82 <option value="protein">Protein</option>
28 </param> 83 </param>
29 <param name="dry_run" type="boolean" label="Dry run to get the number of sequences?" truevalue="--count" falsevalue="" checked="false"/> 84 <param name="dry_run" type="boolean" label="Get only the number of sequences" truevalue="--count" falsevalue="" checked="false"/>
30 </inputs> 85 </inputs>
31 <outputs> 86 <outputs>
32 <data name="outfile" format="fasta" label="${tool.name} (${dbname.value_label}) with queryString '${queryString.value}'" > 87 <data name="outfile" format="fasta" label="${tool.name} (${dbname.value_label}) with queryString '${queryString.value}'" >
33 <filter> dry_run == False</filter> 88 <filter> dry_run == False</filter>
34 </data> 89 </data>
35 <data format="txt" name="logfile" label="${tool.name}: log"/> 90 <data format="txt" name="logfile" label="${tool.name}: log"/>
36 </outputs> 91 </outputs>
37 <tests> 92 <tests>
38 <test> 93 <test>
39 <param name="queryString" value="9629650[gi]" /> 94 <param name="queryString" value="9629650[gi]" />
40 <param name="dbname" value="nuccore" /> 95 <param name="dbname" value="nuccore" />
41 <output name="outfilename" ftype="fasta" file="output.fa" /> 96 <output name="outfilename" ftype="fasta" file="output.fa" />
42 </test> 97 </test>
43 <test> 98 <test>
44 <param name="queryString" value="CU929326[Accession]" /> 99 <param name="queryString" value="CU929326[Accession]" />
45 <param name="dbname" value="nuccore" /> 100 <param name="dbname" value="nuccore" />
46 <param name="date_filter" value="1"/> 101 <param name="date_filter" value="1"/>
47 <param name="dry_run" value="True"/> 102 <param name="dry_run" value="True"/>
48 <output name="logfile" ftype="txt" file="dry_run.log" compare="sim_size"/> 103 <output name="logfile" ftype="txt" file="dry_run.log" compare="sim_size"/>
49 </test> 104 </test>
105 <test>
106 <param name="queryString" value="Drosophila[Organism] AND 2014[PDAT] AND virus" />
107 <output name="outfilename" ftype="fasta" >
108 <metadata name="sequences" value="13" />
109 </output>
110 </test>
50 </tests> 111 </tests>
51 <help> 112 <help>
52 **What it does** 113 **What it does**
53 114
54 This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database for a given entrez query. 115 This tool retrieves nucleotide/peptide sequences from the corresponding NCBI database (nuccore or protein) for a given entrez query.
55 116
56 The tool is preset with "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose 117 The tool is preset with "txid10239[orgn] NOT txid131567[orgn] AND complete NOT partial[title] NOT phage[title]" for metaVisitor use purpose
57 118
58 See `Entrez help`_ for explanation of query formats 119 See `Entrez help`_ for explanation of query formats
59 120
60 Be sure to use the appropriate NCBI query syntax. Always use [] to specify the search fields. 121 Be sure to use the appropriate NCBI query syntax. Always use [] to specify the search fields.
61 122
123 By checking the checkbox you can also run your query without sequence retrieval and get the number of sequences your query will fetch.
124
62 Note that the tool may fail in case of interrupted connexion with the NCBI database (see the log dataset) 125 Note that the tool may fail in case of interrupted connexion with the NCBI database (see the log dataset)
126
127 Retrieval progress is reported in the log dataset.
63 128
64 **Acknowledgments** 129 **Acknowledgments**
65 130
66 This Galaxy tool has been adapted from the galaxy tool `get_fasta_from_taxon`_. 131 This Galaxy tool has been adapted from the galaxy tool `get_fasta_from_taxon`_.
67 132