changeset 0:854be3d51221 draft

Uploaded 20171204
author fabio
date Mon, 04 Dec 2017 16:05:45 -0500
parents
children ef234e9679ea
files ._.DS_Store ._retrieve.py ._retrieve.xml ._search.py ._search.xml .shed.yml retrieve.py retrieve.xml search.py search.xml
diffstat 10 files changed, 389 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
Binary file ._.DS_Store has changed
Binary file ._retrieve.py has changed
Binary file ._retrieve.xml has changed
Binary file ._search.py has changed
Binary file ._search.xml has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.shed.yml	Mon Dec 04 16:05:45 2017 -0500
@@ -0,0 +1,21 @@
+name: srase
+owner: iuc
+categories:
+  - Web Services
+  - Data Source
+description: Sequence Read Archive Search Engine
+long_description: |
+  A fast querying tool to search on the Sequence Read Archive repository
+  using Bloom Filters.
+remote_repository_url: https://github.com/fabio-cumbo/sequence-read-archive-search-engine
+homepage_url: https://github.com/fabio-cumbo/sequence-read-archive-search-engine
+type: unrestricted
+auto_tool_repositories:
+  name_template: "{{ tool_id }}"
+  descriptor_template: "Wrapper for IWTomics application: {{ tool_name }}."
+suite:
+  name: "srase_suite"
+  description: "A suite of Galaxy tools designed to work with the query and extract data from the Sequence Read Archive repository."
+  long_description: |
+    A fast querying tool to search on the Sequence Read Archive repository
+    using Bloom Filters.
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/retrieve.py	Mon Dec 04 16:05:45 2017 -0500
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+
+# NCBI SRA Tools
+# https://galaxyproject.org/tutorials/upload/
+
+import os
+import optparse
+from subprocess import Popen, PIPE
+
+db_key = "?";
+sra_instant_url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/";
+
+def convertSRA(tmp_dir, accession_number, data_format):
+    absolute_tmp_dir = os.path.abspath(tmp_dir);
+    sra_file_path = os.path.join(absolute_tmp_dir, accession_number+".sra");
+    if os.path.isdir(absolute_tmp_dir) and os.path.exists(sra_file_path):
+        process = None;
+        if data_format == ".fasta.gz":
+            process = Popen(["fastq-dump", "--fasta", "--gzip", sra_file_path, "--outdir", absolute_tmp_dir], stdout=PIPE);
+        elif data_format == ".fastq.gz":
+            process = Popen(["fastq-dump", "--gzip", sra_file_path, "--outdir", absolute_tmp_dir], stdout=PIPE);
+        elif data_format == ".fasta":
+            process = Popen(["fastq-dump", "--fasta", sra_file_path, "--outdir", absolute_tmp_dir], stdout=PIPE);
+        elif data_format == ".fastq":
+            process = Popen(["fastq-dump", sra_file_path, "--outdir", absolute_tmp_dir], stdout=PIPE);
+        else:
+            process = None;
+        if process is not None:
+            (output, err) = process.communicate();
+            if err:
+                # kill the process
+                # kill_process(process.pid);
+                # remove any trace of the output file
+                an_file_path = os.path.join(tmp_dir, accession_number+data_format);
+                if os.path.exists(an_file_path):
+                    os.unlink(an_file_path);
+                # try to restart the process
+                return downloadAccessionData(tmp_dir, accession_number, data_format);
+            #exit_code = process.wait();
+            return os.path.join(tmp_dir, accession_number+data_format);
+    return "";
+
+def downloadAccessionData(accession_number, accession_path, appdata_path, data_format, limit=10):
+    split = accession_number[:6];
+    srr_path = sra_instant_url+split+"/"+accession_number+"/"+accession_number+".sra";
+    sra_file_path = os.path.join(appdata_path, accession_number+".sra");
+    process = Popen(['wget', srr_path, "--output-document="+sra_file_path], stdout=PIPE);
+    (output, err) = process.communicate();
+    if err:
+        # remove any trace of the output file
+        if os.path.exists(an_file_path):
+            os.unlink(an_file_path);
+        # try to restart the process
+        if limit > 0:
+            return downloadAccessionData(accession_number, accession_path, appdata_path, data_format, limit-1);
+        return -1;
+    if os.path.exists(sra_file_path):
+        converted_file_path = convertSRA(appdata_path, accession_number, data_format);
+        if os.path.exists(converted_file_path):
+            os.rename(converted_file_path, accession_path);
+        os.unlink(sra_file_path);
+    return 0;
+
+def process_accessions( options, args ):
+    # create appdata dir if it does not exist
+    appdata_path = options.appdata;
+    if not os.path.exists(appdata_path):
+        os.makedirs(appdata_path);
+    data_format = options.dataformat;
+    '''
+    # Collection test
+    test_file_name = "Test Collection" + "_" + "SRRtest" + "_" + data_format[1:] + "_" + db_key;
+    test_file_path = os.path.join(appdata_path, test_file_name);
+    file = open(test_file_path, "w");
+    file.write("Hello World");
+    file.close();
+    '''
+    # read inputs
+    comma_sep_file_paths = options.files;
+    #print("files: "+str(comma_sep_file_paths)+" - "+str(type(comma_sep_file_paths)));
+    # check if options.files contains at least one file path
+    if comma_sep_file_paths is not None:
+        # split file paths
+        file_paths = comma_sep_file_paths.split(",");
+        # split file names
+        comma_sep_file_names = str(options.names);
+        #print("names: "+str(comma_sep_file_names));
+        file_names = comma_sep_file_names.split(",");
+        # populate a dictionary with the files containing the sequences to query
+        for idx, file_path in enumerate(file_paths):
+            file_name = file_names[idx];
+            #print(file_name + ": " + file_path);
+            with open(file_path) as accessions:
+                for line in accessions:
+                    if line.strip() != "" and not line.startswith(">"):
+                        accession_number = line.strip();
+                        filename_with_collection_prefix = file_name + "_" + accession_number + "_" + data_format[1:] + "_" + db_key;
+                        accession_path = os.path.join(appdata_path, filename_with_collection_prefix)
+                        # download fastq filte related to accession_number
+                        downloadAccessionData( accession_number, accession_path, appdata_path, data_format );
+    return 0;
+
+def __main__():
+    # Parse the command line options
+    usage = "Usage: retrieve.py --files comma_sep_file_paths --names comma_seq_file_names --format data_format --appdata folder_name";
+    parser = optparse.OptionParser(usage = usage);
+    parser.add_option("-f", "--files", type="string",
+                    action="store", dest="files", help="comma separated files path");
+    parser.add_option("-n", "--names", type="string",
+                    action="store", dest="names", help="comma separated names associated to the files specified in --files");
+    parser.add_option("-e", "--format", type="string",
+                    action="store", dest="dataformat", help="data format");
+    parser.add_option("-a", "--appdata", type="string",
+                    action="store", dest="appdata", help="appdata folder name");
+    parser.add_option("-v", "--version", action="store_true", dest="version",
+                    default=False, help="display version and exit");
+    (options, args) = parser.parse_args();
+    if options.version:
+        print __version__;
+    else:
+        return process_accessions( options, args );
+
+if __name__ == "__main__": __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/retrieve.xml	Mon Dec 04 16:05:45 2017 -0500
@@ -0,0 +1,43 @@
+<?xml version="1.0"?>
+<tool name="Retrieve" id="srase_retrieve" version="1.0.0">
+    <description>data from SRA</description>
+    <requirements>
+        <requirement type="package" version="2.7.10">python</requirement>
+        <requirement type="package" version="2.8.2">sra-tools</requirement>
+    </requirements>
+    <command detect_errors="exit_code">
+<![CDATA[
+    python '$__tool_directory__/retrieve.py'
+    #set file_paths = ','.join( [ str( $f ) for $f in $files ] )
+    --files '${file_paths}'
+    #set file_names = ','.join( [ str( $f.name ) for $f in $files ] )
+        --names '${file_names}'
+    --format '${dataformat}'
+    --appdata 'tmp'
+    > ${stdouterr}
+]]>
+    </command>
+    <inputs>
+        <param format="txt" name="files" type="data" label="Select input files" multiple="true" optional="false" help="Select one or more txt files containing a list of accession numbers." />
+        <param name="dataformat" type="select" label="Select a data format" help="Select a data format for the accession numbers related files that will be downloaded">
+            <option value=".fastq">.fastq</option>
+            <option value=".fastq.gz">.fastq.gz</option>
+            <option value=".fasta">.fasta</option>
+            <option value=".fasta.gz">.fasta.gz</option>
+        </param>
+    </inputs>
+    <outputs>
+        <collection name="list_output" type="list:list" label="${tool.name} Accessions: Output Collection">
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_(?P&lt;identifier_1&gt;[^_]+)_(?P&lt;ext&gt;[^_]+)_(?P&lt;dbkey&gt;[^_]+)" ext="auto" visible="False" directory="tmp" />
+        </collection>
+        <data format="txt" name="stdouterr" />
+    </outputs>
+
+    <help><![CDATA[
+Authors: Fabio Cumbo, Robert S. Harris, Chen Sun, Paul Medvedev, and Anton Nekrutenko
+
+This tool will retrieve fastq files associated to the accession numbers listed in the input files.
+
+Help section
+    ]]></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/search.py	Mon Dec 04 16:05:45 2017 -0500
@@ -0,0 +1,133 @@
+#!/usr/bin/env python
+
+# https://github.com/ross/requests-futures
+# http://docs.python-requests.org/en/master/user/quickstart/#more-complicated-post-requests
+
+import os, uuid
+import optparse
+import requests
+from requests_futures.sessions import FuturesSession
+
+# proxy to uv0
+service_url = "http://deputy.bx.psu.edu/";
+# url to query page
+query_url = service_url+"query.php";
+# url to echo page: just return 'it works!'
+#echo_url = service_url+"echo.php";
+
+'''
+# synchronous
+def echo( options, args ):
+    # create a session
+    session = requests.Session()
+    # make a sync get request
+    resp = session.get(echo_url)
+    # check for response status code
+    resp_code = resp.status_code;
+    if resp_code == requests.codes.ok:
+        # get output file path
+        output_file_path = options.output;
+        # write response on the output file
+        with open(output_file_path, 'w') as out:
+            #out.write(resp.data);
+            out.write(resp.content);
+        return 0;
+    else:
+        return resp_code;
+'''
+
+# asynchronous
+def async_request( options, args, payload ):
+    # add additional parameters to the payload
+    payload["search_mode"] = str(options.search);
+    payload["exact_algorithm"] = str(options.exact);
+    payload["search_threshold"] = str(options.sthreshold);
+    # create a session
+    session = FuturesSession();
+    # make an async post request with requests-futures
+    future_req = session.post(query_url, data=payload);
+    # wait for the request to complete, if it has not already
+    resp = future_req.result();
+    # check for response status code
+    resp_code = resp.status_code;
+    # get output file path
+    output_file_path = options.output;
+    # write response on the output file
+    with open(output_file_path, 'w') as out:
+        #out.write(resp.data);
+        out.write(str(resp_code)+"\n"+str(resp.content));
+    if resp_code == requests.codes.ok:
+        return 0;
+    else:
+        return resp_code;
+
+def srase_query( options, args ):
+    multiple_files = {};
+    comma_sep_file_paths = options.files;
+    #print("files: "+str(comma_sep_file_paths)+" - "+str(type(comma_sep_file_paths)));
+    # check if options.files contains at least one file path
+    if comma_sep_file_paths is not None:
+        # split file paths
+        file_paths = comma_sep_file_paths.split(",");
+        # split file names
+        comma_sep_file_names = str(options.names);
+        #print("names: "+str(comma_sep_file_names));
+        file_names = comma_sep_file_names.split(",");
+        # populate a dictionary with the files containing the sequences to query
+        for idx, file_path in enumerate(file_paths):
+            file_name = file_names[idx];
+            with open(file_path, 'r') as content_file:
+                content = content_file.read()
+                multiple_files[file_name] = content;
+        if len(multiple_files) > 0:
+            return async_request( options, args,  multiple_files );
+            #return echo( options, args );
+    else:
+        # try with the sequence in --sequence
+        sequences_text = options.sequences;
+        #print("sequences: "+sequences_text);
+        # check if options.sequences contains a list of sequences (one for each row)
+        if sequences_text is not None:
+            sequences_text = str(sequences_text);
+            if sequences_text.strip():
+                # populate a dictionary with the files containing the sequences to query
+                seq_counter = 0;
+                sequences_arr = sequences_text.split("__cn__");
+                for seq in sequences_arr:
+                    seq_index = 'sequence'+str(seq_counter);
+                    multiple_files[seq_index] = seq;
+                    #print(str(seq_counter)+": "+seq);
+                    seq_counter += 1;
+                return async_request( options, args, multiple_files );
+                #return echo( options, args );
+            else:
+                return -1;
+    return -1;
+
+def __main__():
+    # Parse the command line options
+    usage = "Usage: search.py --files comma_sep_file_paths --names comma_seq_file_names --sequences sequences_text --search search_mode --exact exact_alg --sthreshold threshold --output output_file_path";
+    parser = optparse.OptionParser(usage = usage);
+    parser.add_option("-f", "--files", type="string",
+                    action="store", dest="files", help="comma separated files path");
+    parser.add_option("-n", "--names", type="string",
+                    action="store", dest="names", help="comma separated names associated to the files specified in --files");
+    parser.add_option("-s", "--sequences", type="string",
+                    action="store", dest="sequences", help="optional filed, contains a list of sequences (one for each row)");
+    parser.add_option("-x", "--search", type="int", default=0,
+                    action="store", dest="search", help="search mode");
+    parser.add_option("-e", "--exact", type="int", default=0,
+                    action="store", dest="exact", help="exact algorithm (required if search is 1 only)");
+    parser.add_option("-t", "--sthreshold", type="string",
+                    action="store", dest="sthreshold", help="threshold applied to the search algrithm");
+    parser.add_option("-o", "--output", type="string",
+                    action="store", dest="output", help="output file path");
+    parser.add_option("-v", "--version", action="store_true", dest="version",
+                    default=False, help="display version and exit");
+    (options, args) = parser.parse_args();
+    if options.version:
+        print __version__;
+    else:
+        srase_query( options, args );
+
+if __name__ == "__main__": __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/search.xml	Mon Dec 04 16:05:45 2017 -0500
@@ -0,0 +1,69 @@
+<?xml version="1.0"?>
+<tool name="Search" id="srase_search" version="1.0.0">
+    <description>your sequences in the big SRA data lake</description>
+    <requirements>
+        <requirement type="package" version="2.7.10">python</requirement>
+        <requirement type="package" version="2.18.4">requests</requirement>
+        <requirement type="package" version="0.9.7">requests-futures</requirement>
+    </requirements>
+    <command detect_errors="exit_code">
+<![CDATA[
+    python '$__tool_directory__/search.py'
+    
+    #if $search_condition.sequences:
+        --sequences '${search_condition.sequences}'
+    #end if
+    --search ${search_condition.search}
+    
+    #if $search_condition.search is 0:
+        #set file_paths = ','.join( [ str( $f ) for $f in $search_condition.txtfiles ] )
+        #if $file_paths is not 'None':
+            --files '${file_paths}'
+            #set file_names = ','.join( [ str( $f.name ) for $f in $search_condition.txtfiles ] )
+                --names '${file_names}'
+        #end if
+    #elif $search_condition.search is 1:
+        #set file_paths = ','.join( [ str( $f ) for $f in $search_condition.fastafiles ] )
+        #if $file_paths is not 'None':
+            --files '${file_paths}'
+            #set file_names = ','.join( [ str( $f.name ) for $f in $search_condition.fastafiles ] )
+                --names '${file_names}'
+        #end if
+        --exact ${search_condition.exact}
+    #end if
+
+    --sthreshold '${sthreshold}'
+    --output '${output}'
+]]>
+    </command>
+    <inputs>
+        <conditional name="search_condition">
+            <param name="search" type="select" label="Search mode" help="Select a search mode between normal (slower but returns an optimal solution) and fast (faster but it is heuristic)">
+                <option value="0" selected="true">Normal</option>
+                <option value="1">Fast</option>
+            </param>
+            <when value="0">
+                <param format="txt" name="txtfiles" type="data" label="Select sequences" multiple="true" optional="true" help="Select one or more txt files containing a sequence. A single file can contain more sequences, one for each row. Every file will represent a query to the Sequence Read Archive Search Engine that will return a list of accession numbers in which your sequences occur on." />
+                <param name="sequences" type="text" area="True" size="5x25" label="Manually insert sequence" optional="true" help="Optionally you can put a list of sequences (one for each row) in this text field." />
+            </when>
+            <when value="1">
+                <param format="fasta" name="fastafiles" type="data" label="Select fasta files" multiple="true" optional="false" help="Select one or more fasta files containing sequences. These files represent queries to the Sequence Read Archive Search Engine that will return a list of accession numbers in which your sequences occur on." />
+                <param name="exact" type="boolean" truevalue="1" falsevalue="0" checked="False" label="Use exact algorithm" help="" />
+            </when>
+        </conditional>
+        <param name="sthreshold" size="3" type="float" value="0.5" min="0.0" max="1.0" label="Threshold applied to the search algorithm" />
+    </inputs>
+    <outputs>
+        <data name="output" format="txt" label="${tool.name} on ${on_string}: SRA-SE Search Result" />
+    </outputs>
+
+    <help><![CDATA[
+Authors: Fabio Cumbo, Robert S. Harris, Chen Sun, Paul Medvedev, and Anton Nekrutenko
+
+Help section
+    ]]></help>
+
+    <citations>
+        <citation type="doi">10.1101/090464</citation>
+    </citations>
+</tool>