view retrieve.py @ 2:4291c9d1ff07 draft

Uploaded 20180124
author fabio
date Wed, 24 Jan 2018 11:26:24 -0500
parents 00d6e82d74e9
children
line wrap: on
line source

#!/usr/bin/env python

# NCBI SRA Tools
# https://galaxyproject.org/tutorials/upload/

import os
import optparse
from subprocess import Popen, PIPE

db_key = "?";
sra_instant_url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/";

def convertSRA(tmp_dir, accession_number, data_format):
    absolute_tmp_dir = os.path.abspath(tmp_dir);
    sra_file_path = os.path.join(absolute_tmp_dir, accession_number+".sra");
    if os.path.isdir(absolute_tmp_dir) and os.path.exists(sra_file_path):
        process = None;
        if data_format == ".fasta.gz":
            process = Popen(["fastq-dump", "--fasta", "--gzip", sra_file_path, "--outdir", absolute_tmp_dir], stdout=PIPE);
        elif data_format == ".fastq.gz":
            process = Popen(["fastq-dump", "--gzip", sra_file_path, "--outdir", absolute_tmp_dir], stdout=PIPE);
        elif data_format == ".fasta":
            process = Popen(["fastq-dump", "--fasta", sra_file_path, "--outdir", absolute_tmp_dir], stdout=PIPE);
        elif data_format == ".fastq":
            process = Popen(["fastq-dump", sra_file_path, "--outdir", absolute_tmp_dir], stdout=PIPE);
        else:
            process = None;
        if process is not None:
            (output, err) = process.communicate();
            if err:
                # kill the process
                # kill_process(process.pid);
                # remove any trace of the output file
                an_file_path = os.path.join(tmp_dir, accession_number+data_format);
                if os.path.exists(an_file_path):
                    os.unlink(an_file_path);
                # try to restart the process
                return downloadAccessionData(tmp_dir, accession_number, data_format);
            #exit_code = process.wait();
            return os.path.join(tmp_dir, accession_number+data_format);
    return "";

def downloadAccessionData(accession_number, accession_path, appdata_path, data_format, limit=10):
    split = accession_number[:6];
    srr_path = sra_instant_url+split+"/"+accession_number+"/"+accession_number+".sra";
    sra_file_path = os.path.join(appdata_path, accession_number+".sra");
    process = Popen(['wget', srr_path, "--output-document="+sra_file_path], stdout=PIPE);
    (output, err) = process.communicate();
    if err:
        # remove any trace of the output file
        if os.path.exists(an_file_path):
            os.unlink(an_file_path);
        # try to restart the process
        if limit > 0:
            return downloadAccessionData(accession_number, accession_path, appdata_path, data_format, limit-1);
        return -1;
    if os.path.exists(sra_file_path):
        converted_file_path = convertSRA(appdata_path, accession_number, data_format);
        if os.path.exists(converted_file_path):
            os.rename(converted_file_path, accession_path);
        os.unlink(sra_file_path);
    return 0;

def process_accessions( options, args ):
    # create appdata dir if it does not exist
    appdata_path = options.appdata;
    if not os.path.exists(appdata_path):
        os.makedirs(appdata_path);
    data_format = options.dataformat;
    '''
    # Collection test
    test_file_name = "Test Collection" + "_" + "SRRtest" + "_" + data_format[1:] + "_" + db_key;
    test_file_path = os.path.join(appdata_path, test_file_name);
    file = open(test_file_path, "w");
    file.write("Hello World");
    file.close();
    '''
    # read inputs
    comma_sep_file_paths = options.files;
    #print("files: "+str(comma_sep_file_paths)+" - "+str(type(comma_sep_file_paths)));
    # check if options.files contains at least one file path
    if comma_sep_file_paths is not None:
        # split file paths
        file_paths = comma_sep_file_paths.split(",");
        # split file names
        comma_sep_file_names = str(options.names);
        #print("names: "+str(comma_sep_file_names));
        file_names = comma_sep_file_names.split(",");
        # populate a dictionary with the files containing the sequences to query
        for idx, file_path in enumerate(file_paths):
            file_name = file_names[idx];
            #print(file_name + ": " + file_path);
            with open(file_path) as accessions:
                for line in accessions:
                    if line.strip() != "" and not line.startswith(">"):
                        accession_number = line.strip();
                        filename_with_collection_prefix = file_name + "_" + accession_number + "_" + data_format[1:] + "_" + db_key;
                        accession_path = os.path.join(appdata_path, filename_with_collection_prefix)
                        # download fastq filte related to accession_number
                        downloadAccessionData( accession_number, accession_path, appdata_path, data_format );
    return 0;

def __main__():
    # Parse the command line options
    usage = "Usage: retrieve.py --files comma_sep_file_paths --names comma_seq_file_names --format data_format --appdata folder_name";
    parser = optparse.OptionParser(usage = usage);
    parser.add_option("-f", "--files", type="string",
                    action="store", dest="files", help="comma separated files path");
    parser.add_option("-n", "--names", type="string",
                    action="store", dest="names", help="comma separated names associated to the files specified in --files");
    parser.add_option("-e", "--format", type="string",
                    action="store", dest="dataformat", help="data format");
    parser.add_option("-a", "--appdata", type="string",
                    action="store", dest="appdata", help="appdata folder name");
    (options, args) = parser.parse_args();
    return process_accessions( options, args );

if __name__ == "__main__": __main__()