Mercurial > repos > yufei-luo > s_mart
comparison SMART/DiffExpAnal/fastq_groomer_parallel_unSQL.py @ 18:94ab73e8a190
Uploaded
| author | m-zytnicki |
|---|---|
| date | Mon, 29 Apr 2013 03:20:15 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 17:b0e8584489e6 | 18:94ab73e8a190 |
|---|---|
| 1 import sys, os, optparse,shutil, random | |
| 2 from commons.core.launcher.Launcher import Launcher | |
| 3 from commons.core.sql.TableJobAdaptatorFactory import TableJobAdaptatorFactory | |
| 4 from commons.core.utils.FileUtils import FileUtils | |
| 5 | |
| 6 def _map(iLauncher, cmd, cmdStart, cmdFinish ): | |
| 7 lCmds = [] | |
| 8 lCmds.extend(cmd) | |
| 9 lCmdStart = [] | |
| 10 lCmdStart.extend(cmdStart) | |
| 11 lCmdFinish = [] | |
| 12 lCmdFinish.extend(cmdFinish) | |
| 13 return(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish)) | |
| 14 | |
| 15 def splitFastQ(fileName, nbOfSeqPerBatch): | |
| 16 nbOfLinesPerFile = nbOfSeqPerBatch * 4 | |
| 17 lOutput = [] | |
| 18 filePrefix, fileExt = os.path.splitext(os.path.basename(fileName)) | |
| 19 resDir = os.path.dirname(fileName) | |
| 20 with open(fileName) as inF: | |
| 21 fileNb = 1 | |
| 22 line = inF.readline() | |
| 23 if not line or nbOfLinesPerFile == 0: | |
| 24 outFileName = "%s/%s-%s%s" %(resDir, filePrefix, fileNb, fileExt) | |
| 25 lOutput.append(outFileName) | |
| 26 f = open(outFileName, "wb") | |
| 27 shutil.copyfileobj(open(fileName, "rb"), f) | |
| 28 f.close() | |
| 29 else: | |
| 30 while line: | |
| 31 outFileName = "%s/%s-%s%s" %(resDir, filePrefix, fileNb, fileExt) | |
| 32 lOutput.append(outFileName) | |
| 33 with open(outFileName, "w") as outF: | |
| 34 lineNb = 1 | |
| 35 while lineNb <= nbOfLinesPerFile and line: | |
| 36 outF.write(line) | |
| 37 line = inF.readline() | |
| 38 lineNb += 1 | |
| 39 fileNb += 1 | |
| 40 return lOutput | |
| 41 | |
| 42 def joinFastQ(dCutOut2Out): | |
| 43 for key in dCutOut2Out.keys(): | |
| 44 FileUtils.catFilesFromList(dCutOut2Out[key],key, False) | |
| 45 | |
| 46 def _createFastqGroomerCode(outGroomerNames, inputFileNames, input_type, output_type, force_quality_encoding, summarize_input): | |
| 47 cmd2Launch = [] | |
| 48 cmd2Launch.append("log = 0") | |
| 49 cmd2Launch.append("from galaxy_utils.sequence.fastq import fastqReader, fastqVerboseErrorReader, fastqAggregator, fastqWriter") | |
| 50 cmd2Launch.append("aggregator = fastqAggregator()") | |
| 51 cmd2Launch.append("out = fastqWriter( open( '%s', 'wb' ), format = '%s', force_quality_encoding = '%s')" % (outGroomerNames,output_type,force_quality_encoding)) | |
| 52 cmd2Launch.append("read_count = None") | |
| 53 if summarize_input: | |
| 54 cmd2Launch.append("reader = fastqVerboseErrorReader") | |
| 55 else: | |
| 56 cmd2Launch.append("reader = fastqReader") | |
| 57 cmd2Launch.append("for read_count, fastq_read in enumerate( reader( open( '%s' ), format = '%s', apply_galaxy_conventions = True ) ):" % (inputFileNames, input_type)) | |
| 58 if summarize_input: | |
| 59 cmd2Launch.append("\taggregator.consume_read( fastq_read )") | |
| 60 cmd2Launch.append("\tout.write( fastq_read )") | |
| 61 cmd2Launch.append("out.close()") | |
| 62 cmd2Launch.append("if read_count is not None:") | |
| 63 #cmd2Launch.append("\tprint 'Groomed %s %s reads into %s reads.' % ( read_count + 1, %s, %s )" % ('%i', '%s', '%s', input_type,output_type)) | |
| 64 cmd2Launch.append("\tif '%s' != '%s' and 'solexa' in [ '%s', '%s' ]:" % (input_type, output_type, input_type, output_type)) | |
| 65 cmd2Launch.append("\t\tprint 'Converted between Solexa and PHRED scores.'") | |
| 66 if summarize_input: | |
| 67 cmd2Launch.append("\tprint 'Based upon quality and sequence, the input data is valid for: %s' % ( ', '.join( aggregator.get_valid_formats() ) or 'None' )") | |
| 68 cmd2Launch.append("\tascii_range = aggregator.get_ascii_range()") | |
| 69 cmd2Launch.append("\tdecimal_range = aggregator.get_decimal_range()") | |
| 70 cmd2Launch.append("\tprint 'Input ASCII range: %s(%i) - %s(%i)' % ( repr( ascii_range[0] ), ord( ascii_range[0] ), repr( ascii_range[1] ), ord( ascii_range[1] ) )") | |
| 71 cmd2Launch.append("\tprint 'Input decimal range: %i - %i' % ( decimal_range[0], decimal_range[1] ) ") | |
| 72 cmd2Launch.append("else:") | |
| 73 cmd2Launch.append("\tprint 'No valid FASTQ reads were provided.'") | |
| 74 cmd2Launch.append("\tlog = 255") | |
| 75 return cmd2Launch | |
| 76 | |
| 77 def stop_err(msg): | |
| 78 sys.stderr.write("%s\n" % msg) | |
| 79 sys.exit() | |
| 80 | |
| 81 def main(): | |
| 82 | |
| 83 input_filename = sys.argv[1] #a txt file | |
| 84 input_type = sys.argv[2] | |
| 85 output_filename = sys.argv[3] #a txt file | |
| 86 output_type = sys.argv[4] | |
| 87 force_quality_encoding = sys.argv[5] | |
| 88 summarize_input = sys.argv[6] == 'summarize_input' | |
| 89 pairedEnd_input = sys.argv[7] | |
| 90 if pairedEnd_input == 'None': | |
| 91 pairedEnd_input = None | |
| 92 else: | |
| 93 output_pairedEndFileName = sys.argv[8] | |
| 94 | |
| 95 if force_quality_encoding == 'None': | |
| 96 force_quality_encoding = None | |
| 97 | |
| 98 #Parse the input txt file and read a list of fastq files | |
| 99 file = open(input_filename, "r") | |
| 100 lines = file.readlines() | |
| 101 inputFileNames = [] | |
| 102 outGroomerNames = [] | |
| 103 resDirName = os.path.dirname(output_filename) + "/" | |
| 104 #Write output txt file and define all output groomer file names | |
| 105 outFile = open(output_filename, "w") | |
| 106 for line in lines: | |
| 107 tab = line.split() | |
| 108 inputFileNames.append(tab[1]) | |
| 109 outGroomerName = resDirName + tab[0] + '_outGroomer_%s.fastq' % random.randrange(0, 10000) | |
| 110 outGroomerNames.append(outGroomerName) | |
| 111 outFile.write(tab[0] + '\t' + outGroomerName + '\n') | |
| 112 outFile.close() | |
| 113 file.close() | |
| 114 | |
| 115 if pairedEnd_input != None: | |
| 116 inPairedFile = open(pairedEnd_input, "r") | |
| 117 lines = inPairedFile.readlines() | |
| 118 inputPairedEndFileNames = [] | |
| 119 outGroomerPairedEndNames = [] | |
| 120 outPairedEndFile = open(output_pairedEndFileName, "w") | |
| 121 for line in lines: | |
| 122 tab = line.split() | |
| 123 inputPairedEndFileNames.append(tab[1]) | |
| 124 outGroomerPairedEndName = resDirName + tab[0] + '_outGroomer_pairedEnd_%s.fastq' % random.randrange(0, 10000) | |
| 125 outGroomerPairedEndNames.append(outGroomerPairedEndName) | |
| 126 outPairedEndFile.write(tab[0] + '\t' + outGroomerPairedEndName + '\n') | |
| 127 outPairedEndFile.close() | |
| 128 inPairedFile.close() | |
| 129 | |
| 130 acronym = "fastqGroomer" | |
| 131 jobdb = TableJobAdaptatorFactory.createJobInstance() | |
| 132 iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True) | |
| 133 lCmdsTuples = [] | |
| 134 dCutOut2Out = {} | |
| 135 lAllFile2remove = [] | |
| 136 # Write output file | |
| 137 for i in range(len(outGroomerNames)): | |
| 138 lCutInputFile = splitFastQ(inputFileNames[i], 20000) | |
| 139 lAllFile2remove.extend(lCutInputFile) | |
| 140 lCutOutput = [] | |
| 141 for cutInput in lCutInputFile: | |
| 142 cutOutput = "%s_out" % cutInput | |
| 143 lCutOutput.append(cutOutput) | |
| 144 lAllFile2remove.extend(lCutOutput) | |
| 145 cmd2Launch = _createFastqGroomerCode(cutOutput, cutInput, input_type, output_type, force_quality_encoding, summarize_input) | |
| 146 cmdStart = [] | |
| 147 cmdFinish = [] | |
| 148 lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish)) | |
| 149 dCutOut2Out[outGroomerNames[i]] = lCutOutput | |
| 150 if pairedEnd_input != None: | |
| 151 lCutInputFile = splitFastQ(inputPairedEndFileNames[i], 20000) | |
| 152 lAllFile2remove.extend(lCutInputFile) | |
| 153 lCutOutput = [] | |
| 154 for cutInput in lCutInputFile: | |
| 155 cutOutput = "%s_out" % cutInput | |
| 156 lCutOutput.append(cutOutput) | |
| 157 lAllFile2remove.extend(lCutOutput) | |
| 158 cmd2Launch = _createFastqGroomerCode(cutOutput, cutInput, input_type, output_type, force_quality_encoding, summarize_input) | |
| 159 cmdStart = [] | |
| 160 cmdFinish = [] | |
| 161 lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish)) | |
| 162 dCutOut2Out[outGroomerPairedEndNames[i]] = lCutOutput | |
| 163 iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, False) | |
| 164 | |
| 165 joinFastQ(dCutOut2Out) | |
| 166 FileUtils.removeFilesFromListIfExist(lAllFile2remove) | |
| 167 | |
| 168 if __name__ == "__main__": main() |
