Mercurial > repos > daumsoft > star_icgc_alignment2
changeset 0:8922a3be8490 draft
Uploaded
author | daumsoft |
---|---|
date | Mon, 04 Jun 2018 02:38:49 -0400 |
parents | |
children | b566013f81e8 |
files | star_alignment/.shed.yml star_alignment/ICGC_pipeline.sh star_alignment/ICGC_pipeline_RUN.sh star_alignment/STAR star_alignment/icgc_star2_wrapper.xml star_alignment/star_align.py |
diffstat | 6 files changed, 633 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/star_alignment/.shed.yml Mon Jun 04 02:38:49 2018 -0400 @@ -0,0 +1,5 @@ +categories: [Transcriptomics] +description: daumsoft +name: tar +owner: daumsoft +remote_repository_url: https://toolshed.g2.bx.psu.edu/view/daumsoft
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/star_alignment/ICGC_pipeline.sh Mon Jun 04 02:38:49 2018 -0400 @@ -0,0 +1,51 @@ +#!/bin/bash + +if [ "$#" -ne 1 ]; then + echo "[usage:] ICGC_pipeline.sh fastq_in_tar.gz" + exit 1; +fi + +#FASTQ_1=$1 +#FASTQ_2=$2 +#SAMPLE_ID=$3 +#TAR_FILE=$SAMPLE_ID".tar" +TAR_FILE=$1 + +export PATH=/storage/data/program/GDC_TGCA-Harmonized/RNA-Seq/bin/ICGC-STAR_ALIGNMENT_PIPELINE:$PATH + +#tar cvfh $TAR_FILE $FASTQ_2 $FASTQ_1 + +STAD_ALIGN=~/package/DAUMSOFT/RNA-seq/ICGC_STAR_ALIGNMENT_PIPELINE/star_align.py +STAR_INDEX_PATH=~/refs/hg38/gdc/Index_Files/GDC.h38.d1.vd1_STAR2_Index_Files/star_genome_d1_vd1_gtfv22 +WORK_DIR=./wrk +OUTPUT_BAM=./out +REFERENCE=~/refs/hg38/gdc/GRCh38.d1.vd1_Reference_Sequence/GRCh38.d1.vd1.fa +RUN_THREAD_NUM=8 +rm -rf $WORK_DIR +rm -rf $OUTPUT_BAM +mkdir $WORK_DIR +mkdir $OUTPUT_BAM + +python $STAD_ALIGN \ +--genomeDir $STAR_INDEX_PATH \ +--tarFileIn $TAR_FILE \ +--workDir $WORK_DIR \ +--out $OUTPUT_BAM \ +--genomeFastaFiles $REFERENCE \ +--runThreadN $RUN_THREAD_NUM \ +--outFilterMultimapScoreRange 1 \ +--outFilterMultimapNmax 20 \ +--outFilterMismatchNmax 10 \ +--alignIntronMax 500000 \ +--alignMatesGapMax 1000000 \ +--sjdbScore 2 \ +--limitBAMsortRAM 0 \ +--alignSJDBoverhangMin 1 \ +--genomeLoad NoSharedMemory \ +--outFilterMatchNminOverLread 0.33 \ +--outFilterScoreMinOverLread 0.33 \ +--twopass1readsN -1 \ +--sjdbOverhang 100 \ +--outSAMstrandField intronMotif \ +--outSAMunmapped Within +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/star_alignment/ICGC_pipeline_RUN.sh Mon Jun 04 02:38:49 2018 -0400 @@ -0,0 +1,14 @@ +#!/bin/bash + +if [ "$#" -ne 1 ]; then + echo "[usage:] ICGC_pipeline_RUN.sh tar.gz(fastq in file)" + exit 1; +fi + +INPUT_TAR_GZ=$1 + +ln -s $INPUT_TAR_GZ ${INPUT_TAR_GZ}.tar.gz +INPUT_TAR_GZ=${INPUT_TAR_GZ}.tar.gz + +$GALAXY_HOME/package/DAUMSOFT/RNA-seq/ICGC_STAR_ALIGNMENT_PIPELINE/ICGC_pipeline.sh $INPUT_TAR_GZ > log.out 2>&1 +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/star_alignment/icgc_star2_wrapper.xml Mon Jun 04 02:38:49 2018 -0400 @@ -0,0 +1,36 @@ +<tool id="daumsoft_wts_star" name="ICGC_STAR_PIPELINE" version="2.4.2A"> + <description>STAR Alignment/ICGC Method</description> + <stdio> + <regex match="Exception|Error" source="both" level="fatal" description="Tool execution failed"/> + </stdio> + <version_command></version_command> + <command> + + \$GALAXY_HOME/package/DAUMSOFT/RNA-seq/ICGC_STAR_ALIGNMENT_PIPELINE/ICGC_pipeline_RUN.sh + ${tar_gz} + + </command> + <inputs> + <param name="tar_gz" format="gz" type="data" label="RNA-Seq FASTQ file In TAR.GZ(GRCh38)/hg38" help="" /> + </inputs> + <outputs> + <data format="bam" name="alignment" label="${tool.name} on ${on_string}: Alignments BAM" from_work_dir="out/Aligned.sortedByCoord.out.bam"/> + </outputs> + <tests><test><output name="alignment"/></test></tests> + <help> + +For more detailed manual, please visit The GDC mRNA quantification analysis pipeline website: + +https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline + +The mRNA Analysis pipeline begins with the Alignment Workflow, which is performed using a two-pass method with STAR. +STAR aligns each read group separately and then merges the resulting alignments into one. +Following the methods used by the International Cancer Genome Consortium ICGC, +the two-pass method includes a splice junction detection step, which is used to generate the final alignment. + +This workflow outputs a BAM file, which contains both aligned and unaligned reads. +Quality assessment is performed pre-alignment with FASTQC and post-alignment with RNA-SeQC and Picard Tools. + + </help> + <citations></citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/star_alignment/star_align.py Mon Jun 04 02:38:49 2018 -0400 @@ -0,0 +1,527 @@ +#!/usr/bin/env python + +import os +import sys +import re +import string +import tempfile +import subprocess +import argparse +import shutil +import lxml.etree as etree +import fnmatch + +def walk_dir(base, pattern): + files = [] + for root, dirnames, filenames in os.walk(base): + for fname in fnmatch.filter(filenames, pattern): + files.append(os.path.join(root, fname)) + return files + +def scan_workdir(base): + + ### scan for paired-end files + ############################# + + ### unzipped fastq input + fastq_files = walk_dir(base, "*_read[12]_*fastq") + if len(fastq_files): + o = {} + for i in sorted(fastq_files): + basename = re.sub(r'_read[12]', '', i) + try: + o[basename].append(i) + except KeyError: + o[basename] = [i] + if not all( (len(i) == 2 for i in o.values())): + raise Exception("Missing Pair") + return ( 'cat', list( (os.path.basename(i), o[i][0], o[i][1]) for i in o.keys()), 'PE') + + ### unzipped fastq input + fastq_files = walk_dir(base, "*_R[12]_001.fastq") + if len(fastq_files): + o = {} + for i in fastq_files: + basename = re.sub(r'_R[12]_001.fastq$', '', i) + o[basename] = o.get(basename, 0) + 1 + if not all( (i == 2 for i in o.values())): + raise Exception("Missing Pair") + return ( 'cat', list( (os.path.basename(i), "%s_R1_001.fastq" % i,"%s_R2_001.fastq" % i) for i in o.keys()), 'PE') + + ### unzipped fastq input + fastq_files = walk_dir(base, "*_[12].fastq") + if len(fastq_files): + o = {} + for i in fastq_files: + basename = re.sub(r'_[12].fastq$', '', i) + o[basename] = o.get(basename, 0) + 1 + if not all( (i == 2 for i in o.values())): + raise Exception("Missing Pair") + return ( 'cat', list( (os.path.basename(i), "%s_1.fastq" % i,"%s_2.fastq" % i) for i in o.keys()), 'PE') + + ### unzipped fastq input + fastq_files = walk_dir(base, "*[.][12].fastq") + if len(fastq_files): + o = {} + for i in fastq_files: + basename = re.sub(r'[.][12].fastq$', '', i) + o[basename] = o.get(basename, 0) + 1 + if not all( (i == 2 for i in o.values())): + raise Exception("Missing Pair") + return ( 'cat', list( (os.path.basename(i), "%s.1.fastq" % i,"%s.2.fastq" % i) for i in o.keys()), 'PE') + + ### unzipped fastq input + fastq_files = walk_dir(base, "*.fastq[12]") + if len(fastq_files): + o = {} + for i in fastq_files: + basename = re.sub(r'.fastq[12]$', '', i) + o[basename] = o.get(basename, 0) + 1 + if not all( (i == 2 for i in o.values())): + raise Exception("Missing Pair") + return ( 'cat', list( (os.path.basename(i), "%s.fastq1" % i,"%s.fastq2" % i) for i in o.keys()), 'PE') + + ### unzipped txt input + fastq_files = walk_dir(base, "*_[12]_sequence.txt") + if len(fastq_files): + o = {} + for i in fastq_files: + basename = re.sub(r'_[12]_sequence.txt$', '', i) + o[basename] = o.get(basename, 0) + 1 + if not all( (i == 2 for i in o.values())): + raise Exception("Missing Pair") + return ( 'cat', list( (os.path.basename(i), "%s_1_sequence.txt" % i,"%s_2_sequence.txt" % i) for i in o.keys()), 'PE') + + ### gzipped input + fastq_gz_files = walk_dir(base, "*_[12].fastq.gz") + if len(fastq_gz_files): + o = {} + for i in fastq_gz_files: + basename = re.sub(r'_[12].fastq.gz$', '', i) + o[basename] = o.get(basename, 0) + 1 + if not all( (i == 2 for i in o.values())): + raise Exception("Missing Pair") + return ( 'zcat', list( (os.path.basename(i), "%s_1.fastq.gz" % i,"%s_2.fastq.gz" % i) for i in o.keys()), 'PE') + + ### bzipped input + fastq_bz_files = walk_dir(base, "*_[12].fastq.bz") + if len(fastq_gz_files): + o = {} + for i in fastq_gz_files: + basename = re.sub(r'_[12].fastq.bz$', '', i) + o[basename] = o.get(basename, 0) + 1 + if not all( (i == 2 for i in o.values())): + raise Exception("Missing Pair") + return ( 'bzcat', list( (os.path.basename(i), "%s_1.fastq.bz" % i,"%s_2.fastq.bz" % i) for i in o.keys()), 'PE') + + ### scan for single-end files + ############################# + + ### unzipped input + fastq_files = walk_dir(base, "*.fastq") + if len(fastq_files): + return ( 'cat', list( (os.path.basename(re.sub(r'.fastq$', '', i)), i) for i in fastq_files), 'SE') + + ### unzipped input + fastq_files = walk_dir(base, "*.fq") + if len(fastq_files): + return ( 'cat', list( (os.path.basename(re.sub(r'.fq$', '', i)), i) for i in fastq_files), 'SE') + + ### gzipped input + fastq_files = walk_dir(base, "*.fastq.gz") + if len(fastq_files): + return ( 'zcat', list( (os.path.basename(re.sub(r'.fastq.gz$', '', i)), i) for i in fastq_files), 'SE') + + ### bzipped input + fastq_files = walk_dir(base, "*.fastq.bz") + if len(fastq_files): + return ( 'bzcat', list( (os.path.basename(re.sub(r'.fastq.bz$', '', i)), i) for i in fastq_files), 'SE') + + raise Exception("Unable to determine input type") + + +def spreadsheet2dict(spreadFile): + """ + Takes the filename of the spreadsheet, loads the data and organizes + it into a dictionary""" + + spreadDict = {} + key2field = {} + for l, line in enumerate(open(spreadFile)): + sl = line.strip().split('\t') + if l == 0: + for k, key in enumerate(sl): + key2field[key] = k + else: + spreadDict[sl[key2field['analysis_id']]] = sl + + return (spreadDict, key2field) + + +def spreadsheet2RGdict(spreadFile, analysisID): + """Compiles a read group dictionary from the information + in the spreadFile for the given analysisID.""" + + sD, k2f = spreadsheet2dict(spreadFile) + + try: + rec = sD[analysisID] + except KeyError: + raise Exception('Information for analysis ID %s could not be found in %s' % (analysisID, spreadFile)) + + ### build dictionary + RG_dict = { 'ID' : '%s:%s' % (rec[k2f['center_name']], analysisID), + 'CN' : rec[k2f['center_name']], + 'LB' : 'RNA-Seq:%s:%s' % (rec[k2f['center_name']], rec[k2f['lib_id']]), + 'PL' : rec[k2f['platform']], + 'PM' : rec[k2f['platform_model']], + 'SM' : rec[k2f['specimen_id']], + 'SI' : rec[k2f['submitted_sample_id']]} + + files = [] + if 'fastq_files' in k2f: + if not rec[k2f['fastq_files']].strip(' ') in ['N/A', 'NA', 'no', '']: + files = rec[k2f['fastq_files']].strip(' ').split(' ') + + return (RG_dict, files) + + +def xml2RGdict(xmlfile): + + ### read xml in + root = etree.parse(xmlfile) + rtree = root.find('Result') + + ### analysis_id + analysis_id = rtree.find('analysis_id').text + center = rtree.find('center_name').text + try: + date_string = rtree.find('analysis_xml/ANALYSIS_SET/ANALYSIS').attrib['analysis_date'] + except KeyError: + date_string = rtree.find('run_xml/RUN_SET/RUN').attrib['run_date'] + sample_id = rtree.find('sample_id').text + submitter_id = rtree.find('legacy_sample_id').text + library_id = rtree.find('experiment_xml/EXPERIMENT_SET/EXPERIMENT').attrib['alias'] + platform = rtree.find('experiment_xml/EXPERIMENT_SET/EXPERIMENT/PLATFORM').getchildren()[0].tag + instrument = rtree.find('experiment_xml/EXPERIMENT_SET/EXPERIMENT/PLATFORM/*/INSTRUMENT_MODEL').text + + ### build dictionary + RG_dict = { 'ID' : '%s:%s' % (center, analysis_id), + 'CN' : center, + 'DT' : date_string, + 'LB' : 'RNA-Seq:%s:%s' % (center, library_id), + 'PL' : platform, + 'PM' : instrument, + 'SM' : sample_id, + 'SI' : submitter_id} + + return RG_dict + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description="ICGC RNA-Seq alignment wrapper for STAR alignments.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, usage='%(prog)s [options]', add_help=False) + required = parser.add_argument_group("Required input parameters") + required.add_argument("--genomeDir", default=None, help="Directory containing the reference genome index", required=True) + required.add_argument("--tarFileIn", default=None, help="Input file containing the sequence information", required=True) + optional = parser.add_argument_group("optional input parameters") + optional.add_argument("--out", default="out.bam", help="Name of the output BAM file") + optional.add_argument("--workDir", default="./", help="Work directory") + optional.add_argument("--metaDataTab", default=None, help="File containing metadata for the alignment header") + optional.add_argument("--analysisID", default=None, help="Analysis ID to be considered in the metadata file") + optional.add_argument("--keepJunctions", default=False, action='store_true', help="keeps the junction file as {--out}.junctions") + optional.add_argument("--useTMP", default=None, help="environment variable that is used as prefix for temprary data") + optional.add_argument("-h", "--help", action='store_true', help="show this help message and exit") + star = parser.add_argument_group("STAR input parameters") + star.add_argument("--runThreadN", type=int, default=4, help="Number of threads") + star.add_argument("--outFilterMultimapScoreRange", type=int, default=1, help="outFilterMultimapScoreRange") + star.add_argument("--outFilterMultimapNmax", type=int, default=20, help="outFilterMultimapNmax") + star.add_argument("--outFilterMismatchNmax", type=int, default=10, help="outFilterMismatchNmax") + star.add_argument("--alignIntronMax", type=int, default=500000, help="alignIntronMax") + star.add_argument("--alignMatesGapMax", type=int, default=1000000, help="alignMatesGapMax") + star.add_argument("--sjdbScore", type=int, default=2, help="sjdbScore") + star.add_argument("--limitBAMsortRAM", type=int, default=0, help="limitBAMsortRAM") + star.add_argument("--alignSJDBoverhangMin", type=int, default=1, help="alignSJDBoverhangMin") + star.add_argument("--genomeLoad", default="NoSharedMemory", help="genomeLoad") + star.add_argument("--genomeFastaFiles", default=None, help="genome sequence in fasta format to rebuild index") + star.add_argument("--outFilterMatchNminOverLread", type=float, default=0.33, help="outFilterMatchNminOverLread") + star.add_argument("--outFilterScoreMinOverLread", type=float, default=0.33, help="outFilterScoreMinOverLread") + star.add_argument("--twopass1readsN", type=int, default=-1, help="twopass1readsN (-1 means all reads are used for remapping)") + star.add_argument("--sjdbOverhang", type=int, default=100, help="sjdbOverhang (only necessary for two-pass mode)") + star.add_argument("--outSAMstrandField", default="intronMotif", help="outSAMstrandField") + star.add_argument("--outSAMattributes", default=["NH", "HI", "NM", "MD", "AS", "XS"], help="outSAMattributes") + star.add_argument("--outSAMunmapped", default="Within", help="outSAMunmapped") + star.add_argument("--outSAMtype", default=["BAM", "SortedByCoordinate"], help="outSAMtype") + star.add_argument("--outSAMheaderHD", default=["@HD", "VN:1.4"], help="outSAMheaderHD") + star.add_argument("--outSAMattrRGline", default=None, help="RG attribute line submitted to outSAMattrRGline") + star.add_argument("--outSAMattrRGfile", default=None, help="File containing the RG attribute line submitted to outSAMattrRGline") + star.add_argument("--outSAMattrRGxml", default=None, help="XML-File in TCGA format to compile RG attribute line") + + ### check completeness of command line inputs + if len(sys.argv) < 2: + parser.print_help() + sys.exit(0) + + args = parser.parse_args() + + ### some sanity checks on command line parameters + if args.metaDataTab is not None: + if not os.path.exists(args.metaDataTab): + raise Exception("File provided via --metaDataTab does not exist\nFile: %s" % args.metaDataTab) + if args.analysisID is None: + raise Exception("When providing information in a metadata file, a value for --analysisID is required") + if args.outSAMattrRGxml is not None and not os.path.exists(args.outSAMattrRGxml): + raise Exception("File provided via --outSAMattrRGxml does not exist\nFile: %s" % args.outSAMattrRGxml) + if args.outSAMattrRGfile is not None and not os.path.exists(args.outSAMattrRGfile): + raise Exception("File provided via --outSAMattrRGfile does not exist\nFile: %s" % args.outSAMattrRGfile) + + ### handling of input file (unpacking, etc. ) + if args.useTMP is not None: + workdir = tempfile.mkdtemp(dir=os.environ[args.useTMP], prefix="star_inputdir_") + else: + workdir = tempfile.mkdtemp(dir=args.workDir, prefix="star_inputdir_") + if args.tarFileIn.endswith(".gz"): + tarcmd = "tar xvzf %s -C %s" % (args.tarFileIn, workdir) + elif args.tarFileIn.endswith(".bz"): + tarcmd = "tar xvjf %s -C %s" % (args.tarFileIn, workdir) + elif args.tarFileIn.endswith(".tar"): + tarcmd = "tar xvf %s -C %s" % (args.tarFileIn, workdir) + elif args.tarFileIn.endswith(".sra"): + tarcmd = "fastq-dump --gzip --split-3 --outdir %s %s" % (workdir, args.tarFileIn) + else: + raise Exception("Unknown input file extension for file %s" % (args.tarFileIn)) + subprocess.check_call(tarcmd, shell=True) + + ### collect fastq information from extraction dir + align_sets = scan_workdir(os.path.abspath(workdir)) + + ### process read group information + files = [] + if args.metaDataTab is not None: + (RG_dict, files_tmp) = spreadsheet2RGdict(args.metaDataTab, args.analysisID) + files.extend(files_tmp) + elif args.outSAMattrRGxml is not None: + RG_dict = xml2RGdict(args.outSAMattrRGxml) + elif args.outSAMattrRGline is not None: + RG_dict = dict([(x.split(':', 1)[0], x.split(':', 1)[1]) for x in args.outSAMattrRGline.split()]) + elif args.outSAMattrRGfile is not None: + _fh = open(args.outSAMattrRGfile, 'r') + RG_dict = dict([(x.split(':', 1)[0], x.split(':', 1)[1]) for x in _fh.next().strip().split()]) + _fh.close() + else: + RG_dict = {'ID' : '', 'SM' : ''} + + ### post-process RG-dict to comply with STAR conventions + for key in RG_dict: + sl = RG_dict[key].split(' ') + if len(sl) > 1: + RG_dict[key] = '"%s"' % RG_dict[key] + + ### use list of fastq input files for whitelisting + if len(files) > 0: + align_sets = (align_sets[0], [x for x in align_sets[1] if (re.sub('(_[12]){,1}.fastq(.(gz|bz2|bz))*', '', os.path.basename(x[1])) in files)], align_sets[2]) + if len(align_sets[1]) == 0: + print >> sys.stderr, 'All input files have been filtered out - no input remaining. Terminating.' + sys.exit() + + ### use filename stub as read group label + RG_dict['RG'] = [] + for fn in [x[1] for x in align_sets[1]]: + fn_stub = re.sub('(_[12]){,1}.fastq(.(gz|bz2|bz))*', '', os.path.basename(fn)) + fn_stub = re.sub('(_[12]){,1}_sequence.txt(.(gz|bz2|bz))*', '', fn_stub) + fn_stub = re.sub('_read[12]', '', fn_stub) + fn_stub = re.sub('_R[12]_001$', '', fn_stub) + RG_dict['RG'].append(fn_stub) + + ### prepare template string + if align_sets[2] == 'PE': + read_str = '${fastq_left} ${fastq_right}' + else: + read_str = '${fastq_left}' + + ### simulate two pass alignment until STAR fully implements this + if args.twopass1readsN != 0: + + ### run first round of alignments and only record junctions + align_template_str_1st = """STAR \ +--genomeDir ${genomeDir} --readFilesIn %s \ +--runThreadN ${runThreadN} \ +--outFilterMultimapScoreRange ${outFilterMultimapScoreRange} \ +--outFilterMultimapNmax ${outFilterMultimapNmax} \ +--outFilterMismatchNmax ${outFilterMismatchNmax} \ +--alignIntronMax ${alignIntronMax} \ +--alignMatesGapMax ${alignMatesGapMax} \ +--sjdbScore ${sjdbScore} \ +--alignSJDBoverhangMin ${alignSJDBoverhangMin} \ +--genomeLoad ${genomeLoad} \ +--readFilesCommand ${readFilesCommand} \ +--outFilterMatchNminOverLread ${outFilterMatchNminOverLread} \ +--outFilterScoreMinOverLread ${outFilterScoreMinOverLread} \ +--sjdbOverhang ${sjdbOverhang} \ +--outSAMstrandField ${outSAMstrandField} \ +--outSAMtype None \ +--outSAMmode None""" % read_str + + if args.twopass1readsN > 0: + align_template_str_1st += " --readMapNumber %i" % args.twopass1readsN + + cmd = string.Template(align_template_str_1st).safe_substitute({ + 'genomeDir' : os.path.abspath(args.genomeDir), + 'runThreadN' : args.runThreadN, + 'outFilterMultimapScoreRange' : args.outFilterMultimapScoreRange, + 'outFilterMultimapNmax' : args.outFilterMultimapNmax, + 'outFilterMismatchNmax' : args.outFilterMismatchNmax, + 'fastq_left' : ','.join([os.path.join(x[0], x[1]) for x in align_sets[1]]), + 'alignIntronMax' : args.alignIntronMax, + 'alignMatesGapMax': args.alignMatesGapMax, + 'sjdbScore': args.sjdbScore, + 'alignSJDBoverhangMin' : args.alignSJDBoverhangMin, + 'genomeLoad' : args.genomeLoad, + 'readFilesCommand' : align_sets[0], + 'outFilterMatchNminOverLread' : args.outFilterMatchNminOverLread, + 'outFilterScoreMinOverLread' : args.outFilterScoreMinOverLread, + 'sjdbOverhang' : args.sjdbOverhang, + 'outSAMstrandField' : args.outSAMstrandField + }) + if align_sets[2] == 'PE': + cmd = string.Template(cmd).substitute({ + 'fastq_right' : ','.join([os.path.join(x[0], x[2]) for x in align_sets[1]]) + }) + + ### take temp directory from environment variable + if args.useTMP is not None: + align_dir_1st = os.path.abspath( tempfile.mkdtemp(dir=os.environ[args.useTMP], prefix="star_aligndir_1st_") ) + genome_dir_1st = os.path.abspath( tempfile.mkdtemp(dir=os.environ[args.useTMP], prefix="star_genomedir_1st_") ) + else: + align_dir_1st = os.path.abspath( tempfile.mkdtemp(dir=args.workDir, prefix="star_aligndir_1st_") ) + genome_dir_1st = os.path.abspath( tempfile.mkdtemp(dir=args.workDir, prefix="star_genomedir_1st_") ) + print "Running", cmd + subprocess.check_call(cmd, shell=True, cwd=align_dir_1st) + + ### build index using provided genome fasta as well as junctions from first run + cmd = """STAR --runMode genomeGenerate --genomeDir %s \ +--genomeFastaFiles %s \ +--sjdbOverhang %i \ +--runThreadN %i \ +--sjdbFileChrStartEnd %s""" % (genome_dir_1st, args.genomeFastaFiles, args.sjdbOverhang, args.runThreadN, os.path.join(align_dir_1st, 'SJ.out.tab')) + print "Running", cmd + subprocess.check_call(cmd, shell=True, cwd=align_dir_1st) + + ### replace index for the second run with the one currently built + genome_dir = genome_dir_1st + else: + genome_dir = os.path.abspath(args.genomeDir) + + + align_template_str = """STAR \ +--genomeDir ${genomeDir} --readFilesIn %s \ +--runThreadN ${runThreadN} \ +--outFilterMultimapScoreRange ${outFilterMultimapScoreRange} \ +--outFilterMultimapNmax ${outFilterMultimapNmax} \ +--outFilterMismatchNmax ${outFilterMismatchNmax} \ +--alignIntronMax ${alignIntronMax} \ +--alignMatesGapMax ${alignMatesGapMax} \ +--sjdbScore ${sjdbScore} \ +--alignSJDBoverhangMin ${alignSJDBoverhangMin} \ +--genomeLoad ${genomeLoad} \ +--limitBAMsortRAM ${limitBAMsortRAM} \ +--readFilesCommand ${readFilesCommand} \ +--outFilterMatchNminOverLread ${outFilterMatchNminOverLread} \ +--outFilterScoreMinOverLread ${outFilterScoreMinOverLread} \ +--sjdbOverhang ${sjdbOverhang} \ +--outSAMstrandField ${outSAMstrandField} \ +--outSAMattributes ${outSAMattributes} \ +--outSAMunmapped ${outSAMunmapped} \ +--outSAMtype ${outSAMtype} \ +--outSAMheaderHD ${outSAMheaderHD}""" % read_str + +#--twopass1readsN ${twopass1readsN} \ + + cmd = string.Template(align_template_str).safe_substitute({ + 'genomeDir' : genome_dir, + 'runThreadN' : args.runThreadN, + 'fastq_left' : ','.join([os.path.join(x[0], x[1]) for x in align_sets[1]]), #os.path.abspath(pair[1]), + 'outFilterMultimapScoreRange' : args.outFilterMultimapScoreRange, + 'outFilterMultimapNmax' : args.outFilterMultimapNmax, + 'outFilterMismatchNmax' : args.outFilterMismatchNmax, + 'alignIntronMax' : args.alignIntronMax, + 'alignMatesGapMax': args.alignMatesGapMax, + 'sjdbScore': args.sjdbScore, + 'alignSJDBoverhangMin' : args.alignSJDBoverhangMin, + 'genomeLoad' : args.genomeLoad, + 'limitBAMsortRAM' : args.limitBAMsortRAM, + 'readFilesCommand' : align_sets[0], + 'outFilterMatchNminOverLread' : args.outFilterMatchNminOverLread, + 'outFilterScoreMinOverLread' : args.outFilterScoreMinOverLread, + 'sjdbOverhang' : args.sjdbOverhang, + 'outSAMstrandField' : args.outSAMstrandField, + 'outSAMattributes' : " ".join(args.outSAMattributes), + 'outSAMunmapped' : args.outSAMunmapped, + 'outSAMtype' : " ".join(args.outSAMtype), + 'outSAMheaderHD' : " ".join(args.outSAMheaderHD) + }) +# 'twopass1readsN' : args.twopass1readsN, + if align_sets[2] == 'PE': + cmd = string.Template(cmd).substitute({ + 'fastq_right' : ','.join([os.path.join(x[0], x[2]) for x in align_sets[1]]) # os.path.abspath(pair[2]), + }) + + ### convert RG_dict into formatted RG line + RG_line = [] + for r, readgroup in enumerate(align_sets[1]): + if 'RG' in RG_dict: + tmp = 'ID:%s:%s' % (RG_dict['ID'], RG_dict['RG'][r]) + else: + tmp = 'ID:%s:%s' % (RG_dict['ID'], readgroup[0]) + if len(RG_dict) > 1: + tmp += '\t' + tmp += '\t'.join(['%s:%s' % (key, RG_dict[key]) for key in RG_dict if key not in ['ID', 'RG', 'SI']]) + ### add read group label + if 'RG' in RG_dict and 'CN' in RG_dict: + tmp += '\tPU:%s:%s' % (RG_dict['CN'], RG_dict['RG'][r]) + RG_line.append('%s' % tmp) + cmd += ' --outSAMattrRGline %s' % ' , '.join(RG_line) + + ### handle comment lines + comment_file = None + if 'SI' in RG_dict: + if args.useTMP is not None: + comment_file = os.path.abspath( tempfile.mkstemp(dir=os.environ[args.useTMP], prefix="star_comments_")[1] ) + else: + comment_file = os.path.abspath( tempfile.mkstemp(dir=args.workDir, prefix="star_comments_")[1] ) + + fd_com = open(comment_file, 'w') + fd_com.write('@CO\tsubmitter_sample_id:%s\n' % RG_dict['SI']) + + fd_com.flush() + fd_com.close() + + cmd += ' --outSAMheaderCommentFile %s' % comment_file + + + ### take temp directory from environment variable + if args.useTMP is not None: + align_dir = os.path.abspath( tempfile.mkdtemp(dir=os.environ[args.useTMP], prefix="star_aligndir_") ) + else: + align_dir = os.path.abspath( tempfile.mkdtemp(dir=args.workDir, prefix="star_aligndir_") ) + print "Running", cmd + subprocess.check_call(cmd, shell=True, cwd=align_dir) + + ### move output file + if 'BAM' in args.outSAMtype and 'SortedByCoordinate' in args.outSAMtype: + shutil.move(os.path.join(align_dir, 'Aligned.sortedByCoord.out.bam'), args.out) + elif 'BAM' in args.outSAMtype and 'Unsorted' in args.outSAMtype: + shutil.move(os.path.join(align_dir, 'Aligned.out.bam'), args.out) + else: + raise Exception('STAR output file could not be determined') + + ### move junctions if to be kept + if args.keepJunctions: + shutil.move(os.path.join(align_dir, 'SJ.out.tab'), args.out + '.junctions') + + ### clean up working directory + shutil.rmtree(workdir) + shutil.rmtree(align_dir) + if args.twopass1readsN != 0: + shutil.rmtree(align_dir_1st) + shutil.rmtree(genome_dir_1st)