Galaxy |

Changeset 0:8922a3be8490 (2018-06-04)

Next changeset 1:b566013f81e8 (2018-07-24)

Commit message:
Uploaded

added:
star_alignment/.shed.yml
star_alignment/ICGC_pipeline.sh
star_alignment/ICGC_pipeline_RUN.sh
star_alignment/STAR
star_alignment/icgc_star2_wrapper.xml
star_alignment/star_align.py

diff -r 000000000000 -r 8922a3be8490 star_alignment/.shed.yml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/star_alignment/.shed.yml Mon Jun 04 02:38:49 2018 -0400

[

@@ -0,0 +1,5 @@
+categories: [Transcriptomics]
+description: daumsoft
+name: tar
+owner: daumsoft
+remote_repository_url: https://toolshed.g2.bx.psu.edu/view/daumsoft

diff -r 000000000000 -r 8922a3be8490 star_alignment/ICGC_pipeline.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/star_alignment/ICGC_pipeline.sh Mon Jun 04 02:38:49 2018 -0400

[

@@ -0,0 +1,51 @@
+#!/bin/bash
+
+if [ "$#" -ne 1 ]; then
+ echo "[usage:] ICGC_pipeline.sh  fastq_in_tar.gz"
+ exit 1;
+fi
+
+#FASTQ_1=$1
+#FASTQ_2=$2
+#SAMPLE_ID=$3
+#TAR_FILE=$SAMPLE_ID".tar"
+TAR_FILE=$1
+
+export PATH=/storage/data/program/GDC_TGCA-Harmonized/RNA-Seq/bin/ICGC-STAR_ALIGNMENT_PIPELINE:$PATH
+
+#tar cvfh $TAR_FILE  $FASTQ_2  $FASTQ_1
+
+STAD_ALIGN=~/package/DAUMSOFT/RNA-seq/ICGC_STAR_ALIGNMENT_PIPELINE/star_align.py
+STAR_INDEX_PATH=~/refs/hg38/gdc/Index_Files/GDC.h38.d1.vd1_STAR2_Index_Files/star_genome_d1_vd1_gtfv22
+WORK_DIR=./wrk
+OUTPUT_BAM=./out
+REFERENCE=~/refs/hg38/gdc/GRCh38.d1.vd1_Reference_Sequence/GRCh38.d1.vd1.fa
+RUN_THREAD_NUM=8
+rm -rf $WORK_DIR
+rm -rf $OUTPUT_BAM
+mkdir $WORK_DIR
+mkdir $OUTPUT_BAM
+
+python $STAD_ALIGN              \
+--genomeDir $STAR_INDEX_PATH    \
+--tarFileIn $TAR_FILE  \
+--workDir $WORK_DIR             \
+--out $OUTPUT_BAM \
+--genomeFastaFiles $REFERENCE \
+--runThreadN $RUN_THREAD_NUM \
+--outFilterMultimapScoreRange 1 \
+--outFilterMultimapNmax 20 \
+--outFilterMismatchNmax 10 \
+--alignIntronMax 500000 \
+--alignMatesGapMax 1000000 \
+--sjdbScore 2 \
+--limitBAMsortRAM 0 \
+--alignSJDBoverhangMin 1 \
+--genomeLoad NoSharedMemory \
+--outFilterMatchNminOverLread 0.33 \
+--outFilterScoreMinOverLread 0.33 \
+--twopass1readsN -1 \
+--sjdbOverhang 100 \
+--outSAMstrandField intronMotif \
+--outSAMunmapped Within
+

diff -r 000000000000 -r 8922a3be8490 star_alignment/ICGC_pipeline_RUN.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/star_alignment/ICGC_pipeline_RUN.sh Mon Jun 04 02:38:49 2018 -0400

[

@@ -0,0 +1,14 @@
+#!/bin/bash
+
+if [ "$#" -ne 1 ]; then
+ echo "[usage:] ICGC_pipeline_RUN.sh  tar.gz(fastq in file)"
+ exit 1;
+fi
+
+INPUT_TAR_GZ=$1
+
+ln -s $INPUT_TAR_GZ  ${INPUT_TAR_GZ}.tar.gz
+INPUT_TAR_GZ=${INPUT_TAR_GZ}.tar.gz
+
+$GALAXY_HOME/package/DAUMSOFT/RNA-seq/ICGC_STAR_ALIGNMENT_PIPELINE/ICGC_pipeline.sh $INPUT_TAR_GZ > log.out 2>&1
+

diff -r 000000000000 -r 8922a3be8490 star_alignment/STAR

Binary file star_alignment/STAR has changed

diff -r 000000000000 -r 8922a3be8490 star_alignment/icgc_star2_wrapper.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/star_alignment/icgc_star2_wrapper.xml Mon Jun 04 02:38:49 2018 -0400

@@ -0,0 +1,36 @@
+<tool id="daumsoft_wts_star" name="ICGC_STAR_PIPELINE" version="2.4.2A">
+ <description>STAR Alignment/ICGC Method</description>
+        <stdio>
+                <regex match="Exception|Error" source="both" level="fatal" description="Tool execution failed"/>
+        </stdio>
+ <version_command></version_command>
+ <command>
+
+          \$GALAXY_HOME/package/DAUMSOFT/RNA-seq/ICGC_STAR_ALIGNMENT_PIPELINE/ICGC_pipeline_RUN.sh
+           ${tar_gz}
+
+ </command>
+ <inputs>
+ <param name="tar_gz" format="gz" type="data" label="RNA-Seq FASTQ file In TAR.GZ(GRCh38)/hg38" help="" />
+ </inputs>
+ <outputs>
+ <data format="bam" name="alignment" label="${tool.name} on ${on_string}: Alignments BAM" from_work_dir="out/Aligned.sortedByCoord.out.bam"/>
+ </outputs>
+        <tests><test><output name="alignment"/></test></tests>
+        <help>
+
+For more detailed manual, please visit The GDC mRNA quantification analysis pipeline website:
+
+https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline
+
+The mRNA Analysis pipeline begins with the Alignment Workflow, which is performed using a two-pass method with STAR.
+STAR aligns each read group separately and then merges the resulting alignments into one.
+Following the methods used by the International Cancer Genome Consortium ICGC,
+the two-pass method includes a splice junction detection step, which is used to generate the final alignment.
+
+This workflow outputs a BAM file, which contains both aligned and unaligned reads.
+Quality assessment is performed pre-alignment with  FASTQC and post-alignment with RNA-SeQC and   Picard Tools.
+
+ </help>
+        <citations></citations>
+</tool>

diff -r 000000000000 -r 8922a3be8490 star_alignment/star_align.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/star_alignment/star_align.py Mon Jun 04 02:38:49 2018 -0400

[

b'@@ -0,0 +1,527 @@\n+#!/usr/bin/env python\n+\n+import os\n+import sys\n+import re\n+import string\n+import tempfile\n+import subprocess\n+import argparse\n+import shutil\n+import lxml.etree as etree\n+import fnmatch\n+\n+def walk_dir(base, pattern):\n+ files = []\n+ for root, dirnames, filenames in os.walk(base):\n+ for fname in fnmatch.filter(filenames, pattern):\n+ files.append(os.path.join(root, fname))\n+ return files\n+\n+def scan_workdir(base): \n+\n+ ### scan for paired-end files\n+ #############################\n+\n+ ### unzipped fastq input\n+ fastq_files = walk_dir(base, "*_read[12]_*fastq")\n+ if len(fastq_files):\n+ o = {}\n+ for i in sorted(fastq_files):\n+ basename = re.sub(r\'_read[12]\', \'\', i)\n+ try:\n+ o[basename].append(i)\n+ except KeyError:\n+ o[basename] = [i]\n+ if not all( (len(i) == 2 for i in o.values())):\n+ raise Exception("Missing Pair")\n+ return ( \'cat\', list( (os.path.basename(i), o[i][0], o[i][1]) for i in o.keys()), \'PE\') \n+\n+ ### unzipped fastq input\n+ fastq_files = walk_dir(base, "*_R[12]_001.fastq")\n+ if len(fastq_files):\n+ o = {}\n+ for i in fastq_files:\n+ basename = re.sub(r\'_R[12]_001.fastq$\', \'\', i)\n+ o[basename] = o.get(basename, 0) + 1\n+ if not all( (i == 2 for i in o.values())):\n+ raise Exception("Missing Pair")\n+ return ( \'cat\', list( (os.path.basename(i), "%s_R1_001.fastq" % i,"%s_R2_001.fastq" % i) for i in o.keys()), \'PE\') \n+\n+ ### unzipped fastq input\n+ fastq_files = walk_dir(base, "*_[12].fastq")\n+ if len(fastq_files):\n+ o = {}\n+ for i in fastq_files:\n+ basename = re.sub(r\'_[12].fastq$\', \'\', i)\n+ o[basename] = o.get(basename, 0) + 1\n+ if not all( (i == 2 for i in o.values())):\n+ raise Exception("Missing Pair")\n+ return ( \'cat\', list( (os.path.basename(i), "%s_1.fastq" % i,"%s_2.fastq" % i) for i in o.keys()), \'PE\') \n+\n+ ### unzipped fastq input\n+ fastq_files = walk_dir(base, "*[.][12].fastq")\n+ if len(fastq_files):\n+ o = {}\n+ for i in fastq_files:\n+ basename = re.sub(r\'[.][12].fastq$\', \'\', i)\n+ o[basename] = o.get(basename, 0) + 1\n+ if not all( (i == 2 for i in o.values())):\n+ raise Exception("Missing Pair")\n+ return ( \'cat\', list( (os.path.basename(i), "%s.1.fastq" % i,"%s.2.fastq" % i) for i in o.keys()), \'PE\') \n+\n+ ### unzipped fastq input\n+ fastq_files = walk_dir(base, "*.fastq[12]")\n+ if len(fastq_files):\n+ o = {}\n+ for i in fastq_files:\n+ basename = re.sub(r\'.fastq[12]$\', \'\', i)\n+ o[basename] = o.get(basename, 0) + 1\n+ if not all( (i == 2 for i in o.values())):\n+ raise Exception("Missing Pair")\n+ return ( \'cat\', list( (os.path.basename(i), "%s.fastq1" % i,"%s.fastq2" % i) for i in o.keys()), \'PE\') \n+\n+ ### unzipped txt input\n+ fastq_files = walk_dir(base, "*_[12]_sequence.txt")\n+ if len(fastq_files):\n+ o = {}\n+ for i in fastq_files:\n+ basename = re.sub(r\'_[12]_sequence.txt$\', \'\', i)\n+ o[basename] = o.get(basename, 0) + 1\n+ if not all( (i == 2 for i in o.values())):\n+ raise Exception("Missing Pair")\n+ return ( \'cat\', list( (os.path.basename(i), "%s_1_sequence.txt" % i,"%s_2_sequence.txt" % i) for i in o.keys()), \'PE\') \n+ \n+ ### gzipped input\n+ fastq_gz_files = walk_dir(base, "*_[12].fastq.gz")\n+ if len(fastq_gz_files):\n+ o = {}\n+ for i in fastq_gz_files:\n+ basename = re.sub(r\'_[12].fastq.gz$\', \'\', i)\n+ o[basename] = o.get(basename, 0) + 1\n+ if not all( (i == 2 for i in o.values())):\n+ raise Exception("Missing Pair")\n+ return ( \'zcat\', list( (os.path.basename(i), "%s_1.fastq.gz" % i,"%s_2.fastq.gz" % i) for i in o.keys()), \'PE\') \n+\n+ ### bzipped input\n+'..b'lign_template_str).safe_substitute({\n+ \'genomeDir\' : genome_dir,\n+ \'runThreadN\' : args.runThreadN,\n+ \'fastq_left\' : \',\'.join([os.path.join(x[0], x[1]) for x in align_sets[1]]), #os.path.abspath(pair[1]),\n+ \'outFilterMultimapScoreRange\' : args.outFilterMultimapScoreRange,\n+ \'outFilterMultimapNmax\' : args.outFilterMultimapNmax,\n+ \'outFilterMismatchNmax\' : args.outFilterMismatchNmax,\n+ \'alignIntronMax\' : args.alignIntronMax,\n+ \'alignMatesGapMax\': args.alignMatesGapMax,\n+ \'sjdbScore\': args.sjdbScore,\n+ \'alignSJDBoverhangMin\' : args.alignSJDBoverhangMin,\n+ \'genomeLoad\' : args.genomeLoad,\n+ \'limitBAMsortRAM\' : args.limitBAMsortRAM,\n+ \'readFilesCommand\' : align_sets[0],\n+ \'outFilterMatchNminOverLread\' : args.outFilterMatchNminOverLread,\n+ \'outFilterScoreMinOverLread\' : args.outFilterScoreMinOverLread,\n+ \'sjdbOverhang\' : args.sjdbOverhang,\n+ \'outSAMstrandField\' : args.outSAMstrandField,\n+ \'outSAMattributes\' : " ".join(args.outSAMattributes),\n+ \'outSAMunmapped\' : args.outSAMunmapped, \n+ \'outSAMtype\' : " ".join(args.outSAMtype),\n+ \'outSAMheaderHD\' : " ".join(args.outSAMheaderHD)\n+ })\n+# \'twopass1readsN\' : args.twopass1readsN,\n+ if align_sets[2] == \'PE\':\n+ cmd = string.Template(cmd).substitute({\n+ \'fastq_right\' : \',\'.join([os.path.join(x[0], x[2]) for x in align_sets[1]]) # os.path.abspath(pair[2]),\n+ })\n+\n+ ### convert RG_dict into formatted RG line\n+ RG_line = []\n+ for r, readgroup in enumerate(align_sets[1]):\n+ if \'RG\' in RG_dict:\n+ tmp = \'ID:%s:%s\' % (RG_dict[\'ID\'], RG_dict[\'RG\'][r])\n+ else:\n+ tmp = \'ID:%s:%s\' % (RG_dict[\'ID\'], readgroup[0])\n+ if len(RG_dict) > 1:\n+ tmp += \'\\t\'\n+ tmp += \'\\t\'.join([\'%s:%s\' % (key, RG_dict[key]) for key in RG_dict if key not in [\'ID\', \'RG\', \'SI\']])\n+ ### add read group label\n+ if \'RG\' in RG_dict and \'CN\' in RG_dict:\n+ tmp += \'\\tPU:%s:%s\' % (RG_dict[\'CN\'], RG_dict[\'RG\'][r])\n+ RG_line.append(\'%s\' % tmp)\n+ cmd += \' --outSAMattrRGline %s\' % \' , \'.join(RG_line)\n+\n+ ### handle comment lines\n+ comment_file = None\n+ if \'SI\' in RG_dict:\n+ if args.useTMP is not None:\n+ comment_file = os.path.abspath( tempfile.mkstemp(dir=os.environ[args.useTMP], prefix="star_comments_")[1] )\n+ else:\n+ comment_file = os.path.abspath( tempfile.mkstemp(dir=args.workDir, prefix="star_comments_")[1] )\n+ \n+ fd_com = open(comment_file, \'w\')\n+ fd_com.write(\'@CO\\tsubmitter_sample_id:%s\\n\' % RG_dict[\'SI\'])\n+\n+ fd_com.flush()\n+ fd_com.close()\n+ \n+ cmd += \' --outSAMheaderCommentFile %s\' % comment_file\n+\n+\n+ ### take temp directory from environment variable\n+ if args.useTMP is not None:\n+ align_dir = os.path.abspath( tempfile.mkdtemp(dir=os.environ[args.useTMP], prefix="star_aligndir_") )\n+ else:\n+ align_dir = os.path.abspath( tempfile.mkdtemp(dir=args.workDir, prefix="star_aligndir_") )\n+ print "Running", cmd\n+ subprocess.check_call(cmd, shell=True, cwd=align_dir)\n+\n+ ### move output file\n+ if \'BAM\' in args.outSAMtype and \'SortedByCoordinate\' in args.outSAMtype:\n+ shutil.move(os.path.join(align_dir, \'Aligned.sortedByCoord.out.bam\'), args.out)\n+ elif \'BAM\' in args.outSAMtype and \'Unsorted\' in args.outSAMtype:\n+ shutil.move(os.path.join(align_dir, \'Aligned.out.bam\'), args.out)\n+ else:\n+ raise Exception(\'STAR output file could not be determined\') \n+\n+ ### move junctions if to be kept\n+ if args.keepJunctions:\n+ shutil.move(os.path.join(align_dir, \'SJ.out.tab\'), args.out + \'.junctions\')\n+\n+ ### clean up working directory\n+ shutil.rmtree(workdir)\n+ shutil.rmtree(align_dir)\n+ if args.twopass1readsN != 0:\n+ shutil.rmtree(align_dir_1st)\n+ shutil.rmtree(genome_dir_1st)\n'