Repository 'star_icgc_alignment2'
hg clone https://toolshed.g2.bx.psu.edu/repos/daumsoft/star_icgc_alignment2

Changeset 1:b566013f81e8 (2018-07-24)
Previous changeset 0:8922a3be8490 (2018-06-04)
Commit message:
Uploaded
added:
export_info.xml
manifest.xml
star_icgc_alignment2-8922a3be8490.tar.gz
removed:
star_alignment/.shed.yml
star_alignment/ICGC_pipeline.sh
star_alignment/ICGC_pipeline_RUN.sh
star_alignment/STAR
star_alignment/icgc_star2_wrapper.xml
star_alignment/star_align.py
b
diff -r 8922a3be8490 -r b566013f81e8 export_info.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/export_info.xml Tue Jul 24 03:38:01 2018 -0400
b
@@ -0,0 +1,11 @@
+<?xml version="1.0"?>
+<export_info>
+  <export_time>Tue, 24 Jul 2018 07:36:35 +0000</export_time>
+    <tool_shed>https://toolshed.g2.bx.psu.edu</tool_shed>
+    <repository_name>star_icgc_alignment2</repository_name>
+    <repository_owner>daumsoft</repository_owner>
+    <changeset_revision>8922a3be8490</changeset_revision>
+    <export_repository_dependencies>False</export_repository_dependencies>
+    <exported_via_api>False</exported_via_api>
+</export_info>
+
b
diff -r 8922a3be8490 -r b566013f81e8 manifest.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/manifest.xml Tue Jul 24 03:38:01 2018 -0400
b
@@ -0,0 +1,12 @@
+<?xml version="1.0"?>
+<repositories>
+  <repository name="star_icgc_alignment2" type="unrestricted" username="daumsoft">
+      <description>star_icgc_alignment of daumsoft</description>
+        <long_description>star_icgc_alignment of daumsoft</long_description>
+        <archive>star_icgc_alignment2-8922a3be8490.tar.gz</archive>
+        <categories>
+          <category>Transcriptomics</category>
+        </categories>
+    </repository>
+</repositories>
+
b
diff -r 8922a3be8490 -r b566013f81e8 star_alignment/.shed.yml
--- a/star_alignment/.shed.yml Mon Jun 04 02:38:49 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,5 +0,0 @@
-categories: [Transcriptomics]
-description: daumsoft
-name: tar
-owner: daumsoft
-remote_repository_url: https://toolshed.g2.bx.psu.edu/view/daumsoft
b
diff -r 8922a3be8490 -r b566013f81e8 star_alignment/ICGC_pipeline.sh
--- a/star_alignment/ICGC_pipeline.sh Mon Jun 04 02:38:49 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,51 +0,0 @@
-#!/bin/bash
-
-if [ "$#" -ne 1 ]; then
- echo "[usage:] ICGC_pipeline.sh  fastq_in_tar.gz"
- exit 1;
-fi
-
-#FASTQ_1=$1
-#FASTQ_2=$2
-#SAMPLE_ID=$3
-#TAR_FILE=$SAMPLE_ID".tar"
-TAR_FILE=$1
-
-export PATH=/storage/data/program/GDC_TGCA-Harmonized/RNA-Seq/bin/ICGC-STAR_ALIGNMENT_PIPELINE:$PATH
-
-#tar cvfh $TAR_FILE  $FASTQ_2  $FASTQ_1 
-
-STAD_ALIGN=~/package/DAUMSOFT/RNA-seq/ICGC_STAR_ALIGNMENT_PIPELINE/star_align.py
-STAR_INDEX_PATH=~/refs/hg38/gdc/Index_Files/GDC.h38.d1.vd1_STAR2_Index_Files/star_genome_d1_vd1_gtfv22
-WORK_DIR=./wrk
-OUTPUT_BAM=./out
-REFERENCE=~/refs/hg38/gdc/GRCh38.d1.vd1_Reference_Sequence/GRCh38.d1.vd1.fa
-RUN_THREAD_NUM=8
-rm -rf $WORK_DIR
-rm -rf $OUTPUT_BAM
-mkdir $WORK_DIR
-mkdir $OUTPUT_BAM
-
-python $STAD_ALIGN              \
---genomeDir $STAR_INDEX_PATH    \
---tarFileIn $TAR_FILE  \
---workDir $WORK_DIR             \
---out $OUTPUT_BAM \
---genomeFastaFiles $REFERENCE \
---runThreadN $RUN_THREAD_NUM \
---outFilterMultimapScoreRange 1 \
---outFilterMultimapNmax 20 \
---outFilterMismatchNmax 10 \
---alignIntronMax 500000 \
---alignMatesGapMax 1000000 \
---sjdbScore 2 \
---limitBAMsortRAM 0 \
---alignSJDBoverhangMin 1 \
---genomeLoad NoSharedMemory \
---outFilterMatchNminOverLread 0.33 \
---outFilterScoreMinOverLread 0.33 \
---twopass1readsN -1 \
---sjdbOverhang 100 \
---outSAMstrandField intronMotif \
---outSAMunmapped Within
-
b
diff -r 8922a3be8490 -r b566013f81e8 star_alignment/ICGC_pipeline_RUN.sh
--- a/star_alignment/ICGC_pipeline_RUN.sh Mon Jun 04 02:38:49 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-if [ "$#" -ne 1 ]; then
- echo "[usage:] ICGC_pipeline_RUN.sh  tar.gz(fastq in file)"
- exit 1;
-fi
-
-INPUT_TAR_GZ=$1
-
-ln -s $INPUT_TAR_GZ  ${INPUT_TAR_GZ}.tar.gz
-INPUT_TAR_GZ=${INPUT_TAR_GZ}.tar.gz
-
-$GALAXY_HOME/package/DAUMSOFT/RNA-seq/ICGC_STAR_ALIGNMENT_PIPELINE/ICGC_pipeline.sh $INPUT_TAR_GZ > log.out 2>&1   
-
b
diff -r 8922a3be8490 -r b566013f81e8 star_alignment/STAR
b
Binary file star_alignment/STAR has changed
b
diff -r 8922a3be8490 -r b566013f81e8 star_alignment/icgc_star2_wrapper.xml
--- a/star_alignment/icgc_star2_wrapper.xml Mon Jun 04 02:38:49 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,36 +0,0 @@
-<tool id="daumsoft_wts_star" name="ICGC_STAR_PIPELINE" version="2.4.2A">
- <description>STAR Alignment/ICGC Method</description>
-        <stdio>
-                <regex match="Exception|Error" source="both" level="fatal" description="Tool execution failed"/>
-        </stdio>
- <version_command></version_command>
- <command>
-         
-          \$GALAXY_HOME/package/DAUMSOFT/RNA-seq/ICGC_STAR_ALIGNMENT_PIPELINE/ICGC_pipeline_RUN.sh 
-           ${tar_gz}
-
- </command>
- <inputs>
- <param name="tar_gz" format="gz" type="data" label="RNA-Seq FASTQ file In TAR.GZ(GRCh38)/hg38" help="" />
- </inputs>
- <outputs>
- <data format="bam" name="alignment" label="${tool.name} on ${on_string}: Alignments BAM" from_work_dir="out/Aligned.sortedByCoord.out.bam"/>
- </outputs>
-        <tests><test><output name="alignment"/></test></tests>
-        <help>
-          
-For more detailed manual, please visit The GDC mRNA quantification analysis pipeline website:
-
-https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/Expression_mRNA_Pipeline
-
-The mRNA Analysis pipeline begins with the Alignment Workflow, which is performed using a two-pass method with STAR. 
-STAR aligns each read group separately and then merges the resulting alignments into one.
-Following the methods used by the International Cancer Genome Consortium ICGC, 
-the two-pass method includes a splice junction detection step, which is used to generate the final alignment. 
-
-This workflow outputs a BAM file, which contains both aligned and unaligned reads. 
-Quality assessment is performed pre-alignment with  FASTQC and post-alignment with RNA-SeQC and   Picard Tools.
-
- </help>
-        <citations></citations>
-</tool>
b
diff -r 8922a3be8490 -r b566013f81e8 star_alignment/star_align.py
--- a/star_alignment/star_align.py Mon Jun 04 02:38:49 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,527 +0,0 @@\n-#!/usr/bin/env python\n-\n-import os\n-import sys\n-import re\n-import string\n-import tempfile\n-import subprocess\n-import argparse\n-import shutil\n-import lxml.etree as etree\n-import fnmatch\n-\n-def walk_dir(base, pattern):\n-    files = []\n-    for root, dirnames, filenames in os.walk(base):\n-        for fname in fnmatch.filter(filenames, pattern):\n-            files.append(os.path.join(root, fname))\n-    return files\n-\n-def scan_workdir(base): \n-\n-    ### scan for paired-end files\n-    #############################\n-\n-    ### unzipped fastq input\n-    fastq_files = walk_dir(base, "*_read[12]_*fastq")\n-    if len(fastq_files):\n-        o = {}\n-        for i in sorted(fastq_files):\n-            basename = re.sub(r\'_read[12]\', \'\', i)\n-            try:\n-                o[basename].append(i)\n-            except KeyError:\n-                o[basename] = [i]\n-        if not all( (len(i) == 2 for i in o.values())):\n-            raise Exception("Missing Pair")\n-        return ( \'cat\', list( (os.path.basename(i), o[i][0], o[i][1]) for i in o.keys()), \'PE\') \n-\n-    ### unzipped fastq input\n-    fastq_files = walk_dir(base, "*_R[12]_001.fastq")\n-    if len(fastq_files):\n-        o = {}\n-        for i in fastq_files:\n-            basename = re.sub(r\'_R[12]_001.fastq$\', \'\', i)\n-            o[basename] = o.get(basename, 0) + 1\n-        if not all( (i == 2 for i in o.values())):\n-            raise Exception("Missing Pair")\n-        return ( \'cat\', list( (os.path.basename(i), "%s_R1_001.fastq" % i,"%s_R2_001.fastq" % i) for i in o.keys()), \'PE\') \n-\n-    ### unzipped fastq input\n-    fastq_files = walk_dir(base, "*_[12].fastq")\n-    if len(fastq_files):\n-        o = {}\n-        for i in fastq_files:\n-            basename = re.sub(r\'_[12].fastq$\', \'\', i)\n-            o[basename] = o.get(basename, 0) + 1\n-        if not all( (i == 2 for i in o.values())):\n-            raise Exception("Missing Pair")\n-        return ( \'cat\', list( (os.path.basename(i), "%s_1.fastq" % i,"%s_2.fastq" % i) for i in o.keys()), \'PE\') \n-\n-    ### unzipped fastq input\n-    fastq_files = walk_dir(base, "*[.][12].fastq")\n-    if len(fastq_files):\n-        o = {}\n-        for i in fastq_files:\n-            basename = re.sub(r\'[.][12].fastq$\', \'\', i)\n-            o[basename] = o.get(basename, 0) + 1\n-        if not all( (i == 2 for i in o.values())):\n-            raise Exception("Missing Pair")\n-        return ( \'cat\', list( (os.path.basename(i), "%s.1.fastq" % i,"%s.2.fastq" % i) for i in o.keys()), \'PE\') \n-\n-    ### unzipped fastq input\n-    fastq_files = walk_dir(base, "*.fastq[12]")\n-    if len(fastq_files):\n-        o = {}\n-        for i in fastq_files:\n-            basename = re.sub(r\'.fastq[12]$\', \'\', i)\n-            o[basename] = o.get(basename, 0) + 1\n-        if not all( (i == 2 for i in o.values())):\n-            raise Exception("Missing Pair")\n-        return ( \'cat\', list( (os.path.basename(i), "%s.fastq1" % i,"%s.fastq2" % i) for i in o.keys()), \'PE\') \n-\n-    ### unzipped txt input\n-    fastq_files = walk_dir(base, "*_[12]_sequence.txt")\n-    if len(fastq_files):\n-        o = {}\n-        for i in fastq_files:\n-            basename = re.sub(r\'_[12]_sequence.txt$\', \'\', i)\n-            o[basename] = o.get(basename, 0) + 1\n-        if not all( (i == 2 for i in o.values())):\n-            raise Exception("Missing Pair")\n-        return ( \'cat\', list( (os.path.basename(i), "%s_1_sequence.txt" % i,"%s_2_sequence.txt" % i) for i in o.keys()), \'PE\') \n-    \n-    ### gzipped input\n-    fastq_gz_files = walk_dir(base, "*_[12].fastq.gz")\n-    if len(fastq_gz_files):\n-        o = {}\n-        for i in fastq_gz_files:\n-            basename = re.sub(r\'_[12].fastq.gz$\', \'\', i)\n-            o[basename] = o.get(basename, 0) + 1\n-        if not all( (i == 2 for i in o.values())):\n-            raise Exception("Missing Pair")\n-        return ( \'zcat\', list( (os.path.basename(i), "%s_1.fastq.gz" % i,"%s_2.fastq.gz" % i) for i in o.keys()), \'PE\') \n-\n-    ### bzipped input\n-'..b'lign_template_str).safe_substitute({\n-        \'genomeDir\' : genome_dir,\n-        \'runThreadN\' : args.runThreadN,\n-        \'fastq_left\' : \',\'.join([os.path.join(x[0], x[1]) for x in align_sets[1]]), #os.path.abspath(pair[1]),\n-        \'outFilterMultimapScoreRange\' : args.outFilterMultimapScoreRange,\n-        \'outFilterMultimapNmax\' : args.outFilterMultimapNmax,\n-        \'outFilterMismatchNmax\' : args.outFilterMismatchNmax,\n-        \'alignIntronMax\' : args.alignIntronMax,\n-        \'alignMatesGapMax\': args.alignMatesGapMax,\n-        \'sjdbScore\': args.sjdbScore,\n-        \'alignSJDBoverhangMin\' : args.alignSJDBoverhangMin,\n-        \'genomeLoad\' : args.genomeLoad,\n-        \'limitBAMsortRAM\' : args.limitBAMsortRAM,\n-        \'readFilesCommand\' : align_sets[0],\n-        \'outFilterMatchNminOverLread\' : args.outFilterMatchNminOverLread,\n-        \'outFilterScoreMinOverLread\' : args.outFilterScoreMinOverLread,\n-        \'sjdbOverhang\' : args.sjdbOverhang,\n-        \'outSAMstrandField\' : args.outSAMstrandField,\n-        \'outSAMattributes\' : " ".join(args.outSAMattributes),\n-        \'outSAMunmapped\' : args.outSAMunmapped, \n-        \'outSAMtype\' : " ".join(args.outSAMtype),\n-        \'outSAMheaderHD\' : " ".join(args.outSAMheaderHD)\n-    })\n-#        \'twopass1readsN\' : args.twopass1readsN,\n-    if align_sets[2] == \'PE\':\n-        cmd = string.Template(cmd).substitute({\n-            \'fastq_right\' : \',\'.join([os.path.join(x[0], x[2]) for x in align_sets[1]]) # os.path.abspath(pair[2]),\n-        })\n-\n-    ### convert RG_dict into formatted RG line\n-    RG_line = []\n-    for r, readgroup in enumerate(align_sets[1]):\n-        if \'RG\' in RG_dict:\n-            tmp = \'ID:%s:%s\' % (RG_dict[\'ID\'], RG_dict[\'RG\'][r])\n-        else:\n-            tmp = \'ID:%s:%s\' % (RG_dict[\'ID\'], readgroup[0])\n-        if len(RG_dict) > 1:\n-            tmp += \'\\t\'\n-            tmp += \'\\t\'.join([\'%s:%s\' % (key, RG_dict[key]) for key in RG_dict if key not in [\'ID\', \'RG\', \'SI\']])\n-        ### add read group label\n-        if \'RG\' in RG_dict and \'CN\' in RG_dict:\n-            tmp += \'\\tPU:%s:%s\' % (RG_dict[\'CN\'], RG_dict[\'RG\'][r])\n-        RG_line.append(\'%s\' % tmp)\n-    cmd += \' --outSAMattrRGline %s\' % \' , \'.join(RG_line)\n-\n-    ### handle comment lines\n-    comment_file = None\n-    if \'SI\' in RG_dict:\n-        if args.useTMP is not None:\n-            comment_file = os.path.abspath( tempfile.mkstemp(dir=os.environ[args.useTMP], prefix="star_comments_")[1] )\n-        else:\n-            comment_file = os.path.abspath( tempfile.mkstemp(dir=args.workDir, prefix="star_comments_")[1] )\n-        \n-        fd_com = open(comment_file, \'w\')\n-        fd_com.write(\'@CO\\tsubmitter_sample_id:%s\\n\' % RG_dict[\'SI\'])\n-\n-        fd_com.flush()\n-        fd_com.close()\n-        \n-        cmd += \' --outSAMheaderCommentFile %s\' % comment_file\n-\n-\n-    ### take temp directory from environment variable\n-    if args.useTMP is not None:\n-        align_dir = os.path.abspath( tempfile.mkdtemp(dir=os.environ[args.useTMP], prefix="star_aligndir_") )\n-    else:\n-        align_dir = os.path.abspath( tempfile.mkdtemp(dir=args.workDir, prefix="star_aligndir_") )\n-    print "Running", cmd\n-    subprocess.check_call(cmd, shell=True, cwd=align_dir)\n-\n-    ### move output file\n-    if \'BAM\' in args.outSAMtype and \'SortedByCoordinate\' in args.outSAMtype:\n-        shutil.move(os.path.join(align_dir, \'Aligned.sortedByCoord.out.bam\'), args.out)\n-    elif \'BAM\' in args.outSAMtype and \'Unsorted\' in args.outSAMtype:\n-        shutil.move(os.path.join(align_dir, \'Aligned.out.bam\'), args.out)\n-    else:\n-        raise Exception(\'STAR output file could not be determined\') \n-\n-    ### move junctions if to be kept\n-    if args.keepJunctions:\n-        shutil.move(os.path.join(align_dir, \'SJ.out.tab\'), args.out + \'.junctions\')\n-\n-    ### clean up working directory\n-    shutil.rmtree(workdir)\n-    shutil.rmtree(align_dir)\n-    if args.twopass1readsN != 0:\n-        shutil.rmtree(align_dir_1st)\n-        shutil.rmtree(genome_dir_1st)\n'
b
diff -r 8922a3be8490 -r b566013f81e8 star_icgc_alignment2-8922a3be8490.tar.gz
b
Binary file star_icgc_alignment2-8922a3be8490.tar.gz has changed