Mercurial > repos > galaxyp > proteomics_rnaseq_reduced_db_workflow

--- a/README.rst	Mon Mar 17 16:03:12 2014 -0500
+++ b/README.rst	Thu Mar 20 21:50:05 2014 -0500
@@ -47,6 +47,7 @@
 Version Changes
 ------- ----------------------------------------------------------------------
 v0.0.1  - Initial release to Tool Shed (March, 2014)
+v0.0.2  - Use regex_replace tool for tabular file manipulation
 ======= ======================================================================
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/proteomics_rnaseq_reduced_db_workflow_v2.ga	Thu Mar 20 21:50:05 2014 -0500
@@ -0,0 +1,469 @@
+{
+    "a_galaxy_workflow": "true",
+    "annotation": "Filter out proteins that have a transcript expression level, as quantified by RNA-Seq data, below a certain threshold.",
+    "format-version": "0.1",
+    "name": "Proteomics Reduced DB v2",
+    "steps": {
+        "0": {
+            "annotation": "ftp://ftp.ensembl.org/pub/release-73/fasta/homo_sapiens/pep/Homo_sapiens.GRCh37.73.pep.all.fa.gz",
+            "id": 0,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "ftp://ftp.ensembl.org/pub/release-73/fasta/homo_sapiens/pep/Homo_sapiens.GRCh37.73.pep.all.fa.gz",
+                    "name": "Ensembl Protein FASTA (reference proteome)"
+                }
+            ],
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "left": 208,
+                "top": 200
+            },
+            "tool_errors": null,
+            "tool_id": null,
+            "tool_state": "{\"name\": \"Ensembl Protein FASTA (reference proteome)\"}",
+            "tool_version": null,
+            "type": "data_input",
+            "user_outputs": []
+        },
+        "1": {
+            "annotation": "Ensembl reference fasta with only chromosome assigned sequences.   For example: ftp://ftp.ensembl.org/pub/release-73/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.73.dna.toplevel.fa.gz",
+            "id": 1,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "Ensembl reference fasta with only chromosome assigned sequences.   For example: ftp://ftp.ensembl.org/pub/release-73/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.73.dna.toplevel.fa.gz",
+                    "name": "Ensembl Genome Reference Fasta"
+                }
+            ],
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "left": 209,
+                "top": 292
+            },
+            "tool_errors": null,
+            "tool_id": null,
+            "tool_state": "{\"name\": \"Ensembl Genome Reference Fasta\"}",
+            "tool_version": null,
+            "type": "data_input",
+            "user_outputs": []
+        },
+        "2": {
+            "annotation": "For example: \nftp://ftp.ensembl.org/pub/release-73/gtf/homo_sapiens/Homo_sapiens.GRCh37.73.gtf.gz",
+            "id": 2,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "For example: \nftp://ftp.ensembl.org/pub/release-73/gtf/homo_sapiens/Homo_sapiens.GRCh37.73.gtf.gz",
+                    "name": "Ensembl GTF File (gene models)"
+                }
+            ],
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "left": 213,
+                "top": 456
+            },
+            "tool_errors": null,
+            "tool_id": null,
+            "tool_state": "{\"name\": \"Ensembl GTF File (gene models)\"}",
+            "tool_version": null,
+            "type": "data_input",
+            "user_outputs": []
+        },
+        "3": {
+            "annotation": "RNA-Seq left mate pair fastq (These should be in fastqsanger format. If not, convert with \"Fastq Groomer\" tool.)",
+            "id": 3,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "RNA-Seq left mate pair fastq (These should be in fastqsanger format. If not, convert with \"Fastq Groomer\" tool.)",
+                    "name": "RNA-Seq left paired-end fastq"
+                }
+            ],
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "left": 220,
+                "top": 563
+            },
+            "tool_errors": null,
+            "tool_id": null,
+            "tool_state": "{\"name\": \"RNA-Seq left paired-end fastq\"}",
+            "tool_version": null,
+            "type": "data_input",
+            "user_outputs": []
+        },
+        "4": {
+            "annotation": "RNA-Seq right mate pair fastq (These should be in fastqsanger format. If not, convert with \"Fastq Groomer\" tool.)",
+            "id": 4,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "RNA-Seq right mate pair fastq (These should be in fastqsanger format. If not, convert with \"Fastq Groomer\" tool.)",
+                    "name": "RNA-Seq right paired-end fastq"
+                }
+            ],
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "left": 221,
+                "top": 673
+            },
+            "tool_errors": null,
+            "tool_id": null,
+            "tool_state": "{\"name\": \"RNA-Seq right paired-end fastq\"}",
+            "tool_version": null,
+            "type": "data_input",
+            "user_outputs": []
+        },
+        "5": {
+            "annotation": "Convert peptide fasta to a 2-column tabular file.  Keep all the head info.",
+            "id": 5,
+            "input_connections": {
+                "input": {
+                    "id": 0,
+                    "output_name": "output"
+                }
+            },
+            "inputs": [],
+            "name": "FASTA-to-Tabular",
+            "outputs": [
+                {
+                    "name": "output",
+                    "type": "tabular"
+                }
+            ],
+            "position": {
+                "left": 538,
+                "top": 267
+            },
+            "post_job_actions": {},
+            "tool_errors": null,
+            "tool_id": "fasta2tab",
+            "tool_state": "{\"__page__\": 0, \"keep_first\": \"\\\"0\\\"\", \"descr_columns\": \"\\\"1\\\"\", \"input\": \"null\", \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\", \"__rerun_remap_job_id__\": null}",
+            "tool_version": "1.1.0",
+            "type": "tool",
+            "user_outputs": []
+        },
+        "6": {
+            "annotation": "Given a GTF file and the reference genome, this tool constructs a synthetic transcriptome that will be used for isoform quantification during \"-calculate expression\".",
+            "id": 6,
+            "input_connections": {
+                "reference|gtf": {
+                    "id": 2,
+                    "output_name": "output"
+                },
+                "reference|reference_fasta_file": {
+                    "id": 1,
+                    "output_name": "output"
+                }
+            },
+            "inputs": [],
+            "name": "RSEM prepare reference",
+            "outputs": [
+                {
+                    "name": "reference_file",
+                    "type": "rsem_ref"
+                }
+            ],
+            "position": {
+                "left": 419,
+                "top": 388
+            },
+            "post_job_actions": {},
+            "tool_errors": null,
+            "tool_id": "toolshed.g2.bx.psu.edu/repos/jjohnson/rsem/rsem_prepare_reference/1.1.17",
+            "tool_state": "{\"__page__\": 0, \"reference\": \"{\\\"ref_type\\\": \\\"genomic\\\", \\\"gtf\\\": null, \\\"reference_fasta_file\\\": null, \\\"__current_case__\\\": 1}\", \"reference_name\": \"\\\"primaryEnsemblGtfRef\\\"\", \"__rerun_remap_job_id__\": null, \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\", \"polya\": \"{\\\"polya_use\\\": \\\"add\\\", \\\"polya_length\\\": \\\"125\\\", \\\"__current_case__\\\": 0}\", \"transcript_to_gene_map\": \"null\", \"ntog\": \"\\\"False\\\"\"}",
+            "tool_version": "1.1.17",
+            "type": "tool",
+            "user_outputs": []
+        },
+        "7": {
+            "annotation": "",
+            "id": 7,
+            "input_connections": {
+                "infile": {
+                    "id": 5,
+                    "output_name": "output"
+                }
+            },
+            "inputs": [],
+            "name": "Regex Replace",
+            "outputs": [
+                {
+                    "name": "outfile",
+                    "type": "txt"
+                }
+            ],
+            "position": {
+                "left": 802,
+                "top": 281
+            },
+            "post_job_actions": {
+                "ChangeDatatypeActionoutfile": {
+                    "action_arguments": {
+                        "newtype": "tabular"
+                    },
+                    "action_type": "ChangeDatatypeAction",
+                    "output_name": "outfile"
+                }
+            },
+            "tool_errors": null,
+            "tool_id": "toolshed.g2.bx.psu.edu/repos/kellrott/regex_replace/regex_replace/1.0.0",
+            "tool_state": "{\"__page__\": 0, \"ignore_case\": \"\\\"False\\\"\", \"search_str\": \"\\\"^(.* transcript:)(ENST\\\\\\\\d+)(.*)$\\\"\", \"__rerun_remap_job_id__\": null, \"replace_str\": \"\\\"\\\\\\\\1\\\\\\\\2\\\\\\\\3\\\\\\\\t\\\\\\\\2\\\"\", \"replace_count\": \"\\\"0\\\"\", \"multiline\": \"\\\"False\\\"\", \"infile\": \"null\", \"dot_all\": \"\\\"False\\\"\"}",
+            "tool_version": "1.0.0",
+            "type": "tool",
+            "user_outputs": []
+        },
+        "8": {
+            "annotation": "Given then RNA-Seq reads (fastq) and synthetic transcriptome (from \"-prepare reference\"), this tool quantifies the abundances of each mRNA transcript within the GTF file.",
+            "id": 8,
+            "input_connections": {
+                "input|fastq|fastq1": {
+                    "id": 3,
+                    "output_name": "output"
+                },
+                "input|fastq|fastq2": {
+                    "id": 4,
+                    "output_name": "output"
+                },
+                "reference|rsem_ref": {
+                    "id": 6,
+                    "output_name": "reference_file"
+                }
+            },
+            "inputs": [],
+            "name": "RSEM calculate expression",
+            "outputs": [
+                {
+                    "name": "gene_abundances",
+                    "type": "tabular"
+                },
+                {
+                    "name": "isoform_abundances",
+                    "type": "tabular"
+                },
+                {
+                    "name": "transcript_bam",
+                    "type": "bam"
+                },
+                {
+                    "name": "transcript_sorted_bam",
+                    "type": "bam"
+                },
+                {
+                    "name": "genome_bam",
+                    "type": "bam"
+                },
+                {
+                    "name": "genome_sorted_bam",
+                    "type": "bam"
+                },
+                {
+                    "name": "log",
+                    "type": "txt"
+                }
+            ],
+            "position": {
+                "left": 719,
+                "top": 523
+            },
+            "post_job_actions": {},
+            "tool_errors": null,
+            "tool_id": "toolshed.g2.bx.psu.edu/repos/jjohnson/rsem/rsem_calculate_expression/1.1.17",
+            "tool_state": "{\"__page__\": 0, \"reference\": \"{\\\"rsem_ref\\\": null, \\\"refSrc\\\": \\\"history\\\", \\\"__current_case__\\\": 1}\", \"rsem_options\": \"{\\\"fullparams\\\": \\\"default\\\", \\\"__current_case__\\\": 0}\", \"rsem_outputs\": \"{\\\"result_bams\\\": \\\"none\\\", \\\"__current_case__\\\": 0}\", \"__rerun_remap_job_id__\": null, \"seedlength\": \"\\\"25\\\"\", \"sample\": \"\\\"rsem_sample\\\"\", \"forward_prob\": \"\\\"0.5\\\"\", \"input\": \"{\\\"fastq\\\": {\\\"fastq2\\\": null, \\\"fastq1\\\": null, \\\"matepair\\\": \\\"paired\\\", \\\"__current_case__\\\": 1}, \\\"bowtie_options\\\": {\\\"fullparams\\\": \\\"default\\\", \\\"__current_case__\\\": 0}, \\\"fastq_select\\\": \\\"--phred33-quals\\\", \\\"__current_case__\\\": 0, \\\"format\\\": \\\"fastq\\\"}\", \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\"}",
+            "tool_version": "1.1.17",
+            "type": "tool",
+            "user_outputs": []
+        },
+        "9": {
+            "annotation": "Selection of lower threshold of transcriptional abundance in TPM required for inclusion of the corresponding protein in the reduced database.",
+            "id": 9,
+            "input_connections": {
+                "input": {
+                    "id": 8,
+                    "output_name": "isoform_abundances"
+                }
+            },
+            "inputs": [],
+            "name": "Filter",
+            "outputs": [
+                {
+                    "name": "out_file1",
+                    "type": "input"
+                }
+            ],
+            "position": {
+                "left": 991,
+                "top": 591
+            },
+            "post_job_actions": {},
+            "tool_errors": null,
+            "tool_id": "Filter1",
+            "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"cond\": \"\\\"c3>0.000001\\\"\", \"input\": \"null\", \"header_lines\": \"\\\"0\\\"\", \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\"}",
+            "tool_version": "1.1.0",
+            "type": "tool",
+            "user_outputs": []
+        },
+        "10": {
+            "annotation": "Add a column with the  RSEM TPM times a million.",
+            "id": 10,
+            "input_connections": {
+                "input": {
+                    "id": 9,
+                    "output_name": "out_file1"
+                }
+            },
+            "inputs": [],
+            "name": "Compute",
+            "outputs": [
+                {
+                    "name": "out_file1",
+                    "type": "input"
+                }
+            ],
+            "position": {
+                "left": 1199,
+                "top": 574
+            },
+            "post_job_actions": {},
+            "tool_errors": null,
+            "tool_id": "Add_a_column1",
+            "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"cond\": \"\\\"c3*1000000\\\"\", \"input\": \"null\", \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\", \"round\": \"\\\"no\\\"\"}",
+            "tool_version": "1.1.0",
+            "type": "tool",
+            "user_outputs": []
+        },
+        "11": {
+            "annotation": "",
+            "id": 11,
+            "input_connections": {
+                "input1": {
+                    "id": 7,
+                    "output_name": "outfile"
+                },
+                "input2": {
+                    "id": 10,
+                    "output_name": "out_file1"
+                }
+            },
+            "inputs": [],
+            "name": "Join two Datasets",
+            "outputs": [
+                {
+                    "name": "out_file1",
+                    "type": "input"
+                }
+            ],
+            "position": {
+                "left": 1350,
+                "top": 419
+            },
+            "post_job_actions": {},
+            "tool_errors": null,
+            "tool_id": "join1",
+            "tool_state": "{\"input2\": \"null\", \"__page__\": 0, \"field1\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"3\\\"}\", \"partial\": \"\\\"\\\"\", \"field2\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"1\\\"}\", \"__rerun_remap_job_id__\": null, \"fill_empty_columns\": \"{\\\"fill_empty_columns_switch\\\": \\\"no_fill\\\", \\\"__current_case__\\\": 0}\", \"unmatched\": \"\\\"\\\"\", \"input1\": \"null\", \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\"}",
+            "tool_version": "2.0.2",
+            "type": "tool",
+            "user_outputs": []
+        },
+        "12": {
+            "annotation": "",
+            "id": 12,
+            "input_connections": {
+                "infile": {
+                    "id": 11,
+                    "output_name": "out_file1"
+                }
+            },
+            "inputs": [],
+            "name": "Regex Replace",
+            "outputs": [
+                {
+                    "name": "outfile",
+                    "type": "txt"
+                }
+            ],
+            "position": {
+                "left": 1545,
+                "top": 546
+            },
+            "post_job_actions": {
+                "ChangeDatatypeActionoutfile": {
+                    "action_arguments": {
+                        "newtype": "tabular"
+                    },
+                    "action_type": "ChangeDatatypeAction",
+                    "output_name": "outfile"
+                }
+            },
+            "tool_errors": null,
+            "tool_id": "toolshed.g2.bx.psu.edu/repos/kellrott/regex_replace/regex_replace/1.0.0",
+            "tool_state": "{\"__page__\": 0, \"ignore_case\": \"\\\"False\\\"\", \"search_str\": \"\\\"^(.*)\\\\\\\\t(.*)\\\\\\\\t(.*)\\\\\\\\t(.*)\\\\\\\\t(.*)\\\\\\\\t(.*)\\\\\\\\t(.*)\\\\\\\\t(.*)$\\\"\", \"__rerun_remap_job_id__\": null, \"replace_str\": \"\\\"\\\\\\\\1 tmp:\\\\\\\\8\\\\\\\\t\\\\\\\\2\\\"\", \"replace_count\": \"\\\"0\\\"\", \"multiline\": \"\\\"False\\\"\", \"infile\": \"null\", \"dot_all\": \"\\\"False\\\"\"}",
+            "tool_version": "1.0.0",
+            "type": "tool",
+            "user_outputs": []
+        },
+        "13": {
+            "annotation": "Final reduced database after application of a TPM cut-off.",
+            "id": 13,
+            "input_connections": {
+                "input": {
+                    "id": 12,
+                    "output_name": "outfile"
+                }
+            },
+            "inputs": [],
+            "name": "Tabular-to-FASTA",
+            "outputs": [
+                {
+                    "name": "output",
+                    "type": "fasta"
+                }
+            ],
+            "position": {
+                "left": 1743,
+                "top": 484
+            },
+            "post_job_actions": {},
+            "tool_errors": null,
+            "tool_id": "tab2fasta",
+            "tool_state": "{\"title_col\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": [\\\"1\\\"]}\", \"__page__\": 0, \"seq_col\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"2\\\"}\", \"__rerun_remap_job_id__\": null, \"input\": \"null\", \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\"}",
+            "tool_version": "1.1.0",
+            "type": "tool",
+            "user_outputs": []
+        },
+        "14": {
+            "annotation": "Format FASTA to desired width.",
+            "id": 14,
+            "input_connections": {
+                "input": {
+                    "id": 13,
+                    "output_name": "output"
+                }
+            },
+            "inputs": [],
+            "name": "FASTA Width",
+            "outputs": [
+                {
+                    "name": "output",
+                    "type": "input"
+                }
+            ],
+            "position": {
+                "left": 1939,
+                "top": 569
+            },
+            "post_job_actions": {},
+            "tool_errors": null,
+            "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/fasta_formatter/cshl_fasta_formatter/1.0.0",
+            "tool_state": "{\"__page__\": 0, \"input\": \"null\", \"__rerun_remap_job_id__\": null, \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\", \"width\": \"\\\"80\\\"\"}",
+            "tool_version": "1.0.0",
+            "type": "tool",
+            "user_outputs": []
+        }
+    }
+}
\ No newline at end of file
--- a/repository_dependencies.xml	Mon Mar 17 16:03:12 2014 -0500
+++ b/repository_dependencies.xml	Thu Mar 20 21:50:05 2014 -0500
@@ -1,5 +1,6 @@
 <?xml version="1.0"?>
 <repositories description="Required tools for proteomics_rnaseq_splice_db_workflow">
     <repository name="fasta_formatter" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" changeset_revision="8f0ae92440b8" />
+    <repository name="regex_replace" owner="kellrott" toolshed="http://toolshed.g2.bx.psu.edu" changeset_revision="9a77d5fca67c" />
     <repository name="rsem" owner="jjohnson" toolshed="http://toolshed.g2.bx.psu.edu" changeset_revision="59459de65740" />
 </repositories>