changeset 1:20d9fb1ba210 default tip

Replace several tabular manipulations with regex_replace tool
author Jim Johnson <jj@umn.edu>
date Thu, 20 Mar 2014 21:50:05 -0500
parents 9d5e59373c84
children
files README.rst proteomics_rnaseq_reduced_db_workflow_v2.ga repository_dependencies.xml
diffstat 3 files changed, 471 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/README.rst	Mon Mar 17 16:03:12 2014 -0500
+++ b/README.rst	Thu Mar 20 21:50:05 2014 -0500
@@ -47,6 +47,7 @@
 Version Changes
 ------- ----------------------------------------------------------------------
 v0.0.1  - Initial release to Tool Shed (March, 2014)
+v0.0.2  - Use regex_replace tool for tabular file manipulation
 ======= ======================================================================
 
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/proteomics_rnaseq_reduced_db_workflow_v2.ga	Thu Mar 20 21:50:05 2014 -0500
@@ -0,0 +1,469 @@
+{
+    "a_galaxy_workflow": "true", 
+    "annotation": "Filter out proteins that have a transcript expression level, as quantified by RNA-Seq data, below a certain threshold.", 
+    "format-version": "0.1", 
+    "name": "Proteomics Reduced DB v2", 
+    "steps": {
+        "0": {
+            "annotation": "ftp://ftp.ensembl.org/pub/release-73/fasta/homo_sapiens/pep/Homo_sapiens.GRCh37.73.pep.all.fa.gz", 
+            "id": 0, 
+            "input_connections": {}, 
+            "inputs": [
+                {
+                    "description": "ftp://ftp.ensembl.org/pub/release-73/fasta/homo_sapiens/pep/Homo_sapiens.GRCh37.73.pep.all.fa.gz", 
+                    "name": "Ensembl Protein FASTA (reference proteome)"
+                }
+            ], 
+            "name": "Input dataset", 
+            "outputs": [], 
+            "position": {
+                "left": 208, 
+                "top": 200
+            }, 
+            "tool_errors": null, 
+            "tool_id": null, 
+            "tool_state": "{\"name\": \"Ensembl Protein FASTA (reference proteome)\"}", 
+            "tool_version": null, 
+            "type": "data_input", 
+            "user_outputs": []
+        }, 
+        "1": {
+            "annotation": "Ensembl reference fasta with only chromosome assigned sequences.   For example: ftp://ftp.ensembl.org/pub/release-73/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.73.dna.toplevel.fa.gz", 
+            "id": 1, 
+            "input_connections": {}, 
+            "inputs": [
+                {
+                    "description": "Ensembl reference fasta with only chromosome assigned sequences.   For example: ftp://ftp.ensembl.org/pub/release-73/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.73.dna.toplevel.fa.gz", 
+                    "name": "Ensembl Genome Reference Fasta"
+                }
+            ], 
+            "name": "Input dataset", 
+            "outputs": [], 
+            "position": {
+                "left": 209, 
+                "top": 292
+            }, 
+            "tool_errors": null, 
+            "tool_id": null, 
+            "tool_state": "{\"name\": \"Ensembl Genome Reference Fasta\"}", 
+            "tool_version": null, 
+            "type": "data_input", 
+            "user_outputs": []
+        }, 
+        "2": {
+            "annotation": "For example: \nftp://ftp.ensembl.org/pub/release-73/gtf/homo_sapiens/Homo_sapiens.GRCh37.73.gtf.gz", 
+            "id": 2, 
+            "input_connections": {}, 
+            "inputs": [
+                {
+                    "description": "For example: \nftp://ftp.ensembl.org/pub/release-73/gtf/homo_sapiens/Homo_sapiens.GRCh37.73.gtf.gz", 
+                    "name": "Ensembl GTF File (gene models)"
+                }
+            ], 
+            "name": "Input dataset", 
+            "outputs": [], 
+            "position": {
+                "left": 213, 
+                "top": 456
+            }, 
+            "tool_errors": null, 
+            "tool_id": null, 
+            "tool_state": "{\"name\": \"Ensembl GTF File (gene models)\"}", 
+            "tool_version": null, 
+            "type": "data_input", 
+            "user_outputs": []
+        }, 
+        "3": {
+            "annotation": "RNA-Seq left mate pair fastq (These should be in fastqsanger format. If not, convert with \"Fastq Groomer\" tool.)", 
+            "id": 3, 
+            "input_connections": {}, 
+            "inputs": [
+                {
+                    "description": "RNA-Seq left mate pair fastq (These should be in fastqsanger format. If not, convert with \"Fastq Groomer\" tool.)", 
+                    "name": "RNA-Seq left paired-end fastq"
+                }
+            ], 
+            "name": "Input dataset", 
+            "outputs": [], 
+            "position": {
+                "left": 220, 
+                "top": 563
+            }, 
+            "tool_errors": null, 
+            "tool_id": null, 
+            "tool_state": "{\"name\": \"RNA-Seq left paired-end fastq\"}", 
+            "tool_version": null, 
+            "type": "data_input", 
+            "user_outputs": []
+        }, 
+        "4": {
+            "annotation": "RNA-Seq right mate pair fastq (These should be in fastqsanger format. If not, convert with \"Fastq Groomer\" tool.)", 
+            "id": 4, 
+            "input_connections": {}, 
+            "inputs": [
+                {
+                    "description": "RNA-Seq right mate pair fastq (These should be in fastqsanger format. If not, convert with \"Fastq Groomer\" tool.)", 
+                    "name": "RNA-Seq right paired-end fastq"
+                }
+            ], 
+            "name": "Input dataset", 
+            "outputs": [], 
+            "position": {
+                "left": 221, 
+                "top": 673
+            }, 
+            "tool_errors": null, 
+            "tool_id": null, 
+            "tool_state": "{\"name\": \"RNA-Seq right paired-end fastq\"}", 
+            "tool_version": null, 
+            "type": "data_input", 
+            "user_outputs": []
+        }, 
+        "5": {
+            "annotation": "Convert peptide fasta to a 2-column tabular file.  Keep all the head info.", 
+            "id": 5, 
+            "input_connections": {
+                "input": {
+                    "id": 0, 
+                    "output_name": "output"
+                }
+            }, 
+            "inputs": [], 
+            "name": "FASTA-to-Tabular", 
+            "outputs": [
+                {
+                    "name": "output", 
+                    "type": "tabular"
+                }
+            ], 
+            "position": {
+                "left": 538, 
+                "top": 267
+            }, 
+            "post_job_actions": {}, 
+            "tool_errors": null, 
+            "tool_id": "fasta2tab", 
+            "tool_state": "{\"__page__\": 0, \"keep_first\": \"\\\"0\\\"\", \"descr_columns\": \"\\\"1\\\"\", \"input\": \"null\", \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\", \"__rerun_remap_job_id__\": null}", 
+            "tool_version": "1.1.0", 
+            "type": "tool", 
+            "user_outputs": []
+        }, 
+        "6": {
+            "annotation": "Given a GTF file and the reference genome, this tool constructs a synthetic transcriptome that will be used for isoform quantification during \"-calculate expression\".", 
+            "id": 6, 
+            "input_connections": {
+                "reference|gtf": {
+                    "id": 2, 
+                    "output_name": "output"
+                }, 
+                "reference|reference_fasta_file": {
+                    "id": 1, 
+                    "output_name": "output"
+                }
+            }, 
+            "inputs": [], 
+            "name": "RSEM prepare reference", 
+            "outputs": [
+                {
+                    "name": "reference_file", 
+                    "type": "rsem_ref"
+                }
+            ], 
+            "position": {
+                "left": 419, 
+                "top": 388
+            }, 
+            "post_job_actions": {}, 
+            "tool_errors": null, 
+            "tool_id": "toolshed.g2.bx.psu.edu/repos/jjohnson/rsem/rsem_prepare_reference/1.1.17", 
+            "tool_state": "{\"__page__\": 0, \"reference\": \"{\\\"ref_type\\\": \\\"genomic\\\", \\\"gtf\\\": null, \\\"reference_fasta_file\\\": null, \\\"__current_case__\\\": 1}\", \"reference_name\": \"\\\"primaryEnsemblGtfRef\\\"\", \"__rerun_remap_job_id__\": null, \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\", \"polya\": \"{\\\"polya_use\\\": \\\"add\\\", \\\"polya_length\\\": \\\"125\\\", \\\"__current_case__\\\": 0}\", \"transcript_to_gene_map\": \"null\", \"ntog\": \"\\\"False\\\"\"}", 
+            "tool_version": "1.1.17", 
+            "type": "tool", 
+            "user_outputs": []
+        }, 
+        "7": {
+            "annotation": "", 
+            "id": 7, 
+            "input_connections": {
+                "infile": {
+                    "id": 5, 
+                    "output_name": "output"
+                }
+            }, 
+            "inputs": [], 
+            "name": "Regex Replace", 
+            "outputs": [
+                {
+                    "name": "outfile", 
+                    "type": "txt"
+                }
+            ], 
+            "position": {
+                "left": 802, 
+                "top": 281
+            }, 
+            "post_job_actions": {
+                "ChangeDatatypeActionoutfile": {
+                    "action_arguments": {
+                        "newtype": "tabular"
+                    }, 
+                    "action_type": "ChangeDatatypeAction", 
+                    "output_name": "outfile"
+                }
+            }, 
+            "tool_errors": null, 
+            "tool_id": "toolshed.g2.bx.psu.edu/repos/kellrott/regex_replace/regex_replace/1.0.0", 
+            "tool_state": "{\"__page__\": 0, \"ignore_case\": \"\\\"False\\\"\", \"search_str\": \"\\\"^(.* transcript:)(ENST\\\\\\\\d+)(.*)$\\\"\", \"__rerun_remap_job_id__\": null, \"replace_str\": \"\\\"\\\\\\\\1\\\\\\\\2\\\\\\\\3\\\\\\\\t\\\\\\\\2\\\"\", \"replace_count\": \"\\\"0\\\"\", \"multiline\": \"\\\"False\\\"\", \"infile\": \"null\", \"dot_all\": \"\\\"False\\\"\"}", 
+            "tool_version": "1.0.0", 
+            "type": "tool", 
+            "user_outputs": []
+        }, 
+        "8": {
+            "annotation": "Given then RNA-Seq reads (fastq) and synthetic transcriptome (from \"-prepare reference\"), this tool quantifies the abundances of each mRNA transcript within the GTF file.", 
+            "id": 8, 
+            "input_connections": {
+                "input|fastq|fastq1": {
+                    "id": 3, 
+                    "output_name": "output"
+                }, 
+                "input|fastq|fastq2": {
+                    "id": 4, 
+                    "output_name": "output"
+                }, 
+                "reference|rsem_ref": {
+                    "id": 6, 
+                    "output_name": "reference_file"
+                }
+            }, 
+            "inputs": [], 
+            "name": "RSEM calculate expression", 
+            "outputs": [
+                {
+                    "name": "gene_abundances", 
+                    "type": "tabular"
+                }, 
+                {
+                    "name": "isoform_abundances", 
+                    "type": "tabular"
+                }, 
+                {
+                    "name": "transcript_bam", 
+                    "type": "bam"
+                }, 
+                {
+                    "name": "transcript_sorted_bam", 
+                    "type": "bam"
+                }, 
+                {
+                    "name": "genome_bam", 
+                    "type": "bam"
+                }, 
+                {
+                    "name": "genome_sorted_bam", 
+                    "type": "bam"
+                }, 
+                {
+                    "name": "log", 
+                    "type": "txt"
+                }
+            ], 
+            "position": {
+                "left": 719, 
+                "top": 523
+            }, 
+            "post_job_actions": {}, 
+            "tool_errors": null, 
+            "tool_id": "toolshed.g2.bx.psu.edu/repos/jjohnson/rsem/rsem_calculate_expression/1.1.17", 
+            "tool_state": "{\"__page__\": 0, \"reference\": \"{\\\"rsem_ref\\\": null, \\\"refSrc\\\": \\\"history\\\", \\\"__current_case__\\\": 1}\", \"rsem_options\": \"{\\\"fullparams\\\": \\\"default\\\", \\\"__current_case__\\\": 0}\", \"rsem_outputs\": \"{\\\"result_bams\\\": \\\"none\\\", \\\"__current_case__\\\": 0}\", \"__rerun_remap_job_id__\": null, \"seedlength\": \"\\\"25\\\"\", \"sample\": \"\\\"rsem_sample\\\"\", \"forward_prob\": \"\\\"0.5\\\"\", \"input\": \"{\\\"fastq\\\": {\\\"fastq2\\\": null, \\\"fastq1\\\": null, \\\"matepair\\\": \\\"paired\\\", \\\"__current_case__\\\": 1}, \\\"bowtie_options\\\": {\\\"fullparams\\\": \\\"default\\\", \\\"__current_case__\\\": 0}, \\\"fastq_select\\\": \\\"--phred33-quals\\\", \\\"__current_case__\\\": 0, \\\"format\\\": \\\"fastq\\\"}\", \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\"}", 
+            "tool_version": "1.1.17", 
+            "type": "tool", 
+            "user_outputs": []
+        }, 
+        "9": {
+            "annotation": "Selection of lower threshold of transcriptional abundance in TPM required for inclusion of the corresponding protein in the reduced database.", 
+            "id": 9, 
+            "input_connections": {
+                "input": {
+                    "id": 8, 
+                    "output_name": "isoform_abundances"
+                }
+            }, 
+            "inputs": [], 
+            "name": "Filter", 
+            "outputs": [
+                {
+                    "name": "out_file1", 
+                    "type": "input"
+                }
+            ], 
+            "position": {
+                "left": 991, 
+                "top": 591
+            }, 
+            "post_job_actions": {}, 
+            "tool_errors": null, 
+            "tool_id": "Filter1", 
+            "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"cond\": \"\\\"c3>0.000001\\\"\", \"input\": \"null\", \"header_lines\": \"\\\"0\\\"\", \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\"}", 
+            "tool_version": "1.1.0", 
+            "type": "tool", 
+            "user_outputs": []
+        }, 
+        "10": {
+            "annotation": "Add a column with the  RSEM TPM times a million.", 
+            "id": 10, 
+            "input_connections": {
+                "input": {
+                    "id": 9, 
+                    "output_name": "out_file1"
+                }
+            }, 
+            "inputs": [], 
+            "name": "Compute", 
+            "outputs": [
+                {
+                    "name": "out_file1", 
+                    "type": "input"
+                }
+            ], 
+            "position": {
+                "left": 1199, 
+                "top": 574
+            }, 
+            "post_job_actions": {}, 
+            "tool_errors": null, 
+            "tool_id": "Add_a_column1", 
+            "tool_state": "{\"__page__\": 0, \"__rerun_remap_job_id__\": null, \"cond\": \"\\\"c3*1000000\\\"\", \"input\": \"null\", \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\", \"round\": \"\\\"no\\\"\"}", 
+            "tool_version": "1.1.0", 
+            "type": "tool", 
+            "user_outputs": []
+        }, 
+        "11": {
+            "annotation": "", 
+            "id": 11, 
+            "input_connections": {
+                "input1": {
+                    "id": 7, 
+                    "output_name": "outfile"
+                }, 
+                "input2": {
+                    "id": 10, 
+                    "output_name": "out_file1"
+                }
+            }, 
+            "inputs": [], 
+            "name": "Join two Datasets", 
+            "outputs": [
+                {
+                    "name": "out_file1", 
+                    "type": "input"
+                }
+            ], 
+            "position": {
+                "left": 1350, 
+                "top": 419
+            }, 
+            "post_job_actions": {}, 
+            "tool_errors": null, 
+            "tool_id": "join1", 
+            "tool_state": "{\"input2\": \"null\", \"__page__\": 0, \"field1\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"3\\\"}\", \"partial\": \"\\\"\\\"\", \"field2\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"1\\\"}\", \"__rerun_remap_job_id__\": null, \"fill_empty_columns\": \"{\\\"fill_empty_columns_switch\\\": \\\"no_fill\\\", \\\"__current_case__\\\": 0}\", \"unmatched\": \"\\\"\\\"\", \"input1\": \"null\", \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\"}", 
+            "tool_version": "2.0.2", 
+            "type": "tool", 
+            "user_outputs": []
+        }, 
+        "12": {
+            "annotation": "", 
+            "id": 12, 
+            "input_connections": {
+                "infile": {
+                    "id": 11, 
+                    "output_name": "out_file1"
+                }
+            }, 
+            "inputs": [], 
+            "name": "Regex Replace", 
+            "outputs": [
+                {
+                    "name": "outfile", 
+                    "type": "txt"
+                }
+            ], 
+            "position": {
+                "left": 1545, 
+                "top": 546
+            }, 
+            "post_job_actions": {
+                "ChangeDatatypeActionoutfile": {
+                    "action_arguments": {
+                        "newtype": "tabular"
+                    }, 
+                    "action_type": "ChangeDatatypeAction", 
+                    "output_name": "outfile"
+                }
+            }, 
+            "tool_errors": null, 
+            "tool_id": "toolshed.g2.bx.psu.edu/repos/kellrott/regex_replace/regex_replace/1.0.0", 
+            "tool_state": "{\"__page__\": 0, \"ignore_case\": \"\\\"False\\\"\", \"search_str\": \"\\\"^(.*)\\\\\\\\t(.*)\\\\\\\\t(.*)\\\\\\\\t(.*)\\\\\\\\t(.*)\\\\\\\\t(.*)\\\\\\\\t(.*)\\\\\\\\t(.*)$\\\"\", \"__rerun_remap_job_id__\": null, \"replace_str\": \"\\\"\\\\\\\\1 tmp:\\\\\\\\8\\\\\\\\t\\\\\\\\2\\\"\", \"replace_count\": \"\\\"0\\\"\", \"multiline\": \"\\\"False\\\"\", \"infile\": \"null\", \"dot_all\": \"\\\"False\\\"\"}", 
+            "tool_version": "1.0.0", 
+            "type": "tool", 
+            "user_outputs": []
+        }, 
+        "13": {
+            "annotation": "Final reduced database after application of a TPM cut-off.", 
+            "id": 13, 
+            "input_connections": {
+                "input": {
+                    "id": 12, 
+                    "output_name": "outfile"
+                }
+            }, 
+            "inputs": [], 
+            "name": "Tabular-to-FASTA", 
+            "outputs": [
+                {
+                    "name": "output", 
+                    "type": "fasta"
+                }
+            ], 
+            "position": {
+                "left": 1743, 
+                "top": 484
+            }, 
+            "post_job_actions": {}, 
+            "tool_errors": null, 
+            "tool_id": "tab2fasta", 
+            "tool_state": "{\"title_col\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": [\\\"1\\\"]}\", \"__page__\": 0, \"seq_col\": \"{\\\"__class__\\\": \\\"UnvalidatedValue\\\", \\\"value\\\": \\\"2\\\"}\", \"__rerun_remap_job_id__\": null, \"input\": \"null\", \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\"}", 
+            "tool_version": "1.1.0", 
+            "type": "tool", 
+            "user_outputs": []
+        }, 
+        "14": {
+            "annotation": "Format FASTA to desired width.", 
+            "id": 14, 
+            "input_connections": {
+                "input": {
+                    "id": 13, 
+                    "output_name": "output"
+                }
+            }, 
+            "inputs": [], 
+            "name": "FASTA Width", 
+            "outputs": [
+                {
+                    "name": "output", 
+                    "type": "input"
+                }
+            ], 
+            "position": {
+                "left": 1939, 
+                "top": 569
+            }, 
+            "post_job_actions": {}, 
+            "tool_errors": null, 
+            "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/fasta_formatter/cshl_fasta_formatter/1.0.0", 
+            "tool_state": "{\"__page__\": 0, \"input\": \"null\", \"__rerun_remap_job_id__\": null, \"chromInfo\": \"\\\"/website/galaxy.msi.umn.edu/PRODUCTION/tool-data/shared/ucsc/chrom/GRCm38_canon.len\\\"\", \"width\": \"\\\"80\\\"\"}", 
+            "tool_version": "1.0.0", 
+            "type": "tool", 
+            "user_outputs": []
+        }
+    }
+}
\ No newline at end of file
--- a/repository_dependencies.xml	Mon Mar 17 16:03:12 2014 -0500
+++ b/repository_dependencies.xml	Thu Mar 20 21:50:05 2014 -0500
@@ -1,5 +1,6 @@
 <?xml version="1.0"?>
 <repositories description="Required tools for proteomics_rnaseq_splice_db_workflow">
     <repository name="fasta_formatter" owner="devteam" toolshed="http://toolshed.g2.bx.psu.edu" changeset_revision="8f0ae92440b8" />
+    <repository name="regex_replace" owner="kellrott" toolshed="http://toolshed.g2.bx.psu.edu" changeset_revision="9a77d5fca67c" />
     <repository name="rsem" owner="jjohnson" toolshed="http://toolshed.g2.bx.psu.edu" changeset_revision="59459de65740" />
 </repositories>