Mercurial > repos > ebi-gxa > gtf2gene_list

--- a/gtf2featureAnnotation.R	Wed Mar 04 06:44:32 2020 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,196 +0,0 @@
-#!/usr/bin/env Rscript
-
-# This script parses the GTF file to create a feature-wise annotation file with
-# mitochondrial features flagged, to assist in annotation and QC of single-cell
-# expression data analysis.
-
-suppressPackageStartupMessages(require(rtracklayer))
-suppressPackageStartupMessages(require(optparse))
-
-ucfirst <- function (str) {
-  paste(toupper(substring(str, 1, 1)), tolower(substring(str, 2)), sep = "")
-}
-
-die <- function(message){
-  write(message, stderr())
-  q(status = 1)
-}
-
-cleanlist <- function(str){
-  tolower(unlist(strsplit(str, ',')))
-}
-
-cl <- commandArgs(trailingOnly = TRUE)
-
-option_list = list(
-  make_option(
-    c("-g", "--gtf-file"),
-    action = "store",
-    default = NA,
-    type = 'character',
-    help = "Path to a valid GTF file"
-  ),
-  make_option(
-    c("-t", "--feature-type"),
-    action = "store",
-    default = 'gene',
-    type = 'character',
-    help = 'Feature type to use (default: gene)'
-  ),
-  make_option(
-    c("-f", "--first-field"),
-    action = "store",
-    default = 'gene_id',
-    type = 'character',
-    help = 'Field to place first in output table (default: gene_id)'
-  ),
-  make_option(
-    c("-r", "--no-header"),
-    action = "store_false",
-    default = TRUE,
-    type = 'logical',
-    help = 'Suppress header on output'
-  ),
-  make_option(
-    c("-l", "--fields"),
-    action = "store",
-    default = NULL,
-    type = 'character',
-    help = 'Comma-separated list of output fields to retain (default: all)'
-  ),
-  make_option(
-    c("-m", "--mito"),
-    action = "store_true",
-    default = FALSE,
-    type = 'character',
-    help = 'Mark mitochondrial elements with reference to chromsomes and biotypes'
-  ),
-  make_option(
-    c("-n", "--mito-chr"),
-    action = "store",
-    default = 'mt,mitochondrion_genome,mito,m,chrM,chrMt',
-    type = 'character',
-    help = 'If specified, marks in a column called "mito" features on the specified chromosomes (case insensitive)'
-  ),
-  make_option(
-    c("-p", "--mito-biotypes"),
-    action = "store",
-    default = 'mt_trna,mt_rrna,mt_trna_pseudogene',
-    type = 'character',
-    help = 'If specified,  marks in a column called "mito" features with the specified biotypes (case insensitve)'
-  ),
-  make_option(
-    c("-c", "--filter-cdnas"),
-    action = "store",
-    default = NULL,
-    type = 'character',
-    help = 'If specified, sequences in the provided FASTA-format cDNAs file will be filtered to remove entries not present in the annotation'
-  ),
-  make_option(
-    c("-d", "--filter-cdnas-field"),
-    action = "store",
-    default = 'transcript_id',
-    type = 'character',
-    help = 'Where --filter-cdnas is specified, what field should be used to compare to identfiers from the FASTA?'
-  ),
-  make_option(
-    c("-e", "--filter-cdnas-output"),
-    action = "store",
-    default = 'filtered.fa.gz',
-    type = 'character',
-    help = 'Where --filter-cdnas is specified, what file should the filtered sequences be output to?'
-  ),
-  make_option(
-    c("-u", "--version-transcripts"),
-    action = "store_true",
-    default = FALSE,
-    type = 'logical',
-    help = 'Where the GTF contains transcript versions, should these be appended to transcript identifiers? Useful when generating transcript/gene mappings for use with transcriptomes.'
-  ),
-  make_option(
-    c("-o", "--output-file"),
-    action = "store",
-    default = NA,
-    type = 'character',
-    help = 'Output file path'
-  )
-)
-
-opt <- parse_args(OptionParser(option_list = option_list), convert_hyphens_to_underscores = TRUE)
-
-if (is.na(opt$gtf_file)){
-  die('ERROR: No input GTF file specified')
-}
-
-if (is.na(opt$output_file)){
-  die('ERROR: No output file specified')
-}
-
-# Import the GTF
-
-print(paste('Reading', opt$gtf_file, 'elements of type', opt$feature_type))
-gtf <- import(opt$gtf_file, feature.type = opt$feature_type )
-
-# Combine basic info (chromosomes, coordinates) with annotation found in GTF attributes
-
-anno <- cbind(chromosome = seqnames(gtf), as.data.frame(ranges(gtf)), elementMetadata(gtf))
-print(paste('Found', nrow(anno), 'features'))
-
-# Mark mitochondrial features
-
-if (opt$mito){
-  anno$mito <- ucfirst(as.character(tolower(anno$gene_biotype) %in% cleanlist(opt$mito_biotypes) | tolower(anno$chromosome) %in% cleanlist(opt$mito_chr)))
-}
-
-# If specified, put the desired field first
-
-if (! is.na(opt$first_field)){
-  if (! opt$first_field %in% colnames(anno)){
-    die(paste(first_field, 'is not a valid field'))
-  }
-  anno <- anno[,c(opt$first_field, colnames(anno)[colnames(anno) != opt$first_field])]
-}
-
-# Version transcripts
-
-if ( opt$feature_type == 'transcript' && opt$version_transcripts && all(c('transcript_id', 'transcript_version') %in% colnames(anno) )){
-  has_transcript_version <- ! is.na(anno$transcript_version)
-  anno$transcript_id[has_transcript_version] <- paste(anno$transcript_id[has_transcript_version], anno$transcript_version[has_transcript_version], sep='.')
-}
-
-# If specified, filter down a provided cDNA FASTA file
-
-if (! is.null(opt$filter_cdnas)){
-
-  print(paste("Filtering", opt$filter_cdnas, "to match the GTF"))
-
-  suppressPackageStartupMessages(require(Biostrings))
-
-  cdna <- readDNAStringSet(opt$filter_cdnas)
-  cdna_transcript_names <- unlist(lapply(names(cdna), function(x) unlist(strsplit(x, ' '))[1]  ))
-
-  # Filter out cDNAs without matching transcript entries in the GTF
-
-  if (! any(cdna_transcript_names %in% anno[[opt$filter_cdnas_field]])){
-    die(paste("ERROR: None of the input sequences have matching", opt$filter_cdnas_field, 'values in the GTF file'))
-  }
-
-  cdna <- cdna[which(cdna_transcript_names %in% anno[[opt$filter_cdnas_field]])]
-
-  print(paste('Storing filtered seqeunces to', opt$filter_cdnas_output))
-  writeXStringSet(x = cdna, filepath = opt$filter_cdnas_output, compress = 'gzip')
-}
-
-# If specified, subset to desired fields
-
-if (! is.null(opt$fields) && opt$fields != ''){
-  fields <- unlist(strsplit(opt$fields, ','))
-  if (any(! fields %in% colnames(anno))){
-    die(paste('ERROR:', fields, 'contains invalid field(s)'))
-  }
-  anno <- anno[,fields, drop = FALSE]
-  anno <- anno[apply(anno, 1, function(x) all(! is.na(x))), ]
-}
-
-print(paste('Storing output to', opt$output_file))
-write.table(anno, file = opt$output_file, sep = "\t", quote=FALSE, row.names = FALSE, col.names = opt$no_header)
--- a/gtf2featureAnnotation.xml	Wed Mar 04 06:44:32 2020 -0500
+++ b/gtf2featureAnnotation.xml	Mon Jul 19 08:00:24 2021 +0000
@@ -1,13 +1,11 @@
-<tool id="_ensembl_gtf2gene_list" name="GTF2GeneList" version="1.42.1+galaxy6" profile="18.01">
+<tool id="_ensembl_gtf2gene_list" name="GTF2GeneList" version="1.52.0+galaxy0" profile="18.01">
     <description>extracts a complete annotation table or subsets thereof from an Ensembl GTF using rtracklayer</description>
     <requirements>
-      <requirement type="package" version="1.42.1">bioconductor-rtracklayer</requirement>
-      <requirement type="package" version="2.50.2">bioconductor-biostrings</requirement>
-      <requirement type="package" version="1.6.4">r-optparse</requirement>
+      <requirement type="package" version="1.0.1">atlas-gene-annotation-manipulation</requirement>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
        ln -s '$gtf_input' input.gtf;
-       $__tool_directory__/gtf2featureAnnotation.R --gtf-file input.gtf
+       gtf2featureAnnotation.R --gtf-file input.gtf
 #if $noheader
 --no-header
 #end if
@@ -17,8 +15,17 @@
 #if $mito.mark_mito
 --mito --mito-chr "${mito.mito_chr}" --mito-biotypes "${mito.mito_biotypes}"
 #end if
+#if $cdnas.parse_cdnas
+--parse-cdnas "${cdnas.fasta_input}" --parse-cdna-field "${cdnas.cdnas_field}"
 #if $cdnas.filter_cdnas
---filter-cdnas "${cdnas.fasta_input}" --filter-cdnas-field "${cdnas.cdnas_field}" --filter-cdnas-output "${fasta_output}"
+--filter-cdnas-output "${fasta_output}"
+#end if
+#if $cdnas.parse_cdna_names
+--parse-cdna-names
+#end if
+#if $fill_empty
+--fill-empty "${fill_empty}"
+#end if
 #end if
 --feature-type "${feature_type}" --first-field "${first_field}" --output-file annotation.txt --fields "${fields}"
 	    ]]></command>
@@ -30,6 +37,7 @@
         <param name="noheader" type="boolean" checked="false" label="Suppress header line in output?"/>
         <param name="fields" type="text" optional='true' value="" label="Comma-separated list of field names to extract from the GTF (default: use all fields)"/>
         <param name="version_transcripts" type="boolean" checked="false" label="Append version to transcript identifiers?" help="For transcript feature type only: where the GTF contains transcript versions, should these be appended to transcript identifiers? Useful when generating transcript/gene mappings for use with transcriptomes"/>
+        <param name="fill_empty" type="text" optional='true' value="" label="Column to be used to fill empty values in other fields" help="Only when output fields are defined, useful when you need to guarantee a value, for example a gene ID for a transcript/gene mapping."/>
         <conditional name="mito">
           <param name="mark_mito" type="boolean" checked="true" label="Flag mitochondrial features?"/>
           <when value="true">
@@ -39,10 +47,12 @@
           <when value="false" />
         </conditional>
         <conditional name="cdnas">
-          <param name="filter_cdnas" type="boolean" checked="false" label="Filter a FASTA-format cDNA file to match annotations?" help="For some applications, e.g. transcriptome mappers, its useful to match a cDNAs file to an annotation list (e.g. transcript-to-gene mapping)"/>
+          <param name="parse_cdnas" type="boolean" checked="false" label="Provide a cDNA file for extracting annotations and/ or possible filtering?" help="For some applications, e.g. transcriptome mappers, its useful to match a cDNAs file to an annotation list (e.g. transcript-to-gene mapping)"/>
           <when value="true">
             <param name="fasta_input" type="data" format="fasta,fasta.gz" label="FASTA-format cDNA/ transcript file" />
-            <param name="cdnas_field" type="text" optional='true' value="transcript_id" label="Annotation field to match with sequences."/>
+            <param name="cdnas_field" type="text" optional='true' value="transcript_id" label="Annotation field in GTF to match with sequences."/>
+            <param name="parse_cdna_names" type="boolean" checked="false" label="Parse the FASTA headers for annotation info?" help="e.g. to find gene IDs for transcripts not present in the GTF. May only work for Ensembl GTFs."/>
+            <param name="filter_cdnas" type="boolean" checked="false" label="Filter the cDNA file to match the annotations?" />
           </when>
           <when value="false" />
         </conditional>
@@ -67,9 +77,9 @@

 **What it does**

-Given an Ensembl GTF file, it will extract all information on chromosomes, coordinates, and attributes provided at the specified feature level. Mitochondrial features can also be flagged.
+Given an Ensembl GTF file, it will extract all information on chromosomes, coordinates, and attributes provided at the specified feature level. Mitochondrial features can also be flagged. See https://github.com/ebi-gene-expression-group/atlas-gene-annotation-manipulation.

-You can also supply a fasta-format file of sequences, which will be filtered by identifier to match annotation. This can be useful for tools such as Alevin which need a transcript-to-gene mapping and a transcriptome file without any missing entries (with respect to annotation).
+You can also supply a fasta-format file of sequences, which can be filtered by identifier to match annotation and/or used a source of information for transcripts un-annotated in the GTF. This can be useful for tools such as Alevin which need a transcript-to-gene mapping and a transcriptome file without any missing entries (with respect to annotation).


 **Inputs**
@@ -92,5 +102,6 @@
 journal = {GitHub repository},
 url = {https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary},
   }</citation>
+  <citation type="doi">10.1101/2020.04.08.032698</citation>
 </citations>
 </tool>