Mercurial > repos > ebi-gxa > gtf2gene_list
diff gtf2featureAnnotation.R @ 1:247b439a78f7 draft
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/util/.shed.yml commit 194d2e0af16624c9a3d1af92f7b3686d2e0ee552
author | ebi-gxa |
---|---|
date | Fri, 18 Oct 2019 10:10:54 -0400 |
parents | |
children | 00ee933b08fd |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gtf2featureAnnotation.R Fri Oct 18 10:10:54 2019 -0400 @@ -0,0 +1,195 @@ +#!/usr/bin/env Rscript + +# This script parses the GTF file to create a feature-wise annotation file with +# mitochondrial features flagged, to assist in annotation and QC of single-cell +# expression data analysis. + +suppressPackageStartupMessages(require(rtracklayer)) +suppressPackageStartupMessages(require(optparse)) + +ucfirst <- function (str) { + paste(toupper(substring(str, 1, 1)), tolower(substring(str, 2)), sep = "") +} + +die <- function(message){ + write(message, stderr()) + q(status = 1) +} + +cleanlist <- function(str){ + tolower(unlist(strsplit(str, ','))) +} + +cl <- commandArgs(trailingOnly = TRUE) + +option_list = list( + make_option( + c("-g", "--gtf-file"), + action = "store", + default = NA, + type = 'character', + help = "Path to a valid GTF file" + ), + make_option( + c("-t", "--feature-type"), + action = "store", + default = 'gene', + type = 'character', + help = 'Feature type to use (default: gene)' + ), + make_option( + c("-f", "--first-field"), + action = "store", + default = 'gene_id', + type = 'character', + help = 'Field to place first in output table (default: gene_id)' + ), + make_option( + c("-r", "--no-header"), + action = "store_false", + default = TRUE, + type = 'logical', + help = 'Suppress header on output' + ), + make_option( + c("-l", "--fields"), + action = "store", + default = NULL, + type = 'character', + help = 'Comma-separated list of output fields to retain (default: all)' + ), + make_option( + c("-m", "--mito"), + action = "store_true", + default = FALSE, + type = 'character', + help = 'Mark mitochondrial elements with reference to chromsomes and biotypes' + ), + make_option( + c("-n", "--mito-chr"), + action = "store", + default = 'mt,mitochondrion_genome,mito,m,chrM,chrMt', + type = 'character', + help = 'If specified, marks in a column called "mito" features on the specified chromosomes (case insensitive)' + ), + make_option( + c("-p", "--mito-biotypes"), + action = "store", + default = 'mt_trna,mt_rrna,mt_trna_pseudogene', + type = 'character', + help = 'If specified, marks in a column called "mito" features with the specified biotypes (case insensitve)' + ), + make_option( + c("-c", "--filter-cdnas"), + action = "store", + default = NULL, + type = 'character', + help = 'If specified, sequences in the provided FASTA-format cDNAs file will be filtered to remove entries not present in the annotation' + ), + make_option( + c("-d", "--filter-cdnas-field"), + action = "store", + default = 'transcript_id', + type = 'character', + help = 'Where --filter-cdnas is specified, what field should be used to compare to identfiers from the FASTA?' + ), + make_option( + c("-e", "--filter-cdnas-output"), + action = "store", + default = 'filtered.fa.gz', + type = 'character', + help = 'Where --filter-cdnas is specified, what file should the filtered sequences be output to?' + ), + make_option( + c("-u", "--version-transcripts"), + action = "store_true", + default = FALSE, + type = 'logical', + help = 'Where the GTF contains transcript versions, should these be appended to transcript identifiers? Useful when generating transcript/gene mappings for use with transcriptomes.' + ), + make_option( + c("-o", "--output-file"), + action = "store", + default = NA, + type = 'character', + help = 'Output file path' + ) +) + +opt <- parse_args(OptionParser(option_list = option_list), convert_hyphens_to_underscores = TRUE) + +if (is.na(opt$gtf_file)){ + die('ERROR: No input GTF file specified') +} + +if (is.na(opt$output_file)){ + die('ERROR: No output file specified') +} + +# Import the GTF + +print(paste('Reading', opt$gtf_file, 'elements of type', opt$feature_type)) +gtf <- import(opt$gtf_file, feature.type = opt$feature_type ) + +# Combine basic info (chromosomes, coordinates) with annotation found in GTF attributes + +anno <- cbind(chromosome = seqnames(gtf), as.data.frame(ranges(gtf)), elementMetadata(gtf)) +print(paste('Found', nrow(anno), 'features')) + +# Mark mitochondrial features + +if (opt$mito){ + anno$mito <- ucfirst(as.character(tolower(anno$gene_biotype) %in% cleanlist(opt$mito_biotypes) | tolower(anno$chromosome) %in% cleanlist(opt$mito_chr))) +} + +# If specified, put the desired field first + +if (! is.na(opt$first_field)){ + if (! opt$first_field %in% colnames(anno)){ + die(paste(first_field, 'is not a valid field')) + } + anno <- anno[,c(opt$first_field, colnames(anno)[colnames(anno) != opt$first_field])] +} + +# Version transcripts + +if ( opt$feature_type == 'transcript' && opt$version_transcripts && all(c('transcript_id', 'transcript_version') %in% colnames(anno) )){ + anno$transcript_id <- paste(anno$transcript_id, anno$transcript_version, sep='.') +} + +# If specified, filter down a provided cDNA FASTA file + +if (! is.null(opt$filter_cdnas)){ + + print(paste("Filtering", opt$filter_cdnas, "to match the GTF")) + + suppressPackageStartupMessages(require(Biostrings)) + + cdna <- readDNAStringSet(opt$filter_cdnas) + cdna_transcript_names <- unlist(lapply(names(cdna), function(x) unlist(strsplit(x, ' '))[1] )) + + # Filter out cDNAs without matching transcript entries in the GTF + + if (! any(cdna_transcript_names %in% anno[[opt$filter_cdnas_field]])){ + die(paste("ERROR: None of the input sequences have matching", opt$filter_cdnas_field, 'values in the GTF file')) + } + + cdna <- cdna[which(cdna_transcript_names %in% anno[[opt$filter_cdnas_field]])] + + print(paste('Storing filtered seqeunces to', opt$filter_cdnas_output)) + writeXStringSet(x = cdna, filepath = opt$filter_cdnas_output, compress = 'gzip') +} + +# If specified, subset to desired fields + +if (! is.null(opt$fields) && opt$fields != ''){ + fields <- unlist(strsplit(opt$fields, ',')) + if (any(! fields %in% colnames(anno))){ + die(paste('ERROR:', fields, 'contains invalid field(s)')) + } + anno <- anno[,fields, drop = FALSE] + anno <- anno[apply(anno, 1, function(x) all(! is.na(x))), ] +} + +print(paste('Storing output to', opt$output_file)) +write.table(anno, file = opt$output_file, sep = "\t", quote=FALSE, row.names = FALSE, col.names = opt$no_header)