Mercurial > repos > ebi-gxa > gtf2gene_list
view gtf2featureAnnotation.R @ 1:247b439a78f7 draft
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/util/.shed.yml commit 194d2e0af16624c9a3d1af92f7b3686d2e0ee552
author | ebi-gxa |
---|---|
date | Fri, 18 Oct 2019 10:10:54 -0400 |
parents | |
children | 00ee933b08fd |
line wrap: on
line source
#!/usr/bin/env Rscript # This script parses the GTF file to create a feature-wise annotation file with # mitochondrial features flagged, to assist in annotation and QC of single-cell # expression data analysis. suppressPackageStartupMessages(require(rtracklayer)) suppressPackageStartupMessages(require(optparse)) ucfirst <- function (str) { paste(toupper(substring(str, 1, 1)), tolower(substring(str, 2)), sep = "") } die <- function(message){ write(message, stderr()) q(status = 1) } cleanlist <- function(str){ tolower(unlist(strsplit(str, ','))) } cl <- commandArgs(trailingOnly = TRUE) option_list = list( make_option( c("-g", "--gtf-file"), action = "store", default = NA, type = 'character', help = "Path to a valid GTF file" ), make_option( c("-t", "--feature-type"), action = "store", default = 'gene', type = 'character', help = 'Feature type to use (default: gene)' ), make_option( c("-f", "--first-field"), action = "store", default = 'gene_id', type = 'character', help = 'Field to place first in output table (default: gene_id)' ), make_option( c("-r", "--no-header"), action = "store_false", default = TRUE, type = 'logical', help = 'Suppress header on output' ), make_option( c("-l", "--fields"), action = "store", default = NULL, type = 'character', help = 'Comma-separated list of output fields to retain (default: all)' ), make_option( c("-m", "--mito"), action = "store_true", default = FALSE, type = 'character', help = 'Mark mitochondrial elements with reference to chromsomes and biotypes' ), make_option( c("-n", "--mito-chr"), action = "store", default = 'mt,mitochondrion_genome,mito,m,chrM,chrMt', type = 'character', help = 'If specified, marks in a column called "mito" features on the specified chromosomes (case insensitive)' ), make_option( c("-p", "--mito-biotypes"), action = "store", default = 'mt_trna,mt_rrna,mt_trna_pseudogene', type = 'character', help = 'If specified, marks in a column called "mito" features with the specified biotypes (case insensitve)' ), make_option( c("-c", "--filter-cdnas"), action = "store", default = NULL, type = 'character', help = 'If specified, sequences in the provided FASTA-format cDNAs file will be filtered to remove entries not present in the annotation' ), make_option( c("-d", "--filter-cdnas-field"), action = "store", default = 'transcript_id', type = 'character', help = 'Where --filter-cdnas is specified, what field should be used to compare to identfiers from the FASTA?' ), make_option( c("-e", "--filter-cdnas-output"), action = "store", default = 'filtered.fa.gz', type = 'character', help = 'Where --filter-cdnas is specified, what file should the filtered sequences be output to?' ), make_option( c("-u", "--version-transcripts"), action = "store_true", default = FALSE, type = 'logical', help = 'Where the GTF contains transcript versions, should these be appended to transcript identifiers? Useful when generating transcript/gene mappings for use with transcriptomes.' ), make_option( c("-o", "--output-file"), action = "store", default = NA, type = 'character', help = 'Output file path' ) ) opt <- parse_args(OptionParser(option_list = option_list), convert_hyphens_to_underscores = TRUE) if (is.na(opt$gtf_file)){ die('ERROR: No input GTF file specified') } if (is.na(opt$output_file)){ die('ERROR: No output file specified') } # Import the GTF print(paste('Reading', opt$gtf_file, 'elements of type', opt$feature_type)) gtf <- import(opt$gtf_file, feature.type = opt$feature_type ) # Combine basic info (chromosomes, coordinates) with annotation found in GTF attributes anno <- cbind(chromosome = seqnames(gtf), as.data.frame(ranges(gtf)), elementMetadata(gtf)) print(paste('Found', nrow(anno), 'features')) # Mark mitochondrial features if (opt$mito){ anno$mito <- ucfirst(as.character(tolower(anno$gene_biotype) %in% cleanlist(opt$mito_biotypes) | tolower(anno$chromosome) %in% cleanlist(opt$mito_chr))) } # If specified, put the desired field first if (! is.na(opt$first_field)){ if (! opt$first_field %in% colnames(anno)){ die(paste(first_field, 'is not a valid field')) } anno <- anno[,c(opt$first_field, colnames(anno)[colnames(anno) != opt$first_field])] } # Version transcripts if ( opt$feature_type == 'transcript' && opt$version_transcripts && all(c('transcript_id', 'transcript_version') %in% colnames(anno) )){ anno$transcript_id <- paste(anno$transcript_id, anno$transcript_version, sep='.') } # If specified, filter down a provided cDNA FASTA file if (! is.null(opt$filter_cdnas)){ print(paste("Filtering", opt$filter_cdnas, "to match the GTF")) suppressPackageStartupMessages(require(Biostrings)) cdna <- readDNAStringSet(opt$filter_cdnas) cdna_transcript_names <- unlist(lapply(names(cdna), function(x) unlist(strsplit(x, ' '))[1] )) # Filter out cDNAs without matching transcript entries in the GTF if (! any(cdna_transcript_names %in% anno[[opt$filter_cdnas_field]])){ die(paste("ERROR: None of the input sequences have matching", opt$filter_cdnas_field, 'values in the GTF file')) } cdna <- cdna[which(cdna_transcript_names %in% anno[[opt$filter_cdnas_field]])] print(paste('Storing filtered seqeunces to', opt$filter_cdnas_output)) writeXStringSet(x = cdna, filepath = opt$filter_cdnas_output, compress = 'gzip') } # If specified, subset to desired fields if (! is.null(opt$fields) && opt$fields != ''){ fields <- unlist(strsplit(opt$fields, ',')) if (any(! fields %in% colnames(anno))){ die(paste('ERROR:', fields, 'contains invalid field(s)')) } anno <- anno[,fields, drop = FALSE] anno <- anno[apply(anno, 1, function(x) all(! is.na(x))), ] } print(paste('Storing output to', opt$output_file)) write.table(anno, file = opt$output_file, sep = "\t", quote=FALSE, row.names = FALSE, col.names = opt$no_header)