diff tools/maf/interval_maf_to_merged_fasta.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/maf/interval_maf_to_merged_fasta.py	Fri Mar 09 19:37:19 2012 -0500
@@ -0,0 +1,196 @@
+#!/usr/bin/env python
+
+"""
+Reads an interval or gene BED and a MAF Source.
+Produces a FASTA file containing the aligned intervals/gene sequences, based upon the provided coordinates
+
+Alignment blocks are layered ontop of each other based upon score.
+
+usage: %prog maf_file [options]
+   -d, --dbkey=d: Database key, ie hg17
+   -c, --chromCol=c: Column of Chr
+   -s, --startCol=s: Column of Start
+   -e, --endCol=e: Column of End
+   -S, --strandCol=S: Column of Strand
+   -G, --geneBED: Input is a Gene BED file, process and join exons as one region
+   -t, --mafSourceType=t: Type of MAF source to use
+   -m, --mafSource=m: Path of source MAF file, if not using cached version
+   -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version
+   -i, --interval_file=i:       Input interval file
+   -o, --output_file=o:      Output MAF file
+   -p, --species=p: Species to include in output
+   -O, --overwrite_with_gaps=O: Overwrite bases found in a lower-scoring block with gaps interior to the sequence for a species.
+   -z, --mafIndexFileDir=z: Directory of local maf_index.loc file
+
+usage: %prog dbkey_of_BED comma_separated_list_of_additional_dbkeys_to_extract comma_separated_list_of_indexed_maf_files input_gene_bed_file output_fasta_file cached|user GALAXY_DATA_INDEX_DIR
+"""
+
+#Dan Blankenberg
+from galaxy import eggs
+from galaxy.tools.util import maf_utilities
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+import bx.intervals.io
+import sys
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def __main__():
+    
+    #Parse Command Line
+    options, args = doc_optparse.parse( __doc__ )
+    mincols = 0
+    strand_col = -1
+    
+    if options.dbkey:
+        primary_species = options.dbkey
+    else:
+        primary_species = None
+    if primary_species in [None, "?", "None"]:
+        stop_err( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." )
+    
+    include_primary = True
+    secondary_species = maf_utilities.parse_species_option( options.species )
+    if secondary_species:
+        species = list( secondary_species ) # make copy of species list
+        if primary_species in secondary_species:
+            secondary_species.remove( primary_species )
+        else:
+            include_primary = False
+    else:
+        species = None
+    
+    if options.interval_file:
+        interval_file = options.interval_file
+    else: 
+        stop_err( "Input interval file has not been specified." )
+    
+    if options.output_file:
+        output_file = options.output_file
+    else: 
+        stop_err( "Output file has not been specified." )
+    
+    if not options.geneBED:
+        if options.chromCol:
+            chr_col = int( options.chromCol ) - 1
+        else: 
+            stop_err( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." )
+        
+        if options.startCol:
+            start_col = int( options.startCol ) - 1
+        else: 
+            stop_err( "Start column not set, click the pencil icon in the history item to set the metadata attributes." )
+        
+        if options.endCol:
+            end_col = int( options.endCol ) - 1
+        else: 
+            stop_err( "End column not set, click the pencil icon in the history item to set the metadata attributes." )
+        
+        if options.strandCol:
+            strand_col = int( options.strandCol ) - 1
+    
+    mafIndexFile = "%s/maf_index.loc" % options.mafIndexFileDir
+    
+    overwrite_with_gaps = True
+    if options.overwrite_with_gaps and options.overwrite_with_gaps.lower() == 'false':
+        overwrite_with_gaps = False
+    
+    #Finish parsing command line
+        
+    #get index for mafs based on type 
+    index = index_filename = None
+    #using specified uid for locally cached
+    if options.mafSourceType.lower() in ["cached"]:
+        index = maf_utilities.maf_index_by_uid( options.mafSource, mafIndexFile )
+        if index is None:
+            stop_err( "The MAF source specified (%s) appears to be invalid." % ( options.mafSource ) )
+    elif options.mafSourceType.lower() in ["user"]:
+        #index maf for use here, need to remove index_file when finished
+        index, index_filename = maf_utilities.open_or_build_maf_index( options.mafSource, options.mafIndex, species = [primary_species] )
+        if index is None:
+            stop_err( "Your MAF file appears to be malformed." )
+    else:
+        stop_err( "Invalid MAF source type specified." )
+    
+    #open output file
+    output = open( output_file, "w" )
+    
+    if options.geneBED:
+        region_enumerator = maf_utilities.line_enumerator( open( interval_file, "r" ).readlines() )
+    else:
+        region_enumerator = enumerate( bx.intervals.io.NiceReaderWrapper( open( interval_file, 'r' ), chrom_col = chr_col, start_col = start_col, end_col = end_col, strand_col = strand_col, fix_strand = True, return_header = False, return_comments = False ) )
+    
+    #Step through intervals
+    regions_extracted = 0
+    line_count = 0
+    for line_count, line in region_enumerator:
+        try:
+            if options.geneBED: #Process as Gene BED
+                try:
+                    starts, ends, fields = maf_utilities.get_starts_ends_fields_from_gene_bed( line )
+                    #create spliced alignment object
+                    alignment = maf_utilities.get_spliced_region_alignment( index, primary_species, fields[0], starts, ends, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps )
+                    primary_name = secondary_name = fields[3]
+                    alignment_strand = fields[5]
+                except Exception, e:
+                    print "Error loading exon positions from input line %i: %s" % ( line_count, e )
+                    continue
+            else: #Process as standard intervals
+                try:
+                    #create spliced alignment object
+                    alignment = maf_utilities.get_region_alignment( index, primary_species, line.chrom, line.start, line.end, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps )
+                    primary_name = "%s(%s):%s-%s" % ( line.chrom, line.strand, line.start, line.end )
+                    secondary_name = ""
+                    alignment_strand = line.strand
+                except Exception, e:
+                    print "Error loading region positions from input line %i: %s" % ( line_count, e )
+                    continue
+            
+            #Write alignment to output file
+            #Output primary species first, if requested
+            if include_primary:
+                output.write( ">%s.%s\n" %( primary_species, primary_name ) )
+                if alignment_strand == "-":
+                    output.write( alignment.get_sequence_reverse_complement( primary_species ) )
+                else:
+                    output.write( alignment.get_sequence( primary_species ) )
+                output.write( "\n" )
+            #Output all remainging species
+            for spec in secondary_species or alignment.get_species_names( skip = primary_species ):
+                if secondary_name:
+                    output.write( ">%s.%s\n" % ( spec, secondary_name ) )
+                else:
+                    output.write( ">%s\n" % ( spec ) )
+                if alignment_strand == "-":
+                    output.write( alignment.get_sequence_reverse_complement( spec ) )
+                else:
+                    output.write( alignment.get_sequence( spec ) )
+                output.write( "\n" )
+            
+            output.write( "\n" )
+            
+            regions_extracted += 1
+        
+        except Exception, e:
+            print "Unexpected error from input line %i: %s" % ( line_count, e )
+            continue
+    
+    #close output file
+    output.close()
+    
+    #remove index file if created during run
+    maf_utilities.remove_temp_index_file( index_filename )
+    
+    #Print message about success for user
+    if regions_extracted > 0:
+        print "%i regions were processed successfully." % ( regions_extracted )
+    else:
+        print "No regions were processed successfully."
+        if line_count > 0 and options.geneBED:
+            print "This tool requires your input file to conform to the 12 column BED standard."
+
+if __name__ == "__main__": __main__()