GMAPDB %s

# HG changeset patch # User jjohnson # Date 1318956162 14400 # Node ID d58d272914e7e7da99c097334899ffd6b60b4ab6 Uploaded diff -r 000000000000 -r d58d272914e7 gmap/README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gmap/README Tue Oct 18 12:42:42 2011 -0400 @@ -0,0 +1,50 @@ + +GMAP and GSNAP use added datatypes: + + add datatype definition file: lib/galaxy/datatypes/gmap.py + + add the following import line to: lib/galaxy/datatypes/registry.py + import gmap # added for gmap tools + + add to datatypes_conf.xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (split_output == False) + + + + + + + + + + (split_output == True) + + + + + + + + + (split_output == True) + + + + + + + + + (split_output == True) + + + + + + + + + (split_output == True) + + + + + + + + + + + + + +**What it does** + +GMAP_ (Genomic Mapping and Alignment Program) The functionality provided by gmap allows a user to: (1) map and align a single cDNA interactively against a large genome in about a second, without the startup time of several minutes typically needed by existing mapping programs; (2) switch arbitrarily among different genomes, without the need for a preloaded server dedicated to each genome; (3) run the program on computers with as little as 128 MB of RAM (random access memory); (4) perform high-throughput batch processing of cDNAs by using memory mapping and multithreading when appropriate memory and hardware are available; (5) generate accurate gene models, even in the presence of substantial polymorphisms and sequence errors; (6) locate splice sites accurately without the use of probabilistic splice site models, allowing generalized use of the program across species; (7) detect statistically significant microexons and incorporate them into the alignment; and (8) handle mapping and alignment tasks on genomes having alternate assemblies, linkage groups or strains. It is developed by Thomas D. Wu of Genentech, Inc. + +Publication_ citation: Thomas D. Wu, Colin K. Watanabe Bioinformatics 2005 21(9):1859-1875; doi:10.1093/bioinformatics/bti310 + +.. _GMAP: http://research-pub.gene.com/gmap/ +.. _Publication: http://bioinformatics.oxfordjournals.org/cgi/content/full/21/9/1859 + +------ + +**Know what you are doing** + +.. class:: warningmark + +You will want to read the README_ + +.. _README: http://research-pub.gene.com/gmap/src/README + + + + diff -r 000000000000 -r d58d272914e7 gmap/gmap_build.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gmap/gmap_build.xml Tue Oct 18 12:42:42 2011 -0400 @@ -0,0 +1,163 @@ + + a GMAP DB Index + + gmap_build + + gmap --version + /bin/bash $shscript 2>1 1> $output + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +#!/bin/bash +#set $ds = chr(36) +#set $gt = chr(62) +#set $lt = chr(60) +#set $ad = chr(38) +#import os.path +#set $gmapdb = $output.extra_files_path +#set $mapsdir = $os.path.join($os.path.join($gmapdb,str($refname)), str($refname) + '.maps') +mkdir -p $gmapdb +## export GMAPDB required for cmetindex and atoiindex +export GMAPDB=$gmapdb +#for $k in $kmer.__str__.split(','): +gmap_build -D $gmapdb -d $refname -s numeric-alpha -k $k $input +#end for +get-genome -D $gmapdb -d '?' | sed 's/^Available .*/gmap db: /' +echo "kmers: " $kmer +#if $splicesite.splice_source == 'refGeneTable': +#if $splicesite.refGenes.__str__ != 'None': +cat $splicesite.refGenes | psl_splicesites -s $splicesite.col_skip | iit_store -o $os.path.join($mapsdir,'splicesites') +cat $splicesite.refGenes | psl_introns -s $splicesite.col_skip | iit_store -o $os.path.join($mapsdir,'introns') +#end if +#elif $splicesite.splice_source == 'gtf': +#if $splicesite.gtfGenes.__str__ != 'None': +cat $splicesite.gtfGenes | gtf_splicesites | iit_store -o $os.path.join($mapsdir,'splicesites') +cat $splicesite.gtfGenes | gtf_introns | iit_store -o $os.path.join($mapsdir,'introns') +#end if +#elif $splicesite.splice_source == 'gff3': +#if $splicesite.gff3Genes.__str__ != 'None': +cat $splicesite.gff3Genes | gff3_splicesites | iit_store -o $os.path.join($mapsdir,'splicesites') +cat $splicesite.gff3Genes | gff3_introns | iit_store -o $os.path.join($mapsdir,'introns') +#end if +#end if +#if $dbsnp.snp_source == 'snpTable': +#if $dbsnp.snps.__str__ != 'None': +#if $dbsnp.snpsex.__str__ != 'None': +cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight -e $dbsnp.snpsex | iit_store -o $os.path.join($mapsdir,'snps') +#else: +cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight | iit_store -o $os.path.join($mapsdir,'snps') +#end if +snpindex -d $refname -v snps +#end if +#end if +#if $cmetindex.__str__ == 'yes': +cmetindex -d $refname +echo "cmetindex" +#end if +#if $atoiindex.__str__ == 'yes': +atoiindex -d $refname +echo "atoiindex" +#end if +get-genome -D $gmapdb -d $refname -m '?' | sed 's/^Available maps .*/maps: /' + + + + + + + + + +**GMAP Build** + +GMAP Build creates an index of a genomic sequence for alignments using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). + +You will want to read the README_ + +Publication_ citation: Thomas D. Wu, Colin K. Watanabe Bioinformatics 2005 21(9):1859-1875; doi:10.1093/bioinformatics/bti310 + +.. _GMAP: http://research-pub.gene.com/gmap/ +.. _GSNAP: http://research-pub.gene.com/gmap/ +.. _README: http://research-pub.gene.com/gmap/src/README +.. _Publication: http://bioinformatics.oxfordjournals.org/cgi/content/full/21/9/1859 + + + + + diff -r 000000000000 -r d58d272914e7 gmap/gsnap.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gmap/gsnap.xml Tue Oct 18 12:42:42 2011 -0400 @@ -0,0 +1,585 @@ + + Genomic Short-read Nucleotide Alignment Program + + gsnap + + gsnap --version + + #import os.path, re + gsnap + --nthreads="4" --ordered + #if $refGenomeSource.genomeSource == "history": + --gseg=$refGenomeSource.ownFile + #elif $refGenomeSource.genomeSource == "gmapdb": + #set $gmapdb = $os.listdir($refGenomeSource.gmapdb.extra_files_path)[0] + --dir=$refGenomeSource.gmapdb.extra_files_path --db=$gmapdb + #if $refGenomeSource.kmer != None and len($refGenomeSource.kmer.__str__) == 2: + --kmer=$refGenomeSource.kmer + #end if + #if $refGenomeSource.splicemap != None and len($refGenomeSource.splicemap.__str__) == 2: + --use-splices=$refGenomeSource.splicemap + #end if + #if $refGenomeSource.snpindex != None and len($refGenomeSource.snpindex.__str__) == 2: + --use-snps=$refGenomeSource.snpindex + #end if + #else: + --dir=$os.path.dirname($refGenomeSource.gmapindex.value) --db=$os.path.basename($refGenomeSource.gmapindex.value) + #if $refGenomeSource.kmer != None and len($refGenomeSource.kmer.__str__) == 2: + --kmer=$refGenomeSource.kmer + #end if + #end if + #if $mode.__str__ != '': + --mode=$mode + #end if + #if $computation.options == "advanced": + #if $computation.max_mismatches.__str__ != '': + --max-mismatches=$computation.max_mismatches + #end if + $computation.query_unk_mismatch + $computation.genome_unk_mismatch + #if $computation.terminal_threshold.__str__ != '': + --terminal-threshold=$computation.terminal_threshold + #end if + #if $computation.indel_penalty.__str__ != '': + --indel-penalty=$computation.indel_penalty + #end if + #if $computation.indel_endlength.__str__ != '': + --indel-endlength=$computation.indel_endlength + #end if + #if $computation.max_middle_insertions.__str__ != '': + --max-middle-insertions=$computation.max_middle_insertions + #end if + #if $computation.max_middle_deletions.__str__ != '': + --max-middle-deletions=$computation.max_middle_deletions + #end if + #if $computation.max_end_insertions.__str__ != '': + --max-end-insertions=$computation.max_end_insertions + #end if + #if $computation.max_end_deletions.__str__ != '': + --max-end-deletions=$computation.max_end_deletions + #end if + #if $computation.suboptimal_levels.__str__ != '': + --suboptimal-levels=$computation.suboptimal_levels + #end if + #if $computation.adapter_strip.__str__ != '': + --adapter-strip=$computation.adapter_strip + #end if + ## gmap options + #if $computation.gmap_mode.__str__ != '' and $computation.gmap_mode.__str__ != 'None': + --gmap-mode='$computation.gmap_mode' + #end if + #if $computation.trigger_score_for_gmap.__str__ != '': + --trigger-score-for-gmap=$computation.trigger_score_for_gmap + #end if + #if $computation.max_gmap_pairsearch.__str__ != '' and $re.search("pairsearch",$computation.gmap_mode): + --max-gmap-pairsearch=$computation.max_gmap_pairsearch + #end if + #if $computation.max_gmap_terminal.__str__ != '' and $re.search("terminal",$computation.gmap_mode): + --max-gmap-terminal=$computation.max_gmap_terminal + #end if + #if $computation.max_gmap_improvement.__str__ != '' and $re.search("improv",$computation.gmap_mode): + --max-gmap-improvement=$computation.max_gmap_improvement + #end if + #if $computation.microexon_spliceprob.__str__ != '': + --microexon-spliceprob=$computation.microexon_spliceprob + #end if + #end if + #if $splicing.options == "advanced": + $splicing.novelsplicing + #if $splicing.localsplicedist.__str__ != '': + --localsplicedist=$splicing.localsplicedist + #end if + #if $splicing.local_splice_penalty.__str__ != '': + --local-splice-penalty=$splicing.local_splice_penalty + #end if + #if $splicing.distant_splice_penalty.__str__ != '': + --distant-splice-penalty=$splicing.distant_splice_penalty + #end if + #if $splicing.local_splice_endlength.__str__ != '': + --local-splice-endlength=$splicing.local_splice_endlength + #end if + #if $splicing.distant_splice_endlength.__str__ != '': + --distant-splice-endlength=$splicing.distant_splice_endlength + #end if + #if $splicing.distant_splice_identity.__str__ != '': + --distant-splice-identity=$splicing.distant_splice_identity + #end if + #end if + #if $output.options == "advanced": + #if $output.npath.__str__ != '': + --npath=$output.npath + #end if + $output.quiet_if_excessive + $output.show_refdiff + $output.clip_overlap + #end if + #if $result.format == "sam": + --format=sam + $result.no_sam_headers + #if $result.read_group_id.__str__.strip != '': + --read-group-id='$result.read_group_id' + #end if + #if $result.read_group_name.__str__ != '': + --read-group-name='$result.read_group_name' + #end if + #if $result.read_group_library.__str__ != '': + --read-group-library='$result.read_group_library' + #end if + #if $result.read_group_platform.__str__ != '': + --read-group-platform='$result.read_group_platform' + #end if + #if $result.quality_shift.__str__ != '': + --quality-shift=$result.quality_shift + #end if + #elif $result.format == "goby": + #if $result.goby_output.__str__ != '': + --goby-output='$result.goby_output' + #end if + #if $result.creads_window_start.__str__ != '': + --creads-window-start=$result.creads_window_start + #end if + #if $result.creads_window_end.__str__ != '': + --creads-window-end=$result.creads_window_end + #end if + $result.creads_complement + #end if + ## TODO - do we need these options (Is it tally XOR runlength?): + ## --tallydir= --use-tally=tally + ## --runlengthdir --use-runlength=runlength + #if $seq.format == "gsnap_fasta": + $seq.circularinput $seq.gsnap + #else if $seq.format == "fastq": + #if $seq.barcode_length.__str__ != '': + --barcode-length=$seq.barcode_length + #end if + #if $seq.fastq_id_start.__str__ != '': + --fastq-id-start=$seq.fastq_id_start + #end if + #if $seq.fastq_id_end.__str__ != '': + --fastq-id-end=$seq.fastq_id_end + #end if + #if $seq.filter_chastity.__str__ != 'off': + --filter-chastity=$seq.filter_chastity + #end if + #if $seq.paired.ispaired.__str__ == "yes": + #if $seq.paired.pairmax_dna.__str__ != '': + --pairmax-dna=$seq.paired.pairmax_dna + #end if + #if $seq.paired.pairmax_rna.__str__ != '': + --pairmax-rna=$seq.paired.pairmax_rna + #end if + $seq.fastq $seq.paired.fastq + #else + $seq.fastq + #end if + #end if + #if $split_output == True + 2> $gsnap_stderr + #else + 2> $gsnap_stderr > $results + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (split_output == False) + + + + + + + (split_output == True) + + + + + + (split_output == True) + + + + + + (split_output == True) + + + + + + (split_output == True) + + + + + + (split_output == True) + + + + + + (split_output == True) + + + + + + (split_output == True) + + + + + + (split_output == True) + + + + + + (split_output == True) + + + + + + + + + + + +**What it does** + +GSNAP_ (Genomic Short-read Nucleotide Alignment Program) is a short read aligner which can align both single- and paired-end reads as short as 14nt and of arbitrarily long length. It can detect short- and long-distance splicing, including interchromosomal splicing, in individual reads, using probabilistic models or a database of known splice sites. Our program also permits SNP-tolerant alignment to a reference space of all possible combinations of major and minor alleles, and can align reads from bisulfite-treated DNA for the study of methylation state. It is developed by Thomas D. Wu of Genentech, Inc. +Publication_ citation: Thomas D. Wu, Serban Nacu "Fast and SNP-tolerant detection of complex variants and splicing in short reads. Bioinformatics. 2010 Apr 1;26(7):873-81. Epub 2010 Feb 10. + +.. _GSNAP: http://research-pub.gene.com/gmap/ +.. _Publication: http://bioinformatics.oupjournals.org/cgi/content/full/26/7/873 +http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2844994/?tool=pubmed + +------ + +**Know what you are doing** + +.. class:: warningmark + +You will want to read the README_ + +.. _README: http://research-pub.gene.com/gmap/src/README + +------ + +**Input formats** + +Input to GSNAP should be either in FASTQ or FASTA format. + +The FASTQ input may include quality scores, which will then be included in SAM +output, if that output format is selected. + +For FASTA format, you should include one line per read (or end of a +paired-end read). The same FASTA file can have a mixture of +single-end and paired-end reads of varying lengths, if desired. + +Single-end reads: + +Each FASTA entry should contain one short read per line, like this + +>Header information +AAAACATTCTCCTCCGCATAAGCCTGCGTCAGATTA + +Each short read can have a different length. However, the entire read +needs to be on a single line, and may not wrap around multiple lines. +If it extends to a second line, GSNAP will think that the read is +paired-end. + + +Paired-end reads: + +Each FASTA entry should contain two short reads, one per line, like +this + +>Header information +AAAACATTCTCCTCCGCATAAGCCTAGTAGATTA +GGCGTAGGTAGAAGTAGAGGTTAAGGCGCGTCAG + +By default, the program assumes that the second end is in the reverse +complement direction compared with the first end. If they are in the +same direction, you may need to use the --circular-input (or -c) flag. + +( The Galaxy tool: "FASTA Width formatter" can be used to reformat fasta files to have single line sequences. ) + +------ + +**Output formats in GSNAP** + +SAM output format + +Default GSNAP format + See the README_ + + + + + + + diff -r 000000000000 -r d58d272914e7 gmap/lib/galaxy/datatypes/gmap.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gmap/lib/galaxy/datatypes/gmap.py Tue Oct 18 12:42:42 2011 -0400 @@ -0,0 +1,169 @@ +""" +GMAP indexes +""" +import logging +import os,os.path,re +from data import Text +from metadata import MetadataElement + +log = logging.getLogger(__name__) + +class GmapDB( Text ): + """ + A GMAP DB for indexes + """ + MetadataElement( name="db_name", desc="The db name for this index set", default='unknown', set_in_upload=True, readonly=True ) + MetadataElement( name="basesize", default="12", desc="The basesize for offsetscomp", visible=True, readonly=True ) + MetadataElement( name="kmers", default=[''], desc="The kmer sizes for indexes", visible=True, no_value=[''], readonly=True ) + MetadataElement( name="map_dir", desc="The maps directory", default='unknown', set_in_upload=True, readonly=True ) + MetadataElement( name="maps", default=[''], desc="The names of maps stored for this gmap gmapdb", visible=True, no_value=[''], readonly=True ) + MetadataElement( name="snps", default=[''], desc="The names of SNP indexes stored for this gmapdb", visible=True, no_value=[''], readonly=True ) + MetadataElement( name="cmet", default=False, desc="Has a cmet index", visible=True, readonly=True ) + MetadataElement( name="atoi", default=False, desc="Has a atoi index", visible=True, readonly=True ) + + file_ext = 'gmapdb' + is_binary = True + composite_type = 'auto_primary_file' + allow_datatype_change = False + + def generate_primary_file( self, dataset = None ): + """ + This is called only at upload to write the html file + cannot rename the datasets here - they come with the default unfortunately + """ + return 'AutoGenerated Primary File for Composite Dataset' + + def regenerate_primary_file(self,dataset): + """ + cannot do this until we are setting metadata + """ + bn = dataset.metadata.db_name + log.info( "GmapDB regenerate_primary_file %s" % (bn)) + rval = ['GMAPDB %s

GMAPDB %s

cmet %s
atoi %s

Maps:

%s' % name) + rval.append( '

' ) + f = file(dataset.file_name,'w') + f.write("\n".join( rval )) + f.write('\n') + f.close() + + def set_peek( self, dataset, is_multi_byte=False ): + log.info( "GmapDB set_peek %s" % (dataset)) + if not dataset.dataset.purged: + dataset.peek = "GMAPDB index %s\n cmet %s\n atoi %s\n maps %s" % ( dataset.metadata.db_name,dataset.metadata.cmet,dataset.metadata.atoi,dataset.metadata.maps ) + dataset.blurb = "GMAPDB %s" % ( dataset.metadata.db_name ) + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + def display_peek( self, dataset ): + try: + return dataset.peek + except: + return "GMAP index file" + def sniff( self, filename ): + return False + def set_meta( self, dataset, overwrite = True, **kwd ): + """ + Expecting: + extra_files_path//db_name>.ref3 + extra_files_path/db_name/db_name.ref1[2345]1[2345]3offsetscomp + extra_files_path/db_name/db_name.ref1[2345]1[2345]3positions + extra_files_path/db_name/db_name.ref1[2345]1[2345]3gammaptrs + index maps: + extra_files_path/db_name/db_name.maps/*.iit + """ + log.info( "GmapDB set_meta %s %s" % (dataset,dataset.extra_files_path)) + pat = '(.*)\.((ref)|(met)[atgc][atgc]|(a2i)[atgc][atgc])((\d\d)(\d\d))?3positions(\.(.+))?' + efp = dataset.extra_files_path + flist = os.listdir(efp) + for i,fname in enumerate(flist): + log.info( "GmapDB set_meta %s %s" % (i,fname)) + fpath = os.path.join(efp,fname) + if os.path.isdir(fpath): + ilist = os.listdir(fpath) + kmers = {'':'default'} # HACK '' empty key added so user has default choice when selecting kmer from metadata + for j,iname in enumerate(ilist): + log.info( "GmapDB set_meta file %s %s" % (j,iname)) + ipath = os.path.join(fpath,iname) + if os.path.isdir(ipath): # find maps + dataset.metadata.map_dir = iname + for mapfile in os.listdir(ipath): + mapname = mapfile.replace('.iit','') + log.info( "GmapDB set_meta map %s %s" % (mapname,mapfile)) + dataset.metadata.maps.append(mapname) + else: + m = re.match(pat,iname) + if m: + log.info( "GmapDB set_meta m %s %s " % (iname, m)) + assert len(m.groups()) == 10 + dataset.metadata.db_name = fname + if m.groups()[2] == 'ref': + if m.groups()[-1] != None: + dataset.metadata.snps.append(m.groups()[-1]) + else: + if m.groups()[-3] != None: + k = int(m.groups()[-3]) + kmers[k] = k + if m.groups()[-4] != None: + dataset.metadata.basesize = int( m.groups()[-4]) + elif m.groups()[3] == 'met': + dataset.metadata.cmet = True + elif m.groups()[4] == 'a2i': + dataset.metadata.atoi = True + dataset.metadata.kmers = kmers.keys() + +## class IntervalIndexTree( Text ): +## """ +## A GMAP Interval Index Tree Map +## created by iit_store +## (/path/to/map)/(mapname).iit +## """ +## MetadataElement( name="map_name", desc="The map name for this index set", default='unknown', set_in_upload=True, readonly=False ) +## file_ext = 'iit' +## is_binary = True +## composite_type = 'auto_primary_file' +## allow_datatype_change = False +## +## class IntervalAnnotation(data.Text): +## """ +## Class describing a GMAP Interval format: +## >label coords optional_tag +## optional_annotation (which may be zero, one, or multiple lines) +## The coords should be of the form: +## chr:position +## chr:startposition..endposition +## """ +## file_ext = 'gmapannotation' +## +## class SpliceSiteAnnotation(IntervalAnnotation): +## file_ext = 'gmapsplicesites' +## """ +## Example: +## >NM_004448.ERBB2.exon1 17:35110090..35110091 donor 6678 +## >NM_004448.ERBB2.exon2 17:35116768..35116769 acceptor 6678 +## >NM_004448.ERBB2.exon2 17:35116920..35116921 donor 1179 +## >NM_004448.ERBB2.exon3 17:35118099..35118100 acceptor 1179 +## >NM_004449.ERG.exon1 21:38955452..38955451 donor 783 +## >NM_004449.ERG.exon2 21:38878740..38878739 acceptor 783 +## >NM_004449.ERG.exon2 21:38878638..38878637 donor 360 +## >NM_004449.ERG.exon3 21:38869542..38869541 acceptor 360 +## """ +## +## class IntronAnnotation(IntervalAnnotation): +## file_ext = 'gmapintrons' +## """ +## Example: +## >NM_004448.ERBB2.intron1 17:35110090..35116769 +## >NM_004448.ERBB2.intron2 17:35116920..35118100 +## >NM_004449.ERG.intron1 21:38955452..38878739 +## >NM_004449.ERG.intron2 21:38878638..38869541 +## """ +## +## class SNPAnnotation(IntervalAnnotation): +## file_ext = 'gmapsnps' +## """ +## Example: +## >rs62211261 21:14379270 CG +## >rs62211262 21:14379281 CG +## """