# HG changeset patch # User gregory-minevich # Date 1332805033 14400 # Node ID 6bd8660f3a8f5fee6c0a48fa5592bedb00b7816b # Parent bc7cc93ef659af8908427ca943b4072e678330a7 Uploaded diff -r bc7cc93ef659 -r 6bd8660f3a8f ._checkSnpEffCandidates.py Binary file ._checkSnpEffCandidates.py has changed diff -r bc7cc93ef659 -r 6bd8660f3a8f ._checkSnpEffCandidates.xml Binary file ._checkSnpEffCandidates.xml has changed diff -r bc7cc93ef659 -r 6bd8660f3a8f checkSnpEffCandidates.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/checkSnpEffCandidates.py Mon Mar 26 19:37:13 2012 -0400 @@ -0,0 +1,70 @@ +#!/usr/bin/python + +import sys +import optparse +import csv +import re + +def main(): + parser = optparse.OptionParser() + parser.add_option('-s', '--snpeff_file', dest = 'snpeff_file', action = 'store', type = 'string', default = None, help = "Path to the snpEff file") + parser.add_option('-c', '--candidate_list', dest = 'candidate_list', action = 'store', type = 'string', default = None, help = "Two column tabular list of candidate Gene ID, Type") + parser.add_option('-o', '--output', dest = 'output', action = 'store', type = 'string', default = None, help = "Output file name") + (options, args) = parser.parse_args() + + snpeff_file = options.snpeff_file + candidate_list = options.candidate_list + + candidates = parse_candidate_list(candidate_list = candidate_list) + mark_snpeff_file(snpeff_file = snpeff_file, output = options.output, candidates = candidates) + +def skip_and_write_headers(writer = None, reader = None, i_file = None): + # count headers + comment = 0 + while reader.next()[0].startswith('#'): + comment = comment + 1 + + # skip and write headers + i_file.seek(0) + for i in range(0, comment): + row = reader.next() + writer.writerow(row) + +def parse_candidate_list(candidate_list = ""): + i_file = open(candidate_list, 'rU') + reader = csv.reader(i_file, delimiter = '\t',) + + candidates = {} + for row in reader: + gene_id = row[0] + gene_type = row[1] + candidates[gene_id] = gene_type + + i_file.close() + + return candidates + +def mark_snpeff_file(snpeff_file = "", output = "", candidates = None): + i_file = open(snpeff_file, 'rU') + reader = csv.reader(i_file, delimiter = '\t') + + o_file = open(output, 'wb') + writer = csv.writer(o_file, delimiter = '\t') + + skip_and_write_headers(writer = writer, reader = reader, i_file = i_file) + + for row in reader: + gene_id = row[9] + if gene_id in candidates: + gene_type = candidates[gene_id] + row.append(gene_type) + else: + row.append('') + + writer.writerow(row) + + o_file.close() + i_file.close() + +if __name__ == "__main__": + main() diff -r bc7cc93ef659 -r 6bd8660f3a8f checkSnpEffCandidates.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/checkSnpEffCandidates.xml Mon Mar 26 19:37:13 2012 -0400 @@ -0,0 +1,51 @@ + + Marks up a snpEff output file with matches to a gene candidate list. + checkSnpEffCandidates.py -s $snpeff_file -c $candidate_list -o $output + + + + + + + + + sys + optparse + csv + + + + + + + +**What it does:** + +Indicates on a SnpEff output file which genes are found in a candidate list by comparing Gene IDs. + +For a description of the snpEff variant annotation and effect prediction tool: + +http://snpeff.sourceforge.net + +------ + +**Input:** + +The candidate list should be in a tabular format with two columns: Gene ID and Gene Description (e.g. C55B7.12 and transcription_factor). The file should contain no headers. + +Useful candidate lists (e.g. transcription factors, genes expressed in neurons, transgene silencers, chromatin factors) are available on the Hobert Lab website: + +http://biochemistry.hs.columbia.edu/labs/hobert/literature.html + + +------ + +**Citation:** + +This tool is part of the CloudMap package from the Hobert Lab. If you use this tool, please cite `Gregory Minevich, Danny Park, Richard J. Poole and Oliver Hobert. CloudMap: A Cloud-based Pipeline for Analysis of Mutant Genome Sequences. (2012 In Preparation)`__ + + .. __: http://biochemistry.hs.columbia.edu/labs/hobert/literature.html + +Correspondence to gm2123@columbia.edu (G.M.) or or38@columbia.edu (O.H.) + +