Mercurial > repos > davidvanzessen > shm_csr
view baseline/script_imgt.py @ 79:98e3fecedd2b draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 01 Sep 2020 16:03:44 -0400 |
parents | ba33b94637ca |
children |
line wrap: on
line source
#import xlrd #avoid dep import argparse import re parser = argparse.ArgumentParser() parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") parser.add_argument("--ref", help="Reference file") parser.add_argument("--output", help="Output file") parser.add_argument("--id", help="ID to be used at the '>>>' line in the output") args = parser.parse_args() print "script_imgt.py" print "input:", args.input print "ref:", args.ref print "output:", args.output print "id:", args.id refdic = dict() with open(args.ref, 'rU') as ref: currentSeq = "" currentId = "" for line in ref: if line.startswith(">"): if currentSeq is not "" and currentId is not "": refdic[currentId[1:]] = currentSeq currentId = line.rstrip() currentSeq = "" else: currentSeq += line.rstrip() refdic[currentId[1:]] = currentSeq print "Have", str(len(refdic)), "reference sequences" vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#, # r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)", # r"(IGKV[0-3]D?-[0-9]{1,2})", # r"(IGLV[0-9]-[0-9]{1,2})", # r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)", # r"(TRGV[234589])", # r"(TRDV[1-3])"] #vPattern = re.compile(r"|".join(vPattern)) vPattern = re.compile("|".join(vPattern)) def filterGene(s, pattern): if type(s) is not str: return None res = pattern.search(s) if res: return res.group(0) return None currentSeq = "" currentId = "" first=True with open(args.input, 'r') as i: with open(args.output, 'a') as o: o.write(">>>" + args.id + "\n") outputdic = dict() for line in i: if first: first = False continue linesplt = line.split("\t") ref = filterGene(linesplt[1], vPattern) if not ref or not linesplt[2].rstrip(): continue if ref in outputdic: outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] else: outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] #print outputdic for k in outputdic.keys(): if k in refdic: o.write(">>" + k + "\n") o.write(refdic[k] + "\n") for seq in outputdic[k]: #print seq o.write(">" + seq[0] + "\n") o.write(seq[1] + "\n") else: print k + " not in reference, skipping " + k