0
+ − 1 #import xlrd #avoid dep
+ − 2 import argparse
+ − 3 import re
+ − 4
+ − 5 parser = argparse.ArgumentParser()
+ − 6 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
+ − 7 parser.add_argument("--ref", help="Reference file")
+ − 8 parser.add_argument("--output", help="Output file")
+ − 9 parser.add_argument("--id", help="ID to be used at the '>>>' line in the output")
+ − 10
+ − 11 args = parser.parse_args()
+ − 12
+ − 13 refdic = dict()
+ − 14 with open(args.ref, 'r') as ref:
+ − 15 currentSeq = ""
+ − 16 currentId = ""
+ − 17 for line in ref:
+ − 18 if line[0] is ">":
+ − 19 if currentSeq is not "" and currentId is not "":
+ − 20 refdic[currentId[1:]] = currentSeq
+ − 21 currentId = line.rstrip()
+ − 22 currentSeq = ""
+ − 23 else:
+ − 24 currentSeq += line.rstrip()
+ − 25 refdic[currentId[1:]] = currentSeq
+ − 26
+ − 27
+ − 28 vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
+ − 29 # r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
+ − 30 # r"(IGKV[0-3]D?-[0-9]{1,2})",
+ − 31 # r"(IGLV[0-9]-[0-9]{1,2})",
+ − 32 # r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)",
+ − 33 # r"(TRGV[234589])",
+ − 34 # r"(TRDV[1-3])"]
+ − 35
+ − 36 #vPattern = re.compile(r"|".join(vPattern))
+ − 37 vPattern = re.compile("|".join(vPattern))
+ − 38
+ − 39 def filterGene(s, pattern):
+ − 40 if type(s) is not str:
+ − 41 return None
+ − 42 res = pattern.search(s)
+ − 43 if res:
+ − 44 return res.group(0)
+ − 45 return None
+ − 46
+ − 47
+ − 48
+ − 49 currentSeq = ""
+ − 50 currentId = ""
+ − 51 first=True
+ − 52 with open(args.input, 'r') as i:
+ − 53 with open(args.output, 'a') as o:
+ − 54 o.write(">>>" + args.id + "\n")
+ − 55 outputdic = dict()
+ − 56 for line in i:
+ − 57 if first:
+ − 58 first = False
+ − 59 continue
+ − 60 linesplt = line.split("\t")
+ − 61 ref = filterGene(linesplt[1], vPattern)
+ − 62 if not ref or not linesplt[2].rstrip():
+ − 63 continue
+ − 64 if ref in outputdic:
+ − 65 outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
+ − 66 else:
+ − 67 outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
+ − 68 #print outputdic
+ − 69
+ − 70 for k in outputdic.keys():
+ − 71 if k in refdic:
+ − 72 o.write(">>" + k + "\n")
+ − 73 o.write(refdic[k] + "\n")
+ − 74 for seq in outputdic[k]:
+ − 75 #print seq
+ − 76 o.write(">" + seq[0] + "\n")
+ − 77 o.write(seq[1] + "\n")
+ − 78 else:
+ − 79 print k + " not in reference, skipping " + k