+ − 1 import xlrd
+ − 2 import argparse
+ − 3
+ − 4 parser = argparse.ArgumentParser()
+ − 5 parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
+ − 6 parser.add_argument("--ref", help="Reference file")
+ − 7 parser.add_argument("--output", help="Output file")
+ − 8
+ − 9 args = parser.parse_args()
+ − 10
+ − 11 gene_column = 6
+ − 12 id_column = 7
+ − 13 seq_column = 8
+ − 15
+ − 16
+ − 17 refdic = dict()
+ − 18 with open(args.ref, 'r') as ref:
+ − 19 currentSeq = ""
+ − 20 currentId = ""
+ − 21 for line in ref.readlines():
+ − 22 if line[0] is ">":
+ − 23 if currentSeq is not "" and currentId is not "":
+ − 24 refdic[currentId[1:]] = currentSeq
+ − 25 currentId = line.rstrip()
+ − 26 currentSeq = ""
+ − 27 else:
+ − 28 currentSeq += line.rstrip()
+ − 29 refdic[currentId[1:]] = currentSeq
+ − 30
+ − 31 currentSeq = ""
+ − 32 currentId = ""
+ − 33 with xlrd.open_workbook(args.input, 'r') as wb:
+ − 34 with open(args.output, 'a') as o:
+ − 35 for sheet in wb.sheets():
+ − 36 if sheet.cell(1,gene_column).value.find("IGHV") < 0:
+ − 37 print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name
+ − 38 continue
+ − 39 o.write(">>>" + sheet.name + "\n")
+ − 40 outputdic = dict()
+ − 41 for rowindex in range(1, sheet.nrows):
+ − 42 ref = sheet.cell(rowindex, gene_column).value.replace(">", "")
+ − 43 if ref in outputdic:
+ − 44 outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
+ − 45 else:
+ − 46 outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
+ − 47 #print outputdic
+ − 48
+ − 49 for k in outputdic.keys():
+ − 50 if k in refdic:
+ − 51 o.write(">>" + k + "\n")
+ − 52 o.write(refdic[k] + "\n")
+ − 53 for seq in outputdic[k]:
+ − 54 #print seq
+ − 55 o.write(">" + seq[0] + "\n")
+ − 56 o.write(seq[1] + "\n")
+ − 57 else:
+ − 58 print k + " not in reference, skipping " + k