Mercurial > repos > davidvanzessen > shm_csr
diff baseline/script_imgt.py @ 67:ba33b94637ca draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 29 Jan 2019 03:54:09 -0500 |
parents | 8728284105ee |
children |
line wrap: on
line diff
--- a/baseline/script_imgt.py Thu Dec 07 03:44:38 2017 -0500 +++ b/baseline/script_imgt.py Tue Jan 29 03:54:09 2019 -0500 @@ -1,86 +1,86 @@ -#import xlrd #avoid dep -import argparse -import re - -parser = argparse.ArgumentParser() -parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") -parser.add_argument("--ref", help="Reference file") -parser.add_argument("--output", help="Output file") -parser.add_argument("--id", help="ID to be used at the '>>>' line in the output") - -args = parser.parse_args() - -print "script_imgt.py" -print "input:", args.input -print "ref:", args.ref -print "output:", args.output -print "id:", args.id - -refdic = dict() -with open(args.ref, 'rU') as ref: - currentSeq = "" - currentId = "" - for line in ref: - if line.startswith(">"): - if currentSeq is not "" and currentId is not "": - refdic[currentId[1:]] = currentSeq - currentId = line.rstrip() - currentSeq = "" - else: - currentSeq += line.rstrip() - refdic[currentId[1:]] = currentSeq - -print "Have", str(len(refdic)), "reference sequences" - -vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#, -# r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)", -# r"(IGKV[0-3]D?-[0-9]{1,2})", -# r"(IGLV[0-9]-[0-9]{1,2})", -# r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)", -# r"(TRGV[234589])", -# r"(TRDV[1-3])"] - -#vPattern = re.compile(r"|".join(vPattern)) -vPattern = re.compile("|".join(vPattern)) - -def filterGene(s, pattern): - if type(s) is not str: - return None - res = pattern.search(s) - if res: - return res.group(0) - return None - - - -currentSeq = "" -currentId = "" -first=True -with open(args.input, 'r') as i: - with open(args.output, 'a') as o: - o.write(">>>" + args.id + "\n") - outputdic = dict() - for line in i: - if first: - first = False - continue - linesplt = line.split("\t") - ref = filterGene(linesplt[1], vPattern) - if not ref or not linesplt[2].rstrip(): - continue - if ref in outputdic: - outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] - else: - outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] - #print outputdic - - for k in outputdic.keys(): - if k in refdic: - o.write(">>" + k + "\n") - o.write(refdic[k] + "\n") - for seq in outputdic[k]: - #print seq - o.write(">" + seq[0] + "\n") - o.write(seq[1] + "\n") - else: - print k + " not in reference, skipping " + k +#import xlrd #avoid dep +import argparse +import re + +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence") +parser.add_argument("--ref", help="Reference file") +parser.add_argument("--output", help="Output file") +parser.add_argument("--id", help="ID to be used at the '>>>' line in the output") + +args = parser.parse_args() + +print "script_imgt.py" +print "input:", args.input +print "ref:", args.ref +print "output:", args.output +print "id:", args.id + +refdic = dict() +with open(args.ref, 'rU') as ref: + currentSeq = "" + currentId = "" + for line in ref: + if line.startswith(">"): + if currentSeq is not "" and currentId is not "": + refdic[currentId[1:]] = currentSeq + currentId = line.rstrip() + currentSeq = "" + else: + currentSeq += line.rstrip() + refdic[currentId[1:]] = currentSeq + +print "Have", str(len(refdic)), "reference sequences" + +vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#, +# r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)", +# r"(IGKV[0-3]D?-[0-9]{1,2})", +# r"(IGLV[0-9]-[0-9]{1,2})", +# r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)", +# r"(TRGV[234589])", +# r"(TRDV[1-3])"] + +#vPattern = re.compile(r"|".join(vPattern)) +vPattern = re.compile("|".join(vPattern)) + +def filterGene(s, pattern): + if type(s) is not str: + return None + res = pattern.search(s) + if res: + return res.group(0) + return None + + + +currentSeq = "" +currentId = "" +first=True +with open(args.input, 'r') as i: + with open(args.output, 'a') as o: + o.write(">>>" + args.id + "\n") + outputdic = dict() + for line in i: + if first: + first = False + continue + linesplt = line.split("\t") + ref = filterGene(linesplt[1], vPattern) + if not ref or not linesplt[2].rstrip(): + continue + if ref in outputdic: + outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] + else: + outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())] + #print outputdic + + for k in outputdic.keys(): + if k in refdic: + o.write(">>" + k + "\n") + o.write(refdic[k] + "\n") + for seq in outputdic[k]: + #print seq + o.write(">" + seq[0] + "\n") + o.write(seq[1] + "\n") + else: + print k + " not in reference, skipping " + k