Mercurial > repos > chrisb > gap_all_glycan_tools
diff manipulate/rename_kcf/rename_kcf.py @ 0:89592faa2875 draft
Uploaded
author | chrisb |
---|---|
date | Wed, 23 Mar 2016 14:35:56 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/manipulate/rename_kcf/rename_kcf.py Wed Mar 23 14:35:56 2016 -0400 @@ -0,0 +1,86 @@ +__author__ = "Chris Barnett" +__version__ = "0.3" +__license__ = "MIT" + +class id_generator(): + def __init__(self, counterinit=0): + import itertools + + self.generator = itertools.count(counterinit) + + def next(self): + return self.generator.next() + + +def read_meta_kcf(inputstream, prefix="GLY", counterinit=0): + """ + :param inputstream: the kcf file + :param prefix: the prefix for the entry. GLY by default. keep it short + :param counterinit: entries are numbered starting at counterinit. 0 by default. + read kcf file (which may contain multiple kcf entries) and rename the ENTRY. + often the ENTRY is too long or linearcode (my fault for suggesting this) and kcf files then are not recognised properly + and/or are ignored in MCAW and other analysis tools + duplicates are not checked for. entries are named as GLY(x) where x is generated from a counter which by default starts at 0 + :return: + """ + if inputstream is None or inputstream == [] or inputstream == "": + raise IOError("empty input stream") + counter = id_generator(counterinit) + list_of_kcf_paragraphs = [] + kcfpara = None + for line in inputstream: + if "ENTRY" in line: + # . could strip and split the line and remake it, but easier to supplant it + newline = "ENTRY " + str(prefix) + str(counter.next()) + " Glycan\n" + kcfpara = [newline] + elif "///" in line: + kcfpara.append(line) + list_of_kcf_paragraphs.append(kcfpara) + else: + if kcfpara is not None: + kcfpara.append(line) + # . sometimes kcf has no /// or final kcf in many has no ////, so add it + if kcfpara not in list_of_kcf_paragraphs: + list_of_kcf_paragraphs.append(kcfpara) + + return list_of_kcf_paragraphs # why this list. easier to deal with each glycan as an individual item in the list + + +def flatten_meta_kcf_list(metakcflist): + """ + + :param metakcflist: a list containing lists of strings + :return: combined kcfs as a large string for saving to file + """ + import itertools + + return "".join(list(itertools.chain(*metakcflist))) + + +if __name__ == "__main__": + from optparse import OptionParser + + usage = "usage: python %prog [options]\n" + parser = OptionParser(usage=usage) + parser.add_option("-i", action="store", type="string", dest="i", default="input", + help="input kcf file (input)") + parser.add_option("-o", action="store", type="string", dest="o", default="output", + help="output kcf file (output)") + parser.add_option("-p", action="store", type="string", dest="p", default="GLY", + help="prefix for glycan entry name change") + parser.add_option("-c", action="store", type="int", dest="c", default=0, + help="starting number for counter for glycan entry") + (options, args) = parser.parse_args() + + try: + inputname = options.i + outputname = options.o + except Exception as e: + raise Exception(e, "Please pass an input (kcf) and output filename as arguments") + instream = file(inputname, 'r') + try: + convertedkcf = read_meta_kcf(instream,prefix=options.p, counterinit=options.c) + with open(outputname, "w") as f: + f.write(flatten_meta_kcf_list(convertedkcf)) + except Exception as e: + raise e