Mercurial > repos > petr-novak > profrep
diff profrep_masking.py @ 0:a5f1638b73be draft
Uploaded
author | petr-novak |
---|---|
date | Wed, 26 Jun 2019 08:01:42 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/profrep_masking.py Wed Jun 26 08:01:42 2019 -0400 @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 + +import argparse +from Bio import SeqIO +from Bio.Seq import MutableSeq +from Bio.Alphabet import generic_dna +import sys + + +def main(args): + # Command line arguments + QUERY = args.query + MODE = args.mode + REPEAT_GFF = args.repeat_gff + MASKED = args.output_masked + + repeats_all = get_indices(REPEAT_GFF) + + if MODE == "lowercase": + lower_mask(QUERY, repeats_all, MASKED) + else: + N_mask(QUERY, repeats_all, MASKED) + + +def get_indices(REPEAT_GFF): + ''' + Get indices of repeats from GFF file to mask + ''' + repeats_all = {} + with open(REPEAT_GFF, "r") as repeats_gff: + for line in repeats_gff: + if not line.startswith("#"): + seq_id = line.split("\t")[0] + start_r = line.split("\t")[3] + end_r = line.split("\t")[4] + if seq_id in repeats_all.keys(): + repeats_all[seq_id].append([int(start_r), int(end_r)]) + else: + repeats_all[seq_id] = [[int(start_r), int(end_r)]] + return repeats_all + + +def lower_mask(QUERY, repeats_all, MASKED): + allSeqs = list(SeqIO.parse(QUERY, 'fasta')) + for singleSeq in allSeqs: + mutable = MutableSeq(str(singleSeq.seq), generic_dna) + for index in repeats_all[singleSeq.id]: + for item in range(index[0] - 1, index[1]): + mutable[item] = mutable[item].lower() + singleSeq.seq = mutable + with open(MASKED, "w") as handle: + SeqIO.write(allSeqs, handle, 'fasta') + + +def N_mask(QUERY, repeats_all, MASKED): + allSeqs = list(SeqIO.parse(QUERY, 'fasta')) + for singleSeq in allSeqs: + mutable = MutableSeq(str(singleSeq.seq), generic_dna) + for index in repeats_all[singleSeq.id]: + for item in range(index[0] - 1, index[1]): + mutable[item] = "N" + singleSeq.seq = mutable + with open(MASKED, "w") as handle: + SeqIO.write(allSeqs, handle, 'fasta') + + +if __name__ == "__main__": + + # Command line arguments + parser = argparse.ArgumentParser() + parser.add_argument('-q', + '--query', + type=str, + required=True, + help='query sequence to be processed') + parser.add_argument('-rg', + '--repeat_gff', + type=str, + required=True, + help='query sequence to be processed') + parser.add_argument('-m', + '--mode', + default="lowercase", + choices=['lowercase', 'N'], + help='query sequence to be processed') + parser.add_argument('-o', + '--output_masked', + type=str, + default="output_masked", + help='query sequence to be processed') + + args = parser.parse_args() + main(args)