Mercurial > repos > petr-novak > profrep
view profrep_masking.py @ 0:a5f1638b73be draft
Uploaded
author | petr-novak |
---|---|
date | Wed, 26 Jun 2019 08:01:42 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python3 import argparse from Bio import SeqIO from Bio.Seq import MutableSeq from Bio.Alphabet import generic_dna import sys def main(args): # Command line arguments QUERY = args.query MODE = args.mode REPEAT_GFF = args.repeat_gff MASKED = args.output_masked repeats_all = get_indices(REPEAT_GFF) if MODE == "lowercase": lower_mask(QUERY, repeats_all, MASKED) else: N_mask(QUERY, repeats_all, MASKED) def get_indices(REPEAT_GFF): ''' Get indices of repeats from GFF file to mask ''' repeats_all = {} with open(REPEAT_GFF, "r") as repeats_gff: for line in repeats_gff: if not line.startswith("#"): seq_id = line.split("\t")[0] start_r = line.split("\t")[3] end_r = line.split("\t")[4] if seq_id in repeats_all.keys(): repeats_all[seq_id].append([int(start_r), int(end_r)]) else: repeats_all[seq_id] = [[int(start_r), int(end_r)]] return repeats_all def lower_mask(QUERY, repeats_all, MASKED): allSeqs = list(SeqIO.parse(QUERY, 'fasta')) for singleSeq in allSeqs: mutable = MutableSeq(str(singleSeq.seq), generic_dna) for index in repeats_all[singleSeq.id]: for item in range(index[0] - 1, index[1]): mutable[item] = mutable[item].lower() singleSeq.seq = mutable with open(MASKED, "w") as handle: SeqIO.write(allSeqs, handle, 'fasta') def N_mask(QUERY, repeats_all, MASKED): allSeqs = list(SeqIO.parse(QUERY, 'fasta')) for singleSeq in allSeqs: mutable = MutableSeq(str(singleSeq.seq), generic_dna) for index in repeats_all[singleSeq.id]: for item in range(index[0] - 1, index[1]): mutable[item] = "N" singleSeq.seq = mutable with open(MASKED, "w") as handle: SeqIO.write(allSeqs, handle, 'fasta') if __name__ == "__main__": # Command line arguments parser = argparse.ArgumentParser() parser.add_argument('-q', '--query', type=str, required=True, help='query sequence to be processed') parser.add_argument('-rg', '--repeat_gff', type=str, required=True, help='query sequence to be processed') parser.add_argument('-m', '--mode', default="lowercase", choices=['lowercase', 'N'], help='query sequence to be processed') parser.add_argument('-o', '--output_masked', type=str, default="output_masked", help='query sequence to be processed') args = parser.parse_args() main(args)